diff options
Diffstat (limited to 'Godeps')
26 files changed, 6540 insertions, 15 deletions
diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index e4b37a12e..01bda12bf 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -16,8 +16,8 @@ }, { "ImportPath": "github.com/ethereum/ethash", - "Comment": "v23.1-234-g062e40a", - "Rev": "062e40a1a1671f5a5102862b56e4c56f68a732f5" + "Comment": "v23.1-235-gb39e007", + "Rev": "b39e007d393ab5945b4c0748a7415b7e31c5db04" }, { "ImportPath": "github.com/fatih/color", diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl.go new file mode 100644 index 000000000..3d577b2b6 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl.go @@ -0,0 +1,26 @@ +/* +Package cl provides a binding to the OpenCL api. It's mostly a low-level +wrapper that avoids adding functionality while still making the interface +a little more friendly and easy to use. + +Resource life-cycle management: + +For any CL object that gets created (buffer, queue, kernel, etc..) you should +call object.Release() when finished with it to free the CL resources. This +explicitely calls the related clXXXRelease method for the type. However, +as a fallback there is a finalizer set for every resource item that takes +care of it (eventually) if Release isn't called. In this way you can have +better control over the life cycle of resources while having a fall back +to avoid leaks. This is similar to how file handles and such are handled +in the Go standard packages. +*/ +package cl + +// #include "headers/1.2/opencl.h" +// #cgo CFLAGS: -Iheaders/1.2 +// #cgo darwin LDFLAGS: -framework OpenCL +// #cgo linux LDFLAGS: -lOpenCL +import "C" +import "errors" + +var ErrUnsupported = errors.New("cl: unsupported") diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl_test.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl_test.go new file mode 100644 index 000000000..7659ce14f --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl_test.go @@ -0,0 +1,254 @@ +package cl + +import ( + "math/rand" + "reflect" + "strings" + "testing" +) + +var kernelSource = ` +__kernel void square( + __global float* input, + __global float* output, + const unsigned int count) +{ + int i = get_global_id(0); + if(i < count) + output[i] = input[i] * input[i]; +} +` + +func getObjectStrings(object interface{}) map[string]string { + v := reflect.ValueOf(object) + t := reflect.TypeOf(object) + + strs := make(map[string]string) + + numMethods := t.NumMethod() + for i := 0; i < numMethods; i++ { + method := t.Method(i) + if method.Type.NumIn() == 1 && method.Type.NumOut() == 1 && method.Type.Out(0).Kind() == reflect.String { + // this is a string-returning method with (presumably) only a pointer receiver parameter + // call it + outs := v.Method(i).Call([]reflect.Value{}) + // put the result in our map + strs[method.Name] = (outs[0].Interface()).(string) + } + } + + return strs +} + +func TestPlatformStringsContainNoNULs(t *testing.T) { + platforms, err := GetPlatforms() + if err != nil { + t.Fatalf("Failed to get platforms: %+v", err) + } + + for _, p := range platforms { + for key, value := range getObjectStrings(p) { + if strings.Contains(value, "\x00") { + t.Fatalf("platform string %q = %+q contains NUL", key, value) + } + } + } +} + +func TestDeviceStringsContainNoNULs(t *testing.T) { + platforms, err := GetPlatforms() + if err != nil { + t.Fatalf("Failed to get platforms: %+v", err) + } + + for _, p := range platforms { + devs, err := p.GetDevices(DeviceTypeAll) + if err != nil { + t.Fatalf("Failed to get devices for platform %q: %+v", p.Name(), err) + } + + for _, d := range devs { + for key, value := range getObjectStrings(d) { + if strings.Contains(value, "\x00") { + t.Fatalf("device string %q = %+q contains NUL", key, value) + } + } + } + } +} + +func TestHello(t *testing.T) { + var data [1024]float32 + for i := 0; i < len(data); i++ { + data[i] = rand.Float32() + } + + platforms, err := GetPlatforms() + if err != nil { + t.Fatalf("Failed to get platforms: %+v", err) + } + for i, p := range platforms { + t.Logf("Platform %d:", i) + t.Logf(" Name: %s", p.Name()) + t.Logf(" Vendor: %s", p.Vendor()) + t.Logf(" Profile: %s", p.Profile()) + t.Logf(" Version: %s", p.Version()) + t.Logf(" Extensions: %s", p.Extensions()) + } + platform := platforms[0] + + devices, err := platform.GetDevices(DeviceTypeAll) + if err != nil { + t.Fatalf("Failed to get devices: %+v", err) + } + if len(devices) == 0 { + t.Fatalf("GetDevices returned no devices") + } + deviceIndex := -1 + for i, d := range devices { + if deviceIndex < 0 && d.Type() == DeviceTypeGPU { + deviceIndex = i + } + t.Logf("Device %d (%s): %s", i, d.Type(), d.Name()) + t.Logf(" Address Bits: %d", d.AddressBits()) + t.Logf(" Available: %+v", d.Available()) + // t.Logf(" Built-In Kernels: %s", d.BuiltInKernels()) + t.Logf(" Compiler Available: %+v", d.CompilerAvailable()) + t.Logf(" Double FP Config: %s", d.DoubleFPConfig()) + t.Logf(" Driver Version: %s", d.DriverVersion()) + t.Logf(" Error Correction Supported: %+v", d.ErrorCorrectionSupport()) + t.Logf(" Execution Capabilities: %s", d.ExecutionCapabilities()) + t.Logf(" Extensions: %s", d.Extensions()) + t.Logf(" Global Memory Cache Type: %s", d.GlobalMemCacheType()) + t.Logf(" Global Memory Cacheline Size: %d KB", d.GlobalMemCachelineSize()/1024) + t.Logf(" Global Memory Size: %d MB", d.GlobalMemSize()/(1024*1024)) + t.Logf(" Half FP Config: %s", d.HalfFPConfig()) + t.Logf(" Host Unified Memory: %+v", d.HostUnifiedMemory()) + t.Logf(" Image Support: %+v", d.ImageSupport()) + t.Logf(" Image2D Max Dimensions: %d x %d", d.Image2DMaxWidth(), d.Image2DMaxHeight()) + t.Logf(" Image3D Max Dimenionns: %d x %d x %d", d.Image3DMaxWidth(), d.Image3DMaxHeight(), d.Image3DMaxDepth()) + // t.Logf(" Image Max Buffer Size: %d", d.ImageMaxBufferSize()) + // t.Logf(" Image Max Array Size: %d", d.ImageMaxArraySize()) + // t.Logf(" Linker Available: %+v", d.LinkerAvailable()) + t.Logf(" Little Endian: %+v", d.EndianLittle()) + t.Logf(" Local Mem Size Size: %d KB", d.LocalMemSize()/1024) + t.Logf(" Local Mem Type: %s", d.LocalMemType()) + t.Logf(" Max Clock Frequency: %d", d.MaxClockFrequency()) + t.Logf(" Max Compute Units: %d", d.MaxComputeUnits()) + t.Logf(" Max Constant Args: %d", d.MaxConstantArgs()) + t.Logf(" Max Constant Buffer Size: %d KB", d.MaxConstantBufferSize()/1024) + t.Logf(" Max Mem Alloc Size: %d KB", d.MaxMemAllocSize()/1024) + t.Logf(" Max Parameter Size: %d", d.MaxParameterSize()) + t.Logf(" Max Read-Image Args: %d", d.MaxReadImageArgs()) + t.Logf(" Max Samplers: %d", d.MaxSamplers()) + t.Logf(" Max Work Group Size: %d", d.MaxWorkGroupSize()) + t.Logf(" Max Work Item Dimensions: %d", d.MaxWorkItemDimensions()) + t.Logf(" Max Work Item Sizes: %d", d.MaxWorkItemSizes()) + t.Logf(" Max Write-Image Args: %d", d.MaxWriteImageArgs()) + t.Logf(" Memory Base Address Alignment: %d", d.MemBaseAddrAlign()) + t.Logf(" Native Vector Width Char: %d", d.NativeVectorWidthChar()) + t.Logf(" Native Vector Width Short: %d", d.NativeVectorWidthShort()) + t.Logf(" Native Vector Width Int: %d", d.NativeVectorWidthInt()) + t.Logf(" Native Vector Width Long: %d", d.NativeVectorWidthLong()) + t.Logf(" Native Vector Width Float: %d", d.NativeVectorWidthFloat()) + t.Logf(" Native Vector Width Double: %d", d.NativeVectorWidthDouble()) + t.Logf(" Native Vector Width Half: %d", d.NativeVectorWidthHalf()) + t.Logf(" OpenCL C Version: %s", d.OpenCLCVersion()) + // t.Logf(" Parent Device: %+v", d.ParentDevice()) + t.Logf(" Profile: %s", d.Profile()) + t.Logf(" Profiling Timer Resolution: %d", d.ProfilingTimerResolution()) + t.Logf(" Vendor: %s", d.Vendor()) + t.Logf(" Version: %s", d.Version()) + } + if deviceIndex < 0 { + deviceIndex = 0 + } + device := devices[deviceIndex] + t.Logf("Using device %d", deviceIndex) + context, err := CreateContext([]*Device{device}) + if err != nil { + t.Fatalf("CreateContext failed: %+v", err) + } + // imageFormats, err := context.GetSupportedImageFormats(0, MemObjectTypeImage2D) + // if err != nil { + // t.Fatalf("GetSupportedImageFormats failed: %+v", err) + // } + // t.Logf("Supported image formats: %+v", imageFormats) + queue, err := context.CreateCommandQueue(device, 0) + if err != nil { + t.Fatalf("CreateCommandQueue failed: %+v", err) + } + program, err := context.CreateProgramWithSource([]string{kernelSource}) + if err != nil { + t.Fatalf("CreateProgramWithSource failed: %+v", err) + } + if err := program.BuildProgram(nil, ""); err != nil { + t.Fatalf("BuildProgram failed: %+v", err) + } + kernel, err := program.CreateKernel("square") + if err != nil { + t.Fatalf("CreateKernel failed: %+v", err) + } + for i := 0; i < 3; i++ { + name, err := kernel.ArgName(i) + if err == ErrUnsupported { + break + } else if err != nil { + t.Errorf("GetKernelArgInfo for name failed: %+v", err) + break + } else { + t.Logf("Kernel arg %d: %s", i, name) + } + } + input, err := context.CreateEmptyBuffer(MemReadOnly, 4*len(data)) + if err != nil { + t.Fatalf("CreateBuffer failed for input: %+v", err) + } + output, err := context.CreateEmptyBuffer(MemReadOnly, 4*len(data)) + if err != nil { + t.Fatalf("CreateBuffer failed for output: %+v", err) + } + if _, err := queue.EnqueueWriteBufferFloat32(input, true, 0, data[:], nil); err != nil { + t.Fatalf("EnqueueWriteBufferFloat32 failed: %+v", err) + } + if err := kernel.SetArgs(input, output, uint32(len(data))); err != nil { + t.Fatalf("SetKernelArgs failed: %+v", err) + } + + local, err := kernel.WorkGroupSize(device) + if err != nil { + t.Fatalf("WorkGroupSize failed: %+v", err) + } + t.Logf("Work group size: %d", local) + size, _ := kernel.PreferredWorkGroupSizeMultiple(nil) + t.Logf("Preferred Work Group Size Multiple: %d", size) + + global := len(data) + d := len(data) % local + if d != 0 { + global += local - d + } + if _, err := queue.EnqueueNDRangeKernel(kernel, nil, []int{global}, []int{local}, nil); err != nil { + t.Fatalf("EnqueueNDRangeKernel failed: %+v", err) + } + + if err := queue.Finish(); err != nil { + t.Fatalf("Finish failed: %+v", err) + } + + results := make([]float32, len(data)) + if _, err := queue.EnqueueReadBufferFloat32(output, true, 0, results, nil); err != nil { + t.Fatalf("EnqueueReadBufferFloat32 failed: %+v", err) + } + + correct := 0 + for i, v := range data { + if results[i] == v*v { + correct++ + } + } + + if correct != len(data) { + t.Fatalf("%d/%d correct values", correct, len(data)) + } +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/context.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/context.go new file mode 100644 index 000000000..67441bb5f --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/context.go @@ -0,0 +1,161 @@ +package cl + +// #include <stdlib.h> +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" + +import ( + "runtime" + "unsafe" +) + +const maxImageFormats = 256 + +type Context struct { + clContext C.cl_context + devices []*Device +} + +type MemObject struct { + clMem C.cl_mem + size int +} + +func releaseContext(c *Context) { + if c.clContext != nil { + C.clReleaseContext(c.clContext) + c.clContext = nil + } +} + +func releaseMemObject(b *MemObject) { + if b.clMem != nil { + C.clReleaseMemObject(b.clMem) + b.clMem = nil + } +} + +func newMemObject(mo C.cl_mem, size int) *MemObject { + memObject := &MemObject{clMem: mo, size: size} + runtime.SetFinalizer(memObject, releaseMemObject) + return memObject +} + +func (b *MemObject) Release() { + releaseMemObject(b) +} + +// TODO: properties +func CreateContext(devices []*Device) (*Context, error) { + deviceIds := buildDeviceIdList(devices) + var err C.cl_int + clContext := C.clCreateContext(nil, C.cl_uint(len(devices)), &deviceIds[0], nil, nil, &err) + if err != C.CL_SUCCESS { + return nil, toError(err) + } + if clContext == nil { + return nil, ErrUnknown + } + context := &Context{clContext: clContext, devices: devices} + runtime.SetFinalizer(context, releaseContext) + return context, nil +} + +func (ctx *Context) GetSupportedImageFormats(flags MemFlag, imageType MemObjectType) ([]ImageFormat, error) { + var formats [maxImageFormats]C.cl_image_format + var nFormats C.cl_uint + if err := C.clGetSupportedImageFormats(ctx.clContext, C.cl_mem_flags(flags), C.cl_mem_object_type(imageType), maxImageFormats, &formats[0], &nFormats); err != C.CL_SUCCESS { + return nil, toError(err) + } + fmts := make([]ImageFormat, nFormats) + for i, f := range formats[:nFormats] { + fmts[i] = ImageFormat{ + ChannelOrder: ChannelOrder(f.image_channel_order), + ChannelDataType: ChannelDataType(f.image_channel_data_type), + } + } + return fmts, nil +} + +func (ctx *Context) CreateCommandQueue(device *Device, properties CommandQueueProperty) (*CommandQueue, error) { + var err C.cl_int + clQueue := C.clCreateCommandQueue(ctx.clContext, device.id, C.cl_command_queue_properties(properties), &err) + if err != C.CL_SUCCESS { + return nil, toError(err) + } + if clQueue == nil { + return nil, ErrUnknown + } + commandQueue := &CommandQueue{clQueue: clQueue, device: device} + runtime.SetFinalizer(commandQueue, releaseCommandQueue) + return commandQueue, nil +} + +func (ctx *Context) CreateProgramWithSource(sources []string) (*Program, error) { + cSources := make([]*C.char, len(sources)) + for i, s := range sources { + cs := C.CString(s) + cSources[i] = cs + defer C.free(unsafe.Pointer(cs)) + } + var err C.cl_int + clProgram := C.clCreateProgramWithSource(ctx.clContext, C.cl_uint(len(sources)), &cSources[0], nil, &err) + if err != C.CL_SUCCESS { + return nil, toError(err) + } + if clProgram == nil { + return nil, ErrUnknown + } + program := &Program{clProgram: clProgram, devices: ctx.devices} + runtime.SetFinalizer(program, releaseProgram) + return program, nil +} + +func (ctx *Context) CreateBufferUnsafe(flags MemFlag, size int, dataPtr unsafe.Pointer) (*MemObject, error) { + var err C.cl_int + clBuffer := C.clCreateBuffer(ctx.clContext, C.cl_mem_flags(flags), C.size_t(size), dataPtr, &err) + if err != C.CL_SUCCESS { + return nil, toError(err) + } + if clBuffer == nil { + return nil, ErrUnknown + } + return newMemObject(clBuffer, size), nil +} + +func (ctx *Context) CreateEmptyBuffer(flags MemFlag, size int) (*MemObject, error) { + return ctx.CreateBufferUnsafe(flags, size, nil) +} + +func (ctx *Context) CreateEmptyBufferFloat32(flags MemFlag, size int) (*MemObject, error) { + return ctx.CreateBufferUnsafe(flags, 4*size, nil) +} + +func (ctx *Context) CreateBuffer(flags MemFlag, data []byte) (*MemObject, error) { + return ctx.CreateBufferUnsafe(flags, len(data), unsafe.Pointer(&data[0])) +} + +//float64 +func (ctx *Context) CreateBufferFloat32(flags MemFlag, data []float32) (*MemObject, error) { + return ctx.CreateBufferUnsafe(flags, 4*len(data), unsafe.Pointer(&data[0])) +} + +func (ctx *Context) CreateUserEvent() (*Event, error) { + var err C.cl_int + clEvent := C.clCreateUserEvent(ctx.clContext, &err) + if err != C.CL_SUCCESS { + return nil, toError(err) + } + return newEvent(clEvent), nil +} + +func (ctx *Context) Release() { + releaseContext(ctx) +} + +// http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateSubBuffer.html +// func (memObject *MemObject) CreateSubBuffer(flags MemFlag, bufferCreateType BufferCreateType, ) diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device.go new file mode 100644 index 000000000..d62a6fb71 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device.go @@ -0,0 +1,510 @@ +package cl + +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #include "cl_ext.h" +// #endif +import "C" + +import ( + "strings" + "unsafe" +) + +const maxDeviceCount = 64 + +type DeviceType uint + +const ( + DeviceTypeCPU DeviceType = C.CL_DEVICE_TYPE_CPU + DeviceTypeGPU DeviceType = C.CL_DEVICE_TYPE_GPU + DeviceTypeAccelerator DeviceType = C.CL_DEVICE_TYPE_ACCELERATOR + DeviceTypeDefault DeviceType = C.CL_DEVICE_TYPE_DEFAULT + DeviceTypeAll DeviceType = C.CL_DEVICE_TYPE_ALL +) + +type FPConfig int + +const ( + FPConfigDenorm FPConfig = C.CL_FP_DENORM // denorms are supported + FPConfigInfNaN FPConfig = C.CL_FP_INF_NAN // INF and NaNs are supported + FPConfigRoundToNearest FPConfig = C.CL_FP_ROUND_TO_NEAREST // round to nearest even rounding mode supported + FPConfigRoundToZero FPConfig = C.CL_FP_ROUND_TO_ZERO // round to zero rounding mode supported + FPConfigRoundToInf FPConfig = C.CL_FP_ROUND_TO_INF // round to positive and negative infinity rounding modes supported + FPConfigFMA FPConfig = C.CL_FP_FMA // IEEE754-2008 fused multiply-add is supported + FPConfigSoftFloat FPConfig = C.CL_FP_SOFT_FLOAT // Basic floating-point operations (such as addition, subtraction, multiplication) are implemented in software +) + +var fpConfigNameMap = map[FPConfig]string{ + FPConfigDenorm: "Denorm", + FPConfigInfNaN: "InfNaN", + FPConfigRoundToNearest: "RoundToNearest", + FPConfigRoundToZero: "RoundToZero", + FPConfigRoundToInf: "RoundToInf", + FPConfigFMA: "FMA", + FPConfigSoftFloat: "SoftFloat", +} + +func (c FPConfig) String() string { + var parts []string + for bit, name := range fpConfigNameMap { + if c&bit != 0 { + parts = append(parts, name) + } + } + if parts == nil { + return "" + } + return strings.Join(parts, "|") +} + +func (dt DeviceType) String() string { + var parts []string + if dt&DeviceTypeCPU != 0 { + parts = append(parts, "CPU") + } + if dt&DeviceTypeGPU != 0 { + parts = append(parts, "GPU") + } + if dt&DeviceTypeAccelerator != 0 { + parts = append(parts, "Accelerator") + } + if dt&DeviceTypeDefault != 0 { + parts = append(parts, "Default") + } + if parts == nil { + parts = append(parts, "None") + } + return strings.Join(parts, "|") +} + +type Device struct { + id C.cl_device_id +} + +func buildDeviceIdList(devices []*Device) []C.cl_device_id { + deviceIds := make([]C.cl_device_id, len(devices)) + for i, d := range devices { + deviceIds[i] = d.id + } + return deviceIds +} + +// Obtain the list of devices available on a platform. 'platform' refers +// to the platform returned by GetPlatforms or can be nil. If platform +// is nil, the behavior is implementation-defined. +func GetDevices(platform *Platform, deviceType DeviceType) ([]*Device, error) { + var deviceIds [maxDeviceCount]C.cl_device_id + var numDevices C.cl_uint + var platformId C.cl_platform_id + if platform != nil { + platformId = platform.id + } + if err := C.clGetDeviceIDs(platformId, C.cl_device_type(deviceType), C.cl_uint(maxDeviceCount), &deviceIds[0], &numDevices); err != C.CL_SUCCESS { + return nil, toError(err) + } + if numDevices > maxDeviceCount { + numDevices = maxDeviceCount + } + devices := make([]*Device, numDevices) + for i := 0; i < int(numDevices); i++ { + devices[i] = &Device{id: deviceIds[i]} + } + return devices, nil +} + +func (d *Device) nullableId() C.cl_device_id { + if d == nil { + return nil + } + return d.id +} + +func (d *Device) GetInfoString(param C.cl_device_info, panicOnError bool) (string, error) { + var strC [1024]C.char + var strN C.size_t + if err := C.clGetDeviceInfo(d.id, param, 1024, unsafe.Pointer(&strC), &strN); err != C.CL_SUCCESS { + if panicOnError { + panic("Should never fail") + } + return "", toError(err) + } + + // OpenCL strings are NUL-terminated, and the terminator is included in strN + // Go strings aren't NUL-terminated, so subtract 1 from the length + return C.GoStringN((*C.char)(unsafe.Pointer(&strC)), C.int(strN-1)), nil +} + +func (d *Device) getInfoUint(param C.cl_device_info, panicOnError bool) (uint, error) { + var val C.cl_uint + if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS { + if panicOnError { + panic("Should never fail") + } + return 0, toError(err) + } + return uint(val), nil +} + +func (d *Device) getInfoSize(param C.cl_device_info, panicOnError bool) (int, error) { + var val C.size_t + if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS { + if panicOnError { + panic("Should never fail") + } + return 0, toError(err) + } + return int(val), nil +} + +func (d *Device) getInfoUlong(param C.cl_device_info, panicOnError bool) (int64, error) { + var val C.cl_ulong + if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS { + if panicOnError { + panic("Should never fail") + } + return 0, toError(err) + } + return int64(val), nil +} + +func (d *Device) getInfoBool(param C.cl_device_info, panicOnError bool) (bool, error) { + var val C.cl_bool + if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS { + if panicOnError { + panic("Should never fail") + } + return false, toError(err) + } + return val == C.CL_TRUE, nil +} + +func (d *Device) Name() string { + str, _ := d.GetInfoString(C.CL_DEVICE_NAME, true) + return str +} + +func (d *Device) Vendor() string { + str, _ := d.GetInfoString(C.CL_DEVICE_VENDOR, true) + return str +} + +func (d *Device) Extensions() string { + str, _ := d.GetInfoString(C.CL_DEVICE_EXTENSIONS, true) + return str +} + +func (d *Device) OpenCLCVersion() string { + str, _ := d.GetInfoString(C.CL_DEVICE_OPENCL_C_VERSION, true) + return str +} + +func (d *Device) Profile() string { + str, _ := d.GetInfoString(C.CL_DEVICE_PROFILE, true) + return str +} + +func (d *Device) Version() string { + str, _ := d.GetInfoString(C.CL_DEVICE_VERSION, true) + return str +} + +func (d *Device) DriverVersion() string { + str, _ := d.GetInfoString(C.CL_DRIVER_VERSION, true) + return str +} + +// The default compute device address space size specified as an +// unsigned integer value in bits. Currently supported values are 32 or 64 bits. +func (d *Device) AddressBits() int { + val, _ := d.getInfoUint(C.CL_DEVICE_ADDRESS_BITS, true) + return int(val) +} + +// Size of global memory cache line in bytes. +func (d *Device) GlobalMemCachelineSize() int { + val, _ := d.getInfoUint(C.CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, true) + return int(val) +} + +// Maximum configured clock frequency of the device in MHz. +func (d *Device) MaxClockFrequency() int { + val, _ := d.getInfoUint(C.CL_DEVICE_MAX_CLOCK_FREQUENCY, true) + return int(val) +} + +// The number of parallel compute units on the OpenCL device. +// A work-group executes on a single compute unit. The minimum value is 1. +func (d *Device) MaxComputeUnits() int { + val, _ := d.getInfoUint(C.CL_DEVICE_MAX_COMPUTE_UNITS, true) + return int(val) +} + +// Max number of arguments declared with the __constant qualifier in a kernel. +// The minimum value is 8 for devices that are not of type CL_DEVICE_TYPE_CUSTOM. +func (d *Device) MaxConstantArgs() int { + val, _ := d.getInfoUint(C.CL_DEVICE_MAX_CONSTANT_ARGS, true) + return int(val) +} + +// Max number of simultaneous image objects that can be read by a kernel. +// The minimum value is 128 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. +func (d *Device) MaxReadImageArgs() int { + val, _ := d.getInfoUint(C.CL_DEVICE_MAX_READ_IMAGE_ARGS, true) + return int(val) +} + +// Maximum number of samplers that can be used in a kernel. The minimum +// value is 16 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. (Also see sampler_t.) +func (d *Device) MaxSamplers() int { + val, _ := d.getInfoUint(C.CL_DEVICE_MAX_SAMPLERS, true) + return int(val) +} + +// Maximum dimensions that specify the global and local work-item IDs used +// by the data parallel execution model. (Refer to clEnqueueNDRangeKernel). +// The minimum value is 3 for devices that are not of type CL_DEVICE_TYPE_CUSTOM. +func (d *Device) MaxWorkItemDimensions() int { + val, _ := d.getInfoUint(C.CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, true) + return int(val) +} + +// Max number of simultaneous image objects that can be written to by a +// kernel. The minimum value is 8 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. +func (d *Device) MaxWriteImageArgs() int { + val, _ := d.getInfoUint(C.CL_DEVICE_MAX_WRITE_IMAGE_ARGS, true) + return int(val) +} + +// The minimum value is the size (in bits) of the largest OpenCL built-in +// data type supported by the device (long16 in FULL profile, long16 or +// int16 in EMBEDDED profile) for devices that are not of type CL_DEVICE_TYPE_CUSTOM. +func (d *Device) MemBaseAddrAlign() int { + val, _ := d.getInfoUint(C.CL_DEVICE_MEM_BASE_ADDR_ALIGN, true) + return int(val) +} + +func (d *Device) NativeVectorWidthChar() int { + val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, true) + return int(val) +} + +func (d *Device) NativeVectorWidthShort() int { + val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, true) + return int(val) +} + +func (d *Device) NativeVectorWidthInt() int { + val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, true) + return int(val) +} + +func (d *Device) NativeVectorWidthLong() int { + val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, true) + return int(val) +} + +func (d *Device) NativeVectorWidthFloat() int { + val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, true) + return int(val) +} + +func (d *Device) NativeVectorWidthDouble() int { + val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, true) + return int(val) +} + +func (d *Device) NativeVectorWidthHalf() int { + val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, true) + return int(val) +} + +// Max height of 2D image in pixels. The minimum value is 8192 +// if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. +func (d *Device) Image2DMaxHeight() int { + val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE2D_MAX_HEIGHT, true) + return int(val) +} + +// Max width of 2D image or 1D image not created from a buffer object in +// pixels. The minimum value is 8192 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. +func (d *Device) Image2DMaxWidth() int { + val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE2D_MAX_WIDTH, true) + return int(val) +} + +// Max depth of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. +func (d *Device) Image3DMaxDepth() int { + val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE3D_MAX_DEPTH, true) + return int(val) +} + +// Max height of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. +func (d *Device) Image3DMaxHeight() int { + val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE3D_MAX_HEIGHT, true) + return int(val) +} + +// Max width of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. +func (d *Device) Image3DMaxWidth() int { + val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE3D_MAX_WIDTH, true) + return int(val) +} + +// Max size in bytes of the arguments that can be passed to a kernel. The +// minimum value is 1024 for devices that are not of type CL_DEVICE_TYPE_CUSTOM. +// For this minimum value, only a maximum of 128 arguments can be passed to a kernel. +func (d *Device) MaxParameterSize() int { + val, _ := d.getInfoSize(C.CL_DEVICE_MAX_PARAMETER_SIZE, true) + return int(val) +} + +// Maximum number of work-items in a work-group executing a kernel on a +// single compute unit, using the data parallel execution model. (Refer +// to clEnqueueNDRangeKernel). The minimum value is 1. +func (d *Device) MaxWorkGroupSize() int { + val, _ := d.getInfoSize(C.CL_DEVICE_MAX_WORK_GROUP_SIZE, true) + return int(val) +} + +// Describes the resolution of device timer. This is measured in nanoseconds. +func (d *Device) ProfilingTimerResolution() int { + val, _ := d.getInfoSize(C.CL_DEVICE_PROFILING_TIMER_RESOLUTION, true) + return int(val) +} + +// Size of local memory arena in bytes. The minimum value is 32 KB for +// devices that are not of type CL_DEVICE_TYPE_CUSTOM. +func (d *Device) LocalMemSize() int64 { + val, _ := d.getInfoUlong(C.CL_DEVICE_LOCAL_MEM_SIZE, true) + return val +} + +// Max size in bytes of a constant buffer allocation. The minimum value is +// 64 KB for devices that are not of type CL_DEVICE_TYPE_CUSTOM. +func (d *Device) MaxConstantBufferSize() int64 { + val, _ := d.getInfoUlong(C.CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, true) + return val +} + +// Max size of memory object allocation in bytes. The minimum value is max +// (1/4th of CL_DEVICE_GLOBAL_MEM_SIZE, 128*1024*1024) for devices that are +// not of type CL_DEVICE_TYPE_CUSTOM. +func (d *Device) MaxMemAllocSize() int64 { + val, _ := d.getInfoUlong(C.CL_DEVICE_MAX_MEM_ALLOC_SIZE, true) + return val +} + +// Size of global device memory in bytes. +func (d *Device) GlobalMemSize() int64 { + val, _ := d.getInfoUlong(C.CL_DEVICE_GLOBAL_MEM_SIZE, true) + return val +} + +func (d *Device) Available() bool { + val, _ := d.getInfoBool(C.CL_DEVICE_AVAILABLE, true) + return val +} + +func (d *Device) CompilerAvailable() bool { + val, _ := d.getInfoBool(C.CL_DEVICE_COMPILER_AVAILABLE, true) + return val +} + +func (d *Device) EndianLittle() bool { + val, _ := d.getInfoBool(C.CL_DEVICE_ENDIAN_LITTLE, true) + return val +} + +// Is CL_TRUE if the device implements error correction for all +// accesses to compute device memory (global and constant). Is +// CL_FALSE if the device does not implement such error correction. +func (d *Device) ErrorCorrectionSupport() bool { + val, _ := d.getInfoBool(C.CL_DEVICE_ERROR_CORRECTION_SUPPORT, true) + return val +} + +func (d *Device) HostUnifiedMemory() bool { + val, _ := d.getInfoBool(C.CL_DEVICE_HOST_UNIFIED_MEMORY, true) + return val +} + +func (d *Device) ImageSupport() bool { + val, _ := d.getInfoBool(C.CL_DEVICE_IMAGE_SUPPORT, true) + return val +} + +func (d *Device) Type() DeviceType { + var deviceType C.cl_device_type + if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_TYPE, C.size_t(unsafe.Sizeof(deviceType)), unsafe.Pointer(&deviceType), nil); err != C.CL_SUCCESS { + panic("Failed to get device type") + } + return DeviceType(deviceType) +} + +// Describes double precision floating-point capability of the OpenCL device +func (d *Device) DoubleFPConfig() FPConfig { + var fpConfig C.cl_device_fp_config + if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_DOUBLE_FP_CONFIG, C.size_t(unsafe.Sizeof(fpConfig)), unsafe.Pointer(&fpConfig), nil); err != C.CL_SUCCESS { + panic("Failed to get double FP config") + } + return FPConfig(fpConfig) +} + +// Describes the OPTIONAL half precision floating-point capability of the OpenCL device +func (d *Device) HalfFPConfig() FPConfig { + var fpConfig C.cl_device_fp_config + err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_HALF_FP_CONFIG, C.size_t(unsafe.Sizeof(fpConfig)), unsafe.Pointer(&fpConfig), nil) + if err != C.CL_SUCCESS { + return FPConfig(0) + } + return FPConfig(fpConfig) +} + +// Type of local memory supported. This can be set to CL_LOCAL implying dedicated +// local memory storage such as SRAM, or CL_GLOBAL. For custom devices, CL_NONE +// can also be returned indicating no local memory support. +func (d *Device) LocalMemType() LocalMemType { + var memType C.cl_device_local_mem_type + if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_LOCAL_MEM_TYPE, C.size_t(unsafe.Sizeof(memType)), unsafe.Pointer(&memType), nil); err != C.CL_SUCCESS { + return LocalMemType(C.CL_NONE) + } + return LocalMemType(memType) +} + +// Describes the execution capabilities of the device. The mandated minimum capability is CL_EXEC_KERNEL. +func (d *Device) ExecutionCapabilities() ExecCapability { + var execCap C.cl_device_exec_capabilities + if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_EXECUTION_CAPABILITIES, C.size_t(unsafe.Sizeof(execCap)), unsafe.Pointer(&execCap), nil); err != C.CL_SUCCESS { + panic("Failed to get execution capabilities") + } + return ExecCapability(execCap) +} + +func (d *Device) GlobalMemCacheType() MemCacheType { + var memType C.cl_device_mem_cache_type + if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, C.size_t(unsafe.Sizeof(memType)), unsafe.Pointer(&memType), nil); err != C.CL_SUCCESS { + return MemCacheType(C.CL_NONE) + } + return MemCacheType(memType) +} + +// Maximum number of work-items that can be specified in each dimension of the work-group to clEnqueueNDRangeKernel. +// +// Returns n size_t entries, where n is the value returned by the query for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS. +// +// The minimum value is (1, 1, 1) for devices that are not of type CL_DEVICE_TYPE_CUSTOM. +func (d *Device) MaxWorkItemSizes() []int { + dims := d.MaxWorkItemDimensions() + sizes := make([]C.size_t, dims) + if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_MAX_WORK_ITEM_SIZES, C.size_t(int(unsafe.Sizeof(sizes[0]))*dims), unsafe.Pointer(&sizes[0]), nil); err != C.CL_SUCCESS { + panic("Failed to get max work item sizes") + } + intSizes := make([]int, dims) + for i, s := range sizes { + intSizes[i] = int(s) + } + return intSizes +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device12.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device12.go new file mode 100644 index 000000000..b4b559a6a --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device12.go @@ -0,0 +1,51 @@ +// +build cl12 + +package cl + +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" +import "unsafe" + +const FPConfigCorrectlyRoundedDivideSqrt FPConfig = C.CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT + +func init() { + fpConfigNameMap[FPConfigCorrectlyRoundedDivideSqrt] = "CorrectlyRoundedDivideSqrt" +} + +func (d *Device) BuiltInKernels() string { + str, _ := d.getInfoString(C.CL_DEVICE_BUILT_IN_KERNELS, true) + return str +} + +// Is CL_FALSE if the implementation does not have a linker available. Is CL_TRUE if the linker is available. This can be CL_FALSE for the embedded platform profile only. This must be CL_TRUE if CL_DEVICE_COMPILER_AVAILABLE is CL_TRUE +func (d *Device) LinkerAvailable() bool { + val, _ := d.getInfoBool(C.CL_DEVICE_LINKER_AVAILABLE, true) + return val +} + +func (d *Device) ParentDevice() *Device { + var deviceId C.cl_device_id + if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_PARENT_DEVICE, C.size_t(unsafe.Sizeof(deviceId)), unsafe.Pointer(&deviceId), nil); err != C.CL_SUCCESS { + panic("ParentDevice failed") + } + if deviceId == nil { + return nil + } + return &Device{id: deviceId} +} + +// Max number of pixels for a 1D image created from a buffer object. The minimum value is 65536 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. +func (d *Device) ImageMaxBufferSize() int { + val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, true) + return int(val) +} + +// Max number of images in a 1D or 2D image array. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE +func (d *Device) ImageMaxArraySize() int { + val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, true) + return int(val) +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl.h new file mode 100644 index 000000000..c38aa1eda --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl.h @@ -0,0 +1,1210 @@ +/******************************************************************************* + * Copyright (c) 2008 - 2012 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_H +#define __OPENCL_CL_H + +#include <cl_platform.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; +typedef struct _cl_context * cl_context; +typedef struct _cl_command_queue * cl_command_queue; +typedef struct _cl_mem * cl_mem; +typedef struct _cl_program * cl_program; +typedef struct _cl_kernel * cl_kernel; +typedef struct _cl_event * cl_event; +typedef struct _cl_sampler * cl_sampler; + +typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ +typedef cl_ulong cl_bitfield; +typedef cl_bitfield cl_device_type; +typedef cl_uint cl_platform_info; +typedef cl_uint cl_device_info; +typedef cl_bitfield cl_device_fp_config; +typedef cl_uint cl_device_mem_cache_type; +typedef cl_uint cl_device_local_mem_type; +typedef cl_bitfield cl_device_exec_capabilities; +typedef cl_bitfield cl_command_queue_properties; +typedef intptr_t cl_device_partition_property; +typedef cl_bitfield cl_device_affinity_domain; + +typedef intptr_t cl_context_properties; +typedef cl_uint cl_context_info; +typedef cl_uint cl_command_queue_info; +typedef cl_uint cl_channel_order; +typedef cl_uint cl_channel_type; +typedef cl_bitfield cl_mem_flags; +typedef cl_uint cl_mem_object_type; +typedef cl_uint cl_mem_info; +typedef cl_bitfield cl_mem_migration_flags; +typedef cl_uint cl_image_info; +typedef cl_uint cl_buffer_create_type; +typedef cl_uint cl_addressing_mode; +typedef cl_uint cl_filter_mode; +typedef cl_uint cl_sampler_info; +typedef cl_bitfield cl_map_flags; +typedef cl_uint cl_program_info; +typedef cl_uint cl_program_build_info; +typedef cl_uint cl_program_binary_type; +typedef cl_int cl_build_status; +typedef cl_uint cl_kernel_info; +typedef cl_uint cl_kernel_arg_info; +typedef cl_uint cl_kernel_arg_address_qualifier; +typedef cl_uint cl_kernel_arg_access_qualifier; +typedef cl_bitfield cl_kernel_arg_type_qualifier; +typedef cl_uint cl_kernel_work_group_info; +typedef cl_uint cl_event_info; +typedef cl_uint cl_command_type; +typedef cl_uint cl_profiling_info; + + +typedef struct _cl_image_format { + cl_channel_order image_channel_order; + cl_channel_type image_channel_data_type; +} cl_image_format; + +typedef struct _cl_image_desc { + cl_mem_object_type image_type; + size_t image_width; + size_t image_height; + size_t image_depth; + size_t image_array_size; + size_t image_row_pitch; + size_t image_slice_pitch; + cl_uint num_mip_levels; + cl_uint num_samples; + cl_mem buffer; +} cl_image_desc; + +typedef struct _cl_buffer_region { + size_t origin; + size_t size; +} cl_buffer_region; + + +/******************************************************************************/ + +/* Error Codes */ +#define CL_SUCCESS 0 +#define CL_DEVICE_NOT_FOUND -1 +#define CL_DEVICE_NOT_AVAILABLE -2 +#define CL_COMPILER_NOT_AVAILABLE -3 +#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 +#define CL_OUT_OF_RESOURCES -5 +#define CL_OUT_OF_HOST_MEMORY -6 +#define CL_PROFILING_INFO_NOT_AVAILABLE -7 +#define CL_MEM_COPY_OVERLAP -8 +#define CL_IMAGE_FORMAT_MISMATCH -9 +#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 +#define CL_BUILD_PROGRAM_FAILURE -11 +#define CL_MAP_FAILURE -12 +#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 +#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 +#define CL_COMPILE_PROGRAM_FAILURE -15 +#define CL_LINKER_NOT_AVAILABLE -16 +#define CL_LINK_PROGRAM_FAILURE -17 +#define CL_DEVICE_PARTITION_FAILED -18 +#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 + +#define CL_INVALID_VALUE -30 +#define CL_INVALID_DEVICE_TYPE -31 +#define CL_INVALID_PLATFORM -32 +#define CL_INVALID_DEVICE -33 +#define CL_INVALID_CONTEXT -34 +#define CL_INVALID_QUEUE_PROPERTIES -35 +#define CL_INVALID_COMMAND_QUEUE -36 +#define CL_INVALID_HOST_PTR -37 +#define CL_INVALID_MEM_OBJECT -38 +#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 +#define CL_INVALID_IMAGE_SIZE -40 +#define CL_INVALID_SAMPLER -41 +#define CL_INVALID_BINARY -42 +#define CL_INVALID_BUILD_OPTIONS -43 +#define CL_INVALID_PROGRAM -44 +#define CL_INVALID_PROGRAM_EXECUTABLE -45 +#define CL_INVALID_KERNEL_NAME -46 +#define CL_INVALID_KERNEL_DEFINITION -47 +#define CL_INVALID_KERNEL -48 +#define CL_INVALID_ARG_INDEX -49 +#define CL_INVALID_ARG_VALUE -50 +#define CL_INVALID_ARG_SIZE -51 +#define CL_INVALID_KERNEL_ARGS -52 +#define CL_INVALID_WORK_DIMENSION -53 +#define CL_INVALID_WORK_GROUP_SIZE -54 +#define CL_INVALID_WORK_ITEM_SIZE -55 +#define CL_INVALID_GLOBAL_OFFSET -56 +#define CL_INVALID_EVENT_WAIT_LIST -57 +#define CL_INVALID_EVENT -58 +#define CL_INVALID_OPERATION -59 +#define CL_INVALID_GL_OBJECT -60 +#define CL_INVALID_BUFFER_SIZE -61 +#define CL_INVALID_MIP_LEVEL -62 +#define CL_INVALID_GLOBAL_WORK_SIZE -63 +#define CL_INVALID_PROPERTY -64 +#define CL_INVALID_IMAGE_DESCRIPTOR -65 +#define CL_INVALID_COMPILER_OPTIONS -66 +#define CL_INVALID_LINKER_OPTIONS -67 +#define CL_INVALID_DEVICE_PARTITION_COUNT -68 + +/* OpenCL Version */ +#define CL_VERSION_1_0 1 +#define CL_VERSION_1_1 1 +#define CL_VERSION_1_2 1 + +/* cl_bool */ +#define CL_FALSE 0 +#define CL_TRUE 1 +#define CL_BLOCKING CL_TRUE +#define CL_NON_BLOCKING CL_FALSE + +/* cl_platform_info */ +#define CL_PLATFORM_PROFILE 0x0900 +#define CL_PLATFORM_VERSION 0x0901 +#define CL_PLATFORM_NAME 0x0902 +#define CL_PLATFORM_VENDOR 0x0903 +#define CL_PLATFORM_EXTENSIONS 0x0904 + +/* cl_device_type - bitfield */ +#define CL_DEVICE_TYPE_DEFAULT (1 << 0) +#define CL_DEVICE_TYPE_CPU (1 << 1) +#define CL_DEVICE_TYPE_GPU (1 << 2) +#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) +#define CL_DEVICE_TYPE_CUSTOM (1 << 4) +#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF + +/* cl_device_info */ +#define CL_DEVICE_TYPE 0x1000 +#define CL_DEVICE_VENDOR_ID 0x1001 +#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 +#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 +#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B +#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C +#define CL_DEVICE_ADDRESS_BITS 0x100D +#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E +#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F +#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 +#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 +#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 +#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 +#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 +#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 +#define CL_DEVICE_IMAGE_SUPPORT 0x1016 +#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 +#define CL_DEVICE_MAX_SAMPLERS 0x1018 +#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 +#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A +#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B +#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C +#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D +#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E +#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F +#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 +#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 +#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 +#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 +#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 +#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 +#define CL_DEVICE_ENDIAN_LITTLE 0x1026 +#define CL_DEVICE_AVAILABLE 0x1027 +#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 +#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 +#define CL_DEVICE_QUEUE_PROPERTIES 0x102A +#define CL_DEVICE_NAME 0x102B +#define CL_DEVICE_VENDOR 0x102C +#define CL_DRIVER_VERSION 0x102D +#define CL_DEVICE_PROFILE 0x102E +#define CL_DEVICE_VERSION 0x102F +#define CL_DEVICE_EXTENSIONS 0x1030 +#define CL_DEVICE_PLATFORM 0x1031 +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 +/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */ +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 +#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C +#define CL_DEVICE_OPENCL_C_VERSION 0x103D +#define CL_DEVICE_LINKER_AVAILABLE 0x103E +#define CL_DEVICE_BUILT_IN_KERNELS 0x103F +#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 +#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 +#define CL_DEVICE_PARENT_DEVICE 0x1042 +#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 +#define CL_DEVICE_PARTITION_PROPERTIES 0x1044 +#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 +#define CL_DEVICE_PARTITION_TYPE 0x1046 +#define CL_DEVICE_REFERENCE_COUNT 0x1047 +#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 +#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B + +/* cl_device_fp_config - bitfield */ +#define CL_FP_DENORM (1 << 0) +#define CL_FP_INF_NAN (1 << 1) +#define CL_FP_ROUND_TO_NEAREST (1 << 2) +#define CL_FP_ROUND_TO_ZERO (1 << 3) +#define CL_FP_ROUND_TO_INF (1 << 4) +#define CL_FP_FMA (1 << 5) +#define CL_FP_SOFT_FLOAT (1 << 6) +#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) + +/* cl_device_mem_cache_type */ +#define CL_NONE 0x0 +#define CL_READ_ONLY_CACHE 0x1 +#define CL_READ_WRITE_CACHE 0x2 + +/* cl_device_local_mem_type */ +#define CL_LOCAL 0x1 +#define CL_GLOBAL 0x2 + +/* cl_device_exec_capabilities - bitfield */ +#define CL_EXEC_KERNEL (1 << 0) +#define CL_EXEC_NATIVE_KERNEL (1 << 1) + +/* cl_command_queue_properties - bitfield */ +#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) +#define CL_QUEUE_PROFILING_ENABLE (1 << 1) + +/* cl_context_info */ +#define CL_CONTEXT_REFERENCE_COUNT 0x1080 +#define CL_CONTEXT_DEVICES 0x1081 +#define CL_CONTEXT_PROPERTIES 0x1082 +#define CL_CONTEXT_NUM_DEVICES 0x1083 + +/* cl_context_properties */ +#define CL_CONTEXT_PLATFORM 0x1084 +#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 + +/* cl_device_partition_property */ +#define CL_DEVICE_PARTITION_EQUALLY 0x1086 +#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 +#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 +#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 + +/* cl_device_affinity_domain */ +#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) +#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) +#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) +#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) +#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) +#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) + +/* cl_command_queue_info */ +#define CL_QUEUE_CONTEXT 0x1090 +#define CL_QUEUE_DEVICE 0x1091 +#define CL_QUEUE_REFERENCE_COUNT 0x1092 +#define CL_QUEUE_PROPERTIES 0x1093 + +/* cl_mem_flags - bitfield */ +#define CL_MEM_READ_WRITE (1 << 0) +#define CL_MEM_WRITE_ONLY (1 << 1) +#define CL_MEM_READ_ONLY (1 << 2) +#define CL_MEM_USE_HOST_PTR (1 << 3) +#define CL_MEM_ALLOC_HOST_PTR (1 << 4) +#define CL_MEM_COPY_HOST_PTR (1 << 5) +/* reserved (1 << 6) */ +#define CL_MEM_HOST_WRITE_ONLY (1 << 7) +#define CL_MEM_HOST_READ_ONLY (1 << 8) +#define CL_MEM_HOST_NO_ACCESS (1 << 9) + +/* cl_mem_migration_flags - bitfield */ +#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) +#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) + +/* cl_channel_order */ +#define CL_R 0x10B0 +#define CL_A 0x10B1 +#define CL_RG 0x10B2 +#define CL_RA 0x10B3 +#define CL_RGB 0x10B4 +#define CL_RGBA 0x10B5 +#define CL_BGRA 0x10B6 +#define CL_ARGB 0x10B7 +#define CL_INTENSITY 0x10B8 +#define CL_LUMINANCE 0x10B9 +#define CL_Rx 0x10BA +#define CL_RGx 0x10BB +#define CL_RGBx 0x10BC +#define CL_DEPTH 0x10BD +#define CL_DEPTH_STENCIL 0x10BE + +/* cl_channel_type */ +#define CL_SNORM_INT8 0x10D0 +#define CL_SNORM_INT16 0x10D1 +#define CL_UNORM_INT8 0x10D2 +#define CL_UNORM_INT16 0x10D3 +#define CL_UNORM_SHORT_565 0x10D4 +#define CL_UNORM_SHORT_555 0x10D5 +#define CL_UNORM_INT_101010 0x10D6 +#define CL_SIGNED_INT8 0x10D7 +#define CL_SIGNED_INT16 0x10D8 +#define CL_SIGNED_INT32 0x10D9 +#define CL_UNSIGNED_INT8 0x10DA +#define CL_UNSIGNED_INT16 0x10DB +#define CL_UNSIGNED_INT32 0x10DC +#define CL_HALF_FLOAT 0x10DD +#define CL_FLOAT 0x10DE +#define CL_UNORM_INT24 0x10DF + +/* cl_mem_object_type */ +#define CL_MEM_OBJECT_BUFFER 0x10F0 +#define CL_MEM_OBJECT_IMAGE2D 0x10F1 +#define CL_MEM_OBJECT_IMAGE3D 0x10F2 +#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 +#define CL_MEM_OBJECT_IMAGE1D 0x10F4 +#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 +#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 + +/* cl_mem_info */ +#define CL_MEM_TYPE 0x1100 +#define CL_MEM_FLAGS 0x1101 +#define CL_MEM_SIZE 0x1102 +#define CL_MEM_HOST_PTR 0x1103 +#define CL_MEM_MAP_COUNT 0x1104 +#define CL_MEM_REFERENCE_COUNT 0x1105 +#define CL_MEM_CONTEXT 0x1106 +#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 +#define CL_MEM_OFFSET 0x1108 + +/* cl_image_info */ +#define CL_IMAGE_FORMAT 0x1110 +#define CL_IMAGE_ELEMENT_SIZE 0x1111 +#define CL_IMAGE_ROW_PITCH 0x1112 +#define CL_IMAGE_SLICE_PITCH 0x1113 +#define CL_IMAGE_WIDTH 0x1114 +#define CL_IMAGE_HEIGHT 0x1115 +#define CL_IMAGE_DEPTH 0x1116 +#define CL_IMAGE_ARRAY_SIZE 0x1117 +#define CL_IMAGE_BUFFER 0x1118 +#define CL_IMAGE_NUM_MIP_LEVELS 0x1119 +#define CL_IMAGE_NUM_SAMPLES 0x111A + +/* cl_addressing_mode */ +#define CL_ADDRESS_NONE 0x1130 +#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 +#define CL_ADDRESS_CLAMP 0x1132 +#define CL_ADDRESS_REPEAT 0x1133 +#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 + +/* cl_filter_mode */ +#define CL_FILTER_NEAREST 0x1140 +#define CL_FILTER_LINEAR 0x1141 + +/* cl_sampler_info */ +#define CL_SAMPLER_REFERENCE_COUNT 0x1150 +#define CL_SAMPLER_CONTEXT 0x1151 +#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 +#define CL_SAMPLER_ADDRESSING_MODE 0x1153 +#define CL_SAMPLER_FILTER_MODE 0x1154 + +/* cl_map_flags - bitfield */ +#define CL_MAP_READ (1 << 0) +#define CL_MAP_WRITE (1 << 1) +#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) + +/* cl_program_info */ +#define CL_PROGRAM_REFERENCE_COUNT 0x1160 +#define CL_PROGRAM_CONTEXT 0x1161 +#define CL_PROGRAM_NUM_DEVICES 0x1162 +#define CL_PROGRAM_DEVICES 0x1163 +#define CL_PROGRAM_SOURCE 0x1164 +#define CL_PROGRAM_BINARY_SIZES 0x1165 +#define CL_PROGRAM_BINARIES 0x1166 +#define CL_PROGRAM_NUM_KERNELS 0x1167 +#define CL_PROGRAM_KERNEL_NAMES 0x1168 + +/* cl_program_build_info */ +#define CL_PROGRAM_BUILD_STATUS 0x1181 +#define CL_PROGRAM_BUILD_OPTIONS 0x1182 +#define CL_PROGRAM_BUILD_LOG 0x1183 +#define CL_PROGRAM_BINARY_TYPE 0x1184 + +/* cl_program_binary_type */ +#define CL_PROGRAM_BINARY_TYPE_NONE 0x0 +#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 +#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 +#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 + +/* cl_build_status */ +#define CL_BUILD_SUCCESS 0 +#define CL_BUILD_NONE -1 +#define CL_BUILD_ERROR -2 +#define CL_BUILD_IN_PROGRESS -3 + +/* cl_kernel_info */ +#define CL_KERNEL_FUNCTION_NAME 0x1190 +#define CL_KERNEL_NUM_ARGS 0x1191 +#define CL_KERNEL_REFERENCE_COUNT 0x1192 +#define CL_KERNEL_CONTEXT 0x1193 +#define CL_KERNEL_PROGRAM 0x1194 +#define CL_KERNEL_ATTRIBUTES 0x1195 + +/* cl_kernel_arg_info */ +#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 +#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 +#define CL_KERNEL_ARG_TYPE_NAME 0x1198 +#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 +#define CL_KERNEL_ARG_NAME 0x119A + +/* cl_kernel_arg_address_qualifier */ +#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B +#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C +#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D +#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E + +/* cl_kernel_arg_access_qualifier */ +#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 +#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 +#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 +#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 + +/* cl_kernel_arg_type_qualifer */ +#define CL_KERNEL_ARG_TYPE_NONE 0 +#define CL_KERNEL_ARG_TYPE_CONST (1 << 0) +#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) +#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) + +/* cl_kernel_work_group_info */ +#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 +#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 +#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 +#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 +#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 +#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 + +/* cl_event_info */ +#define CL_EVENT_COMMAND_QUEUE 0x11D0 +#define CL_EVENT_COMMAND_TYPE 0x11D1 +#define CL_EVENT_REFERENCE_COUNT 0x11D2 +#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 +#define CL_EVENT_CONTEXT 0x11D4 + +/* cl_command_type */ +#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 +#define CL_COMMAND_TASK 0x11F1 +#define CL_COMMAND_NATIVE_KERNEL 0x11F2 +#define CL_COMMAND_READ_BUFFER 0x11F3 +#define CL_COMMAND_WRITE_BUFFER 0x11F4 +#define CL_COMMAND_COPY_BUFFER 0x11F5 +#define CL_COMMAND_READ_IMAGE 0x11F6 +#define CL_COMMAND_WRITE_IMAGE 0x11F7 +#define CL_COMMAND_COPY_IMAGE 0x11F8 +#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 +#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA +#define CL_COMMAND_MAP_BUFFER 0x11FB +#define CL_COMMAND_MAP_IMAGE 0x11FC +#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD +#define CL_COMMAND_MARKER 0x11FE +#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF +#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 +#define CL_COMMAND_READ_BUFFER_RECT 0x1201 +#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 +#define CL_COMMAND_COPY_BUFFER_RECT 0x1203 +#define CL_COMMAND_USER 0x1204 +#define CL_COMMAND_BARRIER 0x1205 +#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 +#define CL_COMMAND_FILL_BUFFER 0x1207 +#define CL_COMMAND_FILL_IMAGE 0x1208 + +/* command execution status */ +#define CL_COMPLETE 0x0 +#define CL_RUNNING 0x1 +#define CL_SUBMITTED 0x2 +#define CL_QUEUED 0x3 + +/* cl_buffer_create_type */ +#define CL_BUFFER_CREATE_TYPE_REGION 0x1220 + +/* cl_profiling_info */ +#define CL_PROFILING_COMMAND_QUEUED 0x1280 +#define CL_PROFILING_COMMAND_SUBMIT 0x1281 +#define CL_PROFILING_COMMAND_START 0x1282 +#define CL_PROFILING_COMMAND_END 0x1283 + +/********************************************************************************************************/ + +/* Platform API */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformIDs(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformInfo(cl_platform_id /* platform */, + cl_platform_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Device APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDs(cl_platform_id /* platform */, + cl_device_type /* device_type */, + cl_uint /* num_entries */, + cl_device_id * /* devices */, + cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceInfo(cl_device_id /* device */, + cl_device_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateSubDevices(cl_device_id /* in_device */, + const cl_device_partition_property * /* properties */, + cl_uint /* num_devices */, + cl_device_id * /* out_devices */, + cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +/* Context APIs */ +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContext(const cl_context_properties * /* properties */, + cl_uint /* num_devices */, + const cl_device_id * /* devices */, + void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContextFromType(const cl_context_properties * /* properties */, + cl_device_type /* device_type */, + void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetContextInfo(cl_context /* context */, + cl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Command Queue APIs */ +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueue(cl_context /* context */, + cl_device_id /* device */, + cl_command_queue_properties /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetCommandQueueInfo(cl_command_queue /* command_queue */, + cl_command_queue_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Memory Object APIs */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + size_t /* size */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateSubBuffer(cl_mem /* buffer */, + cl_mem_flags /* flags */, + cl_buffer_create_type /* buffer_create_type */, + const void * /* buffer_create_info */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + const cl_image_desc * /* image_desc */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSupportedImageFormats(cl_context /* context */, + cl_mem_flags /* flags */, + cl_mem_object_type /* image_type */, + cl_uint /* num_entries */, + cl_image_format * /* image_formats */, + cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetMemObjectInfo(cl_mem /* memobj */, + cl_mem_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetImageInfo(cl_mem /* image */, + cl_image_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetMemObjectDestructorCallback( cl_mem /* memobj */, + void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1; + +/* Sampler APIs */ +extern CL_API_ENTRY cl_sampler CL_API_CALL +clCreateSampler(cl_context /* context */, + cl_bool /* normalized_coords */, + cl_addressing_mode /* addressing_mode */, + cl_filter_mode /* filter_mode */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSamplerInfo(cl_sampler /* sampler */, + cl_sampler_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Program Object APIs */ +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithSource(cl_context /* context */, + cl_uint /* count */, + const char ** /* strings */, + const size_t * /* lengths */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBinary(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const size_t * /* lengths */, + const unsigned char ** /* binaries */, + cl_int * /* binary_status */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBuiltInKernels(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* kernel_names */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clBuildProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCompileProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_headers */, + const cl_program * /* input_headers */, + const char ** /* header_include_names */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_program CL_API_CALL +clLinkProgram(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_programs */, + const cl_program * /* input_programs */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */, + cl_int * /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2; + + +extern CL_API_ENTRY cl_int CL_API_CALL +clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramInfo(cl_program /* program */, + cl_program_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramBuildInfo(cl_program /* program */, + cl_device_id /* device */, + cl_program_build_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Kernel Object APIs */ +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCreateKernel(cl_program /* program */, + const char * /* kernel_name */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateKernelsInProgram(cl_program /* program */, + cl_uint /* num_kernels */, + cl_kernel * /* kernels */, + cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArg(cl_kernel /* kernel */, + cl_uint /* arg_index */, + size_t /* arg_size */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelInfo(cl_kernel /* kernel */, + cl_kernel_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelArgInfo(cl_kernel /* kernel */, + cl_uint /* arg_indx */, + cl_kernel_arg_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelWorkGroupInfo(cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_work_group_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Event Object APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clWaitForEvents(cl_uint /* num_events */, + const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventInfo(cl_event /* event */, + cl_event_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateUserEvent(cl_context /* context */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetUserEventStatus(cl_event /* event */, + cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetEventCallback( cl_event /* event */, + cl_int /* command_exec_callback_type */, + void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; + +/* Profiling APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventProfilingInfo(cl_event /* event */, + cl_profiling_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Flush and Finish APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +/* Enqueued Commands APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + size_t /* offset */, + size_t /* size */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + size_t /* offset */, + size_t /* size */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + size_t /* src_offset */, + size_t /* dst_offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferRect(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin */, + const size_t * /* dst_origin */, + const size_t * /* region */, + size_t /* src_row_pitch */, + size_t /* src_slice_pitch */, + size_t /* dst_row_pitch */, + size_t /* dst_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_read */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* row_pitch */, + size_t /* slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_write */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* input_row_pitch */, + size_t /* input_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + const void * /* fill_color */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImage(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_image */, + const size_t * /* src_origin[3] */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin[3] */, + const size_t * /* region[3] */, + size_t /* dst_offset */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_image */, + size_t /* src_offset */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t * /* image_row_pitch */, + size_t * /* image_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueUnmapMemObject(cl_command_queue /* command_queue */, + cl_mem /* memobj */, + void * /* mapped_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMigrateMemObjects(cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueTask(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNativeKernel(cl_command_queue /* command_queue */, + void (CL_CALLBACK * /*user_func*/)(void *), + void * /* args */, + size_t /* cb_args */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_list */, + const void ** /* args_mem_loc */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + + +/* Extension function access + * + * Returns the extension function address for the given function name, + * or NULL if a valid function can not be found. The client must + * check to make sure the address is not NULL, before using or + * calling the returned function address. + */ +extern CL_API_ENTRY void * CL_API_CALL +clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */, + const char * /* func_name */) CL_API_SUFFIX__VERSION_1_2; + + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage2D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_row_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage3D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_depth */, + size_t /* image_row_pitch */, + size_t /* image_slice_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueMarker(cl_command_queue /* command_queue */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueWaitForEvents(cl_command_queue /* command_queue */, + cl_uint /* num_events */, + const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL +clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_H */ + diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_ext.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_ext.h new file mode 100644 index 000000000..a998c9c51 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_ext.h @@ -0,0 +1,315 @@ +/******************************************************************************* + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */ + +/* cl_ext.h contains OpenCL extensions which don't have external */ +/* (OpenGL, D3D) dependencies. */ + +#ifndef __CL_EXT_H +#define __CL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ +#include <AvailabilityMacros.h> +#endif + +#include <cl.h> + +/* cl_khr_fp16 extension - no extension #define since it has no functions */ +#define CL_DEVICE_HALF_FP_CONFIG 0x1033 + +/* Memory object destruction + * + * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR + * + * Registers a user callback function that will be called when the memory object is deleted and its resources + * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback + * stack associated with memobj. The registered user callback functions are called in the reverse order in + * which they were registered. The user callback functions are called and then the memory object is deleted + * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be + * notified when the memory referenced by host_ptr, specified when the memory object is created and used as + * the storage bits for the memory object, can be reused or freed. + * + * The application may not call CL api's with the cl_mem object passed to the pfn_notify. + * + * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + */ +#define cl_APPLE_SetMemObjectDestructor 1 +cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */, + void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/* Context Logging Functions + * + * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext(). + * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + * + * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger + */ +#define cl_APPLE_ContextLoggingFunctions 1 +extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */ +extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */ +extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/************************ +* cl_khr_icd extension * +************************/ +#define cl_khr_icd 1 + +/* cl_platform_info */ +#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 + +/* Additional Error Codes */ +#define CL_PLATFORM_NOT_FOUND_KHR -1001 + +extern CL_API_ENTRY cl_int CL_API_CALL +clIcdGetPlatformIDsKHR(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)( + cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + + +/* Extension: cl_khr_image2D_buffer + * + * This extension allows a 2D image to be created from a cl_mem buffer without a copy. + * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t. + * Both the sampler and sampler-less read_image built-in functions are supported for 2D images + * and 2D images created from a buffer. Similarly, the write_image built-ins are also supported + * for 2D images created from a buffer. + * + * When the 2D image from buffer is created, the client must specify the width, + * height, image format (i.e. channel order and channel data type) and optionally the row pitch + * + * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels. + * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels. + */ + +/************************************* + * cl_khr_initalize_memory extension * + *************************************/ + +#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x200E + + +/************************************** + * cl_khr_terminate_context extension * + **************************************/ + +#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x200F +#define CL_CONTEXT_TERMINATE_KHR 0x2010 + +#define cl_khr_terminate_context 1 +extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2; + + +/* + * Extension: cl_khr_spir + * + * This extension adds support to create an OpenCL program object from a + * Standard Portable Intermediate Representation (SPIR) instance + */ + +#define CL_DEVICE_SPIR_VERSIONS 0x40E0 +#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1 + + +/****************************************** +* cl_nv_device_attribute_query extension * +******************************************/ +/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ +#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 +#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 +#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 +#define CL_DEVICE_WARP_SIZE_NV 0x4003 +#define CL_DEVICE_GPU_OVERLAP_NV 0x4004 +#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 +#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 + +/********************************* +* cl_amd_device_attribute_query * +*********************************/ +#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 + +/********************************* +* cl_arm_printf extension +*********************************/ +#define CL_PRINTF_CALLBACK_ARM 0x40B0 +#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 + +#ifdef CL_VERSION_1_1 + /*********************************** + * cl_ext_device_fission extension * + ***********************************/ + #define cl_ext_device_fission 1 + + extern CL_API_ENTRY cl_int CL_API_CALL + clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + extern CL_API_ENTRY cl_int CL_API_CALL + clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef cl_ulong cl_device_partition_property_ext; + extern CL_API_ENTRY cl_int CL_API_CALL + clCreateSubDevicesEXT( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + ( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + /* cl_device_partition_property_ext */ + #define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 + #define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 + #define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 + #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 + + /* clDeviceGetInfo selectors */ + #define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 + #define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 + #define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 + #define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 + #define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 + + /* error codes */ + #define CL_DEVICE_PARTITION_FAILED_EXT -1057 + #define CL_INVALID_PARTITION_COUNT_EXT -1058 + #define CL_INVALID_PARTITION_NAME_EXT -1059 + + /* CL_AFFINITY_DOMAINs */ + #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 + #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 + #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 + #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 + #define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 + #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 + + /* cl_device_partition_property_ext list terminators */ + #define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0) + #define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0) + #define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1) + +/********************************* +* cl_qcom_ext_host_ptr extension +*********************************/ + +#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29) + +#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 +#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 +#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2 +#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3 +#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4 +#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5 +#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6 +#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7 + +typedef cl_uint cl_image_pitch_info_qcom; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceImageInfoQCOM(cl_device_id device, + size_t image_width, + size_t image_height, + const cl_image_format *image_format, + cl_image_pitch_info_qcom param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + +typedef struct _cl_mem_ext_host_ptr +{ + /* Type of external memory allocation. */ + /* Legal values will be defined in layered extensions. */ + cl_uint allocation_type; + + /* Host cache policy for this external memory allocation. */ + cl_uint host_cache_policy; + +} cl_mem_ext_host_ptr; + +/********************************* +* cl_qcom_ion_host_ptr extension +*********************************/ + +#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8 + +typedef struct _cl_mem_ion_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* ION file descriptor */ + int ion_filedesc; + + /* Host pointer to the ION allocated memory */ + void* ion_hostptr; + +} cl_mem_ion_host_ptr; + +#endif /* CL_VERSION_1_1 */ + +#ifdef __cplusplus +} +#endif + + +#endif /* __CL_EXT_H */ diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl.h new file mode 100644 index 000000000..ad914cbd4 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl.h @@ -0,0 +1,158 @@ +/********************************************************************************** + * Copyright (c) 2008 - 2012 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +#ifndef __OPENCL_CL_GL_H +#define __OPENCL_CL_GL_H + +#include <cl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef cl_uint cl_gl_object_type; +typedef cl_uint cl_gl_texture_info; +typedef cl_uint cl_gl_platform_info; +typedef struct __GLsync *cl_GLsync; + +/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ +#define CL_GL_OBJECT_BUFFER 0x2000 +#define CL_GL_OBJECT_TEXTURE2D 0x2001 +#define CL_GL_OBJECT_TEXTURE3D 0x2002 +#define CL_GL_OBJECT_RENDERBUFFER 0x2003 +#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E +#define CL_GL_OBJECT_TEXTURE1D 0x200F +#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 +#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 + +/* cl_gl_texture_info */ +#define CL_GL_TEXTURE_TARGET 0x2004 +#define CL_GL_MIPMAP_LEVEL 0x2005 +#define CL_GL_NUM_SAMPLES 0x2012 + + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* bufobj */, + int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLTexture(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLRenderbuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* renderbuffer */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLObjectInfo(cl_mem /* memobj */, + cl_gl_object_type * /* gl_object_type */, + cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLTextureInfo(cl_mem /* memobj */, + cl_gl_texture_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture2D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture3D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* cl_khr_gl_sharing extension */ + +#define cl_khr_gl_sharing 1 + +typedef cl_uint cl_gl_context_info; + +/* Additional Error Codes */ +#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 + +/* cl_gl_context_info */ +#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 +#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 + +/* Additional cl_context_properties */ +#define CL_GL_CONTEXT_KHR 0x2008 +#define CL_EGL_DISPLAY_KHR 0x2009 +#define CL_GLX_DISPLAY_KHR 0x200A +#define CL_WGL_HDC_KHR 0x200B +#define CL_CGL_SHAREGROUP_KHR 0x200C + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLContextInfoKHR(const cl_context_properties * /* properties */, + cl_gl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( + const cl_context_properties * properties, + cl_gl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_H */ diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl_ext.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl_ext.h new file mode 100644 index 000000000..0c10fedfa --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl_ext.h @@ -0,0 +1,65 @@ +/********************************************************************************** + * Copyright (c) 2008-2012 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */ +/* OpenGL dependencies. */ + +#ifndef __OPENCL_CL_GL_EXT_H +#define __OPENCL_CL_GL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <cl_gl.h> + +/* + * For each extension, follow this template + * cl_VEN_extname extension */ +/* #define cl_VEN_extname 1 + * ... define new types, if any + * ... define new tokens, if any + * ... define new APIs, if any + * + * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header + * This allows us to avoid having to decide whether to include GL headers or GLES here. + */ + +/* + * cl_khr_gl_event extension + * See section 9.9 in the OpenCL 1.1 spec for more information + */ +#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromGLsyncKHR(cl_context /* context */, + cl_GLsync /* cl_GLsync */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_EXT_H */ diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_platform.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_platform.h new file mode 100644 index 000000000..7f6f5e8a7 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_platform.h @@ -0,0 +1,1278 @@ +/********************************************************************************** + * Copyright (c) 2008-2012 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */ + +#ifndef __CL_PLATFORM_H +#define __CL_PLATFORM_H + +#ifdef __APPLE__ + /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ + #include <AvailabilityMacros.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) + #define CL_API_ENTRY + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_ENTRY + #define CL_API_CALL + #define CL_CALLBACK +#endif + +#ifdef __APPLE__ + #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) + #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + #else + #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #endif +#else + #define CL_EXTENSION_WEAK_LINK + #define CL_API_SUFFIX__VERSION_1_0 + #define CL_EXT_SUFFIX__VERSION_1_0 + #define CL_API_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_1 + #define CL_API_SUFFIX__VERSION_1_2 + #define CL_EXT_SUFFIX__VERSION_1_2 + + #ifdef __GNUC__ + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #endif + #elif _WIN32 + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated) + #endif + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #endif +#endif + +#if (defined (_WIN32) && defined(_MSC_VER)) + +/* scalar types */ +typedef signed __int8 cl_char; +typedef unsigned __int8 cl_uchar; +typedef signed __int16 cl_short; +typedef unsigned __int16 cl_ushort; +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 0x1.0p-23f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.718281828459045090796 +#define CL_M_LOG2E 1.442695040888963387005 +#define CL_M_LOG10E 0.434294481903251816668 +#define CL_M_LN2 0.693147180559945286227 +#define CL_M_LN10 2.302585092994045901094 +#define CL_M_PI 3.141592653589793115998 +#define CL_M_PI_2 1.570796326794896557999 +#define CL_M_PI_4 0.785398163397448278999 +#define CL_M_1_PI 0.318309886183790691216 +#define CL_M_2_PI 0.636619772367581382433 +#define CL_M_2_SQRTPI 1.128379167095512558561 +#define CL_M_SQRT2 1.414213562373095145475 +#define CL_M_SQRT1_2 0.707106781186547572737 + +#define CL_M_E_F 2.71828174591064f +#define CL_M_LOG2E_F 1.44269502162933f +#define CL_M_LOG10E_F 0.43429449200630f +#define CL_M_LN2_F 0.69314718246460f +#define CL_M_LN10_F 2.30258512496948f +#define CL_M_PI_F 3.14159274101257f +#define CL_M_PI_2_F 1.57079637050629f +#define CL_M_PI_4_F 0.78539818525314f +#define CL_M_1_PI_F 0.31830987334251f +#define CL_M_2_PI_F 0.63661974668503f +#define CL_M_2_SQRTPI_F 1.12837922573090f +#define CL_M_SQRT2_F 1.41421353816986f +#define CL_M_SQRT1_2_F 0.70710676908493f + +#define CL_NAN (CL_INFINITY - CL_INFINITY) +#define CL_HUGE_VALF ((cl_float) 1e50) +#define CL_HUGE_VAL ((cl_double) 1e500) +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#else + +#include <stdint.h> + +/* scalar types */ +typedef int8_t cl_char; +typedef uint8_t cl_uchar; +typedef int16_t cl_short __attribute__((aligned(2))); +typedef uint16_t cl_ushort __attribute__((aligned(2))); +typedef int32_t cl_int __attribute__((aligned(4))); +typedef uint32_t cl_uint __attribute__((aligned(4))); +typedef int64_t cl_long __attribute__((aligned(8))); +typedef uint64_t cl_ulong __attribute__((aligned(8))); + +typedef uint16_t cl_half __attribute__((aligned(2))); +typedef float cl_float __attribute__((aligned(4))); +typedef double cl_double __attribute__((aligned(8))); + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 0x1.fffffep127f +#define CL_FLT_MIN 0x1.0p-126f +#define CL_FLT_EPSILON 0x1.0p-23f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 0x1.fffffffffffffp1023 +#define CL_DBL_MIN 0x1.0p-1022 +#define CL_DBL_EPSILON 0x1.0p-52 + +#define CL_M_E 2.718281828459045090796 +#define CL_M_LOG2E 1.442695040888963387005 +#define CL_M_LOG10E 0.434294481903251816668 +#define CL_M_LN2 0.693147180559945286227 +#define CL_M_LN10 2.302585092994045901094 +#define CL_M_PI 3.141592653589793115998 +#define CL_M_PI_2 1.570796326794896557999 +#define CL_M_PI_4 0.785398163397448278999 +#define CL_M_1_PI 0.318309886183790691216 +#define CL_M_2_PI 0.636619772367581382433 +#define CL_M_2_SQRTPI 1.128379167095512558561 +#define CL_M_SQRT2 1.414213562373095145475 +#define CL_M_SQRT1_2 0.707106781186547572737 + +#define CL_M_E_F 2.71828174591064f +#define CL_M_LOG2E_F 1.44269502162933f +#define CL_M_LOG10E_F 0.43429449200630f +#define CL_M_LN2_F 0.69314718246460f +#define CL_M_LN10_F 2.30258512496948f +#define CL_M_PI_F 3.14159274101257f +#define CL_M_PI_2_F 1.57079637050629f +#define CL_M_PI_4_F 0.78539818525314f +#define CL_M_1_PI_F 0.31830987334251f +#define CL_M_2_PI_F 0.63661974668503f +#define CL_M_2_SQRTPI_F 1.12837922573090f +#define CL_M_SQRT2_F 1.41421353816986f +#define CL_M_SQRT1_2_F 0.70710676908493f + +#if defined( __GNUC__ ) + #define CL_HUGE_VALF __builtin_huge_valf() + #define CL_HUGE_VAL __builtin_huge_val() + #define CL_NAN __builtin_nanf( "" ) +#else + #define CL_HUGE_VALF ((cl_float) 1e50) + #define CL_HUGE_VAL ((cl_double) 1e500) + float nanf( const char * ); + #define CL_NAN nanf( "" ) +#endif +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#endif + +#include <stddef.h> + +/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ +typedef unsigned int cl_GLuint; +typedef int cl_GLint; +typedef unsigned int cl_GLenum; + +/* + * Vector types + * + * Note: OpenCL requires that all types be naturally aligned. + * This means that vector types must be naturally aligned. + * For example, a vector of four floats must be aligned to + * a 16 byte boundary (calculated as 4 * the natural 4-byte + * alignment of the float). The alignment qualifiers here + * will only function properly if your compiler supports them + * and if you don't actively work to defeat them. For example, + * in order for a cl_float4 to be 16 byte aligned in a struct, + * the start of the struct must itself be 16-byte aligned. + * + * Maintaining proper alignment is the user's responsibility. + */ + +/* Define basic vector types */ +#if defined( __VEC__ ) + #include <altivec.h> /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ + typedef vector unsigned char __cl_uchar16; + typedef vector signed char __cl_char16; + typedef vector unsigned short __cl_ushort8; + typedef vector signed short __cl_short8; + typedef vector unsigned int __cl_uint4; + typedef vector signed int __cl_int4; + typedef vector float __cl_float4; + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_UINT4__ 1 + #define __CL_INT4__ 1 + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE__ ) + #if defined( __MINGW64__ ) + #include <intrin.h> + #else + #include <xmmintrin.h> + #endif + #if defined( __GNUC__ ) + typedef float __cl_float4 __attribute__((vector_size(16))); + #else + typedef __m128 __cl_float4; + #endif + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE2__ ) + #if defined( __MINGW64__ ) + #include <intrin.h> + #else + #include <emmintrin.h> + #endif + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); + typedef cl_char __cl_char16 __attribute__((vector_size(16))); + typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); + typedef cl_short __cl_short8 __attribute__((vector_size(16))); + typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); + typedef cl_int __cl_int4 __attribute__((vector_size(16))); + typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); + typedef cl_long __cl_long2 __attribute__((vector_size(16))); + typedef cl_double __cl_double2 __attribute__((vector_size(16))); + #else + typedef __m128i __cl_uchar16; + typedef __m128i __cl_char16; + typedef __m128i __cl_ushort8; + typedef __m128i __cl_short8; + typedef __m128i __cl_uint4; + typedef __m128i __cl_int4; + typedef __m128i __cl_ulong2; + typedef __m128i __cl_long2; + typedef __m128d __cl_double2; + #endif + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_INT4__ 1 + #define __CL_UINT4__ 1 + #define __CL_ULONG2__ 1 + #define __CL_LONG2__ 1 + #define __CL_DOUBLE2__ 1 +#endif + +#if defined( __MMX__ ) + #include <mmintrin.h> + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); + typedef cl_char __cl_char8 __attribute__((vector_size(8))); + typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); + typedef cl_short __cl_short4 __attribute__((vector_size(8))); + typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); + typedef cl_int __cl_int2 __attribute__((vector_size(8))); + typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); + typedef cl_long __cl_long1 __attribute__((vector_size(8))); + typedef cl_float __cl_float2 __attribute__((vector_size(8))); + #else + typedef __m64 __cl_uchar8; + typedef __m64 __cl_char8; + typedef __m64 __cl_ushort4; + typedef __m64 __cl_short4; + typedef __m64 __cl_uint2; + typedef __m64 __cl_int2; + typedef __m64 __cl_ulong1; + typedef __m64 __cl_long1; + typedef __m64 __cl_float2; + #endif + #define __CL_UCHAR8__ 1 + #define __CL_CHAR8__ 1 + #define __CL_USHORT4__ 1 + #define __CL_SHORT4__ 1 + #define __CL_INT2__ 1 + #define __CL_UINT2__ 1 + #define __CL_ULONG1__ 1 + #define __CL_LONG1__ 1 + #define __CL_FLOAT2__ 1 +#endif + +#if defined( __AVX__ ) + #if defined( __MINGW64__ ) + #include <intrin.h> + #else + #include <immintrin.h> + #endif + #if defined( __GNUC__ ) + typedef cl_float __cl_float8 __attribute__((vector_size(32))); + typedef cl_double __cl_double4 __attribute__((vector_size(32))); + #else + typedef __m256 __cl_float8; + typedef __m256d __cl_double4; + #endif + #define __CL_FLOAT8__ 1 + #define __CL_DOUBLE4__ 1 +#endif + +/* Define capabilities for anonymous struct members. */ +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ __extension__ +#elif defined( _WIN32) && (_MSC_VER >= 1500) + /* Microsoft Developer Studio 2008 supports anonymous structs, but + * complains by default. */ +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ + /* Disable warning C4201: nonstandard extension used : nameless + * struct/union */ +#pragma warning( push ) +#pragma warning( disable : 4201 ) +#else +#define __CL_HAS_ANON_STRUCT__ 0 +#define __CL_ANON_STRUCT__ +#endif + +/* Define alignment keys */ +#if defined( __GNUC__ ) + #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) +#elif defined( _WIN32) && (_MSC_VER) + /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ + /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ + /* #include <crtdefs.h> */ + /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ + #define CL_ALIGNED(_x) +#else + #warning Need to implement some method to align data here + #define CL_ALIGNED(_x) +#endif + +/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ +#if __CL_HAS_ANON_STRUCT__ + /* .xyzw and .s0123...{f|F} are supported */ + #define CL_HAS_NAMED_VECTOR_FIELDS 1 + /* .hi and .lo are supported */ + #define CL_HAS_HI_LO_VECTOR_FIELDS 1 +#endif + +/* Define cl_vector types */ + +/* ---- cl_charn ---- */ +typedef union +{ + cl_char CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2; +#endif +}cl_char2; + +typedef union +{ + cl_char CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[2]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4; +#endif +}cl_char4; + +/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ +typedef cl_char4 cl_char3; + +typedef union +{ + cl_char CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[4]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[2]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8; +#endif +}cl_char8; + +typedef union +{ + cl_char CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[8]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[4]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8[2]; +#endif +#if defined( __CL_CHAR16__ ) + __cl_char16 v16; +#endif +}cl_char16; + + +/* ---- cl_ucharn ---- */ +typedef union +{ + cl_uchar CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; +#endif +#if defined( __cl_uchar2__) + __cl_uchar2 v2; +#endif +}cl_uchar2; + +typedef union +{ + cl_uchar CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[2]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4; +#endif +}cl_uchar4; + +/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ +typedef cl_uchar4 cl_uchar3; + +typedef union +{ + cl_uchar CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[4]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[2]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8; +#endif +}cl_uchar8; + +typedef union +{ + cl_uchar CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[8]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[4]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8[2]; +#endif +#if defined( __CL_UCHAR16__ ) + __cl_uchar16 v16; +#endif +}cl_uchar16; + + +/* ---- cl_shortn ---- */ +typedef union +{ + cl_short CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2; +#endif +}cl_short2; + +typedef union +{ + cl_short CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[2]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4; +#endif +}cl_short4; + +/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ +typedef cl_short4 cl_short3; + +typedef union +{ + cl_short CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[4]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[2]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8; +#endif +}cl_short8; + +typedef union +{ + cl_short CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[8]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[4]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8[2]; +#endif +#if defined( __CL_SHORT16__ ) + __cl_short16 v16; +#endif +}cl_short16; + + +/* ---- cl_ushortn ---- */ +typedef union +{ + cl_ushort CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2; +#endif +}cl_ushort2; + +typedef union +{ + cl_ushort CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[2]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4; +#endif +}cl_ushort4; + +/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ +typedef cl_ushort4 cl_ushort3; + +typedef union +{ + cl_ushort CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[4]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[2]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8; +#endif +}cl_ushort8; + +typedef union +{ + cl_ushort CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[8]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[4]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8[2]; +#endif +#if defined( __CL_USHORT16__ ) + __cl_ushort16 v16; +#endif +}cl_ushort16; + +/* ---- cl_intn ---- */ +typedef union +{ + cl_int CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2; +#endif +}cl_int2; + +typedef union +{ + cl_int CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[2]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4; +#endif +}cl_int4; + +/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ +typedef cl_int4 cl_int3; + +typedef union +{ + cl_int CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[4]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[2]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8; +#endif +}cl_int8; + +typedef union +{ + cl_int CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[8]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[4]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8[2]; +#endif +#if defined( __CL_INT16__ ) + __cl_int16 v16; +#endif +}cl_int16; + + +/* ---- cl_uintn ---- */ +typedef union +{ + cl_uint CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2; +#endif +}cl_uint2; + +typedef union +{ + cl_uint CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[2]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4; +#endif +}cl_uint4; + +/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ +typedef cl_uint4 cl_uint3; + +typedef union +{ + cl_uint CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[4]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[2]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8; +#endif +}cl_uint8; + +typedef union +{ + cl_uint CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[8]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[4]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8[2]; +#endif +#if defined( __CL_UINT16__ ) + __cl_uint16 v16; +#endif +}cl_uint16; + +/* ---- cl_longn ---- */ +typedef union +{ + cl_long CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2; +#endif +}cl_long2; + +typedef union +{ + cl_long CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[2]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4; +#endif +}cl_long4; + +/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ +typedef cl_long4 cl_long3; + +typedef union +{ + cl_long CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[4]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[2]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8; +#endif +}cl_long8; + +typedef union +{ + cl_long CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[8]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[4]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8[2]; +#endif +#if defined( __CL_LONG16__ ) + __cl_long16 v16; +#endif +}cl_long16; + + +/* ---- cl_ulongn ---- */ +typedef union +{ + cl_ulong CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2; +#endif +}cl_ulong2; + +typedef union +{ + cl_ulong CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[2]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4; +#endif +}cl_ulong4; + +/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ +typedef cl_ulong4 cl_ulong3; + +typedef union +{ + cl_ulong CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[4]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[2]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8; +#endif +}cl_ulong8; + +typedef union +{ + cl_ulong CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[8]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[4]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8[2]; +#endif +#if defined( __CL_ULONG16__ ) + __cl_ulong16 v16; +#endif +}cl_ulong16; + + +/* --- cl_floatn ---- */ + +typedef union +{ + cl_float CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2; +#endif +}cl_float2; + +typedef union +{ + cl_float CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[2]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4; +#endif +}cl_float4; + +/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ +typedef cl_float4 cl_float3; + +typedef union +{ + cl_float CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[4]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[2]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8; +#endif +}cl_float8; + +typedef union +{ + cl_float CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[8]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[4]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8[2]; +#endif +#if defined( __CL_FLOAT16__ ) + __cl_float16 v16; +#endif +}cl_float16; + +/* --- cl_doublen ---- */ + +typedef union +{ + cl_double CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2; +#endif +}cl_double2; + +typedef union +{ + cl_double CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[2]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4; +#endif +}cl_double4; + +/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ +typedef cl_double4 cl_double3; + +typedef union +{ + cl_double CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[4]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[2]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8; +#endif +}cl_double8; + +typedef union +{ + cl_double CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[8]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[4]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8[2]; +#endif +#if defined( __CL_DOUBLE16__ ) + __cl_double16 v16; +#endif +}cl_double16; + +/* Macro to facilitate debugging + * Usage: + * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. + * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" + * Each line thereafter of OpenCL C source must end with: \n\ + * The last line ends in "; + * + * Example: + * + * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ + * kernel void foo( int a, float * b ) \n\ + * { \n\ + * // my comment \n\ + * *b[ get_global_id(0)] = a; \n\ + * } \n\ + * "; + * + * This should correctly set up the line, (column) and file information for your source + * string so you can do source level debugging. + */ +#define __CL_STRINGIFY( _x ) # _x +#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) +#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" + +#ifdef __cplusplus +} +#endif + +#undef __CL_HAS_ANON_STRUCT__ +#undef __CL_ANON_STRUCT__ +#if defined( _WIN32) && (_MSC_VER >= 1500) +#pragma warning( pop ) +#endif + +#endif /* __CL_PLATFORM_H */ diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/opencl.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/opencl.h new file mode 100644 index 000000000..fccacd168 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/opencl.h @@ -0,0 +1,43 @@ +/******************************************************************************* + * Copyright (c) 2008-2012 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_H +#define __OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <cl.h> +#include <cl_gl.h> +#include <cl_gl_ext.h> +#include <cl_ext.h> + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_H */ + diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/image.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/image.go new file mode 100644 index 000000000..d6a996377 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/image.go @@ -0,0 +1,83 @@ +// +build cl12 + +package cl + +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" +import ( + "image" + "unsafe" +) + +func (ctx *Context) CreateImage(flags MemFlag, imageFormat ImageFormat, imageDesc ImageDescription, data []byte) (*MemObject, error) { + format := imageFormat.toCl() + desc := imageDesc.toCl() + var dataPtr unsafe.Pointer + if data != nil { + dataPtr = unsafe.Pointer(&data[0]) + } + var err C.cl_int + clBuffer := C.clCreateImage(ctx.clContext, C.cl_mem_flags(flags), &format, &desc, dataPtr, &err) + if err != C.CL_SUCCESS { + return nil, toError(err) + } + if clBuffer == nil { + return nil, ErrUnknown + } + return newMemObject(clBuffer, len(data)), nil +} + +func (ctx *Context) CreateImageSimple(flags MemFlag, width, height int, channelOrder ChannelOrder, channelDataType ChannelDataType, data []byte) (*MemObject, error) { + format := ImageFormat{channelOrder, channelDataType} + desc := ImageDescription{ + Type: MemObjectTypeImage2D, + Width: width, + Height: height, + } + return ctx.CreateImage(flags, format, desc, data) +} + +func (ctx *Context) CreateImageFromImage(flags MemFlag, img image.Image) (*MemObject, error) { + switch m := img.(type) { + case *image.Gray: + format := ImageFormat{ChannelOrderIntensity, ChannelDataTypeUNormInt8} + desc := ImageDescription{ + Type: MemObjectTypeImage2D, + Width: m.Bounds().Dx(), + Height: m.Bounds().Dy(), + RowPitch: m.Stride, + } + return ctx.CreateImage(flags, format, desc, m.Pix) + case *image.RGBA: + format := ImageFormat{ChannelOrderRGBA, ChannelDataTypeUNormInt8} + desc := ImageDescription{ + Type: MemObjectTypeImage2D, + Width: m.Bounds().Dx(), + Height: m.Bounds().Dy(), + RowPitch: m.Stride, + } + return ctx.CreateImage(flags, format, desc, m.Pix) + } + + b := img.Bounds() + w := b.Dx() + h := b.Dy() + data := make([]byte, w*h*4) + dataOffset := 0 + for y := 0; y < h; y++ { + for x := 0; x < w; x++ { + c := img.At(x+b.Min.X, y+b.Min.Y) + r, g, b, a := c.RGBA() + data[dataOffset] = uint8(r >> 8) + data[dataOffset+1] = uint8(g >> 8) + data[dataOffset+2] = uint8(b >> 8) + data[dataOffset+3] = uint8(a >> 8) + dataOffset += 4 + } + } + return ctx.CreateImageSimple(flags, w, h, ChannelOrderRGBA, ChannelDataTypeUNormInt8, data) +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel.go new file mode 100644 index 000000000..894a775e8 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel.go @@ -0,0 +1,127 @@ +package cl + +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" + +import ( + "fmt" + "unsafe" +) + +type ErrUnsupportedArgumentType struct { + Index int + Value interface{} +} + +func (e ErrUnsupportedArgumentType) Error() string { + return fmt.Sprintf("cl: unsupported argument type for index %d: %+v", e.Index, e.Value) +} + +type Kernel struct { + clKernel C.cl_kernel + name string +} + +type LocalBuffer int + +func releaseKernel(k *Kernel) { + if k.clKernel != nil { + C.clReleaseKernel(k.clKernel) + k.clKernel = nil + } +} + +func (k *Kernel) Release() { + releaseKernel(k) +} + +func (k *Kernel) SetArgs(args ...interface{}) error { + for index, arg := range args { + if err := k.SetArg(index, arg); err != nil { + return err + } + } + return nil +} + +func (k *Kernel) SetArg(index int, arg interface{}) error { + switch val := arg.(type) { + case uint8: + return k.SetArgUint8(index, val) + case int8: + return k.SetArgInt8(index, val) + case uint32: + return k.SetArgUint32(index, val) + case uint64: + return k.SetArgUint64(index, val) + case int32: + return k.SetArgInt32(index, val) + case float32: + return k.SetArgFloat32(index, val) + case *MemObject: + return k.SetArgBuffer(index, val) + case LocalBuffer: + return k.SetArgLocal(index, int(val)) + default: + return ErrUnsupportedArgumentType{Index: index, Value: arg} + } +} + +func (k *Kernel) SetArgBuffer(index int, buffer *MemObject) error { + return k.SetArgUnsafe(index, int(unsafe.Sizeof(buffer.clMem)), unsafe.Pointer(&buffer.clMem)) +} + +func (k *Kernel) SetArgFloat32(index int, val float32) error { + return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val)) +} + +func (k *Kernel) SetArgInt8(index int, val int8) error { + return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val)) +} + +func (k *Kernel) SetArgUint8(index int, val uint8) error { + return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val)) +} + +func (k *Kernel) SetArgInt32(index int, val int32) error { + return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val)) +} + +func (k *Kernel) SetArgUint32(index int, val uint32) error { + return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val)) +} + +func (k *Kernel) SetArgUint64(index int, val uint64) error { + return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val)) +} + +func (k *Kernel) SetArgLocal(index int, size int) error { + return k.SetArgUnsafe(index, size, nil) +} + +func (k *Kernel) SetArgUnsafe(index, argSize int, arg unsafe.Pointer) error { + //fmt.Println("FUNKY: ", index, argSize) + return toError(C.clSetKernelArg(k.clKernel, C.cl_uint(index), C.size_t(argSize), arg)) +} + +func (k *Kernel) PreferredWorkGroupSizeMultiple(device *Device) (int, error) { + var size C.size_t + err := C.clGetKernelWorkGroupInfo(k.clKernel, device.nullableId(), C.CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, C.size_t(unsafe.Sizeof(size)), unsafe.Pointer(&size), nil) + return int(size), toError(err) +} + +func (k *Kernel) WorkGroupSize(device *Device) (int, error) { + var size C.size_t + err := C.clGetKernelWorkGroupInfo(k.clKernel, device.nullableId(), C.CL_KERNEL_WORK_GROUP_SIZE, C.size_t(unsafe.Sizeof(size)), unsafe.Pointer(&size), nil) + return int(size), toError(err) +} + +func (k *Kernel) NumArgs() (int, error) { + var num C.cl_uint + err := C.clGetKernelInfo(k.clKernel, C.CL_KERNEL_NUM_ARGS, C.size_t(unsafe.Sizeof(num)), unsafe.Pointer(&num), nil) + return int(num), toError(err) +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel10.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel10.go new file mode 100644 index 000000000..579946068 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel10.go @@ -0,0 +1,7 @@ +// +build !cl12 + +package cl + +func (k *Kernel) ArgName(index int) (string, error) { + return "", ErrUnsupported +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel12.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel12.go new file mode 100644 index 000000000..d9e6f3546 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel12.go @@ -0,0 +1,20 @@ +// +build cl12 + +package cl + +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" +import "unsafe" + +func (k *Kernel) ArgName(index int) (string, error) { + var strC [1024]byte + var strN C.size_t + if err := C.clGetKernelArgInfo(k.clKernel, C.cl_uint(index), C.CL_KERNEL_ARG_NAME, 1024, unsafe.Pointer(&strC[0]), &strN); err != C.CL_SUCCESS { + return "", toError(err) + } + return string(strC[:strN]), nil +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/platform.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/platform.go new file mode 100644 index 000000000..fd1d162cf --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/platform.go @@ -0,0 +1,83 @@ +package cl + +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" + +import "unsafe" + +const maxPlatforms = 32 + +type Platform struct { + id C.cl_platform_id +} + +// Obtain the list of platforms available. +func GetPlatforms() ([]*Platform, error) { + var platformIds [maxPlatforms]C.cl_platform_id + var nPlatforms C.cl_uint + if err := C.clGetPlatformIDs(C.cl_uint(maxPlatforms), &platformIds[0], &nPlatforms); err != C.CL_SUCCESS { + return nil, toError(err) + } + platforms := make([]*Platform, nPlatforms) + for i := 0; i < int(nPlatforms); i++ { + platforms[i] = &Platform{id: platformIds[i]} + } + return platforms, nil +} + +func (p *Platform) GetDevices(deviceType DeviceType) ([]*Device, error) { + return GetDevices(p, deviceType) +} + +func (p *Platform) getInfoString(param C.cl_platform_info) (string, error) { + var strC [2048]byte + var strN C.size_t + if err := C.clGetPlatformInfo(p.id, param, 2048, unsafe.Pointer(&strC[0]), &strN); err != C.CL_SUCCESS { + return "", toError(err) + } + return string(strC[:(strN - 1)]), nil +} + +func (p *Platform) Name() string { + if str, err := p.getInfoString(C.CL_PLATFORM_NAME); err != nil { + panic("Platform.Name() should never fail") + } else { + return str + } +} + +func (p *Platform) Vendor() string { + if str, err := p.getInfoString(C.CL_PLATFORM_VENDOR); err != nil { + panic("Platform.Vendor() should never fail") + } else { + return str + } +} + +func (p *Platform) Profile() string { + if str, err := p.getInfoString(C.CL_PLATFORM_PROFILE); err != nil { + panic("Platform.Profile() should never fail") + } else { + return str + } +} + +func (p *Platform) Version() string { + if str, err := p.getInfoString(C.CL_PLATFORM_VERSION); err != nil { + panic("Platform.Version() should never fail") + } else { + return str + } +} + +func (p *Platform) Extensions() string { + if str, err := p.getInfoString(C.CL_PLATFORM_EXTENSIONS); err != nil { + panic("Platform.Extensions() should never fail") + } else { + return str + } +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/program.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/program.go new file mode 100644 index 000000000..e75f7ee0e --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/program.go @@ -0,0 +1,105 @@ +package cl + +// #include <stdlib.h> +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" + +import ( + "fmt" + "runtime" + "unsafe" +) + +type BuildError struct { + Message string + Device *Device +} + +func (e BuildError) Error() string { + if e.Device != nil { + return fmt.Sprintf("cl: build error on %q: %s", e.Device.Name(), e.Message) + } else { + return fmt.Sprintf("cl: build error: %s", e.Message) + } +} + +type Program struct { + clProgram C.cl_program + devices []*Device +} + +func releaseProgram(p *Program) { + if p.clProgram != nil { + C.clReleaseProgram(p.clProgram) + p.clProgram = nil + } +} + +func (p *Program) Release() { + releaseProgram(p) +} + +func (p *Program) BuildProgram(devices []*Device, options string) error { + var cOptions *C.char + if options != "" { + cOptions = C.CString(options) + defer C.free(unsafe.Pointer(cOptions)) + } + var deviceList []C.cl_device_id + var deviceListPtr *C.cl_device_id + numDevices := C.cl_uint(len(devices)) + if devices != nil && len(devices) > 0 { + deviceList = buildDeviceIdList(devices) + deviceListPtr = &deviceList[0] + } + if err := C.clBuildProgram(p.clProgram, numDevices, deviceListPtr, cOptions, nil, nil); err != C.CL_SUCCESS { + buffer := make([]byte, 4096) + var bLen C.size_t + var err C.cl_int + + for _, dev := range p.devices { + for i := 2; i >= 0; i-- { + err = C.clGetProgramBuildInfo(p.clProgram, dev.id, C.CL_PROGRAM_BUILD_LOG, C.size_t(len(buffer)), unsafe.Pointer(&buffer[0]), &bLen) + if err == C.CL_INVALID_VALUE && i > 0 && bLen < 1024*1024 { + // INVALID_VALUE probably means our buffer isn't large enough + buffer = make([]byte, bLen) + } else { + break + } + } + if err != C.CL_SUCCESS { + return toError(err) + } + + if bLen > 1 { + return BuildError{ + Device: dev, + Message: string(buffer[:bLen-1]), + } + } + } + + return BuildError{ + Device: nil, + Message: "build failed and produced no log entries", + } + } + return nil +} + +func (p *Program) CreateKernel(name string) (*Kernel, error) { + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + var err C.cl_int + clKernel := C.clCreateKernel(p.clProgram, cName, &err) + if err != C.CL_SUCCESS { + return nil, toError(err) + } + kernel := &Kernel{clKernel: clKernel, name: name} + runtime.SetFinalizer(kernel, releaseKernel) + return kernel, nil +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/queue.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/queue.go new file mode 100644 index 000000000..7762746da --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/queue.go @@ -0,0 +1,193 @@ +package cl + +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" + +import "unsafe" + +type CommandQueueProperty int + +const ( + CommandQueueOutOfOrderExecModeEnable CommandQueueProperty = C.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE + CommandQueueProfilingEnable CommandQueueProperty = C.CL_QUEUE_PROFILING_ENABLE +) + +type CommandQueue struct { + clQueue C.cl_command_queue + device *Device +} + +func releaseCommandQueue(q *CommandQueue) { + if q.clQueue != nil { + C.clReleaseCommandQueue(q.clQueue) + q.clQueue = nil + } +} + +// Call clReleaseCommandQueue on the CommandQueue. Using the CommandQueue after Release will cause a panick. +func (q *CommandQueue) Release() { + releaseCommandQueue(q) +} + +// Blocks until all previously queued OpenCL commands in a command-queue are issued to the associated device and have completed. +func (q *CommandQueue) Finish() error { + return toError(C.clFinish(q.clQueue)) +} + +// Issues all previously queued OpenCL commands in a command-queue to the device associated with the command-queue. +func (q *CommandQueue) Flush() error { + return toError(C.clFlush(q.clQueue)) +} + +// Enqueues a command to map a region of the buffer object given by buffer into the host address space and returns a pointer to this mapped region. +func (q *CommandQueue) EnqueueMapBuffer(buffer *MemObject, blocking bool, flags MapFlag, offset, size int, eventWaitList []*Event) (*MappedMemObject, *Event, error) { + var event C.cl_event + var err C.cl_int + ptr := C.clEnqueueMapBuffer(q.clQueue, buffer.clMem, clBool(blocking), flags.toCl(), C.size_t(offset), C.size_t(size), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event, &err) + if err != C.CL_SUCCESS { + return nil, nil, toError(err) + } + ev := newEvent(event) + if ptr == nil { + return nil, ev, ErrUnknown + } + return &MappedMemObject{ptr: ptr, size: size}, ev, nil +} + +// Enqueues a command to map a region of an image object into the host address space and returns a pointer to this mapped region. +func (q *CommandQueue) EnqueueMapImage(buffer *MemObject, blocking bool, flags MapFlag, origin, region [3]int, eventWaitList []*Event) (*MappedMemObject, *Event, error) { + cOrigin := sizeT3(origin) + cRegion := sizeT3(region) + var event C.cl_event + var err C.cl_int + var rowPitch, slicePitch C.size_t + ptr := C.clEnqueueMapImage(q.clQueue, buffer.clMem, clBool(blocking), flags.toCl(), &cOrigin[0], &cRegion[0], &rowPitch, &slicePitch, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event, &err) + if err != C.CL_SUCCESS { + return nil, nil, toError(err) + } + ev := newEvent(event) + if ptr == nil { + return nil, ev, ErrUnknown + } + size := 0 // TODO: could calculate this + return &MappedMemObject{ptr: ptr, size: size, rowPitch: int(rowPitch), slicePitch: int(slicePitch)}, ev, nil +} + +// Enqueues a command to unmap a previously mapped region of a memory object. +func (q *CommandQueue) EnqueueUnmapMemObject(buffer *MemObject, mappedObj *MappedMemObject, eventWaitList []*Event) (*Event, error) { + var event C.cl_event + if err := C.clEnqueueUnmapMemObject(q.clQueue, buffer.clMem, mappedObj.ptr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event); err != C.CL_SUCCESS { + return nil, toError(err) + } + return newEvent(event), nil +} + +// Enqueues a command to copy a buffer object to another buffer object. +func (q *CommandQueue) EnqueueCopyBuffer(srcBuffer, dstBuffer *MemObject, srcOffset, dstOffset, byteCount int, eventWaitList []*Event) (*Event, error) { + var event C.cl_event + err := toError(C.clEnqueueCopyBuffer(q.clQueue, srcBuffer.clMem, dstBuffer.clMem, C.size_t(srcOffset), C.size_t(dstOffset), C.size_t(byteCount), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event)) + return newEvent(event), err +} + +// Enqueue commands to write to a buffer object from host memory. +func (q *CommandQueue) EnqueueWriteBuffer(buffer *MemObject, blocking bool, offset, dataSize int, dataPtr unsafe.Pointer, eventWaitList []*Event) (*Event, error) { + var event C.cl_event + err := toError(C.clEnqueueWriteBuffer(q.clQueue, buffer.clMem, clBool(blocking), C.size_t(offset), C.size_t(dataSize), dataPtr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event)) + return newEvent(event), err +} + +func (q *CommandQueue) EnqueueWriteBufferFloat32(buffer *MemObject, blocking bool, offset int, data []float32, eventWaitList []*Event) (*Event, error) { + dataPtr := unsafe.Pointer(&data[0]) + dataSize := int(unsafe.Sizeof(data[0])) * len(data) + return q.EnqueueWriteBuffer(buffer, blocking, offset, dataSize, dataPtr, eventWaitList) +} + +// Enqueue commands to read from a buffer object to host memory. +func (q *CommandQueue) EnqueueReadBuffer(buffer *MemObject, blocking bool, offset, dataSize int, dataPtr unsafe.Pointer, eventWaitList []*Event) (*Event, error) { + var event C.cl_event + err := toError(C.clEnqueueReadBuffer(q.clQueue, buffer.clMem, clBool(blocking), C.size_t(offset), C.size_t(dataSize), dataPtr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event)) + return newEvent(event), err +} + +func (q *CommandQueue) EnqueueReadBufferFloat32(buffer *MemObject, blocking bool, offset int, data []float32, eventWaitList []*Event) (*Event, error) { + dataPtr := unsafe.Pointer(&data[0]) + dataSize := int(unsafe.Sizeof(data[0])) * len(data) + return q.EnqueueReadBuffer(buffer, blocking, offset, dataSize, dataPtr, eventWaitList) +} + +// Enqueues a command to execute a kernel on a device. +func (q *CommandQueue) EnqueueNDRangeKernel(kernel *Kernel, globalWorkOffset, globalWorkSize, localWorkSize []int, eventWaitList []*Event) (*Event, error) { + workDim := len(globalWorkSize) + var globalWorkOffsetList []C.size_t + var globalWorkOffsetPtr *C.size_t + if globalWorkOffset != nil { + globalWorkOffsetList = make([]C.size_t, len(globalWorkOffset)) + for i, off := range globalWorkOffset { + globalWorkOffsetList[i] = C.size_t(off) + } + globalWorkOffsetPtr = &globalWorkOffsetList[0] + } + var globalWorkSizeList []C.size_t + var globalWorkSizePtr *C.size_t + if globalWorkSize != nil { + globalWorkSizeList = make([]C.size_t, len(globalWorkSize)) + for i, off := range globalWorkSize { + globalWorkSizeList[i] = C.size_t(off) + } + globalWorkSizePtr = &globalWorkSizeList[0] + } + var localWorkSizeList []C.size_t + var localWorkSizePtr *C.size_t + if localWorkSize != nil { + localWorkSizeList = make([]C.size_t, len(localWorkSize)) + for i, off := range localWorkSize { + localWorkSizeList[i] = C.size_t(off) + } + localWorkSizePtr = &localWorkSizeList[0] + } + var event C.cl_event + err := toError(C.clEnqueueNDRangeKernel(q.clQueue, kernel.clKernel, C.cl_uint(workDim), globalWorkOffsetPtr, globalWorkSizePtr, localWorkSizePtr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event)) + return newEvent(event), err +} + +// Enqueues a command to read from a 2D or 3D image object to host memory. +func (q *CommandQueue) EnqueueReadImage(image *MemObject, blocking bool, origin, region [3]int, rowPitch, slicePitch int, data []byte, eventWaitList []*Event) (*Event, error) { + cOrigin := sizeT3(origin) + cRegion := sizeT3(region) + var event C.cl_event + err := toError(C.clEnqueueReadImage(q.clQueue, image.clMem, clBool(blocking), &cOrigin[0], &cRegion[0], C.size_t(rowPitch), C.size_t(slicePitch), unsafe.Pointer(&data[0]), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event)) + return newEvent(event), err +} + +// Enqueues a command to write from a 2D or 3D image object to host memory. +func (q *CommandQueue) EnqueueWriteImage(image *MemObject, blocking bool, origin, region [3]int, rowPitch, slicePitch int, data []byte, eventWaitList []*Event) (*Event, error) { + cOrigin := sizeT3(origin) + cRegion := sizeT3(region) + var event C.cl_event + err := toError(C.clEnqueueWriteImage(q.clQueue, image.clMem, clBool(blocking), &cOrigin[0], &cRegion[0], C.size_t(rowPitch), C.size_t(slicePitch), unsafe.Pointer(&data[0]), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event)) + return newEvent(event), err +} + +func (q *CommandQueue) EnqueueFillBuffer(buffer *MemObject, pattern unsafe.Pointer, patternSize, offset, size int, eventWaitList []*Event) (*Event, error) { + var event C.cl_event + err := toError(C.clEnqueueFillBuffer(q.clQueue, buffer.clMem, pattern, C.size_t(patternSize), C.size_t(offset), C.size_t(size), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event)) + return newEvent(event), err +} + +// A synchronization point that enqueues a barrier operation. +func (q *CommandQueue) EnqueueBarrierWithWaitList(eventWaitList []*Event) (*Event, error) { + var event C.cl_event + err := toError(C.clEnqueueBarrierWithWaitList(q.clQueue, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event)) + return newEvent(event), err +} + +// Enqueues a marker command which waits for either a list of events to complete, or all previously enqueued commands to complete. +func (q *CommandQueue) EnqueueMarkerWithWaitList(eventWaitList []*Event) (*Event, error) { + var event C.cl_event + err := toError(C.clEnqueueMarkerWithWaitList(q.clQueue, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event)) + return newEvent(event), err +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types.go new file mode 100644 index 000000000..00c846ce7 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types.go @@ -0,0 +1,487 @@ +package cl + +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" + +import ( + "errors" + "fmt" + "reflect" + "runtime" + "strings" + "unsafe" +) + +var ( + ErrUnknown = errors.New("cl: unknown error") // Generally an unexpected result from an OpenCL function (e.g. CL_SUCCESS but null pointer) +) + +type ErrOther int + +func (e ErrOther) Error() string { + return fmt.Sprintf("cl: error %d", int(e)) +} + +var ( + ErrDeviceNotFound = errors.New("cl: Device Not Found") + ErrDeviceNotAvailable = errors.New("cl: Device Not Available") + ErrCompilerNotAvailable = errors.New("cl: Compiler Not Available") + ErrMemObjectAllocationFailure = errors.New("cl: Mem Object Allocation Failure") + ErrOutOfResources = errors.New("cl: Out Of Resources") + ErrOutOfHostMemory = errors.New("cl: Out Of Host Memory") + ErrProfilingInfoNotAvailable = errors.New("cl: Profiling Info Not Available") + ErrMemCopyOverlap = errors.New("cl: Mem Copy Overlap") + ErrImageFormatMismatch = errors.New("cl: Image Format Mismatch") + ErrImageFormatNotSupported = errors.New("cl: Image Format Not Supported") + ErrBuildProgramFailure = errors.New("cl: Build Program Failure") + ErrMapFailure = errors.New("cl: Map Failure") + ErrMisalignedSubBufferOffset = errors.New("cl: Misaligned Sub Buffer Offset") + ErrExecStatusErrorForEventsInWaitList = errors.New("cl: Exec Status Error For Events In Wait List") + ErrCompileProgramFailure = errors.New("cl: Compile Program Failure") + ErrLinkerNotAvailable = errors.New("cl: Linker Not Available") + ErrLinkProgramFailure = errors.New("cl: Link Program Failure") + ErrDevicePartitionFailed = errors.New("cl: Device Partition Failed") + ErrKernelArgInfoNotAvailable = errors.New("cl: Kernel Arg Info Not Available") + ErrInvalidValue = errors.New("cl: Invalid Value") + ErrInvalidDeviceType = errors.New("cl: Invalid Device Type") + ErrInvalidPlatform = errors.New("cl: Invalid Platform") + ErrInvalidDevice = errors.New("cl: Invalid Device") + ErrInvalidContext = errors.New("cl: Invalid Context") + ErrInvalidQueueProperties = errors.New("cl: Invalid Queue Properties") + ErrInvalidCommandQueue = errors.New("cl: Invalid Command Queue") + ErrInvalidHostPtr = errors.New("cl: Invalid Host Ptr") + ErrInvalidMemObject = errors.New("cl: Invalid Mem Object") + ErrInvalidImageFormatDescriptor = errors.New("cl: Invalid Image Format Descriptor") + ErrInvalidImageSize = errors.New("cl: Invalid Image Size") + ErrInvalidSampler = errors.New("cl: Invalid Sampler") + ErrInvalidBinary = errors.New("cl: Invalid Binary") + ErrInvalidBuildOptions = errors.New("cl: Invalid Build Options") + ErrInvalidProgram = errors.New("cl: Invalid Program") + ErrInvalidProgramExecutable = errors.New("cl: Invalid Program Executable") + ErrInvalidKernelName = errors.New("cl: Invalid Kernel Name") + ErrInvalidKernelDefinition = errors.New("cl: Invalid Kernel Definition") + ErrInvalidKernel = errors.New("cl: Invalid Kernel") + ErrInvalidArgIndex = errors.New("cl: Invalid Arg Index") + ErrInvalidArgValue = errors.New("cl: Invalid Arg Value") + ErrInvalidArgSize = errors.New("cl: Invalid Arg Size") + ErrInvalidKernelArgs = errors.New("cl: Invalid Kernel Args") + ErrInvalidWorkDimension = errors.New("cl: Invalid Work Dimension") + ErrInvalidWorkGroupSize = errors.New("cl: Invalid Work Group Size") + ErrInvalidWorkItemSize = errors.New("cl: Invalid Work Item Size") + ErrInvalidGlobalOffset = errors.New("cl: Invalid Global Offset") + ErrInvalidEventWaitList = errors.New("cl: Invalid Event Wait List") + ErrInvalidEvent = errors.New("cl: Invalid Event") + ErrInvalidOperation = errors.New("cl: Invalid Operation") + ErrInvalidGlObject = errors.New("cl: Invalid Gl Object") + ErrInvalidBufferSize = errors.New("cl: Invalid Buffer Size") + ErrInvalidMipLevel = errors.New("cl: Invalid Mip Level") + ErrInvalidGlobalWorkSize = errors.New("cl: Invalid Global Work Size") + ErrInvalidProperty = errors.New("cl: Invalid Property") + ErrInvalidImageDescriptor = errors.New("cl: Invalid Image Descriptor") + ErrInvalidCompilerOptions = errors.New("cl: Invalid Compiler Options") + ErrInvalidLinkerOptions = errors.New("cl: Invalid Linker Options") + ErrInvalidDevicePartitionCount = errors.New("cl: Invalid Device Partition Count") +) +var errorMap = map[C.cl_int]error{ + C.CL_SUCCESS: nil, + C.CL_DEVICE_NOT_FOUND: ErrDeviceNotFound, + C.CL_DEVICE_NOT_AVAILABLE: ErrDeviceNotAvailable, + C.CL_COMPILER_NOT_AVAILABLE: ErrCompilerNotAvailable, + C.CL_MEM_OBJECT_ALLOCATION_FAILURE: ErrMemObjectAllocationFailure, + C.CL_OUT_OF_RESOURCES: ErrOutOfResources, + C.CL_OUT_OF_HOST_MEMORY: ErrOutOfHostMemory, + C.CL_PROFILING_INFO_NOT_AVAILABLE: ErrProfilingInfoNotAvailable, + C.CL_MEM_COPY_OVERLAP: ErrMemCopyOverlap, + C.CL_IMAGE_FORMAT_MISMATCH: ErrImageFormatMismatch, + C.CL_IMAGE_FORMAT_NOT_SUPPORTED: ErrImageFormatNotSupported, + C.CL_BUILD_PROGRAM_FAILURE: ErrBuildProgramFailure, + C.CL_MAP_FAILURE: ErrMapFailure, + C.CL_MISALIGNED_SUB_BUFFER_OFFSET: ErrMisalignedSubBufferOffset, + C.CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: ErrExecStatusErrorForEventsInWaitList, + C.CL_INVALID_VALUE: ErrInvalidValue, + C.CL_INVALID_DEVICE_TYPE: ErrInvalidDeviceType, + C.CL_INVALID_PLATFORM: ErrInvalidPlatform, + C.CL_INVALID_DEVICE: ErrInvalidDevice, + C.CL_INVALID_CONTEXT: ErrInvalidContext, + C.CL_INVALID_QUEUE_PROPERTIES: ErrInvalidQueueProperties, + C.CL_INVALID_COMMAND_QUEUE: ErrInvalidCommandQueue, + C.CL_INVALID_HOST_PTR: ErrInvalidHostPtr, + C.CL_INVALID_MEM_OBJECT: ErrInvalidMemObject, + C.CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: ErrInvalidImageFormatDescriptor, + C.CL_INVALID_IMAGE_SIZE: ErrInvalidImageSize, + C.CL_INVALID_SAMPLER: ErrInvalidSampler, + C.CL_INVALID_BINARY: ErrInvalidBinary, + C.CL_INVALID_BUILD_OPTIONS: ErrInvalidBuildOptions, + C.CL_INVALID_PROGRAM: ErrInvalidProgram, + C.CL_INVALID_PROGRAM_EXECUTABLE: ErrInvalidProgramExecutable, + C.CL_INVALID_KERNEL_NAME: ErrInvalidKernelName, + C.CL_INVALID_KERNEL_DEFINITION: ErrInvalidKernelDefinition, + C.CL_INVALID_KERNEL: ErrInvalidKernel, + C.CL_INVALID_ARG_INDEX: ErrInvalidArgIndex, + C.CL_INVALID_ARG_VALUE: ErrInvalidArgValue, + C.CL_INVALID_ARG_SIZE: ErrInvalidArgSize, + C.CL_INVALID_KERNEL_ARGS: ErrInvalidKernelArgs, + C.CL_INVALID_WORK_DIMENSION: ErrInvalidWorkDimension, + C.CL_INVALID_WORK_GROUP_SIZE: ErrInvalidWorkGroupSize, + C.CL_INVALID_WORK_ITEM_SIZE: ErrInvalidWorkItemSize, + C.CL_INVALID_GLOBAL_OFFSET: ErrInvalidGlobalOffset, + C.CL_INVALID_EVENT_WAIT_LIST: ErrInvalidEventWaitList, + C.CL_INVALID_EVENT: ErrInvalidEvent, + C.CL_INVALID_OPERATION: ErrInvalidOperation, + C.CL_INVALID_GL_OBJECT: ErrInvalidGlObject, + C.CL_INVALID_BUFFER_SIZE: ErrInvalidBufferSize, + C.CL_INVALID_MIP_LEVEL: ErrInvalidMipLevel, + C.CL_INVALID_GLOBAL_WORK_SIZE: ErrInvalidGlobalWorkSize, + C.CL_INVALID_PROPERTY: ErrInvalidProperty, +} + +func toError(code C.cl_int) error { + if err, ok := errorMap[code]; ok { + return err + } + return ErrOther(code) +} + +type LocalMemType int + +const ( + LocalMemTypeNone LocalMemType = C.CL_NONE + LocalMemTypeGlobal LocalMemType = C.CL_GLOBAL + LocalMemTypeLocal LocalMemType = C.CL_LOCAL +) + +var localMemTypeMap = map[LocalMemType]string{ + LocalMemTypeNone: "None", + LocalMemTypeGlobal: "Global", + LocalMemTypeLocal: "Local", +} + +func (t LocalMemType) String() string { + name := localMemTypeMap[t] + if name == "" { + name = "Unknown" + } + return name +} + +type ExecCapability int + +const ( + ExecCapabilityKernel ExecCapability = C.CL_EXEC_KERNEL // The OpenCL device can execute OpenCL kernels. + ExecCapabilityNativeKernel ExecCapability = C.CL_EXEC_NATIVE_KERNEL // The OpenCL device can execute native kernels. +) + +func (ec ExecCapability) String() string { + var parts []string + if ec&ExecCapabilityKernel != 0 { + parts = append(parts, "Kernel") + } + if ec&ExecCapabilityNativeKernel != 0 { + parts = append(parts, "NativeKernel") + } + if parts == nil { + return "" + } + return strings.Join(parts, "|") +} + +type MemCacheType int + +const ( + MemCacheTypeNone MemCacheType = C.CL_NONE + MemCacheTypeReadOnlyCache MemCacheType = C.CL_READ_ONLY_CACHE + MemCacheTypeReadWriteCache MemCacheType = C.CL_READ_WRITE_CACHE +) + +func (ct MemCacheType) String() string { + switch ct { + case MemCacheTypeNone: + return "None" + case MemCacheTypeReadOnlyCache: + return "ReadOnly" + case MemCacheTypeReadWriteCache: + return "ReadWrite" + } + return fmt.Sprintf("Unknown(%x)", int(ct)) +} + +type MemFlag int + +const ( + MemReadWrite MemFlag = C.CL_MEM_READ_WRITE + MemWriteOnly MemFlag = C.CL_MEM_WRITE_ONLY + MemReadOnly MemFlag = C.CL_MEM_READ_ONLY + MemUseHostPtr MemFlag = C.CL_MEM_USE_HOST_PTR + MemAllocHostPtr MemFlag = C.CL_MEM_ALLOC_HOST_PTR + MemCopyHostPtr MemFlag = C.CL_MEM_COPY_HOST_PTR + + MemWriteOnlyHost MemFlag = C.CL_MEM_HOST_WRITE_ONLY + MemReadOnlyHost MemFlag = C.CL_MEM_HOST_READ_ONLY + MemNoAccessHost MemFlag = C.CL_MEM_HOST_NO_ACCESS +) + +type MemObjectType int + +const ( + MemObjectTypeBuffer MemObjectType = C.CL_MEM_OBJECT_BUFFER + MemObjectTypeImage2D MemObjectType = C.CL_MEM_OBJECT_IMAGE2D + MemObjectTypeImage3D MemObjectType = C.CL_MEM_OBJECT_IMAGE3D +) + +type MapFlag int + +const ( + // This flag specifies that the region being mapped in the memory object is being mapped for reading. + MapFlagRead MapFlag = C.CL_MAP_READ + MapFlagWrite MapFlag = C.CL_MAP_WRITE + MapFlagWriteInvalidateRegion MapFlag = C.CL_MAP_WRITE_INVALIDATE_REGION +) + +func (mf MapFlag) toCl() C.cl_map_flags { + return C.cl_map_flags(mf) +} + +type ChannelOrder int + +const ( + ChannelOrderR ChannelOrder = C.CL_R + ChannelOrderA ChannelOrder = C.CL_A + ChannelOrderRG ChannelOrder = C.CL_RG + ChannelOrderRA ChannelOrder = C.CL_RA + ChannelOrderRGB ChannelOrder = C.CL_RGB + ChannelOrderRGBA ChannelOrder = C.CL_RGBA + ChannelOrderBGRA ChannelOrder = C.CL_BGRA + ChannelOrderARGB ChannelOrder = C.CL_ARGB + ChannelOrderIntensity ChannelOrder = C.CL_INTENSITY + ChannelOrderLuminance ChannelOrder = C.CL_LUMINANCE + ChannelOrderRx ChannelOrder = C.CL_Rx + ChannelOrderRGx ChannelOrder = C.CL_RGx + ChannelOrderRGBx ChannelOrder = C.CL_RGBx +) + +var channelOrderNameMap = map[ChannelOrder]string{ + ChannelOrderR: "R", + ChannelOrderA: "A", + ChannelOrderRG: "RG", + ChannelOrderRA: "RA", + ChannelOrderRGB: "RGB", + ChannelOrderRGBA: "RGBA", + ChannelOrderBGRA: "BGRA", + ChannelOrderARGB: "ARGB", + ChannelOrderIntensity: "Intensity", + ChannelOrderLuminance: "Luminance", + ChannelOrderRx: "Rx", + ChannelOrderRGx: "RGx", + ChannelOrderRGBx: "RGBx", +} + +func (co ChannelOrder) String() string { + name := channelOrderNameMap[co] + if name == "" { + name = fmt.Sprintf("Unknown(%x)", int(co)) + } + return name +} + +type ChannelDataType int + +const ( + ChannelDataTypeSNormInt8 ChannelDataType = C.CL_SNORM_INT8 + ChannelDataTypeSNormInt16 ChannelDataType = C.CL_SNORM_INT16 + ChannelDataTypeUNormInt8 ChannelDataType = C.CL_UNORM_INT8 + ChannelDataTypeUNormInt16 ChannelDataType = C.CL_UNORM_INT16 + ChannelDataTypeUNormShort565 ChannelDataType = C.CL_UNORM_SHORT_565 + ChannelDataTypeUNormShort555 ChannelDataType = C.CL_UNORM_SHORT_555 + ChannelDataTypeUNormInt101010 ChannelDataType = C.CL_UNORM_INT_101010 + ChannelDataTypeSignedInt8 ChannelDataType = C.CL_SIGNED_INT8 + ChannelDataTypeSignedInt16 ChannelDataType = C.CL_SIGNED_INT16 + ChannelDataTypeSignedInt32 ChannelDataType = C.CL_SIGNED_INT32 + ChannelDataTypeUnsignedInt8 ChannelDataType = C.CL_UNSIGNED_INT8 + ChannelDataTypeUnsignedInt16 ChannelDataType = C.CL_UNSIGNED_INT16 + ChannelDataTypeUnsignedInt32 ChannelDataType = C.CL_UNSIGNED_INT32 + ChannelDataTypeHalfFloat ChannelDataType = C.CL_HALF_FLOAT + ChannelDataTypeFloat ChannelDataType = C.CL_FLOAT +) + +var channelDataTypeNameMap = map[ChannelDataType]string{ + ChannelDataTypeSNormInt8: "SNormInt8", + ChannelDataTypeSNormInt16: "SNormInt16", + ChannelDataTypeUNormInt8: "UNormInt8", + ChannelDataTypeUNormInt16: "UNormInt16", + ChannelDataTypeUNormShort565: "UNormShort565", + ChannelDataTypeUNormShort555: "UNormShort555", + ChannelDataTypeUNormInt101010: "UNormInt101010", + ChannelDataTypeSignedInt8: "SignedInt8", + ChannelDataTypeSignedInt16: "SignedInt16", + ChannelDataTypeSignedInt32: "SignedInt32", + ChannelDataTypeUnsignedInt8: "UnsignedInt8", + ChannelDataTypeUnsignedInt16: "UnsignedInt16", + ChannelDataTypeUnsignedInt32: "UnsignedInt32", + ChannelDataTypeHalfFloat: "HalfFloat", + ChannelDataTypeFloat: "Float", +} + +func (ct ChannelDataType) String() string { + name := channelDataTypeNameMap[ct] + if name == "" { + name = fmt.Sprintf("Unknown(%x)", int(ct)) + } + return name +} + +type ImageFormat struct { + ChannelOrder ChannelOrder + ChannelDataType ChannelDataType +} + +func (f ImageFormat) toCl() C.cl_image_format { + var format C.cl_image_format + format.image_channel_order = C.cl_channel_order(f.ChannelOrder) + format.image_channel_data_type = C.cl_channel_type(f.ChannelDataType) + return format +} + +type ProfilingInfo int + +const ( + // A 64-bit value that describes the current device time counter in + // nanoseconds when the command identified by event is enqueued in + // a command-queue by the host. + ProfilingInfoCommandQueued ProfilingInfo = C.CL_PROFILING_COMMAND_QUEUED + // A 64-bit value that describes the current device time counter in + // nanoseconds when the command identified by event that has been + // enqueued is submitted by the host to the device associated with the command-queue. + ProfilingInfoCommandSubmit ProfilingInfo = C.CL_PROFILING_COMMAND_SUBMIT + // A 64-bit value that describes the current device time counter in + // nanoseconds when the command identified by event starts execution on the device. + ProfilingInfoCommandStart ProfilingInfo = C.CL_PROFILING_COMMAND_START + // A 64-bit value that describes the current device time counter in + // nanoseconds when the command identified by event has finished + // execution on the device. + ProfilingInfoCommandEnd ProfilingInfo = C.CL_PROFILING_COMMAND_END +) + +type CommmandExecStatus int + +const ( + CommmandExecStatusComplete CommmandExecStatus = C.CL_COMPLETE + CommmandExecStatusRunning CommmandExecStatus = C.CL_RUNNING + CommmandExecStatusSubmitted CommmandExecStatus = C.CL_SUBMITTED + CommmandExecStatusQueued CommmandExecStatus = C.CL_QUEUED +) + +type Event struct { + clEvent C.cl_event +} + +func releaseEvent(ev *Event) { + if ev.clEvent != nil { + C.clReleaseEvent(ev.clEvent) + ev.clEvent = nil + } +} + +func (e *Event) Release() { + releaseEvent(e) +} + +func (e *Event) GetEventProfilingInfo(paramName ProfilingInfo) (int64, error) { + var paramValue C.cl_ulong + if err := C.clGetEventProfilingInfo(e.clEvent, C.cl_profiling_info(paramName), C.size_t(unsafe.Sizeof(paramValue)), unsafe.Pointer(¶mValue), nil); err != C.CL_SUCCESS { + return 0, toError(err) + } + return int64(paramValue), nil +} + +// Sets the execution status of a user event object. +// +// `status` specifies the new execution status to be set and +// can be CL_COMPLETE or a negative integer value to indicate +// an error. A negative integer value causes all enqueued commands +// that wait on this user event to be terminated. clSetUserEventStatus +// can only be called once to change the execution status of event. +func (e *Event) SetUserEventStatus(status int) error { + return toError(C.clSetUserEventStatus(e.clEvent, C.cl_int(status))) +} + +// Waits on the host thread for commands identified by event objects in +// events to complete. A command is considered complete if its execution +// status is CL_COMPLETE or a negative value. The events specified in +// event_list act as synchronization points. +// +// If the cl_khr_gl_event extension is enabled, event objects can also be +// used to reflect the status of an OpenGL sync object. The sync object +// in turn refers to a fence command executing in an OpenGL command +// stream. This provides another method of coordinating sharing of buffers +// and images between OpenGL and OpenCL. +func WaitForEvents(events []*Event) error { + return toError(C.clWaitForEvents(C.cl_uint(len(events)), eventListPtr(events))) +} + +func newEvent(clEvent C.cl_event) *Event { + ev := &Event{clEvent: clEvent} + runtime.SetFinalizer(ev, releaseEvent) + return ev +} + +func eventListPtr(el []*Event) *C.cl_event { + if el == nil { + return nil + } + elist := make([]C.cl_event, len(el)) + for i, e := range el { + elist[i] = e.clEvent + } + return (*C.cl_event)(&elist[0]) +} + +func clBool(b bool) C.cl_bool { + if b { + return C.CL_TRUE + } + return C.CL_FALSE +} + +func sizeT3(i3 [3]int) [3]C.size_t { + var val [3]C.size_t + val[0] = C.size_t(i3[0]) + val[1] = C.size_t(i3[1]) + val[2] = C.size_t(i3[2]) + return val +} + +type MappedMemObject struct { + ptr unsafe.Pointer + size int + rowPitch int + slicePitch int +} + +func (mb *MappedMemObject) ByteSlice() []byte { + var byteSlice []byte + sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&byteSlice)) + sliceHeader.Cap = mb.size + sliceHeader.Len = mb.size + sliceHeader.Data = uintptr(mb.ptr) + return byteSlice +} + +func (mb *MappedMemObject) Ptr() unsafe.Pointer { + return mb.ptr +} + +func (mb *MappedMemObject) Size() int { + return mb.size +} + +func (mb *MappedMemObject) RowPitch() int { + return mb.rowPitch +} + +func (mb *MappedMemObject) SlicePitch() int { + return mb.slicePitch +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types12.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types12.go new file mode 100644 index 000000000..58023cb60 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types12.go @@ -0,0 +1,71 @@ +// +build cl12 + +package cl + +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" + +const ( + ChannelDataTypeUNormInt24 ChannelDataType = C.CL_UNORM_INT24 + ChannelOrderDepth ChannelOrder = C.CL_DEPTH + ChannelOrderDepthStencil ChannelOrder = C.CL_DEPTH_STENCIL + MemHostNoAccess MemFlag = C.CL_MEM_HOST_NO_ACCESS // OpenCL 1.2 + MemHostReadOnly MemFlag = C.CL_MEM_HOST_READ_ONLY // OpenCL 1.2 + MemHostWriteOnly MemFlag = C.CL_MEM_HOST_WRITE_ONLY // OpenCL 1.2 + MemObjectTypeImage1D MemObjectType = C.CL_MEM_OBJECT_IMAGE1D + MemObjectTypeImage1DArray MemObjectType = C.CL_MEM_OBJECT_IMAGE1D_ARRAY + MemObjectTypeImage1DBuffer MemObjectType = C.CL_MEM_OBJECT_IMAGE1D_BUFFER + MemObjectTypeImage2DArray MemObjectType = C.CL_MEM_OBJECT_IMAGE2D_ARRAY + // This flag specifies that the region being mapped in the memory object is being mapped for writing. + // + // The contents of the region being mapped are to be discarded. This is typically the case when the + // region being mapped is overwritten by the host. This flag allows the implementation to no longer + // guarantee that the pointer returned by clEnqueueMapBuffer or clEnqueueMapImage contains the + // latest bits in the region being mapped which can be a significant performance enhancement. + MapFlagWriteInvalidateRegion MapFlag = C.CL_MAP_WRITE_INVALIDATE_REGION +) + +func init() { + errorMap[C.CL_COMPILE_PROGRAM_FAILURE] = ErrCompileProgramFailure + errorMap[C.CL_DEVICE_PARTITION_FAILED] = ErrDevicePartitionFailed + errorMap[C.CL_INVALID_COMPILER_OPTIONS] = ErrInvalidCompilerOptions + errorMap[C.CL_INVALID_DEVICE_PARTITION_COUNT] = ErrInvalidDevicePartitionCount + errorMap[C.CL_INVALID_IMAGE_DESCRIPTOR] = ErrInvalidImageDescriptor + errorMap[C.CL_INVALID_LINKER_OPTIONS] = ErrInvalidLinkerOptions + errorMap[C.CL_KERNEL_ARG_INFO_NOT_AVAILABLE] = ErrKernelArgInfoNotAvailable + errorMap[C.CL_LINK_PROGRAM_FAILURE] = ErrLinkProgramFailure + errorMap[C.CL_LINKER_NOT_AVAILABLE] = ErrLinkerNotAvailable + channelOrderNameMap[ChannelOrderDepth] = "Depth" + channelOrderNameMap[ChannelOrderDepthStencil] = "DepthStencil" + channelDataTypeNameMap[ChannelDataTypeUNormInt24] = "UNormInt24" +} + +type ImageDescription struct { + Type MemObjectType + Width, Height, Depth int + ArraySize, RowPitch, SlicePitch int + NumMipLevels, NumSamples int + Buffer *MemObject +} + +func (d ImageDescription) toCl() C.cl_image_desc { + var desc C.cl_image_desc + desc.image_type = C.cl_mem_object_type(d.Type) + desc.image_width = C.size_t(d.Width) + desc.image_height = C.size_t(d.Height) + desc.image_depth = C.size_t(d.Depth) + desc.image_array_size = C.size_t(d.ArraySize) + desc.image_row_pitch = C.size_t(d.RowPitch) + desc.image_slice_pitch = C.size_t(d.SlicePitch) + desc.num_mip_levels = C.cl_uint(d.NumMipLevels) + desc.num_samples = C.cl_uint(d.NumSamples) + desc.buffer = nil + if d.Buffer != nil { + desc.buffer = d.Buffer.clMem + } + return desc +} diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types_darwin.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types_darwin.go new file mode 100644 index 000000000..ddcf74906 --- /dev/null +++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types_darwin.go @@ -0,0 +1,45 @@ +package cl + +// #ifdef __APPLE__ +// #include "OpenCL/opencl.h" +// #else +// #include "cl.h" +// #endif +import "C" + +// Extension: cl_APPLE_fixed_alpha_channel_orders +// +// These selectors may be passed to clCreateImage2D() in the cl_image_format.image_channel_order field. +// They are like CL_BGRA and CL_ARGB except that the alpha channel to be ignored. On calls to read_imagef, +// the alpha will be 0xff (1.0f) if the sample falls in the image and 0 if it does not fall in the image. +// On calls to write_imagef, the alpha value is ignored and 0xff (1.0f) is written. These formats are +// currently only available for the CL_UNORM_INT8 cl_channel_type. They are intended to support legacy +// image formats. +const ( + ChannelOrder1RGBApple ChannelOrder = C.CL_1RGB_APPLE // Introduced in MacOS X.7. + ChannelOrderBGR1Apple ChannelOrder = C.CL_BGR1_APPLE // Introduced in MacOS X.7. +) + +// Extension: cl_APPLE_biased_fixed_point_image_formats +// +// This selector may be passed to clCreateImage2D() in the cl_image_format.image_channel_data_type field. +// It defines a biased signed 1.14 fixed point storage format, with range [-1, 3). The conversion from +// float to this fixed point format is defined as follows: +// +// ushort float_to_sfixed14( float x ){ +// int i = convert_int_sat_rte( x * 0x1.0p14f ); // scale [-1, 3.0) to [-16384, 3*16384), round to nearest integer +// i = add_sat( i, 0x4000 ); // apply bias, to convert to [0, 65535) range +// return convert_ushort_sat(i); // clamp to destination size +// } +// +// The inverse conversion is the reverse process. The formats are currently only available on the CPU with +// the CL_RGBA channel layout. +const ( + ChannelDataTypeSFixed14Apple ChannelDataType = C.CL_SFIXED14_APPLE // Introduced in MacOS X.7. +) + +func init() { + channelOrderNameMap[ChannelOrder1RGBApple] = "1RGBApple" + channelOrderNameMap[ChannelOrderBGR1Apple] = "RGB1Apple" + channelDataTypeNameMap[ChannelDataTypeSFixed14Apple] = "SFixed14Apple" +} diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go index d0864da7f..ddb8ba583 100644 --- a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go +++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go @@ -30,8 +30,8 @@ import ( ) var ( - minDifficulty = new(big.Int).Exp(big.NewInt(2), big.NewInt(256), big.NewInt(0)) - sharedLight = new(Light) + maxUint256 = new(big.Int).Exp(big.NewInt(2), big.NewInt(256), big.NewInt(0)) + sharedLight = new(Light) ) const ( @@ -140,7 +140,7 @@ func (l *Light) Verify(block pow.Block) bool { // the finalizer before the call completes. _ = cache // The actual check. - target := new(big.Int).Div(minDifficulty, difficulty) + target := new(big.Int).Div(maxUint256, difficulty) return h256ToHash(ret.result).Big().Cmp(target) <= 0 } @@ -199,7 +199,7 @@ func (d *dag) generate() { if d.dir == "" { d.dir = DefaultDir } - glog.V(logger.Info).Infof("Generating DAG for epoch %d (%x)", d.epoch, seedHash) + glog.V(logger.Info).Infof("Generating DAG for epoch %d (size %d) (%x)", d.epoch, dagSize, seedHash) // Generate a temporary cache. // TODO: this could share the cache with Light cache := C.ethash_light_new_internal(cacheSize, (*C.ethash_h256_t)(unsafe.Pointer(&seedHash[0]))) @@ -220,14 +220,18 @@ func (d *dag) generate() { }) } -func freeDAG(h *dag) { - C.ethash_full_delete(h.ptr) - h.ptr = nil +func freeDAG(d *dag) { + C.ethash_full_delete(d.ptr) + d.ptr = nil +} + +func (d *dag) Ptr() unsafe.Pointer { + return unsafe.Pointer(d.ptr.data) } //export ethashGoCallback func ethashGoCallback(percent C.unsigned) C.int { - glog.V(logger.Info).Infof("Still generating DAG: %d%%", percent) + glog.V(logger.Info).Infof("Generating DAG: %d%%", percent) return 0 } @@ -273,7 +277,7 @@ func (pow *Full) getDAG(blockNum uint64) (d *dag) { return d } -func (pow *Full) Search(block pow.Block, stop <-chan struct{}) (nonce uint64, mixDigest []byte) { +func (pow *Full) Search(block pow.Block, stop <-chan struct{}, index int) (nonce uint64, mixDigest []byte) { dag := pow.getDAG(block.NumberU64()) r := rand.New(rand.NewSource(time.Now().UnixNano())) @@ -286,7 +290,7 @@ func (pow *Full) Search(block pow.Block, stop <-chan struct{}) (nonce uint64, mi nonce = uint64(r.Int63()) hash := hashToH256(block.HashNoNonce()) - target := new(big.Int).Div(minDifficulty, diff) + target := new(big.Int).Div(maxUint256, diff) for { select { case <-stop: diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go new file mode 100644 index 000000000..332b7f524 --- /dev/null +++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go @@ -0,0 +1,629 @@ +// Copyright 2014 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. + +// +build opencl + +package ethash + +//#cgo LDFLAGS: -w +//#include <stdint.h> +//#include <string.h> +//#include "src/libethash/internal.h" +import "C" + +import ( + crand "crypto/rand" + "encoding/binary" + "fmt" + "math" + "math/big" + mrand "math/rand" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + "unsafe" + + "github.com/Gustav-Simonsson/go-opencl/cl" + "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/pow" +) + +/* + + This code have two main entry points: + + 1. The initCL(...) function configures one or more OpenCL device + (for now only GPU) and loads the Ethash DAG onto device memory + + 2. The Search(...) function loads a Ethash nonce into device(s) memory and + executes the Ethash OpenCL kernel. + + Throughout the code, we refer to "host memory" and "device memory". + For most systems (e.g. regular PC GPU miner) the host memory is RAM and + device memory is the GPU global memory (e.g. GDDR5). + + References mentioned in code comments: + + 1. https://github.com/ethereum/wiki/wiki/Ethash + 2. https://github.com/ethereum/cpp-ethereum/blob/develop/libethash-cl/ethash_cl_miner.cpp + 3. https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/ + 4. http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_OpenCL_Programming_User_Guide.pdf + +*/ + +type OpenCLDevice struct { + deviceId int + device *cl.Device + openCL11 bool // OpenCL version 1.1 and 1.2 are handled a bit different + openCL12 bool + + dagBuf *cl.MemObject // Ethash full DAG in device mem + headerBuf *cl.MemObject // Hash of block-to-mine in device mem + searchBuffers []*cl.MemObject + + searchKernel *cl.Kernel + hashKernel *cl.Kernel + + queue *cl.CommandQueue + ctx *cl.Context + workGroupSize int + + nonceRand *mrand.Rand // seeded by crypto/rand, see comments where it's initialised + result common.Hash +} + +type OpenCLMiner struct { + mu sync.Mutex + + ethash *Ethash // Ethash full DAG & cache in host mem + + deviceIds []int + devices []*OpenCLDevice + + dagSize uint64 + + hashRate int32 // Go atomics & uint64 have some issues; int32 is supported on all platforms +} + +type pendingSearch struct { + bufIndex uint32 + startNonce uint64 +} + +const ( + SIZEOF_UINT32 = 4 + + // See [1] + ethashMixBytesLen = 128 + ethashAccesses = 64 + + // See [4] + workGroupSize = 32 // must be multiple of 8 + maxSearchResults = 63 + searchBufSize = 2 + globalWorkSize = 1024 * 256 +) + +func NewCL(deviceIds []int) *OpenCLMiner { + ids := make([]int, len(deviceIds)) + copy(ids, deviceIds) + return &OpenCLMiner{ + ethash: New(), + dagSize: 0, // to see if we need to update DAG. + deviceIds: ids, + } +} + +func PrintDevices() { + fmt.Println("=============================================") + fmt.Println("============ OpenCL Device Info =============") + fmt.Println("=============================================") + + var found []*cl.Device + + platforms, err := cl.GetPlatforms() + if err != nil { + fmt.Println("Plaform error (check your OpenCL installation): %v", err) + return + } + + for i, p := range platforms { + fmt.Println("Platform id ", i) + fmt.Println("Platform Name ", p.Name()) + fmt.Println("Platform Vendor ", p.Vendor()) + fmt.Println("Platform Version ", p.Version()) + fmt.Println("Platform Extensions ", p.Extensions()) + fmt.Println("Platform Profile ", p.Profile()) + fmt.Println("") + + devices, err := cl.GetDevices(p, cl.DeviceTypeGPU) + if err != nil { + fmt.Println("Device error (check your GPU drivers) :", err) + return + } + + for _, d := range devices { + fmt.Println("Device OpenCL id ", i) + fmt.Println("Device id for mining ", len(found)) + fmt.Println("Device Name ", d.Name()) + fmt.Println("Vendor ", d.Vendor()) + fmt.Println("Version ", d.Version()) + fmt.Println("Driver version ", d.DriverVersion()) + fmt.Println("Address bits ", d.AddressBits()) + fmt.Println("Max clock freq ", d.MaxClockFrequency()) + fmt.Println("Global mem size ", d.GlobalMemSize()) + fmt.Println("Max constant buffer size", d.MaxConstantBufferSize()) + fmt.Println("Max mem alloc size ", d.MaxMemAllocSize()) + fmt.Println("Max compute units ", d.MaxComputeUnits()) + fmt.Println("Max work group size ", d.MaxWorkGroupSize()) + fmt.Println("Max work item sizes ", d.MaxWorkItemSizes()) + fmt.Println("=============================================") + + found = append(found, d) + } + } + if len(found) == 0 { + fmt.Println("Found no GPU(s). Check that your OS can see the GPU(s)") + } else { + var idsFormat string + for i := 0; i < len(found); i++ { + idsFormat += strconv.Itoa(i) + if i != len(found)-1 { + idsFormat += "," + } + } + fmt.Printf("Found %v devices. Benchmark first GPU: geth gpubench 0\n", len(found)) + fmt.Printf("Mine using all GPUs: geth --minegpu %v\n", idsFormat) + } +} + +// See [2]. We basically do the same here, but the Go OpenCL bindings +// are at a slightly higher abtraction level. +func InitCL(blockNum uint64, c *OpenCLMiner) error { + platforms, err := cl.GetPlatforms() + if err != nil { + return fmt.Errorf("Plaform error: %v\nCheck your OpenCL installation and then run geth gpuinfo", err) + } + + var devices []*cl.Device + for _, p := range platforms { + ds, err := cl.GetDevices(p, cl.DeviceTypeGPU) + if err != nil { + return fmt.Errorf("Devices error: %v\nCheck your GPU drivers and then run geth gpuinfo", err) + } + for _, d := range ds { + devices = append(devices, d) + } + } + + pow := New() + _ = pow.getDAG(blockNum) // generates DAG if we don't have it + pow.Light.getCache(blockNum) // and cache + + c.ethash = pow + dagSize := uint64(C.ethash_get_datasize(C.uint64_t(blockNum))) + c.dagSize = dagSize + + for _, id := range c.deviceIds { + if id > len(devices)-1 { + return fmt.Errorf("Device id not found. See available device ids with: geth gpuinfo") + } else { + err := initCLDevice(id, devices[id], c) + if err != nil { + return err + } + } + } + if len(c.devices) == 0 { + return fmt.Errorf("No GPU devices found") + } + return nil +} + +func initCLDevice(deviceId int, device *cl.Device, c *OpenCLMiner) error { + devMaxAlloc := uint64(device.MaxMemAllocSize()) + devGlobalMem := uint64(device.GlobalMemSize()) + + // TODO: more fine grained version logic + if device.Version() == "OpenCL 1.0" { + fmt.Println("Device OpenCL version not supported: ", device.Version()) + return fmt.Errorf("opencl version not supported") + } + + var cl11, cl12 bool + if device.Version() == "OpenCL 1.1" { + cl11 = true + } + if device.Version() == "OpenCL 1.2" { + cl12 = true + } + + // log warnings but carry on; some device drivers report inaccurate values + if c.dagSize > devGlobalMem { + fmt.Printf("WARNING: device memory may be insufficient: %v. DAG size: %v.\n", devGlobalMem, c.dagSize) + } + + if c.dagSize > devMaxAlloc { + fmt.Printf("WARNING: DAG size (%v) larger than device max memory allocation size (%v).\n", c.dagSize, devMaxAlloc) + fmt.Printf("You probably have to export GPU_MAX_ALLOC_PERCENT=95\n") + } + + fmt.Printf("Initialising device %v: %v\n", deviceId, device.Name()) + + context, err := cl.CreateContext([]*cl.Device{device}) + if err != nil { + return fmt.Errorf("failed creating context:", err) + } + + // TODO: test running with CL_QUEUE_PROFILING_ENABLE for profiling? + queue, err := context.CreateCommandQueue(device, 0) + if err != nil { + return fmt.Errorf("command queue err:", err) + } + + // See [4] section 3.2 and [3] "clBuildProgram". + // The OpenCL kernel code is compiled at run-time. + kvs := make(map[string]string, 4) + kvs["GROUP_SIZE"] = strconv.FormatUint(workGroupSize, 10) + kvs["DAG_SIZE"] = strconv.FormatUint(c.dagSize/ethashMixBytesLen, 10) + kvs["ACCESSES"] = strconv.FormatUint(ethashAccesses, 10) + kvs["MAX_OUTPUTS"] = strconv.FormatUint(maxSearchResults, 10) + kernelCode := replaceWords(kernel, kvs) + + program, err := context.CreateProgramWithSource([]string{kernelCode}) + if err != nil { + return fmt.Errorf("program err:", err) + } + + /* if using AMD OpenCL impl, you can set this to debug on x86 CPU device. + see AMD OpenCL programming guide section 4.2 + + export in shell before running: + export AMD_OCL_BUILD_OPTIONS_APPEND="-g -O0" + export CPU_MAX_COMPUTE_UNITS=1 + + buildOpts := "-g -cl-opt-disable" + + */ + buildOpts := "" + err = program.BuildProgram([]*cl.Device{device}, buildOpts) + if err != nil { + return fmt.Errorf("program build err:", err) + } + + var searchKernelName, hashKernelName string + searchKernelName = "ethash_search" + hashKernelName = "ethash_hash" + + searchKernel, err := program.CreateKernel(searchKernelName) + hashKernel, err := program.CreateKernel(hashKernelName) + if err != nil { + return fmt.Errorf("kernel err:", err) + } + + // TODO: when this DAG size appears, patch the Go bindings + // (context.go) to work with uint64 as size_t + if c.dagSize > math.MaxInt32 { + fmt.Println("DAG too large for allocation.") + return fmt.Errorf("DAG too large for alloc") + } + + // TODO: patch up Go bindings to work with size_t, will overflow if > maxint32 + // TODO: fuck. shit's gonna overflow around 2017-06-09 12:17:02 + dagBuf := *(new(*cl.MemObject)) + dagBuf, err = context.CreateEmptyBuffer(cl.MemReadOnly, int(c.dagSize)) + if err != nil { + return fmt.Errorf("allocating dag buf failed: ", err) + } + + // write DAG to device mem + dagPtr := unsafe.Pointer(c.ethash.Full.current.ptr.data) + _, err = queue.EnqueueWriteBuffer(dagBuf, true, 0, int(c.dagSize), dagPtr, nil) + if err != nil { + return fmt.Errorf("writing to dag buf failed: ", err) + } + + searchBuffers := make([]*cl.MemObject, searchBufSize) + for i := 0; i < searchBufSize; i++ { + searchBuff, err := context.CreateEmptyBuffer(cl.MemWriteOnly, (1+maxSearchResults)*SIZEOF_UINT32) + if err != nil { + return fmt.Errorf("search buffer err:", err) + } + searchBuffers[i] = searchBuff + } + + headerBuf, err := context.CreateEmptyBuffer(cl.MemReadOnly, 32) + if err != nil { + return fmt.Errorf("header buffer err:", err) + } + + // Unique, random nonces are crucial for mining efficieny. + // While we do not need cryptographically secure PRNG for nonces, + // we want to have uniform distribution and minimal repetition of nonces. + // We could guarantee strict uniqueness of nonces by generating unique ranges, + // but a int64 seed from crypto/rand should be good enough. + // we then use math/rand for speed and to avoid draining OS entropy pool + seed, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) + if err != nil { + return err + } + nonceRand := mrand.New(mrand.NewSource(seed.Int64())) + + deviceStruct := &OpenCLDevice{ + deviceId: deviceId, + device: device, + openCL11: cl11, + openCL12: cl12, + + dagBuf: dagBuf, + headerBuf: headerBuf, + searchBuffers: searchBuffers, + + searchKernel: searchKernel, + hashKernel: hashKernel, + + queue: queue, + ctx: context, + + workGroupSize: workGroupSize, + + nonceRand: nonceRand, + } + c.devices = append(c.devices, deviceStruct) + + return nil +} + +func (c *OpenCLMiner) Search(block pow.Block, stop <-chan struct{}, index int) (uint64, []byte) { + c.mu.Lock() + newDagSize := uint64(C.ethash_get_datasize(C.uint64_t(block.NumberU64()))) + if newDagSize > c.dagSize { + // TODO: clean up buffers from previous DAG? + err := InitCL(block.NumberU64(), c) + if err != nil { + fmt.Println("OpenCL init error: ", err) + return 0, []byte{0} + } + } + defer c.mu.Unlock() + + // Avoid unneeded OpenCL initialisation if we received stop while running InitCL + select { + case <-stop: + return 0, []byte{0} + default: + } + + headerHash := block.HashNoNonce() + diff := block.Difficulty() + target256 := new(big.Int).Div(maxUint256, diff) + target64 := new(big.Int).Rsh(target256, 192).Uint64() + var zero uint32 = 0 + + d := c.devices[index] + + _, err := d.queue.EnqueueWriteBuffer(d.headerBuf, false, 0, 32, unsafe.Pointer(&headerHash[0]), nil) + if err != nil { + fmt.Println("Error in Search clEnqueueWriterBuffer : ", err) + return 0, []byte{0} + } + + for i := 0; i < searchBufSize; i++ { + _, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[i], false, 0, 4, unsafe.Pointer(&zero), nil) + if err != nil { + fmt.Println("Error in Search clEnqueueWriterBuffer : ", err) + return 0, []byte{0} + } + } + + // wait for all search buffers to complete + err = d.queue.Finish() + if err != nil { + fmt.Println("Error in Search clFinish : ", err) + return 0, []byte{0} + } + + err = d.searchKernel.SetArg(1, d.headerBuf) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + + err = d.searchKernel.SetArg(2, d.dagBuf) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + + err = d.searchKernel.SetArg(4, target64) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + err = d.searchKernel.SetArg(5, uint32(math.MaxUint32)) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + + // wait on this before returning + var preReturnEvent *cl.Event + if d.openCL12 { + preReturnEvent, err = d.ctx.CreateUserEvent() + if err != nil { + fmt.Println("Error in Search create CL user event : ", err) + return 0, []byte{0} + } + } + + pending := make([]pendingSearch, 0, searchBufSize) + var p *pendingSearch + searchBufIndex := uint32(0) + var checkNonce uint64 + loops := int64(0) + prevHashRate := int32(0) + start := time.Now().UnixNano() + // we grab a single random nonce and sets this as argument to the kernel search function + // the device will then add each local threads gid to the nonce, creating a unique nonce + // for each device computing unit executing in parallel + initNonce := uint64(d.nonceRand.Int63()) + for nonce := initNonce; ; nonce += uint64(globalWorkSize) { + select { + case <-stop: + + /* + if d.openCL12 { + err = cl.WaitForEvents([]*cl.Event{preReturnEvent}) + if err != nil { + fmt.Println("Error in Search WaitForEvents: ", err) + } + } + */ + + atomic.AddInt32(&c.hashRate, -prevHashRate) + return 0, []byte{0} + default: + } + + if (loops % (1 << 7)) == 0 { + elapsed := time.Now().UnixNano() - start + // TODO: verify if this is correct hash rate calculation + hashes := (float64(1e9) / float64(elapsed)) * float64(loops*1024*256) + hashrateDiff := int32(hashes) - prevHashRate + prevHashRate = int32(hashes) + atomic.AddInt32(&c.hashRate, hashrateDiff) + } + loops++ + + err = d.searchKernel.SetArg(0, d.searchBuffers[searchBufIndex]) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + err = d.searchKernel.SetArg(3, nonce) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + + // execute kernel + _, err := d.queue.EnqueueNDRangeKernel( + d.searchKernel, + []int{0}, + []int{globalWorkSize}, + []int{d.workGroupSize}, + nil) + if err != nil { + fmt.Println("Error in Search clEnqueueNDRangeKernel : ", err) + return 0, []byte{0} + } + + pending = append(pending, pendingSearch{bufIndex: searchBufIndex, startNonce: nonce}) + searchBufIndex = (searchBufIndex + 1) % searchBufSize + + if len(pending) == searchBufSize { + p = &(pending[searchBufIndex]) + cres, _, err := d.queue.EnqueueMapBuffer(d.searchBuffers[p.bufIndex], true, + cl.MapFlagRead, 0, (1+maxSearchResults)*SIZEOF_UINT32, + nil) + if err != nil { + fmt.Println("Error in Search clEnqueueMapBuffer: ", err) + return 0, []byte{0} + } + + results := cres.ByteSlice() + nfound := binary.LittleEndian.Uint32(results) + nfound = uint32(math.Min(float64(nfound), float64(maxSearchResults))) + // OpenCL returns the offsets from the start nonce + for i := uint32(0); i < nfound; i++ { + lo := (i + 1) * SIZEOF_UINT32 + hi := (i + 2) * SIZEOF_UINT32 + upperNonce := uint64(binary.LittleEndian.Uint32(results[lo:hi])) + checkNonce = p.startNonce + upperNonce + if checkNonce != 0 { + cn := C.uint64_t(checkNonce) + ds := C.uint64_t(c.dagSize) + // We verify that the nonce is indeed a solution by + // executing the Ethash verification function (on the CPU). + ret := C.ethash_light_compute_internal(c.ethash.Light.current.ptr, ds, hashToH256(headerHash), cn) + // TODO: return result first + if ret.success && h256ToHash(ret.result).Big().Cmp(target256) <= 0 { + _, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil) + if err != nil { + fmt.Println("Error in Search clEnqueueUnmapMemObject: ", err) + } + if d.openCL12 { + err = cl.WaitForEvents([]*cl.Event{preReturnEvent}) + if err != nil { + fmt.Println("Error in Search WaitForEvents: ", err) + } + } + return checkNonce, C.GoBytes(unsafe.Pointer(&ret.mix_hash), C.int(32)) + } + + _, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[p.bufIndex], false, 0, 4, unsafe.Pointer(&zero), nil) + if err != nil { + fmt.Println("Error in Search cl: EnqueueWriteBuffer", err) + return 0, []byte{0} + } + } + } + _, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil) + if err != nil { + fmt.Println("Error in Search clEnqueueUnMapMemObject: ", err) + return 0, []byte{0} + } + pending = append(pending[:searchBufIndex], pending[searchBufIndex+1:]...) + } + } + if d.openCL12 { + err := cl.WaitForEvents([]*cl.Event{preReturnEvent}) + if err != nil { + fmt.Println("Error in Search clWaitForEvents: ", err) + return 0, []byte{0} + } + } + return 0, []byte{0} +} + +func (c *OpenCLMiner) Verify(block pow.Block) bool { + return c.ethash.Light.Verify(block) +} +func (c *OpenCLMiner) GetHashrate() int64 { + return int64(atomic.LoadInt32(&c.hashRate)) +} +func (c *OpenCLMiner) Turbo(on bool) { + // This is GPU mining. Always be turbo. +} + +func replaceWords(text string, kvs map[string]string) string { + for k, v := range kvs { + text = strings.Replace(text, k, v, -1) + } + return text +} + +func logErr(err error) { + if err != nil { + fmt.Println("Error in OpenCL call:", err) + } +} + +func argErr(err error) error { + return fmt.Errorf("arg err: %v", err) +} diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go new file mode 100644 index 000000000..695ff1829 --- /dev/null +++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go @@ -0,0 +1,600 @@ +package ethash + +/* DO NOT EDIT!!! + + This code is version controlled at + https://github.com/ethereum/cpp-ethereum/blob/develop/libethash-cl/ethash_cl_miner_kernel.cl + + If needed change it there first, then copy over here. +*/ + +const kernel = ` +// author Tim Hughes <tim@twistedfury.com> +// Tested on Radeon HD 7850 +// Hashrate: 15940347 hashes/s +// Bandwidth: 124533 MB/s +// search kernel should fit in <= 84 VGPRS (3 wavefronts) + +#define THREADS_PER_HASH (128 / 16) +#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH) + +#define FNV_PRIME 0x01000193 + +__constant uint2 const Keccak_f1600_RC[24] = { + (uint2)(0x00000001, 0x00000000), + (uint2)(0x00008082, 0x00000000), + (uint2)(0x0000808a, 0x80000000), + (uint2)(0x80008000, 0x80000000), + (uint2)(0x0000808b, 0x00000000), + (uint2)(0x80000001, 0x00000000), + (uint2)(0x80008081, 0x80000000), + (uint2)(0x00008009, 0x80000000), + (uint2)(0x0000008a, 0x00000000), + (uint2)(0x00000088, 0x00000000), + (uint2)(0x80008009, 0x00000000), + (uint2)(0x8000000a, 0x00000000), + (uint2)(0x8000808b, 0x00000000), + (uint2)(0x0000008b, 0x80000000), + (uint2)(0x00008089, 0x80000000), + (uint2)(0x00008003, 0x80000000), + (uint2)(0x00008002, 0x80000000), + (uint2)(0x00000080, 0x80000000), + (uint2)(0x0000800a, 0x00000000), + (uint2)(0x8000000a, 0x80000000), + (uint2)(0x80008081, 0x80000000), + (uint2)(0x00008080, 0x80000000), + (uint2)(0x80000001, 0x00000000), + (uint2)(0x80008008, 0x80000000), +}; + +void keccak_f1600_round(uint2* a, uint r, uint out_size) +{ + #if !__ENDIAN_LITTLE__ + for (uint i = 0; i != 25; ++i) + a[i] = a[i].yx; + #endif + + uint2 b[25]; + uint2 t; + + // Theta + b[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]; + b[1] = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]; + b[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]; + b[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]; + b[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]; + t = b[4] ^ (uint2)(b[1].x << 1 | b[1].y >> 31, b[1].y << 1 | b[1].x >> 31); + a[0] ^= t; + a[5] ^= t; + a[10] ^= t; + a[15] ^= t; + a[20] ^= t; + t = b[0] ^ (uint2)(b[2].x << 1 | b[2].y >> 31, b[2].y << 1 | b[2].x >> 31); + a[1] ^= t; + a[6] ^= t; + a[11] ^= t; + a[16] ^= t; + a[21] ^= t; + t = b[1] ^ (uint2)(b[3].x << 1 | b[3].y >> 31, b[3].y << 1 | b[3].x >> 31); + a[2] ^= t; + a[7] ^= t; + a[12] ^= t; + a[17] ^= t; + a[22] ^= t; + t = b[2] ^ (uint2)(b[4].x << 1 | b[4].y >> 31, b[4].y << 1 | b[4].x >> 31); + a[3] ^= t; + a[8] ^= t; + a[13] ^= t; + a[18] ^= t; + a[23] ^= t; + t = b[3] ^ (uint2)(b[0].x << 1 | b[0].y >> 31, b[0].y << 1 | b[0].x >> 31); + a[4] ^= t; + a[9] ^= t; + a[14] ^= t; + a[19] ^= t; + a[24] ^= t; + + // Rho Pi + b[0] = a[0]; + b[10] = (uint2)(a[1].x << 1 | a[1].y >> 31, a[1].y << 1 | a[1].x >> 31); + b[7] = (uint2)(a[10].x << 3 | a[10].y >> 29, a[10].y << 3 | a[10].x >> 29); + b[11] = (uint2)(a[7].x << 6 | a[7].y >> 26, a[7].y << 6 | a[7].x >> 26); + b[17] = (uint2)(a[11].x << 10 | a[11].y >> 22, a[11].y << 10 | a[11].x >> 22); + b[18] = (uint2)(a[17].x << 15 | a[17].y >> 17, a[17].y << 15 | a[17].x >> 17); + b[3] = (uint2)(a[18].x << 21 | a[18].y >> 11, a[18].y << 21 | a[18].x >> 11); + b[5] = (uint2)(a[3].x << 28 | a[3].y >> 4, a[3].y << 28 | a[3].x >> 4); + b[16] = (uint2)(a[5].y << 4 | a[5].x >> 28, a[5].x << 4 | a[5].y >> 28); + b[8] = (uint2)(a[16].y << 13 | a[16].x >> 19, a[16].x << 13 | a[16].y >> 19); + b[21] = (uint2)(a[8].y << 23 | a[8].x >> 9, a[8].x << 23 | a[8].y >> 9); + b[24] = (uint2)(a[21].x << 2 | a[21].y >> 30, a[21].y << 2 | a[21].x >> 30); + b[4] = (uint2)(a[24].x << 14 | a[24].y >> 18, a[24].y << 14 | a[24].x >> 18); + b[15] = (uint2)(a[4].x << 27 | a[4].y >> 5, a[4].y << 27 | a[4].x >> 5); + b[23] = (uint2)(a[15].y << 9 | a[15].x >> 23, a[15].x << 9 | a[15].y >> 23); + b[19] = (uint2)(a[23].y << 24 | a[23].x >> 8, a[23].x << 24 | a[23].y >> 8); + b[13] = (uint2)(a[19].x << 8 | a[19].y >> 24, a[19].y << 8 | a[19].x >> 24); + b[12] = (uint2)(a[13].x << 25 | a[13].y >> 7, a[13].y << 25 | a[13].x >> 7); + b[2] = (uint2)(a[12].y << 11 | a[12].x >> 21, a[12].x << 11 | a[12].y >> 21); + b[20] = (uint2)(a[2].y << 30 | a[2].x >> 2, a[2].x << 30 | a[2].y >> 2); + b[14] = (uint2)(a[20].x << 18 | a[20].y >> 14, a[20].y << 18 | a[20].x >> 14); + b[22] = (uint2)(a[14].y << 7 | a[14].x >> 25, a[14].x << 7 | a[14].y >> 25); + b[9] = (uint2)(a[22].y << 29 | a[22].x >> 3, a[22].x << 29 | a[22].y >> 3); + b[6] = (uint2)(a[9].x << 20 | a[9].y >> 12, a[9].y << 20 | a[9].x >> 12); + b[1] = (uint2)(a[6].y << 12 | a[6].x >> 20, a[6].x << 12 | a[6].y >> 20); + + // Chi + a[0] = bitselect(b[0] ^ b[2], b[0], b[1]); + a[1] = bitselect(b[1] ^ b[3], b[1], b[2]); + a[2] = bitselect(b[2] ^ b[4], b[2], b[3]); + a[3] = bitselect(b[3] ^ b[0], b[3], b[4]); + if (out_size >= 4) + { + a[4] = bitselect(b[4] ^ b[1], b[4], b[0]); + a[5] = bitselect(b[5] ^ b[7], b[5], b[6]); + a[6] = bitselect(b[6] ^ b[8], b[6], b[7]); + a[7] = bitselect(b[7] ^ b[9], b[7], b[8]); + a[8] = bitselect(b[8] ^ b[5], b[8], b[9]); + if (out_size >= 8) + { + a[9] = bitselect(b[9] ^ b[6], b[9], b[5]); + a[10] = bitselect(b[10] ^ b[12], b[10], b[11]); + a[11] = bitselect(b[11] ^ b[13], b[11], b[12]); + a[12] = bitselect(b[12] ^ b[14], b[12], b[13]); + a[13] = bitselect(b[13] ^ b[10], b[13], b[14]); + a[14] = bitselect(b[14] ^ b[11], b[14], b[10]); + a[15] = bitselect(b[15] ^ b[17], b[15], b[16]); + a[16] = bitselect(b[16] ^ b[18], b[16], b[17]); + a[17] = bitselect(b[17] ^ b[19], b[17], b[18]); + a[18] = bitselect(b[18] ^ b[15], b[18], b[19]); + a[19] = bitselect(b[19] ^ b[16], b[19], b[15]); + a[20] = bitselect(b[20] ^ b[22], b[20], b[21]); + a[21] = bitselect(b[21] ^ b[23], b[21], b[22]); + a[22] = bitselect(b[22] ^ b[24], b[22], b[23]); + a[23] = bitselect(b[23] ^ b[20], b[23], b[24]); + a[24] = bitselect(b[24] ^ b[21], b[24], b[20]); + } + } + + // Iota + a[0] ^= Keccak_f1600_RC[r]; + + #if !__ENDIAN_LITTLE__ + for (uint i = 0; i != 25; ++i) + a[i] = a[i].yx; + #endif +} + +void keccak_f1600_no_absorb(ulong* a, uint in_size, uint out_size, uint isolate) +{ + for (uint i = in_size; i != 25; ++i) + { + a[i] = 0; + } +#if __ENDIAN_LITTLE__ + a[in_size] ^= 0x0000000000000001; + a[24-out_size*2] ^= 0x8000000000000000; +#else + a[in_size] ^= 0x0100000000000000; + a[24-out_size*2] ^= 0x0000000000000080; +#endif + + // Originally I unrolled the first and last rounds to interface + // better with surrounding code, however I haven't done this + // without causing the AMD compiler to blow up the VGPR usage. + uint r = 0; + do + { + // This dynamic branch stops the AMD compiler unrolling the loop + // and additionally saves about 33% of the VGPRs, enough to gain another + // wavefront. Ideally we'd get 4 in flight, but 3 is the best I can + // massage out of the compiler. It doesn't really seem to matter how + // much we try and help the compiler save VGPRs because it seems to throw + // that information away, hence the implementation of keccak here + // doesn't bother. + if (isolate) + { + keccak_f1600_round((uint2*)a, r++, 25); + } + } + while (r < 23); + + // final round optimised for digest size + keccak_f1600_round((uint2*)a, r++, out_size); +} + +#define copy(dst, src, count) for (uint i = 0; i != count; ++i) { (dst)[i] = (src)[i]; } + +#define countof(x) (sizeof(x) / sizeof(x[0])) + +uint fnv(uint x, uint y) +{ + return x * FNV_PRIME ^ y; +} + +uint4 fnv4(uint4 x, uint4 y) +{ + return x * FNV_PRIME ^ y; +} + +uint fnv_reduce(uint4 v) +{ + return fnv(fnv(fnv(v.x, v.y), v.z), v.w); +} + +typedef union +{ + ulong ulongs[32 / sizeof(ulong)]; + uint uints[32 / sizeof(uint)]; +} hash32_t; + +typedef union +{ + ulong ulongs[64 / sizeof(ulong)]; + uint4 uint4s[64 / sizeof(uint4)]; +} hash64_t; + +typedef union +{ + uint uints[128 / sizeof(uint)]; + uint4 uint4s[128 / sizeof(uint4)]; +} hash128_t; + +hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate) +{ + hash64_t init; + uint const init_size = countof(init.ulongs); + uint const hash_size = countof(header->ulongs); + + // sha3_512(header .. nonce) + ulong state[25]; + copy(state, header->ulongs, hash_size); + state[hash_size] = nonce; + keccak_f1600_no_absorb(state, hash_size + 1, init_size, isolate); + + copy(init.ulongs, state, init_size); + return init; +} + +uint inner_loop_chunks(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, __global hash128_t const* g_dag1, __global hash128_t const* g_dag2, __global hash128_t const* g_dag3, uint isolate) +{ + uint4 mix = init; + + // share init0 + if (thread_id == 0) + *share = mix.x; + barrier(CLK_LOCAL_MEM_FENCE); + uint init0 = *share; + + uint a = 0; + do + { + bool update_share = thread_id == (a/4) % THREADS_PER_HASH; + + #pragma unroll + for (uint i = 0; i != 4; ++i) + { + if (update_share) + { + uint m[4] = { mix.x, mix.y, mix.z, mix.w }; + *share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE; + } + barrier(CLK_LOCAL_MEM_FENCE); + + mix = fnv4(mix, *share>=3 * DAG_SIZE / 4 ? g_dag3[*share - 3 * DAG_SIZE / 4].uint4s[thread_id] : *share>=DAG_SIZE / 2 ? g_dag2[*share - DAG_SIZE / 2].uint4s[thread_id] : *share>=DAG_SIZE / 4 ? g_dag1[*share - DAG_SIZE / 4].uint4s[thread_id]:g_dag[*share].uint4s[thread_id]); + } + } while ((a += 4) != (ACCESSES & isolate)); + + return fnv_reduce(mix); +} + + + +uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate) +{ + uint4 mix = init; + + // share init0 + if (thread_id == 0) + *share = mix.x; + barrier(CLK_LOCAL_MEM_FENCE); + uint init0 = *share; + + uint a = 0; + do + { + bool update_share = thread_id == (a/4) % THREADS_PER_HASH; + + #pragma unroll + for (uint i = 0; i != 4; ++i) + { + if (update_share) + { + uint m[4] = { mix.x, mix.y, mix.z, mix.w }; + *share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE; + } + barrier(CLK_LOCAL_MEM_FENCE); + + mix = fnv4(mix, g_dag[*share].uint4s[thread_id]); + } + } + while ((a += 4) != (ACCESSES & isolate)); + + return fnv_reduce(mix); +} + + +hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate) +{ + ulong state[25]; + + hash32_t hash; + uint const hash_size = countof(hash.ulongs); + uint const init_size = countof(init->ulongs); + uint const mix_size = countof(mix->ulongs); + + // keccak_256(keccak_512(header..nonce) .. mix); + copy(state, init->ulongs, init_size); + copy(state + init_size, mix->ulongs, mix_size); + keccak_f1600_no_absorb(state, init_size+mix_size, hash_size, isolate); + + // copy out + copy(hash.ulongs, state, hash_size); + return hash; +} + +hash32_t compute_hash_simple( + __constant hash32_t const* g_header, + __global hash128_t const* g_dag, + ulong nonce, + uint isolate + ) +{ + hash64_t init = init_hash(g_header, nonce, isolate); + + hash128_t mix; + for (uint i = 0; i != countof(mix.uint4s); ++i) + { + mix.uint4s[i] = init.uint4s[i % countof(init.uint4s)]; + } + + uint mix_val = mix.uints[0]; + uint init0 = mix.uints[0]; + uint a = 0; + do + { + uint pi = fnv(init0 ^ a, mix_val) % DAG_SIZE; + uint n = (a+1) % countof(mix.uints); + + #pragma unroll + for (uint i = 0; i != countof(mix.uints); ++i) + { + mix.uints[i] = fnv(mix.uints[i], g_dag[pi].uints[i]); + mix_val = i == n ? mix.uints[i] : mix_val; + } + } + while (++a != (ACCESSES & isolate)); + + // reduce to output + hash32_t fnv_mix; + for (uint i = 0; i != countof(fnv_mix.uints); ++i) + { + fnv_mix.uints[i] = fnv_reduce(mix.uint4s[i]); + } + + return final_hash(&init, &fnv_mix, isolate); +} + +typedef union +{ + struct + { + hash64_t init; + uint pad; // avoid lds bank conflicts + }; + hash32_t mix; +} compute_hash_share; + + +hash32_t compute_hash( + __local compute_hash_share* share, + __constant hash32_t const* g_header, + __global hash128_t const* g_dag, + ulong nonce, + uint isolate + ) +{ + uint const gid = get_global_id(0); + + // Compute one init hash per work item. + hash64_t init = init_hash(g_header, nonce, isolate); + + // Threads work together in this phase in groups of 8. + uint const thread_id = gid % THREADS_PER_HASH; + uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH; + + hash32_t mix; + uint i = 0; + do + { + // share init with other threads + if (i == thread_id) + share[hash_id].init = init; + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))]; + barrier(CLK_LOCAL_MEM_FENCE); + + uint thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uints, g_dag, isolate); + + share[hash_id].mix.uints[thread_id] = thread_mix; + barrier(CLK_LOCAL_MEM_FENCE); + + if (i == thread_id) + mix = share[hash_id].mix; + barrier(CLK_LOCAL_MEM_FENCE); + } + while (++i != (THREADS_PER_HASH & isolate)); + + return final_hash(&init, &mix, isolate); +} + + +hash32_t compute_hash_chunks( + __local compute_hash_share* share, + __constant hash32_t const* g_header, + __global hash128_t const* g_dag, + __global hash128_t const* g_dag1, + __global hash128_t const* g_dag2, + __global hash128_t const* g_dag3, + ulong nonce, + uint isolate + ) +{ + uint const gid = get_global_id(0); + + // Compute one init hash per work item. + hash64_t init = init_hash(g_header, nonce, isolate); + + // Threads work together in this phase in groups of 8. + uint const thread_id = gid % THREADS_PER_HASH; + uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH; + + hash32_t mix; + uint i = 0; + do + { + // share init with other threads + if (i == thread_id) + share[hash_id].init = init; + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))]; + barrier(CLK_LOCAL_MEM_FENCE); + + uint thread_mix = inner_loop_chunks(thread_init, thread_id, share[hash_id].mix.uints, g_dag, g_dag1, g_dag2, g_dag3, isolate); + + share[hash_id].mix.uints[thread_id] = thread_mix; + barrier(CLK_LOCAL_MEM_FENCE); + + if (i == thread_id) + mix = share[hash_id].mix; + barrier(CLK_LOCAL_MEM_FENCE); + } + while (++i != (THREADS_PER_HASH & isolate)); + + return final_hash(&init, &mix, isolate); +} + +__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) +__kernel void ethash_hash_simple( + __global hash32_t* g_hashes, + __constant hash32_t const* g_header, + __global hash128_t const* g_dag, + ulong start_nonce, + uint isolate + ) +{ + uint const gid = get_global_id(0); + g_hashes[gid] = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate); +} + +__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) +__kernel void ethash_search_simple( + __global volatile uint* restrict g_output, + __constant hash32_t const* g_header, + __global hash128_t const* g_dag, + ulong start_nonce, + ulong target, + uint isolate + ) +{ + uint const gid = get_global_id(0); + hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate); + + if (hash.ulongs[countof(hash.ulongs)-1] < target) + { + uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1)); + g_output[slot] = gid; + } +} + + +__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) +__kernel void ethash_hash( + __global hash32_t* g_hashes, + __constant hash32_t const* g_header, + __global hash128_t const* g_dag, + ulong start_nonce, + uint isolate + ) +{ + __local compute_hash_share share[HASHES_PER_LOOP]; + + uint const gid = get_global_id(0); + g_hashes[gid] = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate); +} + +__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) +__kernel void ethash_search( + __global volatile uint* restrict g_output, + __constant hash32_t const* g_header, + __global hash128_t const* g_dag, + ulong start_nonce, + ulong target, + uint isolate + ) +{ + __local compute_hash_share share[HASHES_PER_LOOP]; + + uint const gid = get_global_id(0); + hash32_t hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate); + + if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target) + { + uint slot = min((uint)MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1); + g_output[slot] = gid; + } +} + +__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) +__kernel void ethash_hash_chunks( + __global hash32_t* g_hashes, + __constant hash32_t const* g_header, + __global hash128_t const* g_dag, + __global hash128_t const* g_dag1, + __global hash128_t const* g_dag2, + __global hash128_t const* g_dag3, + ulong start_nonce, + uint isolate + ) +{ + __local compute_hash_share share[HASHES_PER_LOOP]; + + uint const gid = get_global_id(0); + g_hashes[gid] = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3,start_nonce + gid, isolate); +} + +__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) +__kernel void ethash_search_chunks( + __global volatile uint* restrict g_output, + __constant hash32_t const* g_header, + __global hash128_t const* g_dag, + __global hash128_t const* g_dag1, + __global hash128_t const* g_dag2, + __global hash128_t const* g_dag3, + ulong start_nonce, + ulong target, + uint isolate + ) +{ + __local compute_hash_share share[HASHES_PER_LOOP]; + + uint const gid = get_global_id(0); + hash32_t hash = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3, start_nonce + gid, isolate); + + if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target) + { + uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1)); + g_output[slot] = gid; + } +} +` diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go index 1e1de989d..c19e45d1d 100644 --- a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go +++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go @@ -92,7 +92,7 @@ func TestEthashConcurrentVerify(t *testing.T) { defer os.RemoveAll(eth.Full.Dir) block := &testBlock{difficulty: big.NewInt(10)} - nonce, md := eth.Search(block, nil) + nonce, md := eth.Search(block, nil, 0) block.nonce = nonce block.mixDigest = common.BytesToHash(md) @@ -135,7 +135,7 @@ func TestEthashConcurrentSearch(t *testing.T) { // launch n searches concurrently. for i := 0; i < nsearch; i++ { go func() { - nonce, md := eth.Search(block, stop) + nonce, md := eth.Search(block, stop, 0) select { case found <- searchRes{n: nonce, md: md}: case <-stop: @@ -167,7 +167,7 @@ func TestEthashSearchAcrossEpoch(t *testing.T) { for i := epochLength - 40; i < epochLength+40; i++ { block := &testBlock{number: i, difficulty: big.NewInt(90)} rand.Read(block.hashNoNonce[:]) - nonce, md := eth.Search(block, nil) + nonce, md := eth.Search(block, nil, 0) block.nonce = nonce block.mixDigest = common.BytesToHash(md) if !eth.Verify(block) { |