Merge pull request #1869 from Gustav-Simonsson/gpu_miner

all: Add GPU mining, disabled by default
author: Jeffrey Wilcke <jeffrey@ethereum.org> 2015-10-16 21:25:33 +0800
committer: Jeffrey Wilcke <jeffrey@ethereum.org> 2015-10-16 21:25:33 +0800
commit: d5327ddc5fdc2a8b967699ea06ef5b5503657123 (patch)
tree: 9872db19a62cb21fc68e7c3e27a8bf8b9fcc63bd
parent: b74775400906cc582bdbb98bf5067c5258ee491f (diff)
parent: ec6a548ee3555813d83f86f82bd25694bfd9c303 (diff)
download: go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar
go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar.gz
go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar.bz2
go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar.lz
go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar.xz
go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar.zst
go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.zip
42 files changed, 6764 insertions, 42 deletions
diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json
index e4b37a12e..01bda12bf 100644
--- a/Godeps/Godeps.json
+++ b/Godeps/Godeps.json
@@ -16,8 +16,8 @@
 		},
 		{
 			"ImportPath": "github.com/ethereum/ethash",
-			"Comment": "v23.1-234-g062e40a",
-			"Rev": "062e40a1a1671f5a5102862b56e4c56f68a732f5"
+			"Comment": "v23.1-235-gb39e007",
+			"Rev": "b39e007d393ab5945b4c0748a7415b7e31c5db04"
 		},
 		{
 			"ImportPath": "github.com/fatih/color",
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl.go
new file mode 100644
index 000000000..3d577b2b6
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl.go
@@ -0,0 +1,26 @@
+/*
+Package cl provides a binding to the OpenCL api. It's mostly a low-level
+wrapper that avoids adding functionality while still making the interface
+a little more friendly and easy to use.
+
+Resource life-cycle management:
+
+For any CL object that gets created (buffer, queue, kernel, etc..) you should
+call object.Release() when finished with it to free the CL resources. This
+explicitely calls the related clXXXRelease method for the type. However,
+as a fallback there is a finalizer set for every resource item that takes
+care of it (eventually) if Release isn't called. In this way you can have
+better control over the life cycle of resources while having a fall back
+to avoid leaks. This is similar to how file handles and such are handled
+in the Go standard packages.
+*/
+package cl
+
+// #include "headers/1.2/opencl.h"
+// #cgo CFLAGS: -Iheaders/1.2
+// #cgo darwin LDFLAGS: -framework OpenCL
+// #cgo linux LDFLAGS: -lOpenCL
+import "C"
+import "errors"
+
+var ErrUnsupported = errors.New("cl: unsupported")
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl_test.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl_test.go
new file mode 100644
index 000000000..7659ce14f
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl_test.go
@@ -0,0 +1,254 @@
+package cl
+
+import (
+	"math/rand"
+	"reflect"
+	"strings"
+	"testing"
+)
+
+var kernelSource = `
+__kernel void square(
+   __global float* input,
+   __global float* output,
+   const unsigned int count)
+{
+   int i = get_global_id(0);
+   if(i < count)
+       output[i] = input[i] * input[i];
+}
+`
+
+func getObjectStrings(object interface{}) map[string]string {
+	v := reflect.ValueOf(object)
+	t := reflect.TypeOf(object)
+
+	strs := make(map[string]string)
+
+	numMethods := t.NumMethod()
+	for i := 0; i < numMethods; i++ {
+		method := t.Method(i)
+		if method.Type.NumIn() == 1 && method.Type.NumOut() == 1 && method.Type.Out(0).Kind() == reflect.String {
+			// this is a string-returning method with (presumably) only a pointer receiver parameter
+			// call it
+			outs := v.Method(i).Call([]reflect.Value{})
+			// put the result in our map
+			strs[method.Name] = (outs[0].Interface()).(string)
+		}
+	}
+
+	return strs
+}
+
+func TestPlatformStringsContainNoNULs(t *testing.T) {
+	platforms, err := GetPlatforms()
+	if err != nil {
+		t.Fatalf("Failed to get platforms: %+v", err)
+	}
+
+	for _, p := range platforms {
+		for key, value := range getObjectStrings(p) {
+			if strings.Contains(value, "\x00") {
+				t.Fatalf("platform string %q =  %+q contains NUL", key, value)
+			}
+		}
+	}
+}
+
+func TestDeviceStringsContainNoNULs(t *testing.T) {
+	platforms, err := GetPlatforms()
+	if err != nil {
+		t.Fatalf("Failed to get platforms: %+v", err)
+	}
+
+	for _, p := range platforms {
+		devs, err := p.GetDevices(DeviceTypeAll)
+		if err != nil {
+			t.Fatalf("Failed to get devices for platform %q: %+v", p.Name(), err)
+		}
+
+		for _, d := range devs {
+			for key, value := range getObjectStrings(d) {
+				if strings.Contains(value, "\x00") {
+					t.Fatalf("device string %q =  %+q contains NUL", key, value)
+				}
+			}
+		}
+	}
+}
+
+func TestHello(t *testing.T) {
+	var data [1024]float32
+	for i := 0; i < len(data); i++ {
+		data[i] = rand.Float32()
+	}
+
+	platforms, err := GetPlatforms()
+	if err != nil {
+		t.Fatalf("Failed to get platforms: %+v", err)
+	}
+	for i, p := range platforms {
+		t.Logf("Platform %d:", i)
+		t.Logf("  Name: %s", p.Name())
+		t.Logf("  Vendor: %s", p.Vendor())
+		t.Logf("  Profile: %s", p.Profile())
+		t.Logf("  Version: %s", p.Version())
+		t.Logf("  Extensions: %s", p.Extensions())
+	}
+	platform := platforms[0]
+
+	devices, err := platform.GetDevices(DeviceTypeAll)
+	if err != nil {
+		t.Fatalf("Failed to get devices: %+v", err)
+	}
+	if len(devices) == 0 {
+		t.Fatalf("GetDevices returned no devices")
+	}
+	deviceIndex := -1
+	for i, d := range devices {
+		if deviceIndex < 0 && d.Type() == DeviceTypeGPU {
+			deviceIndex = i
+		}
+		t.Logf("Device %d (%s): %s", i, d.Type(), d.Name())
+		t.Logf("  Address Bits: %d", d.AddressBits())
+		t.Logf("  Available: %+v", d.Available())
+		// t.Logf("  Built-In Kernels: %s", d.BuiltInKernels())
+		t.Logf("  Compiler Available: %+v", d.CompilerAvailable())
+		t.Logf("  Double FP Config: %s", d.DoubleFPConfig())
+		t.Logf("  Driver Version: %s", d.DriverVersion())
+		t.Logf("  Error Correction Supported: %+v", d.ErrorCorrectionSupport())
+		t.Logf("  Execution Capabilities: %s", d.ExecutionCapabilities())
+		t.Logf("  Extensions: %s", d.Extensions())
+		t.Logf("  Global Memory Cache Type: %s", d.GlobalMemCacheType())
+		t.Logf("  Global Memory Cacheline Size: %d KB", d.GlobalMemCachelineSize()/1024)
+		t.Logf("  Global Memory Size: %d MB", d.GlobalMemSize()/(1024*1024))
+		t.Logf("  Half FP Config: %s", d.HalfFPConfig())
+		t.Logf("  Host Unified Memory: %+v", d.HostUnifiedMemory())
+		t.Logf("  Image Support: %+v", d.ImageSupport())
+		t.Logf("  Image2D Max Dimensions: %d x %d", d.Image2DMaxWidth(), d.Image2DMaxHeight())
+		t.Logf("  Image3D Max Dimenionns: %d x %d x %d", d.Image3DMaxWidth(), d.Image3DMaxHeight(), d.Image3DMaxDepth())
+		// t.Logf("  Image Max Buffer Size: %d", d.ImageMaxBufferSize())
+		// t.Logf("  Image Max Array Size: %d", d.ImageMaxArraySize())
+		// t.Logf("  Linker Available: %+v", d.LinkerAvailable())
+		t.Logf("  Little Endian: %+v", d.EndianLittle())
+		t.Logf("  Local Mem Size Size: %d KB", d.LocalMemSize()/1024)
+		t.Logf("  Local Mem Type: %s", d.LocalMemType())
+		t.Logf("  Max Clock Frequency: %d", d.MaxClockFrequency())
+		t.Logf("  Max Compute Units: %d", d.MaxComputeUnits())
+		t.Logf("  Max Constant Args: %d", d.MaxConstantArgs())
+		t.Logf("  Max Constant Buffer Size: %d KB", d.MaxConstantBufferSize()/1024)
+		t.Logf("  Max Mem Alloc Size: %d KB", d.MaxMemAllocSize()/1024)
+		t.Logf("  Max Parameter Size: %d", d.MaxParameterSize())
+		t.Logf("  Max Read-Image Args: %d", d.MaxReadImageArgs())
+		t.Logf("  Max Samplers: %d", d.MaxSamplers())
+		t.Logf("  Max Work Group Size: %d", d.MaxWorkGroupSize())
+		t.Logf("  Max Work Item Dimensions: %d", d.MaxWorkItemDimensions())
+		t.Logf("  Max Work Item Sizes: %d", d.MaxWorkItemSizes())
+		t.Logf("  Max Write-Image Args: %d", d.MaxWriteImageArgs())
+		t.Logf("  Memory Base Address Alignment: %d", d.MemBaseAddrAlign())
+		t.Logf("  Native Vector Width Char: %d", d.NativeVectorWidthChar())
+		t.Logf("  Native Vector Width Short: %d", d.NativeVectorWidthShort())
+		t.Logf("  Native Vector Width Int: %d", d.NativeVectorWidthInt())
+		t.Logf("  Native Vector Width Long: %d", d.NativeVectorWidthLong())
+		t.Logf("  Native Vector Width Float: %d", d.NativeVectorWidthFloat())
+		t.Logf("  Native Vector Width Double: %d", d.NativeVectorWidthDouble())
+		t.Logf("  Native Vector Width Half: %d", d.NativeVectorWidthHalf())
+		t.Logf("  OpenCL C Version: %s", d.OpenCLCVersion())
+		// t.Logf("  Parent Device: %+v", d.ParentDevice())
+		t.Logf("  Profile: %s", d.Profile())
+		t.Logf("  Profiling Timer Resolution: %d", d.ProfilingTimerResolution())
+		t.Logf("  Vendor: %s", d.Vendor())
+		t.Logf("  Version: %s", d.Version())
+	}
+	if deviceIndex < 0 {
+		deviceIndex = 0
+	}
+	device := devices[deviceIndex]
+	t.Logf("Using device %d", deviceIndex)
+	context, err := CreateContext([]*Device{device})
+	if err != nil {
+		t.Fatalf("CreateContext failed: %+v", err)
+	}
+	// imageFormats, err := context.GetSupportedImageFormats(0, MemObjectTypeImage2D)
+	// if err != nil {
+	// 	t.Fatalf("GetSupportedImageFormats failed: %+v", err)
+	// }
+	// t.Logf("Supported image formats: %+v", imageFormats)
+	queue, err := context.CreateCommandQueue(device, 0)
+	if err != nil {
+		t.Fatalf("CreateCommandQueue failed: %+v", err)
+	}
+	program, err := context.CreateProgramWithSource([]string{kernelSource})
+	if err != nil {
+		t.Fatalf("CreateProgramWithSource failed: %+v", err)
+	}
+	if err := program.BuildProgram(nil, ""); err != nil {
+		t.Fatalf("BuildProgram failed: %+v", err)
+	}
+	kernel, err := program.CreateKernel("square")
+	if err != nil {
+		t.Fatalf("CreateKernel failed: %+v", err)
+	}
+	for i := 0; i < 3; i++ {
+		name, err := kernel.ArgName(i)
+		if err == ErrUnsupported {
+			break
+		} else if err != nil {
+			t.Errorf("GetKernelArgInfo for name failed: %+v", err)
+			break
+		} else {
+			t.Logf("Kernel arg %d: %s", i, name)
+		}
+	}
+	input, err := context.CreateEmptyBuffer(MemReadOnly, 4*len(data))
+	if err != nil {
+		t.Fatalf("CreateBuffer failed for input: %+v", err)
+	}
+	output, err := context.CreateEmptyBuffer(MemReadOnly, 4*len(data))
+	if err != nil {
+		t.Fatalf("CreateBuffer failed for output: %+v", err)
+	}
+	if _, err := queue.EnqueueWriteBufferFloat32(input, true, 0, data[:], nil); err != nil {
+		t.Fatalf("EnqueueWriteBufferFloat32 failed: %+v", err)
+	}
+	if err := kernel.SetArgs(input, output, uint32(len(data))); err != nil {
+		t.Fatalf("SetKernelArgs failed: %+v", err)
+	}
+
+	local, err := kernel.WorkGroupSize(device)
+	if err != nil {
+		t.Fatalf("WorkGroupSize failed: %+v", err)
+	}
+	t.Logf("Work group size: %d", local)
+	size, _ := kernel.PreferredWorkGroupSizeMultiple(nil)
+	t.Logf("Preferred Work Group Size Multiple: %d", size)
+
+	global := len(data)
+	d := len(data) % local
+	if d != 0 {
+		global += local - d
+	}
+	if _, err := queue.EnqueueNDRangeKernel(kernel, nil, []int{global}, []int{local}, nil); err != nil {
+		t.Fatalf("EnqueueNDRangeKernel failed: %+v", err)
+	}
+
+	if err := queue.Finish(); err != nil {
+		t.Fatalf("Finish failed: %+v", err)
+	}
+
+	results := make([]float32, len(data))
+	if _, err := queue.EnqueueReadBufferFloat32(output, true, 0, results, nil); err != nil {
+		t.Fatalf("EnqueueReadBufferFloat32 failed: %+v", err)
+	}
+
+	correct := 0
+	for i, v := range data {
+		if results[i] == v*v {
+			correct++
+		}
+	}
+
+	if correct != len(data) {
+		t.Fatalf("%d/%d correct values", correct, len(data))
+	}
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/context.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/context.go
new file mode 100644
index 000000000..67441bb5f
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/context.go
@@ -0,0 +1,161 @@
+package cl
+
+// #include <stdlib.h>
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const maxImageFormats = 256
+
+type Context struct {
+	clContext C.cl_context
+	devices   []*Device
+}
+
+type MemObject struct {
+	clMem C.cl_mem
+	size  int
+}
+
+func releaseContext(c *Context) {
+	if c.clContext != nil {
+		C.clReleaseContext(c.clContext)
+		c.clContext = nil
+	}
+}
+
+func releaseMemObject(b *MemObject) {
+	if b.clMem != nil {
+		C.clReleaseMemObject(b.clMem)
+		b.clMem = nil
+	}
+}
+
+func newMemObject(mo C.cl_mem, size int) *MemObject {
+	memObject := &MemObject{clMem: mo, size: size}
+	runtime.SetFinalizer(memObject, releaseMemObject)
+	return memObject
+}
+
+func (b *MemObject) Release() {
+	releaseMemObject(b)
+}
+
+// TODO: properties
+func CreateContext(devices []*Device) (*Context, error) {
+	deviceIds := buildDeviceIdList(devices)
+	var err C.cl_int
+	clContext := C.clCreateContext(nil, C.cl_uint(len(devices)), &deviceIds[0], nil, nil, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if clContext == nil {
+		return nil, ErrUnknown
+	}
+	context := &Context{clContext: clContext, devices: devices}
+	runtime.SetFinalizer(context, releaseContext)
+	return context, nil
+}
+
+func (ctx *Context) GetSupportedImageFormats(flags MemFlag, imageType MemObjectType) ([]ImageFormat, error) {
+	var formats [maxImageFormats]C.cl_image_format
+	var nFormats C.cl_uint
+	if err := C.clGetSupportedImageFormats(ctx.clContext, C.cl_mem_flags(flags), C.cl_mem_object_type(imageType), maxImageFormats, &formats[0], &nFormats); err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	fmts := make([]ImageFormat, nFormats)
+	for i, f := range formats[:nFormats] {
+		fmts[i] = ImageFormat{
+			ChannelOrder:    ChannelOrder(f.image_channel_order),
+			ChannelDataType: ChannelDataType(f.image_channel_data_type),
+		}
+	}
+	return fmts, nil
+}
+
+func (ctx *Context) CreateCommandQueue(device *Device, properties CommandQueueProperty) (*CommandQueue, error) {
+	var err C.cl_int
+	clQueue := C.clCreateCommandQueue(ctx.clContext, device.id, C.cl_command_queue_properties(properties), &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if clQueue == nil {
+		return nil, ErrUnknown
+	}
+	commandQueue := &CommandQueue{clQueue: clQueue, device: device}
+	runtime.SetFinalizer(commandQueue, releaseCommandQueue)
+	return commandQueue, nil
+}
+
+func (ctx *Context) CreateProgramWithSource(sources []string) (*Program, error) {
+	cSources := make([]*C.char, len(sources))
+	for i, s := range sources {
+		cs := C.CString(s)
+		cSources[i] = cs
+		defer C.free(unsafe.Pointer(cs))
+	}
+	var err C.cl_int
+	clProgram := C.clCreateProgramWithSource(ctx.clContext, C.cl_uint(len(sources)), &cSources[0], nil, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if clProgram == nil {
+		return nil, ErrUnknown
+	}
+	program := &Program{clProgram: clProgram, devices: ctx.devices}
+	runtime.SetFinalizer(program, releaseProgram)
+	return program, nil
+}
+
+func (ctx *Context) CreateBufferUnsafe(flags MemFlag, size int, dataPtr unsafe.Pointer) (*MemObject, error) {
+	var err C.cl_int
+	clBuffer := C.clCreateBuffer(ctx.clContext, C.cl_mem_flags(flags), C.size_t(size), dataPtr, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if clBuffer == nil {
+		return nil, ErrUnknown
+	}
+	return newMemObject(clBuffer, size), nil
+}
+
+func (ctx *Context) CreateEmptyBuffer(flags MemFlag, size int) (*MemObject, error) {
+	return ctx.CreateBufferUnsafe(flags, size, nil)
+}
+
+func (ctx *Context) CreateEmptyBufferFloat32(flags MemFlag, size int) (*MemObject, error) {
+	return ctx.CreateBufferUnsafe(flags, 4*size, nil)
+}
+
+func (ctx *Context) CreateBuffer(flags MemFlag, data []byte) (*MemObject, error) {
+	return ctx.CreateBufferUnsafe(flags, len(data), unsafe.Pointer(&data[0]))
+}
+
+//float64
+func (ctx *Context) CreateBufferFloat32(flags MemFlag, data []float32) (*MemObject, error) {
+	return ctx.CreateBufferUnsafe(flags, 4*len(data), unsafe.Pointer(&data[0]))
+}
+
+func (ctx *Context) CreateUserEvent() (*Event, error) {
+	var err C.cl_int
+	clEvent := C.clCreateUserEvent(ctx.clContext, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	return newEvent(clEvent), nil
+}
+
+func (ctx *Context) Release() {
+	releaseContext(ctx)
+}
+
+// http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateSubBuffer.html
+// func (memObject *MemObject) CreateSubBuffer(flags MemFlag, bufferCreateType BufferCreateType, )
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device.go
new file mode 100644
index 000000000..d62a6fb71
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device.go
@@ -0,0 +1,510 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #include "cl_ext.h"
+// #endif
+import "C"
+
+import (
+	"strings"
+	"unsafe"
+)
+
+const maxDeviceCount = 64
+
+type DeviceType uint
+
+const (
+	DeviceTypeCPU         DeviceType = C.CL_DEVICE_TYPE_CPU
+	DeviceTypeGPU         DeviceType = C.CL_DEVICE_TYPE_GPU
+	DeviceTypeAccelerator DeviceType = C.CL_DEVICE_TYPE_ACCELERATOR
+	DeviceTypeDefault     DeviceType = C.CL_DEVICE_TYPE_DEFAULT
+	DeviceTypeAll         DeviceType = C.CL_DEVICE_TYPE_ALL
+)
+
+type FPConfig int
+
+const (
+	FPConfigDenorm         FPConfig = C.CL_FP_DENORM           // denorms are supported
+	FPConfigInfNaN         FPConfig = C.CL_FP_INF_NAN          // INF and NaNs are supported
+	FPConfigRoundToNearest FPConfig = C.CL_FP_ROUND_TO_NEAREST // round to nearest even rounding mode supported
+	FPConfigRoundToZero    FPConfig = C.CL_FP_ROUND_TO_ZERO    // round to zero rounding mode supported
+	FPConfigRoundToInf     FPConfig = C.CL_FP_ROUND_TO_INF     // round to positive and negative infinity rounding modes supported
+	FPConfigFMA            FPConfig = C.CL_FP_FMA              // IEEE754-2008 fused multiply-add is supported
+	FPConfigSoftFloat      FPConfig = C.CL_FP_SOFT_FLOAT       // Basic floating-point operations (such as addition, subtraction, multiplication) are implemented in software
+)
+
+var fpConfigNameMap = map[FPConfig]string{
+	FPConfigDenorm:         "Denorm",
+	FPConfigInfNaN:         "InfNaN",
+	FPConfigRoundToNearest: "RoundToNearest",
+	FPConfigRoundToZero:    "RoundToZero",
+	FPConfigRoundToInf:     "RoundToInf",
+	FPConfigFMA:            "FMA",
+	FPConfigSoftFloat:      "SoftFloat",
+}
+
+func (c FPConfig) String() string {
+	var parts []string
+	for bit, name := range fpConfigNameMap {
+		if c&bit != 0 {
+			parts = append(parts, name)
+		}
+	}
+	if parts == nil {
+		return ""
+	}
+	return strings.Join(parts, "|")
+}
+
+func (dt DeviceType) String() string {
+	var parts []string
+	if dt&DeviceTypeCPU != 0 {
+		parts = append(parts, "CPU")
+	}
+	if dt&DeviceTypeGPU != 0 {
+		parts = append(parts, "GPU")
+	}
+	if dt&DeviceTypeAccelerator != 0 {
+		parts = append(parts, "Accelerator")
+	}
+	if dt&DeviceTypeDefault != 0 {
+		parts = append(parts, "Default")
+	}
+	if parts == nil {
+		parts = append(parts, "None")
+	}
+	return strings.Join(parts, "|")
+}
+
+type Device struct {
+	id C.cl_device_id
+}
+
+func buildDeviceIdList(devices []*Device) []C.cl_device_id {
+	deviceIds := make([]C.cl_device_id, len(devices))
+	for i, d := range devices {
+		deviceIds[i] = d.id
+	}
+	return deviceIds
+}
+
+// Obtain the list of devices available on a platform. 'platform' refers
+// to the platform returned by GetPlatforms or can be nil. If platform
+// is nil, the behavior is implementation-defined.
+func GetDevices(platform *Platform, deviceType DeviceType) ([]*Device, error) {
+	var deviceIds [maxDeviceCount]C.cl_device_id
+	var numDevices C.cl_uint
+	var platformId C.cl_platform_id
+	if platform != nil {
+		platformId = platform.id
+	}
+	if err := C.clGetDeviceIDs(platformId, C.cl_device_type(deviceType), C.cl_uint(maxDeviceCount), &deviceIds[0], &numDevices); err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if numDevices > maxDeviceCount {
+		numDevices = maxDeviceCount
+	}
+	devices := make([]*Device, numDevices)
+	for i := 0; i < int(numDevices); i++ {
+		devices[i] = &Device{id: deviceIds[i]}
+	}
+	return devices, nil
+}
+
+func (d *Device) nullableId() C.cl_device_id {
+	if d == nil {
+		return nil
+	}
+	return d.id
+}
+
+func (d *Device) GetInfoString(param C.cl_device_info, panicOnError bool) (string, error) {
+	var strC [1024]C.char
+	var strN C.size_t
+	if err := C.clGetDeviceInfo(d.id, param, 1024, unsafe.Pointer(&strC), &strN); err != C.CL_SUCCESS {
+		if panicOnError {
+			panic("Should never fail")
+		}
+		return "", toError(err)
+	}
+
+	// OpenCL strings are NUL-terminated, and the terminator is included in strN
+	// Go strings aren't NUL-terminated, so subtract 1 from the length
+	return C.GoStringN((*C.char)(unsafe.Pointer(&strC)), C.int(strN-1)), nil
+}
+
+func (d *Device) getInfoUint(param C.cl_device_info, panicOnError bool) (uint, error) {
+	var val C.cl_uint
+	if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS {
+		if panicOnError {
+			panic("Should never fail")
+		}
+		return 0, toError(err)
+	}
+	return uint(val), nil
+}
+
+func (d *Device) getInfoSize(param C.cl_device_info, panicOnError bool) (int, error) {
+	var val C.size_t
+	if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS {
+		if panicOnError {
+			panic("Should never fail")
+		}
+		return 0, toError(err)
+	}
+	return int(val), nil
+}
+
+func (d *Device) getInfoUlong(param C.cl_device_info, panicOnError bool) (int64, error) {
+	var val C.cl_ulong
+	if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS {
+		if panicOnError {
+			panic("Should never fail")
+		}
+		return 0, toError(err)
+	}
+	return int64(val), nil
+}
+
+func (d *Device) getInfoBool(param C.cl_device_info, panicOnError bool) (bool, error) {
+	var val C.cl_bool
+	if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS {
+		if panicOnError {
+			panic("Should never fail")
+		}
+		return false, toError(err)
+	}
+	return val == C.CL_TRUE, nil
+}
+
+func (d *Device) Name() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_NAME, true)
+	return str
+}
+
+func (d *Device) Vendor() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_VENDOR, true)
+	return str
+}
+
+func (d *Device) Extensions() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_EXTENSIONS, true)
+	return str
+}
+
+func (d *Device) OpenCLCVersion() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_OPENCL_C_VERSION, true)
+	return str
+}
+
+func (d *Device) Profile() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_PROFILE, true)
+	return str
+}
+
+func (d *Device) Version() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_VERSION, true)
+	return str
+}
+
+func (d *Device) DriverVersion() string {
+	str, _ := d.GetInfoString(C.CL_DRIVER_VERSION, true)
+	return str
+}
+
+// The default compute device address space size specified as an
+// unsigned integer value in bits. Currently supported values are 32 or 64 bits.
+func (d *Device) AddressBits() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_ADDRESS_BITS, true)
+	return int(val)
+}
+
+// Size of global memory cache line in bytes.
+func (d *Device) GlobalMemCachelineSize() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, true)
+	return int(val)
+}
+
+// Maximum configured clock frequency of the device in MHz.
+func (d *Device) MaxClockFrequency() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_CLOCK_FREQUENCY, true)
+	return int(val)
+}
+
+// The number of parallel compute units on the OpenCL device.
+// A work-group executes on a single compute unit. The minimum value is 1.
+func (d *Device) MaxComputeUnits() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_COMPUTE_UNITS, true)
+	return int(val)
+}
+
+// Max number of arguments declared with the __constant qualifier in a kernel.
+// The minimum value is 8 for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MaxConstantArgs() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_CONSTANT_ARGS, true)
+	return int(val)
+}
+
+// Max number of simultaneous image objects that can be read by a kernel.
+// The minimum value is 128 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) MaxReadImageArgs() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_READ_IMAGE_ARGS, true)
+	return int(val)
+}
+
+// Maximum number of samplers that can be used in a kernel. The minimum
+// value is 16 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. (Also see sampler_t.)
+func (d *Device) MaxSamplers() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_SAMPLERS, true)
+	return int(val)
+}
+
+// Maximum dimensions that specify the global and local work-item IDs used
+// by the data parallel execution model. (Refer to clEnqueueNDRangeKernel).
+// The minimum value is 3 for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MaxWorkItemDimensions() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, true)
+	return int(val)
+}
+
+// Max number of simultaneous image objects that can be written to by a
+// kernel. The minimum value is 8 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) MaxWriteImageArgs() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_WRITE_IMAGE_ARGS, true)
+	return int(val)
+}
+
+// The minimum value is the size (in bits) of the largest OpenCL built-in
+// data type supported by the device (long16 in FULL profile, long16 or
+// int16 in EMBEDDED profile) for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MemBaseAddrAlign() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MEM_BASE_ADDR_ALIGN, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthChar() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthShort() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthInt() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthLong() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthFloat() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthDouble() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthHalf() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, true)
+	return int(val)
+}
+
+// Max height of 2D image in pixels. The minimum value is 8192
+// if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) Image2DMaxHeight() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE2D_MAX_HEIGHT, true)
+	return int(val)
+}
+
+// Max width of 2D image or 1D image not created from a buffer object in
+// pixels. The minimum value is 8192 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) Image2DMaxWidth() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE2D_MAX_WIDTH, true)
+	return int(val)
+}
+
+// Max depth of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) Image3DMaxDepth() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE3D_MAX_DEPTH, true)
+	return int(val)
+}
+
+// Max height of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) Image3DMaxHeight() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE3D_MAX_HEIGHT, true)
+	return int(val)
+}
+
+// Max width of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) Image3DMaxWidth() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE3D_MAX_WIDTH, true)
+	return int(val)
+}
+
+// Max size in bytes of the arguments that can be passed to a kernel. The
+// minimum value is 1024 for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+// For this minimum value, only a maximum of 128 arguments can be passed to a kernel.
+func (d *Device) MaxParameterSize() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_MAX_PARAMETER_SIZE, true)
+	return int(val)
+}
+
+// Maximum number of work-items in a work-group executing a kernel on a
+// single compute unit, using the data parallel execution model. (Refer
+// to clEnqueueNDRangeKernel). The minimum value is 1.
+func (d *Device) MaxWorkGroupSize() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_MAX_WORK_GROUP_SIZE, true)
+	return int(val)
+}
+
+// Describes the resolution of device timer. This is measured in nanoseconds.
+func (d *Device) ProfilingTimerResolution() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_PROFILING_TIMER_RESOLUTION, true)
+	return int(val)
+}
+
+// Size of local memory arena in bytes. The minimum value is 32 KB for
+// devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) LocalMemSize() int64 {
+	val, _ := d.getInfoUlong(C.CL_DEVICE_LOCAL_MEM_SIZE, true)
+	return val
+}
+
+// Max size in bytes of a constant buffer allocation. The minimum value is
+// 64 KB for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MaxConstantBufferSize() int64 {
+	val, _ := d.getInfoUlong(C.CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, true)
+	return val
+}
+
+// Max size of memory object allocation in bytes. The minimum value is max
+// (1/4th of CL_DEVICE_GLOBAL_MEM_SIZE, 128*1024*1024) for devices that are
+// not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MaxMemAllocSize() int64 {
+	val, _ := d.getInfoUlong(C.CL_DEVICE_MAX_MEM_ALLOC_SIZE, true)
+	return val
+}
+
+// Size of global device memory in bytes.
+func (d *Device) GlobalMemSize() int64 {
+	val, _ := d.getInfoUlong(C.CL_DEVICE_GLOBAL_MEM_SIZE, true)
+	return val
+}
+
+func (d *Device) Available() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_AVAILABLE, true)
+	return val
+}
+
+func (d *Device) CompilerAvailable() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_COMPILER_AVAILABLE, true)
+	return val
+}
+
+func (d *Device) EndianLittle() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_ENDIAN_LITTLE, true)
+	return val
+}
+
+// Is CL_TRUE if the device implements error correction for all
+// accesses to compute device memory (global and constant). Is
+// CL_FALSE if the device does not implement such error correction.
+func (d *Device) ErrorCorrectionSupport() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_ERROR_CORRECTION_SUPPORT, true)
+	return val
+}
+
+func (d *Device) HostUnifiedMemory() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_HOST_UNIFIED_MEMORY, true)
+	return val
+}
+
+func (d *Device) ImageSupport() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_IMAGE_SUPPORT, true)
+	return val
+}
+
+func (d *Device) Type() DeviceType {
+	var deviceType C.cl_device_type
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_TYPE, C.size_t(unsafe.Sizeof(deviceType)), unsafe.Pointer(&deviceType), nil); err != C.CL_SUCCESS {
+		panic("Failed to get device type")
+	}
+	return DeviceType(deviceType)
+}
+
+// Describes double precision floating-point capability of the OpenCL device
+func (d *Device) DoubleFPConfig() FPConfig {
+	var fpConfig C.cl_device_fp_config
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_DOUBLE_FP_CONFIG, C.size_t(unsafe.Sizeof(fpConfig)), unsafe.Pointer(&fpConfig), nil); err != C.CL_SUCCESS {
+		panic("Failed to get double FP config")
+	}
+	return FPConfig(fpConfig)
+}
+
+// Describes the OPTIONAL half precision floating-point capability of the OpenCL device
+func (d *Device) HalfFPConfig() FPConfig {
+	var fpConfig C.cl_device_fp_config
+	err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_HALF_FP_CONFIG, C.size_t(unsafe.Sizeof(fpConfig)), unsafe.Pointer(&fpConfig), nil)
+	if err != C.CL_SUCCESS {
+		return FPConfig(0)
+	}
+	return FPConfig(fpConfig)
+}
+
+// Type of local memory supported. This can be set to CL_LOCAL implying dedicated
+// local memory storage such as SRAM, or CL_GLOBAL. For custom devices, CL_NONE
+// can also be returned indicating no local memory support.
+func (d *Device) LocalMemType() LocalMemType {
+	var memType C.cl_device_local_mem_type
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_LOCAL_MEM_TYPE, C.size_t(unsafe.Sizeof(memType)), unsafe.Pointer(&memType), nil); err != C.CL_SUCCESS {
+		return LocalMemType(C.CL_NONE)
+	}
+	return LocalMemType(memType)
+}
+
+// Describes the execution capabilities of the device. The mandated minimum capability is CL_EXEC_KERNEL.
+func (d *Device) ExecutionCapabilities() ExecCapability {
+	var execCap C.cl_device_exec_capabilities
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_EXECUTION_CAPABILITIES, C.size_t(unsafe.Sizeof(execCap)), unsafe.Pointer(&execCap), nil); err != C.CL_SUCCESS {
+		panic("Failed to get execution capabilities")
+	}
+	return ExecCapability(execCap)
+}
+
+func (d *Device) GlobalMemCacheType() MemCacheType {
+	var memType C.cl_device_mem_cache_type
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, C.size_t(unsafe.Sizeof(memType)), unsafe.Pointer(&memType), nil); err != C.CL_SUCCESS {
+		return MemCacheType(C.CL_NONE)
+	}
+	return MemCacheType(memType)
+}
+
+// Maximum number of work-items that can be specified in each dimension of the work-group to clEnqueueNDRangeKernel.
+//
+// Returns n size_t entries, where n is the value returned by the query for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
+//
+// The minimum value is (1, 1, 1) for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MaxWorkItemSizes() []int {
+	dims := d.MaxWorkItemDimensions()
+	sizes := make([]C.size_t, dims)
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_MAX_WORK_ITEM_SIZES, C.size_t(int(unsafe.Sizeof(sizes[0]))*dims), unsafe.Pointer(&sizes[0]), nil); err != C.CL_SUCCESS {
+		panic("Failed to get max work item sizes")
+	}
+	intSizes := make([]int, dims)
+	for i, s := range sizes {
+		intSizes[i] = int(s)
+	}
+	return intSizes
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device12.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device12.go
new file mode 100644
index 000000000..b4b559a6a
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device12.go
@@ -0,0 +1,51 @@
+// +build cl12
+
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+import "unsafe"
+
+const FPConfigCorrectlyRoundedDivideSqrt FPConfig = C.CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT
+
+func init() {
+	fpConfigNameMap[FPConfigCorrectlyRoundedDivideSqrt] = "CorrectlyRoundedDivideSqrt"
+}
+
+func (d *Device) BuiltInKernels() string {
+	str, _ := d.getInfoString(C.CL_DEVICE_BUILT_IN_KERNELS, true)
+	return str
+}
+
+// Is CL_FALSE if the implementation does not have a linker available. Is CL_TRUE if the linker is available. This can be CL_FALSE for the embedded platform profile only. This must be CL_TRUE if CL_DEVICE_COMPILER_AVAILABLE is CL_TRUE
+func (d *Device) LinkerAvailable() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_LINKER_AVAILABLE, true)
+	return val
+}
+
+func (d *Device) ParentDevice() *Device {
+	var deviceId C.cl_device_id
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_PARENT_DEVICE, C.size_t(unsafe.Sizeof(deviceId)), unsafe.Pointer(&deviceId), nil); err != C.CL_SUCCESS {
+		panic("ParentDevice failed")
+	}
+	if deviceId == nil {
+		return nil
+	}
+	return &Device{id: deviceId}
+}
+
+// Max number of pixels for a 1D image created from a buffer object. The minimum value is 65536 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) ImageMaxBufferSize() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, true)
+	return int(val)
+}
+
+// Max number of images in a 1D or 2D image array. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE
+func (d *Device) ImageMaxArraySize() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, true)
+	return int(val)
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl.h
new file mode 100644
index 000000000..c38aa1eda
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl.h
@@ -0,0 +1,1210 @@
+/*******************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#include <cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+    cl_mem                  buffer;
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+#define CL_VERSION_1_2                              1
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                              0x1000
+#define CL_DEVICE_VENDOR_ID                         0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
+#define CL_DEVICE_ADDRESS_BITS                      0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
+#define CL_DEVICE_MAX_SAMPLERS                      0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
+#define CL_DEVICE_AVAILABLE                         0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
+#define CL_DEVICE_NAME                              0x102B
+#define CL_DEVICE_VENDOR                            0x102C
+#define CL_DRIVER_VERSION                           0x102D
+#define CL_DEVICE_PROFILE                           0x102E
+#define CL_DEVICE_VERSION                           0x102F
+#define CL_DEVICE_EXTENSIONS                        0x1030
+#define CL_DEVICE_PLATFORM                          0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
+#define CL_DEVICE_PARENT_DEVICE                     0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
+#define CL_DEVICE_PARTITION_TYPE                    0x1046
+#define CL_DEVICE_REFERENCE_COUNT                   0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+    
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+    
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#define CL_UNORM_INT24                              0x10DF
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+    
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+    
+/* cl_kernel_arg_type_qualifer */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#define CL_EVENT_CONTEXT                            0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL 
+clGetPlatformInfo(cl_platform_id   /* platform */, 
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */, 
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */, 
+               cl_uint          /* num_entries */, 
+               cl_device_id *   /* devices */, 
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */, 
+                size_t          /* param_value_size */, 
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         /* in_device */,
+                   const cl_device_partition_property * /* properties */,
+                   cl_uint                              /* num_devices */,
+                   cl_device_id *                       /* out_devices */,
+                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+    
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                 /* num_devices */,
+                const cl_device_id *    /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                  /* user_data */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type          /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                  /* user_data */,
+                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */, 
+                 cl_context_info    /* param_name */, 
+                 size_t             /* param_value_size */, 
+                 void *             /* param_value */, 
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */, 
+                     cl_device_id                   /* device */, 
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              /* context */,
+              cl_mem_flags            /* flags */,
+              const cl_image_format * /* image_format */,
+              const cl_image_desc *   /* image_desc */, 
+              void *                  /* host_ptr */,
+              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+                        
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+                                    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */, 
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */, 
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(  cl_mem /* memobj */, 
+                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
+
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */, 
+                cl_addressing_mode  /* addressing_mode */, 
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            /* context */,
+                                  cl_uint               /* num_devices */,
+                                  const cl_device_id *  /* device_list */,
+                                  const char *          /* kernel_names */,
+                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */, 
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           /* program */,
+                 cl_uint              /* num_devices */,
+                 const cl_device_id * /* device_list */,
+                 const char *         /* options */, 
+                 cl_uint              /* num_input_headers */,
+                 const cl_program *   /* input_headers */,
+                 const char **        /* header_include_names */,
+                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           /* context */,
+              cl_uint              /* num_devices */,
+              const cl_device_id * /* device_list */,
+              const char *         /* options */, 
+              cl_uint              /* num_input_programs */,
+              const cl_program *   /* input_programs */,
+              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+              void *               /* user_data */,
+              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       /* kernel */,
+                   cl_uint         /* arg_indx */,
+                   cl_kernel_arg_info  /* param_name */,
+                   size_t          /* param_value_size */,
+                   void *          /* param_value */,
+                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;               
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+                     
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                                
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* size */, 
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_offset */,
+                        const size_t *      /* host_offset */, 
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,                        
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
+                     cl_mem             /* buffer */, 
+                     cl_bool            /* blocking_write */, 
+                     size_t             /* offset */, 
+                     size_t             /* size */, 
+                     const void *       /* ptr */, 
+                     cl_uint            /* num_events_in_wait_list */, 
+                     const cl_event *   /* event_wait_list */, 
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_offset */,
+                         const size_t *      /* host_offset */, 
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,                        
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
+                    cl_mem             /* buffer */, 
+                    const void *       /* pattern */, 
+                    size_t             /* pattern_size */, 
+                    size_t             /* offset */, 
+                    size_t             /* size */, 
+                    cl_uint            /* num_events_in_wait_list */, 
+                    const cl_event *   /* event_wait_list */, 
+                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */, 
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* size */, 
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */, 
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */, 
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */, 
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */, 
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */, 
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */, 
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */, 
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   /* command_queue */,
+                   cl_mem             /* image */, 
+                   const void *       /* fill_color */, 
+                   const size_t *     /* origin[3] */, 
+                   const size_t *     /* region[3] */, 
+                   cl_uint            /* num_events_in_wait_list */, 
+                   const cl_event *   /* event_wait_list */, 
+                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */, 
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */, 
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */, 
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */, 
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */, 
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* size */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */, 
+                  cl_bool           /* blocking_map */, 
+                  cl_map_flags      /* map_flags */, 
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
+                           cl_uint                /* num_mem_objects */,
+                           const cl_mem *         /* mem_objects */,
+                           cl_mem_migration_flags /* flags */,
+                           cl_uint                /* num_events_in_wait_list */,
+                           const cl_event *       /* event_wait_list */,
+                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+					  void (CL_CALLBACK * /*user_func*/)(void *), 
+                      void *            /* args */,
+                      size_t            /* cb_args */, 
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+                            cl_uint           /* num_events_in_wait_list */,
+                            const cl_event *  /* event_wait_list */,
+                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+                             cl_uint           /* num_events_in_wait_list */,
+                             const cl_event *  /* event_wait_list */,
+                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or 
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL 
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+    
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */, 
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */, 
+                size_t                  /* image_row_pitch */, 
+                size_t                  /* image_slice_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                        cl_uint          /* num_events */,
+                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
+
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_ext.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_ext.h
new file mode 100644
index 000000000..a998c9c51
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_ext.h
@@ -0,0 +1,315 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
+#include <cl.h>
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
+ * which they were registered. The user callback functions are called and then the memory object is deleted 
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
+                                            const void * /* private_info */, 
+                                            size_t       /* cb */, 
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************ 
+* cl_khr_icd extension *                                                  
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+    
+/*************************************
+ * cl_khr_initalize_memory extension *
+ *************************************/
+    
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
+    
+    
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+    
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x200F
+#define CL_CONTEXT_TERMINATE_KHR                    0x2010
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+    
+    
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a 
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+#ifdef CL_VERSION_1_1
+   /***********************************
+    * cl_ext_device_fission extension *
+    ***********************************/
+    #define cl_ext_device_fission   1
+    
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef cl_ulong  cl_device_partition_property_ext;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                            const cl_device_partition_property_ext * /* properties */,
+                            cl_uint /*num_entries*/,
+                            cl_device_id * /*out_devices*/,
+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int 
+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                                const cl_device_partition_property_ext * /* properties */,
+                                                cl_uint /*num_entries*/,
+                                                cl_device_id * /*out_devices*/,
+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    /* cl_device_partition_property_ext */
+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+    
+    /* clDeviceGetInfo selectors */
+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+    
+    /* error codes */
+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
+    
+    /* CL_AFFINITY_DOMAINs */
+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+    
+    /* cl_device_partition_property_ext list terminators */
+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+            
+	/* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+            
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+#endif /* CL_VERSION_1_1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl.h
new file mode 100644
index 000000000..ad914cbd4
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl.h
@@ -0,0 +1,158 @@
+/**********************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#include <cl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* cl_khr_gl_sharing extension  */
+    
+#define cl_khr_gl_sharing 1
+    
+typedef cl_uint     cl_gl_context_info;
+    
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+    
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+    
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl_ext.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl_ext.h
new file mode 100644
index 000000000..0c10fedfa
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl_ext.h
@@ -0,0 +1,65 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <cl_gl.h>
+
+/*
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_platform.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_platform.h
new file mode 100644
index 000000000..7f6f5e8a7
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_platform.h
@@ -0,0 +1,1278 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+    
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+#else
+    #define CL_EXTENSION_WEAK_LINK  
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    
+    #ifdef __GNUC__
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #endif
+    #elif _WIN32
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)     
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)     
+        #endif
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+    
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          0x1.fffffep127f
+#define CL_FLT_MIN          0x1.0p-126f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          0x1.fffffffffffffp1023
+#define CL_DBL_MIN          0x1.0p-1022
+#define CL_DBL_EPSILON      0x1.0p-52
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )  
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types 
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned. 
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned. 
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h> 
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define capabilities for anonymous struct members. */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && (_MSC_VER >= 1500)
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__) 
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging 
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source 
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 
+  
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && (_MSC_VER >= 1500)
+#pragma warning( pop )
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/opencl.h b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/opencl.h
new file mode 100644
index 000000000..fccacd168
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/opencl.h
@@ -0,0 +1,43 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <cl.h>
+#include <cl_gl.h>
+#include <cl_gl_ext.h>
+#include <cl_ext.h>
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/image.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/image.go
new file mode 100644
index 000000000..d6a996377
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/image.go
@@ -0,0 +1,83 @@
+// +build cl12
+
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+import (
+	"image"
+	"unsafe"
+)
+
+func (ctx *Context) CreateImage(flags MemFlag, imageFormat ImageFormat, imageDesc ImageDescription, data []byte) (*MemObject, error) {
+	format := imageFormat.toCl()
+	desc := imageDesc.toCl()
+	var dataPtr unsafe.Pointer
+	if data != nil {
+		dataPtr = unsafe.Pointer(&data[0])
+	}
+	var err C.cl_int
+	clBuffer := C.clCreateImage(ctx.clContext, C.cl_mem_flags(flags), &format, &desc, dataPtr, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if clBuffer == nil {
+		return nil, ErrUnknown
+	}
+	return newMemObject(clBuffer, len(data)), nil
+}
+
+func (ctx *Context) CreateImageSimple(flags MemFlag, width, height int, channelOrder ChannelOrder, channelDataType ChannelDataType, data []byte) (*MemObject, error) {
+	format := ImageFormat{channelOrder, channelDataType}
+	desc := ImageDescription{
+		Type:   MemObjectTypeImage2D,
+		Width:  width,
+		Height: height,
+	}
+	return ctx.CreateImage(flags, format, desc, data)
+}
+
+func (ctx *Context) CreateImageFromImage(flags MemFlag, img image.Image) (*MemObject, error) {
+	switch m := img.(type) {
+	case *image.Gray:
+		format := ImageFormat{ChannelOrderIntensity, ChannelDataTypeUNormInt8}
+		desc := ImageDescription{
+			Type:     MemObjectTypeImage2D,
+			Width:    m.Bounds().Dx(),
+			Height:   m.Bounds().Dy(),
+			RowPitch: m.Stride,
+		}
+		return ctx.CreateImage(flags, format, desc, m.Pix)
+	case *image.RGBA:
+		format := ImageFormat{ChannelOrderRGBA, ChannelDataTypeUNormInt8}
+		desc := ImageDescription{
+			Type:     MemObjectTypeImage2D,
+			Width:    m.Bounds().Dx(),
+			Height:   m.Bounds().Dy(),
+			RowPitch: m.Stride,
+		}
+		return ctx.CreateImage(flags, format, desc, m.Pix)
+	}
+
+	b := img.Bounds()
+	w := b.Dx()
+	h := b.Dy()
+	data := make([]byte, w*h*4)
+	dataOffset := 0
+	for y := 0; y < h; y++ {
+		for x := 0; x < w; x++ {
+			c := img.At(x+b.Min.X, y+b.Min.Y)
+			r, g, b, a := c.RGBA()
+			data[dataOffset] = uint8(r >> 8)
+			data[dataOffset+1] = uint8(g >> 8)
+			data[dataOffset+2] = uint8(b >> 8)
+			data[dataOffset+3] = uint8(a >> 8)
+			dataOffset += 4
+		}
+	}
+	return ctx.CreateImageSimple(flags, w, h, ChannelOrderRGBA, ChannelDataTypeUNormInt8, data)
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel.go
new file mode 100644
index 000000000..894a775e8
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel.go
@@ -0,0 +1,127 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import (
+	"fmt"
+	"unsafe"
+)
+
+type ErrUnsupportedArgumentType struct {
+	Index int
+	Value interface{}
+}
+
+func (e ErrUnsupportedArgumentType) Error() string {
+	return fmt.Sprintf("cl: unsupported argument type for index %d: %+v", e.Index, e.Value)
+}
+
+type Kernel struct {
+	clKernel C.cl_kernel
+	name     string
+}
+
+type LocalBuffer int
+
+func releaseKernel(k *Kernel) {
+	if k.clKernel != nil {
+		C.clReleaseKernel(k.clKernel)
+		k.clKernel = nil
+	}
+}
+
+func (k *Kernel) Release() {
+	releaseKernel(k)
+}
+
+func (k *Kernel) SetArgs(args ...interface{}) error {
+	for index, arg := range args {
+		if err := k.SetArg(index, arg); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (k *Kernel) SetArg(index int, arg interface{}) error {
+	switch val := arg.(type) {
+	case uint8:
+		return k.SetArgUint8(index, val)
+	case int8:
+		return k.SetArgInt8(index, val)
+	case uint32:
+		return k.SetArgUint32(index, val)
+	case uint64:
+		return k.SetArgUint64(index, val)
+	case int32:
+		return k.SetArgInt32(index, val)
+	case float32:
+		return k.SetArgFloat32(index, val)
+	case *MemObject:
+		return k.SetArgBuffer(index, val)
+	case LocalBuffer:
+		return k.SetArgLocal(index, int(val))
+	default:
+		return ErrUnsupportedArgumentType{Index: index, Value: arg}
+	}
+}
+
+func (k *Kernel) SetArgBuffer(index int, buffer *MemObject) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(buffer.clMem)), unsafe.Pointer(&buffer.clMem))
+}
+
+func (k *Kernel) SetArgFloat32(index int, val float32) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgInt8(index int, val int8) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgUint8(index int, val uint8) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgInt32(index int, val int32) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgUint32(index int, val uint32) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgUint64(index int, val uint64) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgLocal(index int, size int) error {
+	return k.SetArgUnsafe(index, size, nil)
+}
+
+func (k *Kernel) SetArgUnsafe(index, argSize int, arg unsafe.Pointer) error {
+	//fmt.Println("FUNKY: ", index, argSize)
+	return toError(C.clSetKernelArg(k.clKernel, C.cl_uint(index), C.size_t(argSize), arg))
+}
+
+func (k *Kernel) PreferredWorkGroupSizeMultiple(device *Device) (int, error) {
+	var size C.size_t
+	err := C.clGetKernelWorkGroupInfo(k.clKernel, device.nullableId(), C.CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, C.size_t(unsafe.Sizeof(size)), unsafe.Pointer(&size), nil)
+	return int(size), toError(err)
+}
+
+func (k *Kernel) WorkGroupSize(device *Device) (int, error) {
+	var size C.size_t
+	err := C.clGetKernelWorkGroupInfo(k.clKernel, device.nullableId(), C.CL_KERNEL_WORK_GROUP_SIZE, C.size_t(unsafe.Sizeof(size)), unsafe.Pointer(&size), nil)
+	return int(size), toError(err)
+}
+
+func (k *Kernel) NumArgs() (int, error) {
+	var num C.cl_uint
+	err := C.clGetKernelInfo(k.clKernel, C.CL_KERNEL_NUM_ARGS, C.size_t(unsafe.Sizeof(num)), unsafe.Pointer(&num), nil)
+	return int(num), toError(err)
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel10.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel10.go
new file mode 100644
index 000000000..579946068
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel10.go
@@ -0,0 +1,7 @@
+// +build !cl12
+
+package cl
+
+func (k *Kernel) ArgName(index int) (string, error) {
+	return "", ErrUnsupported
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel12.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel12.go
new file mode 100644
index 000000000..d9e6f3546
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel12.go
@@ -0,0 +1,20 @@
+// +build cl12
+
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+import "unsafe"
+
+func (k *Kernel) ArgName(index int) (string, error) {
+	var strC [1024]byte
+	var strN C.size_t
+	if err := C.clGetKernelArgInfo(k.clKernel, C.cl_uint(index), C.CL_KERNEL_ARG_NAME, 1024, unsafe.Pointer(&strC[0]), &strN); err != C.CL_SUCCESS {
+		return "", toError(err)
+	}
+	return string(strC[:strN]), nil
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/platform.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/platform.go
new file mode 100644
index 000000000..fd1d162cf
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/platform.go
@@ -0,0 +1,83 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import "unsafe"
+
+const maxPlatforms = 32
+
+type Platform struct {
+	id C.cl_platform_id
+}
+
+// Obtain the list of platforms available.
+func GetPlatforms() ([]*Platform, error) {
+	var platformIds [maxPlatforms]C.cl_platform_id
+	var nPlatforms C.cl_uint
+	if err := C.clGetPlatformIDs(C.cl_uint(maxPlatforms), &platformIds[0], &nPlatforms); err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	platforms := make([]*Platform, nPlatforms)
+	for i := 0; i < int(nPlatforms); i++ {
+		platforms[i] = &Platform{id: platformIds[i]}
+	}
+	return platforms, nil
+}
+
+func (p *Platform) GetDevices(deviceType DeviceType) ([]*Device, error) {
+	return GetDevices(p, deviceType)
+}
+
+func (p *Platform) getInfoString(param C.cl_platform_info) (string, error) {
+	var strC [2048]byte
+	var strN C.size_t
+	if err := C.clGetPlatformInfo(p.id, param, 2048, unsafe.Pointer(&strC[0]), &strN); err != C.CL_SUCCESS {
+		return "", toError(err)
+	}
+	return string(strC[:(strN - 1)]), nil
+}
+
+func (p *Platform) Name() string {
+	if str, err := p.getInfoString(C.CL_PLATFORM_NAME); err != nil {
+		panic("Platform.Name() should never fail")
+	} else {
+		return str
+	}
+}
+
+func (p *Platform) Vendor() string {
+	if str, err := p.getInfoString(C.CL_PLATFORM_VENDOR); err != nil {
+		panic("Platform.Vendor() should never fail")
+	} else {
+		return str
+	}
+}
+
+func (p *Platform) Profile() string {
+	if str, err := p.getInfoString(C.CL_PLATFORM_PROFILE); err != nil {
+		panic("Platform.Profile() should never fail")
+	} else {
+		return str
+	}
+}
+
+func (p *Platform) Version() string {
+	if str, err := p.getInfoString(C.CL_PLATFORM_VERSION); err != nil {
+		panic("Platform.Version() should never fail")
+	} else {
+		return str
+	}
+}
+
+func (p *Platform) Extensions() string {
+	if str, err := p.getInfoString(C.CL_PLATFORM_EXTENSIONS); err != nil {
+		panic("Platform.Extensions() should never fail")
+	} else {
+		return str
+	}
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/program.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/program.go
new file mode 100644
index 000000000..e75f7ee0e
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/program.go
@@ -0,0 +1,105 @@
+package cl
+
+// #include <stdlib.h>
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import (
+	"fmt"
+	"runtime"
+	"unsafe"
+)
+
+type BuildError struct {
+	Message string
+	Device  *Device
+}
+
+func (e BuildError) Error() string {
+	if e.Device != nil {
+		return fmt.Sprintf("cl: build error on %q: %s", e.Device.Name(), e.Message)
+	} else {
+		return fmt.Sprintf("cl: build error: %s", e.Message)
+	}
+}
+
+type Program struct {
+	clProgram C.cl_program
+	devices   []*Device
+}
+
+func releaseProgram(p *Program) {
+	if p.clProgram != nil {
+		C.clReleaseProgram(p.clProgram)
+		p.clProgram = nil
+	}
+}
+
+func (p *Program) Release() {
+	releaseProgram(p)
+}
+
+func (p *Program) BuildProgram(devices []*Device, options string) error {
+	var cOptions *C.char
+	if options != "" {
+		cOptions = C.CString(options)
+		defer C.free(unsafe.Pointer(cOptions))
+	}
+	var deviceList []C.cl_device_id
+	var deviceListPtr *C.cl_device_id
+	numDevices := C.cl_uint(len(devices))
+	if devices != nil && len(devices) > 0 {
+		deviceList = buildDeviceIdList(devices)
+		deviceListPtr = &deviceList[0]
+	}
+	if err := C.clBuildProgram(p.clProgram, numDevices, deviceListPtr, cOptions, nil, nil); err != C.CL_SUCCESS {
+		buffer := make([]byte, 4096)
+		var bLen C.size_t
+		var err C.cl_int
+
+		for _, dev := range p.devices {
+			for i := 2; i >= 0; i-- {
+				err = C.clGetProgramBuildInfo(p.clProgram, dev.id, C.CL_PROGRAM_BUILD_LOG, C.size_t(len(buffer)), unsafe.Pointer(&buffer[0]), &bLen)
+				if err == C.CL_INVALID_VALUE && i > 0 && bLen < 1024*1024 {
+					// INVALID_VALUE probably means our buffer isn't large enough
+					buffer = make([]byte, bLen)
+				} else {
+					break
+				}
+			}
+			if err != C.CL_SUCCESS {
+				return toError(err)
+			}
+
+			if bLen > 1 {
+				return BuildError{
+					Device:  dev,
+					Message: string(buffer[:bLen-1]),
+				}
+			}
+		}
+
+		return BuildError{
+			Device:  nil,
+			Message: "build failed and produced no log entries",
+		}
+	}
+	return nil
+}
+
+func (p *Program) CreateKernel(name string) (*Kernel, error) {
+	cName := C.CString(name)
+	defer C.free(unsafe.Pointer(cName))
+	var err C.cl_int
+	clKernel := C.clCreateKernel(p.clProgram, cName, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	kernel := &Kernel{clKernel: clKernel, name: name}
+	runtime.SetFinalizer(kernel, releaseKernel)
+	return kernel, nil
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/queue.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/queue.go
new file mode 100644
index 000000000..7762746da
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/queue.go
@@ -0,0 +1,193 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import "unsafe"
+
+type CommandQueueProperty int
+
+const (
+	CommandQueueOutOfOrderExecModeEnable CommandQueueProperty = C.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+	CommandQueueProfilingEnable          CommandQueueProperty = C.CL_QUEUE_PROFILING_ENABLE
+)
+
+type CommandQueue struct {
+	clQueue C.cl_command_queue
+	device  *Device
+}
+
+func releaseCommandQueue(q *CommandQueue) {
+	if q.clQueue != nil {
+		C.clReleaseCommandQueue(q.clQueue)
+		q.clQueue = nil
+	}
+}
+
+// Call clReleaseCommandQueue on the CommandQueue. Using the CommandQueue after Release will cause a panick.
+func (q *CommandQueue) Release() {
+	releaseCommandQueue(q)
+}
+
+// Blocks until all previously queued OpenCL commands in a command-queue are issued to the associated device and have completed.
+func (q *CommandQueue) Finish() error {
+	return toError(C.clFinish(q.clQueue))
+}
+
+// Issues all previously queued OpenCL commands in a command-queue to the device associated with the command-queue.
+func (q *CommandQueue) Flush() error {
+	return toError(C.clFlush(q.clQueue))
+}
+
+// Enqueues a command to map a region of the buffer object given by buffer into the host address space and returns a pointer to this mapped region.
+func (q *CommandQueue) EnqueueMapBuffer(buffer *MemObject, blocking bool, flags MapFlag, offset, size int, eventWaitList []*Event) (*MappedMemObject, *Event, error) {
+	var event C.cl_event
+	var err C.cl_int
+	ptr := C.clEnqueueMapBuffer(q.clQueue, buffer.clMem, clBool(blocking), flags.toCl(), C.size_t(offset), C.size_t(size), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event, &err)
+	if err != C.CL_SUCCESS {
+		return nil, nil, toError(err)
+	}
+	ev := newEvent(event)
+	if ptr == nil {
+		return nil, ev, ErrUnknown
+	}
+	return &MappedMemObject{ptr: ptr, size: size}, ev, nil
+}
+
+// Enqueues a command to map a region of an image object into the host address space and returns a pointer to this mapped region.
+func (q *CommandQueue) EnqueueMapImage(buffer *MemObject, blocking bool, flags MapFlag, origin, region [3]int, eventWaitList []*Event) (*MappedMemObject, *Event, error) {
+	cOrigin := sizeT3(origin)
+	cRegion := sizeT3(region)
+	var event C.cl_event
+	var err C.cl_int
+	var rowPitch, slicePitch C.size_t
+	ptr := C.clEnqueueMapImage(q.clQueue, buffer.clMem, clBool(blocking), flags.toCl(), &cOrigin[0], &cRegion[0], &rowPitch, &slicePitch, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event, &err)
+	if err != C.CL_SUCCESS {
+		return nil, nil, toError(err)
+	}
+	ev := newEvent(event)
+	if ptr == nil {
+		return nil, ev, ErrUnknown
+	}
+	size := 0 // TODO: could calculate this
+	return &MappedMemObject{ptr: ptr, size: size, rowPitch: int(rowPitch), slicePitch: int(slicePitch)}, ev, nil
+}
+
+// Enqueues a command to unmap a previously mapped region of a memory object.
+func (q *CommandQueue) EnqueueUnmapMemObject(buffer *MemObject, mappedObj *MappedMemObject, eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	if err := C.clEnqueueUnmapMemObject(q.clQueue, buffer.clMem, mappedObj.ptr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event); err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	return newEvent(event), nil
+}
+
+// Enqueues a command to copy a buffer object to another buffer object.
+func (q *CommandQueue) EnqueueCopyBuffer(srcBuffer, dstBuffer *MemObject, srcOffset, dstOffset, byteCount int, eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueCopyBuffer(q.clQueue, srcBuffer.clMem, dstBuffer.clMem, C.size_t(srcOffset), C.size_t(dstOffset), C.size_t(byteCount), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+// Enqueue commands to write to a buffer object from host memory.
+func (q *CommandQueue) EnqueueWriteBuffer(buffer *MemObject, blocking bool, offset, dataSize int, dataPtr unsafe.Pointer, eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueWriteBuffer(q.clQueue, buffer.clMem, clBool(blocking), C.size_t(offset), C.size_t(dataSize), dataPtr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+func (q *CommandQueue) EnqueueWriteBufferFloat32(buffer *MemObject, blocking bool, offset int, data []float32, eventWaitList []*Event) (*Event, error) {
+	dataPtr := unsafe.Pointer(&data[0])
+	dataSize := int(unsafe.Sizeof(data[0])) * len(data)
+	return q.EnqueueWriteBuffer(buffer, blocking, offset, dataSize, dataPtr, eventWaitList)
+}
+
+// Enqueue commands to read from a buffer object to host memory.
+func (q *CommandQueue) EnqueueReadBuffer(buffer *MemObject, blocking bool, offset, dataSize int, dataPtr unsafe.Pointer, eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueReadBuffer(q.clQueue, buffer.clMem, clBool(blocking), C.size_t(offset), C.size_t(dataSize), dataPtr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+func (q *CommandQueue) EnqueueReadBufferFloat32(buffer *MemObject, blocking bool, offset int, data []float32, eventWaitList []*Event) (*Event, error) {
+	dataPtr := unsafe.Pointer(&data[0])
+	dataSize := int(unsafe.Sizeof(data[0])) * len(data)
+	return q.EnqueueReadBuffer(buffer, blocking, offset, dataSize, dataPtr, eventWaitList)
+}
+
+// Enqueues a command to execute a kernel on a device.
+func (q *CommandQueue) EnqueueNDRangeKernel(kernel *Kernel, globalWorkOffset, globalWorkSize, localWorkSize []int, eventWaitList []*Event) (*Event, error) {
+	workDim := len(globalWorkSize)
+	var globalWorkOffsetList []C.size_t
+	var globalWorkOffsetPtr *C.size_t
+	if globalWorkOffset != nil {
+		globalWorkOffsetList = make([]C.size_t, len(globalWorkOffset))
+		for i, off := range globalWorkOffset {
+			globalWorkOffsetList[i] = C.size_t(off)
+		}
+		globalWorkOffsetPtr = &globalWorkOffsetList[0]
+	}
+	var globalWorkSizeList []C.size_t
+	var globalWorkSizePtr *C.size_t
+	if globalWorkSize != nil {
+		globalWorkSizeList = make([]C.size_t, len(globalWorkSize))
+		for i, off := range globalWorkSize {
+			globalWorkSizeList[i] = C.size_t(off)
+		}
+		globalWorkSizePtr = &globalWorkSizeList[0]
+	}
+	var localWorkSizeList []C.size_t
+	var localWorkSizePtr *C.size_t
+	if localWorkSize != nil {
+		localWorkSizeList = make([]C.size_t, len(localWorkSize))
+		for i, off := range localWorkSize {
+			localWorkSizeList[i] = C.size_t(off)
+		}
+		localWorkSizePtr = &localWorkSizeList[0]
+	}
+	var event C.cl_event
+	err := toError(C.clEnqueueNDRangeKernel(q.clQueue, kernel.clKernel, C.cl_uint(workDim), globalWorkOffsetPtr, globalWorkSizePtr, localWorkSizePtr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+// Enqueues a command to read from a 2D or 3D image object to host memory.
+func (q *CommandQueue) EnqueueReadImage(image *MemObject, blocking bool, origin, region [3]int, rowPitch, slicePitch int, data []byte, eventWaitList []*Event) (*Event, error) {
+	cOrigin := sizeT3(origin)
+	cRegion := sizeT3(region)
+	var event C.cl_event
+	err := toError(C.clEnqueueReadImage(q.clQueue, image.clMem, clBool(blocking), &cOrigin[0], &cRegion[0], C.size_t(rowPitch), C.size_t(slicePitch), unsafe.Pointer(&data[0]), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+// Enqueues a command to write from a 2D or 3D image object to host memory.
+func (q *CommandQueue) EnqueueWriteImage(image *MemObject, blocking bool, origin, region [3]int, rowPitch, slicePitch int, data []byte, eventWaitList []*Event) (*Event, error) {
+	cOrigin := sizeT3(origin)
+	cRegion := sizeT3(region)
+	var event C.cl_event
+	err := toError(C.clEnqueueWriteImage(q.clQueue, image.clMem, clBool(blocking), &cOrigin[0], &cRegion[0], C.size_t(rowPitch), C.size_t(slicePitch), unsafe.Pointer(&data[0]), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+func (q *CommandQueue) EnqueueFillBuffer(buffer *MemObject, pattern unsafe.Pointer, patternSize, offset, size int, eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueFillBuffer(q.clQueue, buffer.clMem, pattern, C.size_t(patternSize), C.size_t(offset), C.size_t(size), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+// A synchronization point that enqueues a barrier operation.
+func (q *CommandQueue) EnqueueBarrierWithWaitList(eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueBarrierWithWaitList(q.clQueue, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+// Enqueues a marker command which waits for either a list of events to complete, or all previously enqueued commands to complete.
+func (q *CommandQueue) EnqueueMarkerWithWaitList(eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueMarkerWithWaitList(q.clQueue, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types.go
new file mode 100644
index 000000000..00c846ce7
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types.go
@@ -0,0 +1,487 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"reflect"
+	"runtime"
+	"strings"
+	"unsafe"
+)
+
+var (
+	ErrUnknown = errors.New("cl: unknown error") // Generally an unexpected result from an OpenCL function (e.g. CL_SUCCESS but null pointer)
+)
+
+type ErrOther int
+
+func (e ErrOther) Error() string {
+	return fmt.Sprintf("cl: error %d", int(e))
+}
+
+var (
+	ErrDeviceNotFound                     = errors.New("cl: Device Not Found")
+	ErrDeviceNotAvailable                 = errors.New("cl: Device Not Available")
+	ErrCompilerNotAvailable               = errors.New("cl: Compiler Not Available")
+	ErrMemObjectAllocationFailure         = errors.New("cl: Mem Object Allocation Failure")
+	ErrOutOfResources                     = errors.New("cl: Out Of Resources")
+	ErrOutOfHostMemory                    = errors.New("cl: Out Of Host Memory")
+	ErrProfilingInfoNotAvailable          = errors.New("cl: Profiling Info Not Available")
+	ErrMemCopyOverlap                     = errors.New("cl: Mem Copy Overlap")
+	ErrImageFormatMismatch                = errors.New("cl: Image Format Mismatch")
+	ErrImageFormatNotSupported            = errors.New("cl: Image Format Not Supported")
+	ErrBuildProgramFailure                = errors.New("cl: Build Program Failure")
+	ErrMapFailure                         = errors.New("cl: Map Failure")
+	ErrMisalignedSubBufferOffset          = errors.New("cl: Misaligned Sub Buffer Offset")
+	ErrExecStatusErrorForEventsInWaitList = errors.New("cl: Exec Status Error For Events In Wait List")
+	ErrCompileProgramFailure              = errors.New("cl: Compile Program Failure")
+	ErrLinkerNotAvailable                 = errors.New("cl: Linker Not Available")
+	ErrLinkProgramFailure                 = errors.New("cl: Link Program Failure")
+	ErrDevicePartitionFailed              = errors.New("cl: Device Partition Failed")
+	ErrKernelArgInfoNotAvailable          = errors.New("cl: Kernel Arg Info Not Available")
+	ErrInvalidValue                       = errors.New("cl: Invalid Value")
+	ErrInvalidDeviceType                  = errors.New("cl: Invalid Device Type")
+	ErrInvalidPlatform                    = errors.New("cl: Invalid Platform")
+	ErrInvalidDevice                      = errors.New("cl: Invalid Device")
+	ErrInvalidContext                     = errors.New("cl: Invalid Context")
+	ErrInvalidQueueProperties             = errors.New("cl: Invalid Queue Properties")
+	ErrInvalidCommandQueue                = errors.New("cl: Invalid Command Queue")
+	ErrInvalidHostPtr                     = errors.New("cl: Invalid Host Ptr")
+	ErrInvalidMemObject                   = errors.New("cl: Invalid Mem Object")
+	ErrInvalidImageFormatDescriptor       = errors.New("cl: Invalid Image Format Descriptor")
+	ErrInvalidImageSize                   = errors.New("cl: Invalid Image Size")
+	ErrInvalidSampler                     = errors.New("cl: Invalid Sampler")
+	ErrInvalidBinary                      = errors.New("cl: Invalid Binary")
+	ErrInvalidBuildOptions                = errors.New("cl: Invalid Build Options")
+	ErrInvalidProgram                     = errors.New("cl: Invalid Program")
+	ErrInvalidProgramExecutable           = errors.New("cl: Invalid Program Executable")
+	ErrInvalidKernelName                  = errors.New("cl: Invalid Kernel Name")
+	ErrInvalidKernelDefinition            = errors.New("cl: Invalid Kernel Definition")
+	ErrInvalidKernel                      = errors.New("cl: Invalid Kernel")
+	ErrInvalidArgIndex                    = errors.New("cl: Invalid Arg Index")
+	ErrInvalidArgValue                    = errors.New("cl: Invalid Arg Value")
+	ErrInvalidArgSize                     = errors.New("cl: Invalid Arg Size")
+	ErrInvalidKernelArgs                  = errors.New("cl: Invalid Kernel Args")
+	ErrInvalidWorkDimension               = errors.New("cl: Invalid Work Dimension")
+	ErrInvalidWorkGroupSize               = errors.New("cl: Invalid Work Group Size")
+	ErrInvalidWorkItemSize                = errors.New("cl: Invalid Work Item Size")
+	ErrInvalidGlobalOffset                = errors.New("cl: Invalid Global Offset")
+	ErrInvalidEventWaitList               = errors.New("cl: Invalid Event Wait List")
+	ErrInvalidEvent                       = errors.New("cl: Invalid Event")
+	ErrInvalidOperation                   = errors.New("cl: Invalid Operation")
+	ErrInvalidGlObject                    = errors.New("cl: Invalid Gl Object")
+	ErrInvalidBufferSize                  = errors.New("cl: Invalid Buffer Size")
+	ErrInvalidMipLevel                    = errors.New("cl: Invalid Mip Level")
+	ErrInvalidGlobalWorkSize              = errors.New("cl: Invalid Global Work Size")
+	ErrInvalidProperty                    = errors.New("cl: Invalid Property")
+	ErrInvalidImageDescriptor             = errors.New("cl: Invalid Image Descriptor")
+	ErrInvalidCompilerOptions             = errors.New("cl: Invalid Compiler Options")
+	ErrInvalidLinkerOptions               = errors.New("cl: Invalid Linker Options")
+	ErrInvalidDevicePartitionCount        = errors.New("cl: Invalid Device Partition Count")
+)
+var errorMap = map[C.cl_int]error{
+	C.CL_SUCCESS:                                   nil,
+	C.CL_DEVICE_NOT_FOUND:                          ErrDeviceNotFound,
+	C.CL_DEVICE_NOT_AVAILABLE:                      ErrDeviceNotAvailable,
+	C.CL_COMPILER_NOT_AVAILABLE:                    ErrCompilerNotAvailable,
+	C.CL_MEM_OBJECT_ALLOCATION_FAILURE:             ErrMemObjectAllocationFailure,
+	C.CL_OUT_OF_RESOURCES:                          ErrOutOfResources,
+	C.CL_OUT_OF_HOST_MEMORY:                        ErrOutOfHostMemory,
+	C.CL_PROFILING_INFO_NOT_AVAILABLE:              ErrProfilingInfoNotAvailable,
+	C.CL_MEM_COPY_OVERLAP:                          ErrMemCopyOverlap,
+	C.CL_IMAGE_FORMAT_MISMATCH:                     ErrImageFormatMismatch,
+	C.CL_IMAGE_FORMAT_NOT_SUPPORTED:                ErrImageFormatNotSupported,
+	C.CL_BUILD_PROGRAM_FAILURE:                     ErrBuildProgramFailure,
+	C.CL_MAP_FAILURE:                               ErrMapFailure,
+	C.CL_MISALIGNED_SUB_BUFFER_OFFSET:              ErrMisalignedSubBufferOffset,
+	C.CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: ErrExecStatusErrorForEventsInWaitList,
+	C.CL_INVALID_VALUE:                             ErrInvalidValue,
+	C.CL_INVALID_DEVICE_TYPE:                       ErrInvalidDeviceType,
+	C.CL_INVALID_PLATFORM:                          ErrInvalidPlatform,
+	C.CL_INVALID_DEVICE:                            ErrInvalidDevice,
+	C.CL_INVALID_CONTEXT:                           ErrInvalidContext,
+	C.CL_INVALID_QUEUE_PROPERTIES:                  ErrInvalidQueueProperties,
+	C.CL_INVALID_COMMAND_QUEUE:                     ErrInvalidCommandQueue,
+	C.CL_INVALID_HOST_PTR:                          ErrInvalidHostPtr,
+	C.CL_INVALID_MEM_OBJECT:                        ErrInvalidMemObject,
+	C.CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:           ErrInvalidImageFormatDescriptor,
+	C.CL_INVALID_IMAGE_SIZE:                        ErrInvalidImageSize,
+	C.CL_INVALID_SAMPLER:                           ErrInvalidSampler,
+	C.CL_INVALID_BINARY:                            ErrInvalidBinary,
+	C.CL_INVALID_BUILD_OPTIONS:                     ErrInvalidBuildOptions,
+	C.CL_INVALID_PROGRAM:                           ErrInvalidProgram,
+	C.CL_INVALID_PROGRAM_EXECUTABLE:                ErrInvalidProgramExecutable,
+	C.CL_INVALID_KERNEL_NAME:                       ErrInvalidKernelName,
+	C.CL_INVALID_KERNEL_DEFINITION:                 ErrInvalidKernelDefinition,
+	C.CL_INVALID_KERNEL:                            ErrInvalidKernel,
+	C.CL_INVALID_ARG_INDEX:                         ErrInvalidArgIndex,
+	C.CL_INVALID_ARG_VALUE:                         ErrInvalidArgValue,
+	C.CL_INVALID_ARG_SIZE:                          ErrInvalidArgSize,
+	C.CL_INVALID_KERNEL_ARGS:                       ErrInvalidKernelArgs,
+	C.CL_INVALID_WORK_DIMENSION:                    ErrInvalidWorkDimension,
+	C.CL_INVALID_WORK_GROUP_SIZE:                   ErrInvalidWorkGroupSize,
+	C.CL_INVALID_WORK_ITEM_SIZE:                    ErrInvalidWorkItemSize,
+	C.CL_INVALID_GLOBAL_OFFSET:                     ErrInvalidGlobalOffset,
+	C.CL_INVALID_EVENT_WAIT_LIST:                   ErrInvalidEventWaitList,
+	C.CL_INVALID_EVENT:                             ErrInvalidEvent,
+	C.CL_INVALID_OPERATION:                         ErrInvalidOperation,
+	C.CL_INVALID_GL_OBJECT:                         ErrInvalidGlObject,
+	C.CL_INVALID_BUFFER_SIZE:                       ErrInvalidBufferSize,
+	C.CL_INVALID_MIP_LEVEL:                         ErrInvalidMipLevel,
+	C.CL_INVALID_GLOBAL_WORK_SIZE:                  ErrInvalidGlobalWorkSize,
+	C.CL_INVALID_PROPERTY:                          ErrInvalidProperty,
+}
+
+func toError(code C.cl_int) error {
+	if err, ok := errorMap[code]; ok {
+		return err
+	}
+	return ErrOther(code)
+}
+
+type LocalMemType int
+
+const (
+	LocalMemTypeNone   LocalMemType = C.CL_NONE
+	LocalMemTypeGlobal LocalMemType = C.CL_GLOBAL
+	LocalMemTypeLocal  LocalMemType = C.CL_LOCAL
+)
+
+var localMemTypeMap = map[LocalMemType]string{
+	LocalMemTypeNone:   "None",
+	LocalMemTypeGlobal: "Global",
+	LocalMemTypeLocal:  "Local",
+}
+
+func (t LocalMemType) String() string {
+	name := localMemTypeMap[t]
+	if name == "" {
+		name = "Unknown"
+	}
+	return name
+}
+
+type ExecCapability int
+
+const (
+	ExecCapabilityKernel       ExecCapability = C.CL_EXEC_KERNEL        // The OpenCL device can execute OpenCL kernels.
+	ExecCapabilityNativeKernel ExecCapability = C.CL_EXEC_NATIVE_KERNEL // The OpenCL device can execute native kernels.
+)
+
+func (ec ExecCapability) String() string {
+	var parts []string
+	if ec&ExecCapabilityKernel != 0 {
+		parts = append(parts, "Kernel")
+	}
+	if ec&ExecCapabilityNativeKernel != 0 {
+		parts = append(parts, "NativeKernel")
+	}
+	if parts == nil {
+		return ""
+	}
+	return strings.Join(parts, "|")
+}
+
+type MemCacheType int
+
+const (
+	MemCacheTypeNone           MemCacheType = C.CL_NONE
+	MemCacheTypeReadOnlyCache  MemCacheType = C.CL_READ_ONLY_CACHE
+	MemCacheTypeReadWriteCache MemCacheType = C.CL_READ_WRITE_CACHE
+)
+
+func (ct MemCacheType) String() string {
+	switch ct {
+	case MemCacheTypeNone:
+		return "None"
+	case MemCacheTypeReadOnlyCache:
+		return "ReadOnly"
+	case MemCacheTypeReadWriteCache:
+		return "ReadWrite"
+	}
+	return fmt.Sprintf("Unknown(%x)", int(ct))
+}
+
+type MemFlag int
+
+const (
+	MemReadWrite    MemFlag = C.CL_MEM_READ_WRITE
+	MemWriteOnly    MemFlag = C.CL_MEM_WRITE_ONLY
+	MemReadOnly     MemFlag = C.CL_MEM_READ_ONLY
+	MemUseHostPtr   MemFlag = C.CL_MEM_USE_HOST_PTR
+	MemAllocHostPtr MemFlag = C.CL_MEM_ALLOC_HOST_PTR
+	MemCopyHostPtr  MemFlag = C.CL_MEM_COPY_HOST_PTR
+
+	MemWriteOnlyHost MemFlag = C.CL_MEM_HOST_WRITE_ONLY
+	MemReadOnlyHost  MemFlag = C.CL_MEM_HOST_READ_ONLY
+	MemNoAccessHost  MemFlag = C.CL_MEM_HOST_NO_ACCESS
+)
+
+type MemObjectType int
+
+const (
+	MemObjectTypeBuffer  MemObjectType = C.CL_MEM_OBJECT_BUFFER
+	MemObjectTypeImage2D MemObjectType = C.CL_MEM_OBJECT_IMAGE2D
+	MemObjectTypeImage3D MemObjectType = C.CL_MEM_OBJECT_IMAGE3D
+)
+
+type MapFlag int
+
+const (
+	// This flag specifies that the region being mapped in the memory object is being mapped for reading.
+	MapFlagRead                  MapFlag = C.CL_MAP_READ
+	MapFlagWrite                 MapFlag = C.CL_MAP_WRITE
+	MapFlagWriteInvalidateRegion MapFlag = C.CL_MAP_WRITE_INVALIDATE_REGION
+)
+
+func (mf MapFlag) toCl() C.cl_map_flags {
+	return C.cl_map_flags(mf)
+}
+
+type ChannelOrder int
+
+const (
+	ChannelOrderR         ChannelOrder = C.CL_R
+	ChannelOrderA         ChannelOrder = C.CL_A
+	ChannelOrderRG        ChannelOrder = C.CL_RG
+	ChannelOrderRA        ChannelOrder = C.CL_RA
+	ChannelOrderRGB       ChannelOrder = C.CL_RGB
+	ChannelOrderRGBA      ChannelOrder = C.CL_RGBA
+	ChannelOrderBGRA      ChannelOrder = C.CL_BGRA
+	ChannelOrderARGB      ChannelOrder = C.CL_ARGB
+	ChannelOrderIntensity ChannelOrder = C.CL_INTENSITY
+	ChannelOrderLuminance ChannelOrder = C.CL_LUMINANCE
+	ChannelOrderRx        ChannelOrder = C.CL_Rx
+	ChannelOrderRGx       ChannelOrder = C.CL_RGx
+	ChannelOrderRGBx      ChannelOrder = C.CL_RGBx
+)
+
+var channelOrderNameMap = map[ChannelOrder]string{
+	ChannelOrderR:         "R",
+	ChannelOrderA:         "A",
+	ChannelOrderRG:        "RG",
+	ChannelOrderRA:        "RA",
+	ChannelOrderRGB:       "RGB",
+	ChannelOrderRGBA:      "RGBA",
+	ChannelOrderBGRA:      "BGRA",
+	ChannelOrderARGB:      "ARGB",
+	ChannelOrderIntensity: "Intensity",
+	ChannelOrderLuminance: "Luminance",
+	ChannelOrderRx:        "Rx",
+	ChannelOrderRGx:       "RGx",
+	ChannelOrderRGBx:      "RGBx",
+}
+
+func (co ChannelOrder) String() string {
+	name := channelOrderNameMap[co]
+	if name == "" {
+		name = fmt.Sprintf("Unknown(%x)", int(co))
+	}
+	return name
+}
+
+type ChannelDataType int
+
+const (
+	ChannelDataTypeSNormInt8      ChannelDataType = C.CL_SNORM_INT8
+	ChannelDataTypeSNormInt16     ChannelDataType = C.CL_SNORM_INT16
+	ChannelDataTypeUNormInt8      ChannelDataType = C.CL_UNORM_INT8
+	ChannelDataTypeUNormInt16     ChannelDataType = C.CL_UNORM_INT16
+	ChannelDataTypeUNormShort565  ChannelDataType = C.CL_UNORM_SHORT_565
+	ChannelDataTypeUNormShort555  ChannelDataType = C.CL_UNORM_SHORT_555
+	ChannelDataTypeUNormInt101010 ChannelDataType = C.CL_UNORM_INT_101010
+	ChannelDataTypeSignedInt8     ChannelDataType = C.CL_SIGNED_INT8
+	ChannelDataTypeSignedInt16    ChannelDataType = C.CL_SIGNED_INT16
+	ChannelDataTypeSignedInt32    ChannelDataType = C.CL_SIGNED_INT32
+	ChannelDataTypeUnsignedInt8   ChannelDataType = C.CL_UNSIGNED_INT8
+	ChannelDataTypeUnsignedInt16  ChannelDataType = C.CL_UNSIGNED_INT16
+	ChannelDataTypeUnsignedInt32  ChannelDataType = C.CL_UNSIGNED_INT32
+	ChannelDataTypeHalfFloat      ChannelDataType = C.CL_HALF_FLOAT
+	ChannelDataTypeFloat          ChannelDataType = C.CL_FLOAT
+)
+
+var channelDataTypeNameMap = map[ChannelDataType]string{
+	ChannelDataTypeSNormInt8:      "SNormInt8",
+	ChannelDataTypeSNormInt16:     "SNormInt16",
+	ChannelDataTypeUNormInt8:      "UNormInt8",
+	ChannelDataTypeUNormInt16:     "UNormInt16",
+	ChannelDataTypeUNormShort565:  "UNormShort565",
+	ChannelDataTypeUNormShort555:  "UNormShort555",
+	ChannelDataTypeUNormInt101010: "UNormInt101010",
+	ChannelDataTypeSignedInt8:     "SignedInt8",
+	ChannelDataTypeSignedInt16:    "SignedInt16",
+	ChannelDataTypeSignedInt32:    "SignedInt32",
+	ChannelDataTypeUnsignedInt8:   "UnsignedInt8",
+	ChannelDataTypeUnsignedInt16:  "UnsignedInt16",
+	ChannelDataTypeUnsignedInt32:  "UnsignedInt32",
+	ChannelDataTypeHalfFloat:      "HalfFloat",
+	ChannelDataTypeFloat:          "Float",
+}
+
+func (ct ChannelDataType) String() string {
+	name := channelDataTypeNameMap[ct]
+	if name == "" {
+		name = fmt.Sprintf("Unknown(%x)", int(ct))
+	}
+	return name
+}
+
+type ImageFormat struct {
+	ChannelOrder    ChannelOrder
+	ChannelDataType ChannelDataType
+}
+
+func (f ImageFormat) toCl() C.cl_image_format {
+	var format C.cl_image_format
+	format.image_channel_order = C.cl_channel_order(f.ChannelOrder)
+	format.image_channel_data_type = C.cl_channel_type(f.ChannelDataType)
+	return format
+}
+
+type ProfilingInfo int
+
+const (
+	// A 64-bit value that describes the current device time counter in
+	// nanoseconds when the command identified by event is enqueued in
+	// a command-queue by the host.
+	ProfilingInfoCommandQueued ProfilingInfo = C.CL_PROFILING_COMMAND_QUEUED
+	// A 64-bit value that describes the current device time counter in
+	// nanoseconds when the command identified by event that has been
+	// enqueued is submitted by the host to the device associated with the command-queue.
+	ProfilingInfoCommandSubmit ProfilingInfo = C.CL_PROFILING_COMMAND_SUBMIT
+	// A 64-bit value that describes the current device time counter in
+	// nanoseconds when the command identified by event starts execution on the device.
+	ProfilingInfoCommandStart ProfilingInfo = C.CL_PROFILING_COMMAND_START
+	// A 64-bit value that describes the current device time counter in
+	// nanoseconds when the command identified by event has finished
+	// execution on the device.
+	ProfilingInfoCommandEnd ProfilingInfo = C.CL_PROFILING_COMMAND_END
+)
+
+type CommmandExecStatus int
+
+const (
+	CommmandExecStatusComplete  CommmandExecStatus = C.CL_COMPLETE
+	CommmandExecStatusRunning   CommmandExecStatus = C.CL_RUNNING
+	CommmandExecStatusSubmitted CommmandExecStatus = C.CL_SUBMITTED
+	CommmandExecStatusQueued    CommmandExecStatus = C.CL_QUEUED
+)
+
+type Event struct {
+	clEvent C.cl_event
+}
+
+func releaseEvent(ev *Event) {
+	if ev.clEvent != nil {
+		C.clReleaseEvent(ev.clEvent)
+		ev.clEvent = nil
+	}
+}
+
+func (e *Event) Release() {
+	releaseEvent(e)
+}
+
+func (e *Event) GetEventProfilingInfo(paramName ProfilingInfo) (int64, error) {
+	var paramValue C.cl_ulong
+	if err := C.clGetEventProfilingInfo(e.clEvent, C.cl_profiling_info(paramName), C.size_t(unsafe.Sizeof(paramValue)), unsafe.Pointer(&paramValue), nil); err != C.CL_SUCCESS {
+		return 0, toError(err)
+	}
+	return int64(paramValue), nil
+}
+
+// Sets the execution status of a user event object.
+//
+// `status` specifies the new execution status to be set and
+// can be CL_COMPLETE or a negative integer value to indicate
+// an error. A negative integer value causes all enqueued commands
+// that wait on this user event to be terminated. clSetUserEventStatus
+// can only be called once to change the execution status of event.
+func (e *Event) SetUserEventStatus(status int) error {
+	return toError(C.clSetUserEventStatus(e.clEvent, C.cl_int(status)))
+}
+
+// Waits on the host thread for commands identified by event objects in
+// events to complete. A command is considered complete if its execution
+// status is CL_COMPLETE or a negative value. The events specified in
+// event_list act as synchronization points.
+//
+// If the cl_khr_gl_event extension is enabled, event objects can also be
+// used to reflect the status of an OpenGL sync object. The sync object
+// in turn refers to a fence command executing in an OpenGL command
+// stream. This provides another method of coordinating sharing of buffers
+// and images between OpenGL and OpenCL.
+func WaitForEvents(events []*Event) error {
+	return toError(C.clWaitForEvents(C.cl_uint(len(events)), eventListPtr(events)))
+}
+
+func newEvent(clEvent C.cl_event) *Event {
+	ev := &Event{clEvent: clEvent}
+	runtime.SetFinalizer(ev, releaseEvent)
+	return ev
+}
+
+func eventListPtr(el []*Event) *C.cl_event {
+	if el == nil {
+		return nil
+	}
+	elist := make([]C.cl_event, len(el))
+	for i, e := range el {
+		elist[i] = e.clEvent
+	}
+	return (*C.cl_event)(&elist[0])
+}
+
+func clBool(b bool) C.cl_bool {
+	if b {
+		return C.CL_TRUE
+	}
+	return C.CL_FALSE
+}
+
+func sizeT3(i3 [3]int) [3]C.size_t {
+	var val [3]C.size_t
+	val[0] = C.size_t(i3[0])
+	val[1] = C.size_t(i3[1])
+	val[2] = C.size_t(i3[2])
+	return val
+}
+
+type MappedMemObject struct {
+	ptr        unsafe.Pointer
+	size       int
+	rowPitch   int
+	slicePitch int
+}
+
+func (mb *MappedMemObject) ByteSlice() []byte {
+	var byteSlice []byte
+	sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&byteSlice))
+	sliceHeader.Cap = mb.size
+	sliceHeader.Len = mb.size
+	sliceHeader.Data = uintptr(mb.ptr)
+	return byteSlice
+}
+
+func (mb *MappedMemObject) Ptr() unsafe.Pointer {
+	return mb.ptr
+}
+
+func (mb *MappedMemObject) Size() int {
+	return mb.size
+}
+
+func (mb *MappedMemObject) RowPitch() int {
+	return mb.rowPitch
+}
+
+func (mb *MappedMemObject) SlicePitch() int {
+	return mb.slicePitch
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types12.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types12.go
new file mode 100644
index 000000000..58023cb60
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types12.go
@@ -0,0 +1,71 @@
+// +build cl12
+
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+const (
+	ChannelDataTypeUNormInt24  ChannelDataType = C.CL_UNORM_INT24
+	ChannelOrderDepth          ChannelOrder    = C.CL_DEPTH
+	ChannelOrderDepthStencil   ChannelOrder    = C.CL_DEPTH_STENCIL
+	MemHostNoAccess            MemFlag         = C.CL_MEM_HOST_NO_ACCESS  // OpenCL 1.2
+	MemHostReadOnly            MemFlag         = C.CL_MEM_HOST_READ_ONLY  // OpenCL 1.2
+	MemHostWriteOnly           MemFlag         = C.CL_MEM_HOST_WRITE_ONLY // OpenCL 1.2
+	MemObjectTypeImage1D       MemObjectType   = C.CL_MEM_OBJECT_IMAGE1D
+	MemObjectTypeImage1DArray  MemObjectType   = C.CL_MEM_OBJECT_IMAGE1D_ARRAY
+	MemObjectTypeImage1DBuffer MemObjectType   = C.CL_MEM_OBJECT_IMAGE1D_BUFFER
+	MemObjectTypeImage2DArray  MemObjectType   = C.CL_MEM_OBJECT_IMAGE2D_ARRAY
+	// This flag specifies that the region being mapped in the memory object is being mapped for writing.
+	//
+	// The contents of the region being mapped are to be discarded. This is typically the case when the
+	// region being mapped is overwritten by the host. This flag allows the implementation to no longer
+	// guarantee that the pointer returned by clEnqueueMapBuffer or clEnqueueMapImage contains the
+	// latest bits in the region being mapped which can be a significant performance enhancement.
+	MapFlagWriteInvalidateRegion MapFlag = C.CL_MAP_WRITE_INVALIDATE_REGION
+)
+
+func init() {
+	errorMap[C.CL_COMPILE_PROGRAM_FAILURE] = ErrCompileProgramFailure
+	errorMap[C.CL_DEVICE_PARTITION_FAILED] = ErrDevicePartitionFailed
+	errorMap[C.CL_INVALID_COMPILER_OPTIONS] = ErrInvalidCompilerOptions
+	errorMap[C.CL_INVALID_DEVICE_PARTITION_COUNT] = ErrInvalidDevicePartitionCount
+	errorMap[C.CL_INVALID_IMAGE_DESCRIPTOR] = ErrInvalidImageDescriptor
+	errorMap[C.CL_INVALID_LINKER_OPTIONS] = ErrInvalidLinkerOptions
+	errorMap[C.CL_KERNEL_ARG_INFO_NOT_AVAILABLE] = ErrKernelArgInfoNotAvailable
+	errorMap[C.CL_LINK_PROGRAM_FAILURE] = ErrLinkProgramFailure
+	errorMap[C.CL_LINKER_NOT_AVAILABLE] = ErrLinkerNotAvailable
+	channelOrderNameMap[ChannelOrderDepth] = "Depth"
+	channelOrderNameMap[ChannelOrderDepthStencil] = "DepthStencil"
+	channelDataTypeNameMap[ChannelDataTypeUNormInt24] = "UNormInt24"
+}
+
+type ImageDescription struct {
+	Type                            MemObjectType
+	Width, Height, Depth            int
+	ArraySize, RowPitch, SlicePitch int
+	NumMipLevels, NumSamples        int
+	Buffer                          *MemObject
+}
+
+func (d ImageDescription) toCl() C.cl_image_desc {
+	var desc C.cl_image_desc
+	desc.image_type = C.cl_mem_object_type(d.Type)
+	desc.image_width = C.size_t(d.Width)
+	desc.image_height = C.size_t(d.Height)
+	desc.image_depth = C.size_t(d.Depth)
+	desc.image_array_size = C.size_t(d.ArraySize)
+	desc.image_row_pitch = C.size_t(d.RowPitch)
+	desc.image_slice_pitch = C.size_t(d.SlicePitch)
+	desc.num_mip_levels = C.cl_uint(d.NumMipLevels)
+	desc.num_samples = C.cl_uint(d.NumSamples)
+	desc.buffer = nil
+	if d.Buffer != nil {
+		desc.buffer = d.Buffer.clMem
+	}
+	return desc
+}
diff --git a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types_darwin.go b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types_darwin.go
new file mode 100644
index 000000000..ddcf74906
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types_darwin.go
@@ -0,0 +1,45 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+// Extension: cl_APPLE_fixed_alpha_channel_orders
+//
+// These selectors may be passed to clCreateImage2D() in the cl_image_format.image_channel_order field.
+// They are like CL_BGRA and CL_ARGB except that the alpha channel to be ignored.  On calls to read_imagef,
+// the alpha will be 0xff (1.0f) if the sample falls in the image and 0 if it does not fall in the image.
+// On calls to write_imagef, the alpha value is ignored and 0xff (1.0f) is written. These formats are
+// currently only available for the CL_UNORM_INT8 cl_channel_type. They are intended to support legacy
+// image formats.
+const (
+	ChannelOrder1RGBApple ChannelOrder = C.CL_1RGB_APPLE // Introduced in MacOS X.7.
+	ChannelOrderBGR1Apple ChannelOrder = C.CL_BGR1_APPLE // Introduced in MacOS X.7.
+)
+
+// Extension: cl_APPLE_biased_fixed_point_image_formats
+//
+// This selector may be passed to clCreateImage2D() in the cl_image_format.image_channel_data_type field.
+// It defines a biased signed 1.14 fixed point storage format, with range [-1, 3). The conversion from
+// float to this fixed point format is defined as follows:
+//
+//      ushort float_to_sfixed14( float x ){
+//          int i = convert_int_sat_rte( x * 0x1.0p14f );         // scale [-1, 3.0) to [-16384, 3*16384), round to nearest integer
+//          i = add_sat( i, 0x4000 );                             // apply bias, to convert to [0, 65535) range
+//          return convert_ushort_sat(i);                         // clamp to destination size
+//      }
+//
+// The inverse conversion is the reverse process. The formats are currently only available on the CPU with
+// the CL_RGBA channel layout.
+const (
+	ChannelDataTypeSFixed14Apple ChannelDataType = C.CL_SFIXED14_APPLE // Introduced in MacOS X.7.
+)
+
+func init() {
+	channelOrderNameMap[ChannelOrder1RGBApple] = "1RGBApple"
+	channelOrderNameMap[ChannelOrderBGR1Apple] = "RGB1Apple"
+	channelDataTypeNameMap[ChannelDataTypeSFixed14Apple] = "SFixed14Apple"
+}
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go
index d0864da7f..ddb8ba583 100644
--- a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go
@@ -30,8 +30,8 @@ import (
 )
 
 var (
-	minDifficulty = new(big.Int).Exp(big.NewInt(2), big.NewInt(256), big.NewInt(0))
-	sharedLight   = new(Light)
+	maxUint256  = new(big.Int).Exp(big.NewInt(2), big.NewInt(256), big.NewInt(0))
+	sharedLight = new(Light)
 )
 
 const (
@@ -140,7 +140,7 @@ func (l *Light) Verify(block pow.Block) bool {
 	// the finalizer before the call completes.
 	_ = cache
 	// The actual check.
-	target := new(big.Int).Div(minDifficulty, difficulty)
+	target := new(big.Int).Div(maxUint256, difficulty)
 	return h256ToHash(ret.result).Big().Cmp(target) <= 0
 }
 
@@ -199,7 +199,7 @@ func (d *dag) generate() {
 		if d.dir == "" {
 			d.dir = DefaultDir
 		}
-		glog.V(logger.Info).Infof("Generating DAG for epoch %d (%x)", d.epoch, seedHash)
+		glog.V(logger.Info).Infof("Generating DAG for epoch %d (size %d) (%x)", d.epoch, dagSize, seedHash)
 		// Generate a temporary cache.
 		// TODO: this could share the cache with Light
 		cache := C.ethash_light_new_internal(cacheSize, (*C.ethash_h256_t)(unsafe.Pointer(&seedHash[0])))
@@ -220,14 +220,18 @@ func (d *dag) generate() {
 	})
 }
 
-func freeDAG(h *dag) {
-	C.ethash_full_delete(h.ptr)
-	h.ptr = nil
+func freeDAG(d *dag) {
+	C.ethash_full_delete(d.ptr)
+	d.ptr = nil
+}
+
+func (d *dag) Ptr() unsafe.Pointer {
+	return unsafe.Pointer(d.ptr.data)
 }
 
 //export ethashGoCallback
 func ethashGoCallback(percent C.unsigned) C.int {
-	glog.V(logger.Info).Infof("Still generating DAG: %d%%", percent)
+	glog.V(logger.Info).Infof("Generating DAG: %d%%", percent)
 	return 0
 }
 
@@ -273,7 +277,7 @@ func (pow *Full) getDAG(blockNum uint64) (d *dag) {
 	return d
 }
 
-func (pow *Full) Search(block pow.Block, stop <-chan struct{}) (nonce uint64, mixDigest []byte) {
+func (pow *Full) Search(block pow.Block, stop <-chan struct{}, index int) (nonce uint64, mixDigest []byte) {
 	dag := pow.getDAG(block.NumberU64())
 
 	r := rand.New(rand.NewSource(time.Now().UnixNano()))
@@ -286,7 +290,7 @@ func (pow *Full) Search(block pow.Block, stop <-chan struct{}) (nonce uint64, mi
 
 	nonce = uint64(r.Int63())
 	hash := hashToH256(block.HashNoNonce())
-	target := new(big.Int).Div(minDifficulty, diff)
+	target := new(big.Int).Div(maxUint256, diff)
 	for {
 		select {
 		case <-stop:
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go
new file mode 100644
index 000000000..332b7f524
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go
@@ -0,0 +1,629 @@
+// Copyright 2014 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+// +build opencl
+
+package ethash
+
+//#cgo LDFLAGS: -w
+//#include <stdint.h>
+//#include <string.h>
+//#include "src/libethash/internal.h"
+import "C"
+
+import (
+	crand "crypto/rand"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"math/big"
+	mrand "math/rand"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+	"unsafe"
+
+	"github.com/Gustav-Simonsson/go-opencl/cl"
+	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/pow"
+)
+
+/*
+
+  This code have two main entry points:
+
+  1. The initCL(...)  function configures one or more OpenCL device
+     (for now only GPU) and loads the Ethash DAG onto device memory
+
+  2. The Search(...) function loads a Ethash nonce into device(s) memory and
+     executes the Ethash OpenCL kernel.
+
+  Throughout the code, we refer to "host memory" and "device memory".
+  For most systems (e.g. regular PC GPU miner) the host memory is RAM and
+  device memory is the GPU global memory (e.g. GDDR5).
+
+  References mentioned in code comments:
+
+  1. https://github.com/ethereum/wiki/wiki/Ethash
+  2. https://github.com/ethereum/cpp-ethereum/blob/develop/libethash-cl/ethash_cl_miner.cpp
+  3. https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/
+  4. http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_OpenCL_Programming_User_Guide.pdf
+
+*/
+
+type OpenCLDevice struct {
+	deviceId int
+	device   *cl.Device
+	openCL11 bool // OpenCL version 1.1 and 1.2 are handled a bit different
+	openCL12 bool
+
+	dagBuf        *cl.MemObject // Ethash full DAG in device mem
+	headerBuf     *cl.MemObject // Hash of block-to-mine in device mem
+	searchBuffers []*cl.MemObject
+
+	searchKernel *cl.Kernel
+	hashKernel   *cl.Kernel
+
+	queue         *cl.CommandQueue
+	ctx           *cl.Context
+	workGroupSize int
+
+	nonceRand *mrand.Rand // seeded by crypto/rand, see comments where it's initialised
+	result    common.Hash
+}
+
+type OpenCLMiner struct {
+	mu sync.Mutex
+
+	ethash *Ethash // Ethash full DAG & cache in host mem
+
+	deviceIds []int
+	devices   []*OpenCLDevice
+
+	dagSize uint64
+
+	hashRate int32 // Go atomics & uint64 have some issues; int32 is supported on all platforms
+}
+
+type pendingSearch struct {
+	bufIndex   uint32
+	startNonce uint64
+}
+
+const (
+	SIZEOF_UINT32 = 4
+
+	// See [1]
+	ethashMixBytesLen = 128
+	ethashAccesses    = 64
+
+	// See [4]
+	workGroupSize    = 32 // must be multiple of 8
+	maxSearchResults = 63
+	searchBufSize    = 2
+	globalWorkSize   = 1024 * 256
+)
+
+func NewCL(deviceIds []int) *OpenCLMiner {
+	ids := make([]int, len(deviceIds))
+	copy(ids, deviceIds)
+	return &OpenCLMiner{
+		ethash:    New(),
+		dagSize:   0, // to see if we need to update DAG.
+		deviceIds: ids,
+	}
+}
+
+func PrintDevices() {
+	fmt.Println("=============================================")
+	fmt.Println("============ OpenCL Device Info =============")
+	fmt.Println("=============================================")
+
+	var found []*cl.Device
+
+	platforms, err := cl.GetPlatforms()
+	if err != nil {
+		fmt.Println("Plaform error (check your OpenCL installation): %v", err)
+		return
+	}
+
+	for i, p := range platforms {
+		fmt.Println("Platform id             ", i)
+		fmt.Println("Platform Name           ", p.Name())
+		fmt.Println("Platform Vendor         ", p.Vendor())
+		fmt.Println("Platform Version        ", p.Version())
+		fmt.Println("Platform Extensions     ", p.Extensions())
+		fmt.Println("Platform Profile        ", p.Profile())
+		fmt.Println("")
+
+		devices, err := cl.GetDevices(p, cl.DeviceTypeGPU)
+		if err != nil {
+			fmt.Println("Device error (check your GPU drivers) :", err)
+			return
+		}
+
+		for _, d := range devices {
+			fmt.Println("Device OpenCL id        ", i)
+			fmt.Println("Device id for mining    ", len(found))
+			fmt.Println("Device Name             ", d.Name())
+			fmt.Println("Vendor                  ", d.Vendor())
+			fmt.Println("Version                 ", d.Version())
+			fmt.Println("Driver version          ", d.DriverVersion())
+			fmt.Println("Address bits            ", d.AddressBits())
+			fmt.Println("Max clock freq          ", d.MaxClockFrequency())
+			fmt.Println("Global mem size         ", d.GlobalMemSize())
+			fmt.Println("Max constant buffer size", d.MaxConstantBufferSize())
+			fmt.Println("Max mem alloc size      ", d.MaxMemAllocSize())
+			fmt.Println("Max compute units       ", d.MaxComputeUnits())
+			fmt.Println("Max work group size     ", d.MaxWorkGroupSize())
+			fmt.Println("Max work item sizes     ", d.MaxWorkItemSizes())
+			fmt.Println("=============================================")
+
+			found = append(found, d)
+		}
+	}
+	if len(found) == 0 {
+		fmt.Println("Found no GPU(s). Check that your OS can see the GPU(s)")
+	} else {
+		var idsFormat string
+		for i := 0; i < len(found); i++ {
+			idsFormat += strconv.Itoa(i)
+			if i != len(found)-1 {
+				idsFormat += ","
+			}
+		}
+		fmt.Printf("Found %v devices. Benchmark first GPU:       geth gpubench 0\n", len(found))
+		fmt.Printf("Mine using all GPUs:                        geth --minegpu %v\n", idsFormat)
+	}
+}
+
+// See [2]. We basically do the same here, but the Go OpenCL bindings
+// are at a slightly higher abtraction level.
+func InitCL(blockNum uint64, c *OpenCLMiner) error {
+	platforms, err := cl.GetPlatforms()
+	if err != nil {
+		return fmt.Errorf("Plaform error: %v\nCheck your OpenCL installation and then run geth gpuinfo", err)
+	}
+
+	var devices []*cl.Device
+	for _, p := range platforms {
+		ds, err := cl.GetDevices(p, cl.DeviceTypeGPU)
+		if err != nil {
+			return fmt.Errorf("Devices error: %v\nCheck your GPU drivers and then run geth gpuinfo", err)
+		}
+		for _, d := range ds {
+			devices = append(devices, d)
+		}
+	}
+
+	pow := New()
+	_ = pow.getDAG(blockNum)     // generates DAG if we don't have it
+	pow.Light.getCache(blockNum) // and cache
+
+	c.ethash = pow
+	dagSize := uint64(C.ethash_get_datasize(C.uint64_t(blockNum)))
+	c.dagSize = dagSize
+
+	for _, id := range c.deviceIds {
+		if id > len(devices)-1 {
+			return fmt.Errorf("Device id not found. See available device ids with: geth gpuinfo")
+		} else {
+			err := initCLDevice(id, devices[id], c)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	if len(c.devices) == 0 {
+		return fmt.Errorf("No GPU devices found")
+	}
+	return nil
+}
+
+func initCLDevice(deviceId int, device *cl.Device, c *OpenCLMiner) error {
+	devMaxAlloc := uint64(device.MaxMemAllocSize())
+	devGlobalMem := uint64(device.GlobalMemSize())
+
+	// TODO: more fine grained version logic
+	if device.Version() == "OpenCL 1.0" {
+		fmt.Println("Device OpenCL version not supported: ", device.Version())
+		return fmt.Errorf("opencl version not supported")
+	}
+
+	var cl11, cl12 bool
+	if device.Version() == "OpenCL 1.1" {
+		cl11 = true
+	}
+	if device.Version() == "OpenCL 1.2" {
+		cl12 = true
+	}
+
+	// log warnings but carry on; some device drivers report inaccurate values
+	if c.dagSize > devGlobalMem {
+		fmt.Printf("WARNING: device memory may be insufficient: %v. DAG size: %v.\n", devGlobalMem, c.dagSize)
+	}
+
+	if c.dagSize > devMaxAlloc {
+		fmt.Printf("WARNING: DAG size (%v) larger than device max memory allocation size (%v).\n", c.dagSize, devMaxAlloc)
+		fmt.Printf("You probably have to export GPU_MAX_ALLOC_PERCENT=95\n")
+	}
+
+	fmt.Printf("Initialising device %v: %v\n", deviceId, device.Name())
+
+	context, err := cl.CreateContext([]*cl.Device{device})
+	if err != nil {
+		return fmt.Errorf("failed creating context:", err)
+	}
+
+	// TODO: test running with CL_QUEUE_PROFILING_ENABLE for profiling?
+	queue, err := context.CreateCommandQueue(device, 0)
+	if err != nil {
+		return fmt.Errorf("command queue err:", err)
+	}
+
+	// See [4] section 3.2 and [3] "clBuildProgram".
+	// The OpenCL kernel code is compiled at run-time.
+	kvs := make(map[string]string, 4)
+	kvs["GROUP_SIZE"] = strconv.FormatUint(workGroupSize, 10)
+	kvs["DAG_SIZE"] = strconv.FormatUint(c.dagSize/ethashMixBytesLen, 10)
+	kvs["ACCESSES"] = strconv.FormatUint(ethashAccesses, 10)
+	kvs["MAX_OUTPUTS"] = strconv.FormatUint(maxSearchResults, 10)
+	kernelCode := replaceWords(kernel, kvs)
+
+	program, err := context.CreateProgramWithSource([]string{kernelCode})
+	if err != nil {
+		return fmt.Errorf("program err:", err)
+	}
+
+	/* if using AMD OpenCL impl, you can set this to debug on x86 CPU device.
+	   see AMD OpenCL programming guide section 4.2
+
+	   export in shell before running:
+	   export AMD_OCL_BUILD_OPTIONS_APPEND="-g -O0"
+	   export CPU_MAX_COMPUTE_UNITS=1
+
+	buildOpts := "-g -cl-opt-disable"
+
+	*/
+	buildOpts := ""
+	err = program.BuildProgram([]*cl.Device{device}, buildOpts)
+	if err != nil {
+		return fmt.Errorf("program build err:", err)
+	}
+
+	var searchKernelName, hashKernelName string
+	searchKernelName = "ethash_search"
+	hashKernelName = "ethash_hash"
+
+	searchKernel, err := program.CreateKernel(searchKernelName)
+	hashKernel, err := program.CreateKernel(hashKernelName)
+	if err != nil {
+		return fmt.Errorf("kernel err:", err)
+	}
+
+	// TODO: when this DAG size appears, patch the Go bindings
+	// (context.go) to work with uint64 as size_t
+	if c.dagSize > math.MaxInt32 {
+		fmt.Println("DAG too large for allocation.")
+		return fmt.Errorf("DAG too large for alloc")
+	}
+
+	// TODO: patch up Go bindings to work with size_t, will overflow if > maxint32
+	// TODO: fuck. shit's gonna overflow around 2017-06-09 12:17:02
+	dagBuf := *(new(*cl.MemObject))
+	dagBuf, err = context.CreateEmptyBuffer(cl.MemReadOnly, int(c.dagSize))
+	if err != nil {
+		return fmt.Errorf("allocating dag buf failed: ", err)
+	}
+
+	// write DAG to device mem
+	dagPtr := unsafe.Pointer(c.ethash.Full.current.ptr.data)
+	_, err = queue.EnqueueWriteBuffer(dagBuf, true, 0, int(c.dagSize), dagPtr, nil)
+	if err != nil {
+		return fmt.Errorf("writing to dag buf failed: ", err)
+	}
+
+	searchBuffers := make([]*cl.MemObject, searchBufSize)
+	for i := 0; i < searchBufSize; i++ {
+		searchBuff, err := context.CreateEmptyBuffer(cl.MemWriteOnly, (1+maxSearchResults)*SIZEOF_UINT32)
+		if err != nil {
+			return fmt.Errorf("search buffer err:", err)
+		}
+		searchBuffers[i] = searchBuff
+	}
+
+	headerBuf, err := context.CreateEmptyBuffer(cl.MemReadOnly, 32)
+	if err != nil {
+		return fmt.Errorf("header buffer err:", err)
+	}
+
+	// Unique, random nonces are crucial for mining efficieny.
+	// While we do not need cryptographically secure PRNG for nonces,
+	// we want to have uniform distribution and minimal repetition of nonces.
+	// We could guarantee strict uniqueness of nonces by generating unique ranges,
+	// but a int64 seed from crypto/rand should be good enough.
+	// we then use math/rand for speed and to avoid draining OS entropy pool
+	seed, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64))
+	if err != nil {
+		return err
+	}
+	nonceRand := mrand.New(mrand.NewSource(seed.Int64()))
+
+	deviceStruct := &OpenCLDevice{
+		deviceId: deviceId,
+		device:   device,
+		openCL11: cl11,
+		openCL12: cl12,
+
+		dagBuf:        dagBuf,
+		headerBuf:     headerBuf,
+		searchBuffers: searchBuffers,
+
+		searchKernel: searchKernel,
+		hashKernel:   hashKernel,
+
+		queue: queue,
+		ctx:   context,
+
+		workGroupSize: workGroupSize,
+
+		nonceRand: nonceRand,
+	}
+	c.devices = append(c.devices, deviceStruct)
+
+	return nil
+}
+
+func (c *OpenCLMiner) Search(block pow.Block, stop <-chan struct{}, index int) (uint64, []byte) {
+	c.mu.Lock()
+	newDagSize := uint64(C.ethash_get_datasize(C.uint64_t(block.NumberU64())))
+	if newDagSize > c.dagSize {
+		// TODO: clean up buffers from previous DAG?
+		err := InitCL(block.NumberU64(), c)
+		if err != nil {
+			fmt.Println("OpenCL init error: ", err)
+			return 0, []byte{0}
+		}
+	}
+	defer c.mu.Unlock()
+
+	// Avoid unneeded OpenCL initialisation if we received stop while running InitCL
+	select {
+	case <-stop:
+		return 0, []byte{0}
+	default:
+	}
+
+	headerHash := block.HashNoNonce()
+	diff := block.Difficulty()
+	target256 := new(big.Int).Div(maxUint256, diff)
+	target64 := new(big.Int).Rsh(target256, 192).Uint64()
+	var zero uint32 = 0
+
+	d := c.devices[index]
+
+	_, err := d.queue.EnqueueWriteBuffer(d.headerBuf, false, 0, 32, unsafe.Pointer(&headerHash[0]), nil)
+	if err != nil {
+		fmt.Println("Error in Search clEnqueueWriterBuffer : ", err)
+		return 0, []byte{0}
+	}
+
+	for i := 0; i < searchBufSize; i++ {
+		_, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[i], false, 0, 4, unsafe.Pointer(&zero), nil)
+		if err != nil {
+			fmt.Println("Error in Search clEnqueueWriterBuffer : ", err)
+			return 0, []byte{0}
+		}
+	}
+
+	// wait for all search buffers to complete
+	err = d.queue.Finish()
+	if err != nil {
+		fmt.Println("Error in Search clFinish : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(1, d.headerBuf)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(2, d.dagBuf)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(4, target64)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+	err = d.searchKernel.SetArg(5, uint32(math.MaxUint32))
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	// wait on this before returning
+	var preReturnEvent *cl.Event
+	if d.openCL12 {
+		preReturnEvent, err = d.ctx.CreateUserEvent()
+		if err != nil {
+			fmt.Println("Error in Search create CL user event : ", err)
+			return 0, []byte{0}
+		}
+	}
+
+	pending := make([]pendingSearch, 0, searchBufSize)
+	var p *pendingSearch
+	searchBufIndex := uint32(0)
+	var checkNonce uint64
+	loops := int64(0)
+	prevHashRate := int32(0)
+	start := time.Now().UnixNano()
+	// we grab a single random nonce and sets this as argument to the kernel search function
+	// the device will then add each local threads gid to the nonce, creating a unique nonce
+	// for each device computing unit executing in parallel
+	initNonce := uint64(d.nonceRand.Int63())
+	for nonce := initNonce; ; nonce += uint64(globalWorkSize) {
+		select {
+		case <-stop:
+
+			/*
+				if d.openCL12 {
+					err = cl.WaitForEvents([]*cl.Event{preReturnEvent})
+					if err != nil {
+						fmt.Println("Error in Search WaitForEvents: ", err)
+					}
+				}
+			*/
+
+			atomic.AddInt32(&c.hashRate, -prevHashRate)
+			return 0, []byte{0}
+		default:
+		}
+
+		if (loops % (1 << 7)) == 0 {
+			elapsed := time.Now().UnixNano() - start
+			// TODO: verify if this is correct hash rate calculation
+			hashes := (float64(1e9) / float64(elapsed)) * float64(loops*1024*256)
+			hashrateDiff := int32(hashes) - prevHashRate
+			prevHashRate = int32(hashes)
+			atomic.AddInt32(&c.hashRate, hashrateDiff)
+		}
+		loops++
+
+		err = d.searchKernel.SetArg(0, d.searchBuffers[searchBufIndex])
+		if err != nil {
+			fmt.Println("Error in Search clSetKernelArg : ", err)
+			return 0, []byte{0}
+		}
+		err = d.searchKernel.SetArg(3, nonce)
+		if err != nil {
+			fmt.Println("Error in Search clSetKernelArg : ", err)
+			return 0, []byte{0}
+		}
+
+		// execute kernel
+		_, err := d.queue.EnqueueNDRangeKernel(
+			d.searchKernel,
+			[]int{0},
+			[]int{globalWorkSize},
+			[]int{d.workGroupSize},
+			nil)
+		if err != nil {
+			fmt.Println("Error in Search clEnqueueNDRangeKernel : ", err)
+			return 0, []byte{0}
+		}
+
+		pending = append(pending, pendingSearch{bufIndex: searchBufIndex, startNonce: nonce})
+		searchBufIndex = (searchBufIndex + 1) % searchBufSize
+
+		if len(pending) == searchBufSize {
+			p = &(pending[searchBufIndex])
+			cres, _, err := d.queue.EnqueueMapBuffer(d.searchBuffers[p.bufIndex], true,
+				cl.MapFlagRead, 0, (1+maxSearchResults)*SIZEOF_UINT32,
+				nil)
+			if err != nil {
+				fmt.Println("Error in Search clEnqueueMapBuffer: ", err)
+				return 0, []byte{0}
+			}
+
+			results := cres.ByteSlice()
+			nfound := binary.LittleEndian.Uint32(results)
+			nfound = uint32(math.Min(float64(nfound), float64(maxSearchResults)))
+			// OpenCL returns the offsets from the start nonce
+			for i := uint32(0); i < nfound; i++ {
+				lo := (i + 1) * SIZEOF_UINT32
+				hi := (i + 2) * SIZEOF_UINT32
+				upperNonce := uint64(binary.LittleEndian.Uint32(results[lo:hi]))
+				checkNonce = p.startNonce + upperNonce
+				if checkNonce != 0 {
+					cn := C.uint64_t(checkNonce)
+					ds := C.uint64_t(c.dagSize)
+					// We verify that the nonce is indeed a solution by
+					// executing the Ethash verification function (on the CPU).
+					ret := C.ethash_light_compute_internal(c.ethash.Light.current.ptr, ds, hashToH256(headerHash), cn)
+					// TODO: return result first
+					if ret.success && h256ToHash(ret.result).Big().Cmp(target256) <= 0 {
+						_, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil)
+						if err != nil {
+							fmt.Println("Error in Search clEnqueueUnmapMemObject: ", err)
+						}
+						if d.openCL12 {
+							err = cl.WaitForEvents([]*cl.Event{preReturnEvent})
+							if err != nil {
+								fmt.Println("Error in Search WaitForEvents: ", err)
+							}
+						}
+						return checkNonce, C.GoBytes(unsafe.Pointer(&ret.mix_hash), C.int(32))
+					}
+
+					_, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[p.bufIndex], false, 0, 4, unsafe.Pointer(&zero), nil)
+					if err != nil {
+						fmt.Println("Error in Search cl: EnqueueWriteBuffer", err)
+						return 0, []byte{0}
+					}
+				}
+			}
+			_, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil)
+			if err != nil {
+				fmt.Println("Error in Search clEnqueueUnMapMemObject: ", err)
+				return 0, []byte{0}
+			}
+			pending = append(pending[:searchBufIndex], pending[searchBufIndex+1:]...)
+		}
+	}
+	if d.openCL12 {
+		err := cl.WaitForEvents([]*cl.Event{preReturnEvent})
+		if err != nil {
+			fmt.Println("Error in Search clWaitForEvents: ", err)
+			return 0, []byte{0}
+		}
+	}
+	return 0, []byte{0}
+}
+
+func (c *OpenCLMiner) Verify(block pow.Block) bool {
+	return c.ethash.Light.Verify(block)
+}
+func (c *OpenCLMiner) GetHashrate() int64 {
+	return int64(atomic.LoadInt32(&c.hashRate))
+}
+func (c *OpenCLMiner) Turbo(on bool) {
+	// This is GPU mining. Always be turbo.
+}
+
+func replaceWords(text string, kvs map[string]string) string {
+	for k, v := range kvs {
+		text = strings.Replace(text, k, v, -1)
+	}
+	return text
+}
+
+func logErr(err error) {
+	if err != nil {
+		fmt.Println("Error in OpenCL call:", err)
+	}
+}
+
+func argErr(err error) error {
+	return fmt.Errorf("arg err: %v", err)
+}
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go
new file mode 100644
index 000000000..695ff1829
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go
@@ -0,0 +1,600 @@
+package ethash
+
+/*  DO NOT EDIT!!!
+
+    This code is version controlled at
+    https://github.com/ethereum/cpp-ethereum/blob/develop/libethash-cl/ethash_cl_miner_kernel.cl
+
+    If needed change it there first, then copy over here.
+*/
+
+const kernel = `
+// author Tim Hughes <tim@twistedfury.com>
+// Tested on Radeon HD 7850
+// Hashrate: 15940347 hashes/s
+// Bandwidth: 124533 MB/s
+// search kernel should fit in <= 84 VGPRS (3 wavefronts)
+
+#define THREADS_PER_HASH (128 / 16)
+#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
+
+#define FNV_PRIME	0x01000193
+
+__constant uint2 const Keccak_f1600_RC[24] = {
+	(uint2)(0x00000001, 0x00000000),
+	(uint2)(0x00008082, 0x00000000),
+	(uint2)(0x0000808a, 0x80000000),
+	(uint2)(0x80008000, 0x80000000),
+	(uint2)(0x0000808b, 0x00000000),
+	(uint2)(0x80000001, 0x00000000),
+	(uint2)(0x80008081, 0x80000000),
+	(uint2)(0x00008009, 0x80000000),
+	(uint2)(0x0000008a, 0x00000000),
+	(uint2)(0x00000088, 0x00000000),
+	(uint2)(0x80008009, 0x00000000),
+	(uint2)(0x8000000a, 0x00000000),
+	(uint2)(0x8000808b, 0x00000000),
+	(uint2)(0x0000008b, 0x80000000),
+	(uint2)(0x00008089, 0x80000000),
+	(uint2)(0x00008003, 0x80000000),
+	(uint2)(0x00008002, 0x80000000),
+	(uint2)(0x00000080, 0x80000000),
+	(uint2)(0x0000800a, 0x00000000),
+	(uint2)(0x8000000a, 0x80000000),
+	(uint2)(0x80008081, 0x80000000),
+	(uint2)(0x00008080, 0x80000000),
+	(uint2)(0x80000001, 0x00000000),
+	(uint2)(0x80008008, 0x80000000),
+};
+
+void keccak_f1600_round(uint2* a, uint r, uint out_size)
+{
+   #if !__ENDIAN_LITTLE__
+	for (uint i = 0; i != 25; ++i)
+		a[i] = a[i].yx;
+   #endif
+
+	uint2 b[25];
+	uint2 t;
+
+	// Theta
+	b[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20];
+	b[1] = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21];
+	b[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22];
+	b[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23];
+	b[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24];
+	t = b[4] ^ (uint2)(b[1].x << 1 | b[1].y >> 31, b[1].y << 1 | b[1].x >> 31);
+	a[0] ^= t;
+	a[5] ^= t;
+	a[10] ^= t;
+	a[15] ^= t;
+	a[20] ^= t;
+	t = b[0] ^ (uint2)(b[2].x << 1 | b[2].y >> 31, b[2].y << 1 | b[2].x >> 31);
+	a[1] ^= t;
+	a[6] ^= t;
+	a[11] ^= t;
+	a[16] ^= t;
+	a[21] ^= t;
+	t = b[1] ^ (uint2)(b[3].x << 1 | b[3].y >> 31, b[3].y << 1 | b[3].x >> 31);
+	a[2] ^= t;
+	a[7] ^= t;
+	a[12] ^= t;
+	a[17] ^= t;
+	a[22] ^= t;
+	t = b[2] ^ (uint2)(b[4].x << 1 | b[4].y >> 31, b[4].y << 1 | b[4].x >> 31);
+	a[3] ^= t;
+	a[8] ^= t;
+	a[13] ^= t;
+	a[18] ^= t;
+	a[23] ^= t;
+	t = b[3] ^ (uint2)(b[0].x << 1 | b[0].y >> 31, b[0].y << 1 | b[0].x >> 31);
+	a[4] ^= t;
+	a[9] ^= t;
+	a[14] ^= t;
+	a[19] ^= t;
+	a[24] ^= t;
+
+	// Rho Pi
+	b[0] = a[0];
+	b[10] = (uint2)(a[1].x << 1 | a[1].y >> 31, a[1].y << 1 | a[1].x >> 31);
+	b[7] = (uint2)(a[10].x << 3 | a[10].y >> 29, a[10].y << 3 | a[10].x >> 29);
+	b[11] = (uint2)(a[7].x << 6 | a[7].y >> 26, a[7].y << 6 | a[7].x >> 26);
+	b[17] = (uint2)(a[11].x << 10 | a[11].y >> 22, a[11].y << 10 | a[11].x >> 22);
+	b[18] = (uint2)(a[17].x << 15 | a[17].y >> 17, a[17].y << 15 | a[17].x >> 17);
+	b[3] = (uint2)(a[18].x << 21 | a[18].y >> 11, a[18].y << 21 | a[18].x >> 11);
+	b[5] = (uint2)(a[3].x << 28 | a[3].y >> 4, a[3].y << 28 | a[3].x >> 4);
+	b[16] = (uint2)(a[5].y << 4 | a[5].x >> 28, a[5].x << 4 | a[5].y >> 28);
+	b[8] = (uint2)(a[16].y << 13 | a[16].x >> 19, a[16].x << 13 | a[16].y >> 19);
+	b[21] = (uint2)(a[8].y << 23 | a[8].x >> 9, a[8].x << 23 | a[8].y >> 9);
+	b[24] = (uint2)(a[21].x << 2 | a[21].y >> 30, a[21].y << 2 | a[21].x >> 30);
+	b[4] = (uint2)(a[24].x << 14 | a[24].y >> 18, a[24].y << 14 | a[24].x >> 18);
+	b[15] = (uint2)(a[4].x << 27 | a[4].y >> 5, a[4].y << 27 | a[4].x >> 5);
+	b[23] = (uint2)(a[15].y << 9 | a[15].x >> 23, a[15].x << 9 | a[15].y >> 23);
+	b[19] = (uint2)(a[23].y << 24 | a[23].x >> 8, a[23].x << 24 | a[23].y >> 8);
+	b[13] = (uint2)(a[19].x << 8 | a[19].y >> 24, a[19].y << 8 | a[19].x >> 24);
+	b[12] = (uint2)(a[13].x << 25 | a[13].y >> 7, a[13].y << 25 | a[13].x >> 7);
+	b[2] = (uint2)(a[12].y << 11 | a[12].x >> 21, a[12].x << 11 | a[12].y >> 21);
+	b[20] = (uint2)(a[2].y << 30 | a[2].x >> 2, a[2].x << 30 | a[2].y >> 2);
+	b[14] = (uint2)(a[20].x << 18 | a[20].y >> 14, a[20].y << 18 | a[20].x >> 14);
+	b[22] = (uint2)(a[14].y << 7 | a[14].x >> 25, a[14].x << 7 | a[14].y >> 25);
+	b[9] = (uint2)(a[22].y << 29 | a[22].x >> 3, a[22].x << 29 | a[22].y >> 3);
+	b[6] = (uint2)(a[9].x << 20 | a[9].y >> 12, a[9].y << 20 | a[9].x >> 12);
+	b[1] = (uint2)(a[6].y << 12 | a[6].x >> 20, a[6].x << 12 | a[6].y >> 20);
+
+	// Chi
+	a[0] = bitselect(b[0] ^ b[2], b[0], b[1]);
+	a[1] = bitselect(b[1] ^ b[3], b[1], b[2]);
+	a[2] = bitselect(b[2] ^ b[4], b[2], b[3]);
+	a[3] = bitselect(b[3] ^ b[0], b[3], b[4]);
+	if (out_size >= 4)
+	{
+		a[4] = bitselect(b[4] ^ b[1], b[4], b[0]);
+		a[5] = bitselect(b[5] ^ b[7], b[5], b[6]);
+		a[6] = bitselect(b[6] ^ b[8], b[6], b[7]);
+		a[7] = bitselect(b[7] ^ b[9], b[7], b[8]);
+		a[8] = bitselect(b[8] ^ b[5], b[8], b[9]);
+		if (out_size >= 8)
+		{
+			a[9] = bitselect(b[9] ^ b[6], b[9], b[5]);
+			a[10] = bitselect(b[10] ^ b[12], b[10], b[11]);
+			a[11] = bitselect(b[11] ^ b[13], b[11], b[12]);
+			a[12] = bitselect(b[12] ^ b[14], b[12], b[13]);
+			a[13] = bitselect(b[13] ^ b[10], b[13], b[14]);
+			a[14] = bitselect(b[14] ^ b[11], b[14], b[10]);
+			a[15] = bitselect(b[15] ^ b[17], b[15], b[16]);
+			a[16] = bitselect(b[16] ^ b[18], b[16], b[17]);
+			a[17] = bitselect(b[17] ^ b[19], b[17], b[18]);
+			a[18] = bitselect(b[18] ^ b[15], b[18], b[19]);
+			a[19] = bitselect(b[19] ^ b[16], b[19], b[15]);
+			a[20] = bitselect(b[20] ^ b[22], b[20], b[21]);
+			a[21] = bitselect(b[21] ^ b[23], b[21], b[22]);
+			a[22] = bitselect(b[22] ^ b[24], b[22], b[23]);
+			a[23] = bitselect(b[23] ^ b[20], b[23], b[24]);
+			a[24] = bitselect(b[24] ^ b[21], b[24], b[20]);
+		}
+	}
+
+	// Iota
+	a[0] ^= Keccak_f1600_RC[r];
+
+   #if !__ENDIAN_LITTLE__
+	for (uint i = 0; i != 25; ++i)
+		a[i] = a[i].yx;
+   #endif
+}
+
+void keccak_f1600_no_absorb(ulong* a, uint in_size, uint out_size, uint isolate)
+{
+	for (uint i = in_size; i != 25; ++i)
+	{
+		a[i] = 0;
+	}
+#if __ENDIAN_LITTLE__
+	a[in_size] ^= 0x0000000000000001;
+	a[24-out_size*2] ^= 0x8000000000000000;
+#else
+	a[in_size] ^= 0x0100000000000000;
+	a[24-out_size*2] ^= 0x0000000000000080;
+#endif
+
+	// Originally I unrolled the first and last rounds to interface
+	// better with surrounding code, however I haven't done this
+	// without causing the AMD compiler to blow up the VGPR usage.
+	uint r = 0;
+	do
+	{
+		// This dynamic branch stops the AMD compiler unrolling the loop
+		// and additionally saves about 33% of the VGPRs, enough to gain another
+		// wavefront. Ideally we'd get 4 in flight, but 3 is the best I can
+		// massage out of the compiler. It doesn't really seem to matter how
+		// much we try and help the compiler save VGPRs because it seems to throw
+		// that information away, hence the implementation of keccak here
+		// doesn't bother.
+		if (isolate)
+		{
+			keccak_f1600_round((uint2*)a, r++, 25);
+		}
+	}
+	while (r < 23);
+
+	// final round optimised for digest size
+	keccak_f1600_round((uint2*)a, r++, out_size);
+}
+
+#define copy(dst, src, count) for (uint i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
+
+#define countof(x) (sizeof(x) / sizeof(x[0]))
+
+uint fnv(uint x, uint y)
+{
+	return x * FNV_PRIME ^ y;
+}
+
+uint4 fnv4(uint4 x, uint4 y)
+{
+	return x * FNV_PRIME ^ y;
+}
+
+uint fnv_reduce(uint4 v)
+{
+	return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
+}
+
+typedef union
+{
+	ulong ulongs[32 / sizeof(ulong)];
+	uint uints[32 / sizeof(uint)];
+} hash32_t;
+
+typedef union
+{
+	ulong ulongs[64 / sizeof(ulong)];
+	uint4 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+typedef union
+{
+	uint uints[128 / sizeof(uint)];
+	uint4 uint4s[128 / sizeof(uint4)];
+} hash128_t;
+
+hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate)
+{
+	hash64_t init;
+	uint const init_size = countof(init.ulongs);
+	uint const hash_size = countof(header->ulongs);
+
+	// sha3_512(header .. nonce)
+	ulong state[25];
+	copy(state, header->ulongs, hash_size);
+	state[hash_size] = nonce;
+	keccak_f1600_no_absorb(state, hash_size + 1, init_size, isolate);
+
+	copy(init.ulongs, state, init_size);
+	return init;
+}
+
+uint inner_loop_chunks(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, __global hash128_t const* g_dag1, __global hash128_t const* g_dag2, __global hash128_t const* g_dag3, uint isolate)
+{
+	uint4 mix = init;
+
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint init0 = *share;
+
+	uint a = 0;
+	do
+	{
+		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
+
+		#pragma unroll
+		for (uint i = 0; i != 4; ++i)
+		{
+			if (update_share)
+			{
+				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			mix = fnv4(mix, *share>=3 * DAG_SIZE / 4 ? g_dag3[*share - 3 * DAG_SIZE / 4].uint4s[thread_id] : *share>=DAG_SIZE / 2 ? g_dag2[*share - DAG_SIZE / 2].uint4s[thread_id] : *share>=DAG_SIZE / 4 ? g_dag1[*share - DAG_SIZE / 4].uint4s[thread_id]:g_dag[*share].uint4s[thread_id]);
+		}
+	} while ((a += 4) != (ACCESSES & isolate));
+
+	return fnv_reduce(mix);
+}
+
+
+
+uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate)
+{
+	uint4 mix = init;
+
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint init0 = *share;
+
+	uint a = 0;
+	do
+	{
+		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
+
+		#pragma unroll
+		for (uint i = 0; i != 4; ++i)
+		{
+			if (update_share)
+			{
+				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
+		}
+	}
+	while ((a += 4) != (ACCESSES & isolate));
+
+	return fnv_reduce(mix);
+}
+
+
+hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate)
+{
+	ulong state[25];
+
+	hash32_t hash;
+	uint const hash_size = countof(hash.ulongs);
+	uint const init_size = countof(init->ulongs);
+	uint const mix_size = countof(mix->ulongs);
+
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	copy(state, init->ulongs, init_size);
+	copy(state + init_size, mix->ulongs, mix_size);
+	keccak_f1600_no_absorb(state, init_size+mix_size, hash_size, isolate);
+
+	// copy out
+	copy(hash.ulongs, state, hash_size);
+	return hash;
+}
+
+hash32_t compute_hash_simple(
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong nonce,
+	uint isolate
+	)
+{
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	hash128_t mix;
+	for (uint i = 0; i != countof(mix.uint4s); ++i)
+	{
+		mix.uint4s[i] = init.uint4s[i % countof(init.uint4s)];
+	}
+
+	uint mix_val = mix.uints[0];
+	uint init0 = mix.uints[0];
+	uint a = 0;
+	do
+	{
+		uint pi = fnv(init0 ^ a, mix_val) % DAG_SIZE;
+		uint n = (a+1) % countof(mix.uints);
+
+		#pragma unroll
+		for (uint i = 0; i != countof(mix.uints); ++i)
+		{
+			mix.uints[i] = fnv(mix.uints[i], g_dag[pi].uints[i]);
+			mix_val = i == n ? mix.uints[i] : mix_val;
+		}
+	}
+	while (++a != (ACCESSES & isolate));
+
+	// reduce to output
+	hash32_t fnv_mix;
+	for (uint i = 0; i != countof(fnv_mix.uints); ++i)
+	{
+		fnv_mix.uints[i] = fnv_reduce(mix.uint4s[i]);
+	}
+
+	return final_hash(&init, &fnv_mix, isolate);
+}
+
+typedef union
+{
+	struct
+	{
+		hash64_t init;
+		uint pad; // avoid lds bank conflicts
+	};
+	hash32_t mix;
+} compute_hash_share;
+
+
+hash32_t compute_hash(
+	__local compute_hash_share* share,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	// Threads work together in this phase in groups of 8.
+	uint const thread_id = gid % THREADS_PER_HASH;
+	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+
+	hash32_t mix;
+	uint i = 0;
+	do
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uints, g_dag, isolate);
+
+		share[hash_id].mix.uints[thread_id] = thread_mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	while (++i != (THREADS_PER_HASH & isolate));
+
+	return final_hash(&init, &mix, isolate);
+}
+
+
+hash32_t compute_hash_chunks(
+	__local compute_hash_share* share,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	// Threads work together in this phase in groups of 8.
+	uint const thread_id = gid % THREADS_PER_HASH;
+	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+
+	hash32_t mix;
+	uint i = 0;
+	do
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint thread_mix = inner_loop_chunks(thread_init, thread_id, share[hash_id].mix.uints, g_dag, g_dag1, g_dag2, g_dag3, isolate);
+
+		share[hash_id].mix.uints[thread_id] = thread_mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	while (++i != (THREADS_PER_HASH & isolate));
+
+	return final_hash(&init, &mix, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash_simple(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search_simple(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
+
+	if (hash.ulongs[countof(hash.ulongs)-1] < target)
+	{
+		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
+		g_output[slot] = gid;
+	}
+}
+
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
+
+	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	{
+		uint slot = min((uint)MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1);
+		g_output[slot] = gid;
+	}
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash_chunks(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3,start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search_chunks(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3, start_nonce + gid, isolate);
+
+	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	{
+		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
+		g_output[slot] = gid;
+	}
+}
+`
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go
index 1e1de989d..c19e45d1d 100644
--- a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go
@@ -92,7 +92,7 @@ func TestEthashConcurrentVerify(t *testing.T) {
 	defer os.RemoveAll(eth.Full.Dir)
 
 	block := &testBlock{difficulty: big.NewInt(10)}
-	nonce, md := eth.Search(block, nil)
+	nonce, md := eth.Search(block, nil, 0)
 	block.nonce = nonce
 	block.mixDigest = common.BytesToHash(md)
 
@@ -135,7 +135,7 @@ func TestEthashConcurrentSearch(t *testing.T) {
 	// launch n searches concurrently.
 	for i := 0; i < nsearch; i++ {
 		go func() {
-			nonce, md := eth.Search(block, stop)
+			nonce, md := eth.Search(block, stop, 0)
 			select {
 			case found <- searchRes{n: nonce, md: md}:
 			case <-stop:
@@ -167,7 +167,7 @@ func TestEthashSearchAcrossEpoch(t *testing.T) {
 	for i := epochLength - 40; i < epochLength+40; i++ {
 		block := &testBlock{number: i, difficulty: big.NewInt(90)}
 		rand.Read(block.hashNoNonce[:])
-		nonce, md := eth.Search(block, nil)
+		nonce, md := eth.Search(block, nil, 0)
 		block.nonce = nonce
 		block.mixDigest = common.BytesToHash(md)
 		if !eth.Verify(block) {
diff --git a/Makefile b/Makefile
index 8c7b80ea6..56f250a92 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@
 GOBIN = build/bin
 
 geth:
-	build/env.sh go install -v $(shell build/ldflags.sh) ./cmd/geth
+	build/env.sh go install -v $(shell build/flags.sh) ./cmd/geth
 	@echo "Done building."
 	@echo "Run \"$(GOBIN)/geth\" to launch geth."
 
@@ -39,12 +39,12 @@ evm:
 	@echo "Done building."
 	@echo "Run \"$(GOBIN)/evm to start the evm."
 mist:
-	build/env.sh go install -v $(shell build/ldflags.sh) ./cmd/mist
+	build/env.sh go install -v $(shell build/flags.sh) ./cmd/mist
 	@echo "Done building."
 	@echo "Run \"$(GOBIN)/mist --asset_path=cmd/mist/assets\" to launch mist."
 
 all:
-	build/env.sh go install -v $(shell build/ldflags.sh) ./...
+	build/env.sh go install -v $(shell build/flags.sh) ./...
 
 test: all
 	build/env.sh go test ./...
diff --git a/build/ldflags.sh b/build/flags.sh
index 3f055d416..e021dbad4 100755
--- a/build/ldflags.sh
+++ b/build/flags.sh
@@ -16,3 +16,7 @@ sep=$(go version | awk '{ if ($3 >= "go1.5" || index($3, "devel")) print "="; el
 if [ -f ".git/HEAD" ]; then
     echo "-ldflags '-X main.gitCommit$sep$(git rev-parse HEAD)'"
 fi
+
+if [ ! -z "$GO_OPENCL" ]; then
+   echo "-tags opencl"
+fi
diff --git a/cmd/geth/js_test.go b/cmd/geth/js_test.go
index 2ad3d669c..09cc88519 100644
--- a/cmd/geth/js_test.go
+++ b/cmd/geth/js_test.go
@@ -468,8 +468,7 @@ func processTxs(repl *testjethre, t *testing.T, expTxc int) bool {
 		t.Errorf("incorrect number of pending transactions, expected %v, got %v", expTxc, txc)
 		return false
 	}
-
-	err = repl.ethereum.StartMining(runtime.NumCPU())
+	err = repl.ethereum.StartMining(runtime.NumCPU(), "")
 	if err != nil {
 		t.Errorf("unexpected error mining: %v", err)
 		return false
diff --git a/cmd/geth/main.go b/cmd/geth/main.go
index 1eb078201..3422d9500 100644
--- a/cmd/geth/main.go
+++ b/cmd/geth/main.go
@@ -107,6 +107,22 @@ Regular users do not need to execute it.
 `,
 		},
 		{
+			Action: gpuinfo,
+			Name:   "gpuinfo",
+			Usage:  "gpuinfo",
+			Description: `
+Prints OpenCL device info for all found GPUs.
+`,
+		},
+		{
+			Action: gpubench,
+			Name:   "gpubench",
+			Usage:  "benchmark GPU",
+			Description: `
+Runs quick benchmark on first GPU found.
+`,
+		},
+		{
 			Action: version,
 			Name:   "version",
 			Usage:  "print ethereum version numbers",
@@ -298,6 +314,7 @@ JavaScript API. See https://github.com/ethereum/go-ethereum/wiki/Javascipt-Conso
 		utils.GasPriceFlag,
 		utils.MinerThreadsFlag,
 		utils.MiningEnabledFlag,
+		utils.MiningGPUFlag,
 		utils.AutoDAGFlag,
 		utils.NATFlag,
 		utils.NatspecEnabledFlag,
@@ -586,7 +603,10 @@ func startEth(ctx *cli.Context, eth *eth.Ethereum) {
 		}
 	}
 	if ctx.GlobalBool(utils.MiningEnabledFlag.Name) {
-		if err := eth.StartMining(ctx.GlobalInt(utils.MinerThreadsFlag.Name)); err != nil {
+		err := eth.StartMining(
+			ctx.GlobalInt(utils.MinerThreadsFlag.Name),
+			ctx.GlobalString(utils.MiningGPUFlag.Name))
+		if err != nil {
 			utils.Fatalf("%v", err)
 		}
 	}
@@ -740,6 +760,29 @@ func makedag(ctx *cli.Context) {
 	}
 }
 
+func gpuinfo(ctx *cli.Context) {
+	eth.PrintOpenCLDevices()
+}
+
+func gpubench(ctx *cli.Context) {
+	args := ctx.Args()
+	wrongArgs := func() {
+		utils.Fatalf(`Usage: geth gpubench <gpu number>`)
+	}
+	switch {
+	case len(args) == 1:
+		n, err := strconv.ParseUint(args[0], 0, 64)
+		if err != nil {
+			wrongArgs()
+		}
+		eth.GPUBench(n)
+	case len(args) == 0:
+		eth.GPUBench(0)
+	default:
+		wrongArgs()
+	}
+}
+
 func version(c *cli.Context) {
 	fmt.Println(ClientIdentifier)
 	fmt.Println("Version:", Version)
diff --git a/cmd/utils/flags.go b/cmd/utils/flags.go
index 53faeefdc..ca9dd76fd 100644
--- a/cmd/utils/flags.go
+++ b/cmd/utils/flags.go
@@ -155,6 +155,12 @@ var (
 	}
 
 	// miner settings
+	// TODO: refactor CPU vs GPU mining flags
+	MiningGPUFlag = cli.StringFlag{
+		Name:  "minegpu",
+		Usage: "Mine with given GPUs. '--minegpu 0,1' will mine with the first two GPUs found.",
+	}
+
 	MinerThreadsFlag = cli.IntFlag{
 		Name:  "minerthreads",
 		Usage: "Number of miner threads",
diff --git a/common/natspec/natspec_e2e_test.go b/common/natspec/natspec_e2e_test.go
index 57f81f652..02c2014ba 100644
--- a/common/natspec/natspec_e2e_test.go
+++ b/common/natspec/natspec_e2e_test.go
@@ -306,7 +306,7 @@ func processTxs(repl *testFrontend, t *testing.T, expTxc int) bool {
 		return false
 	}
 
-	err = repl.ethereum.StartMining(runtime.NumCPU())
+	err = repl.ethereum.StartMining(runtime.NumCPU(), "")
 	if err != nil {
 		t.Errorf("unexpected error mining: %v", err)
 		return false
diff --git a/core/chain_makers.go b/core/chain_makers.go
index 4347d9173..dbe3adea7 100644
--- a/core/chain_makers.go
+++ b/core/chain_makers.go
@@ -32,7 +32,7 @@ import (
 // It returns true from Verify for any block.
 type FakePow struct{}
 
-func (f FakePow) Search(block pow.Block, stop <-chan struct{}) (uint64, []byte) {
+func (f FakePow) Search(block pow.Block, stop <-chan struct{}, index int) (uint64, []byte) {
 	return 0, nil
 }
 func (f FakePow) Verify(block pow.Block) bool { return true }
diff --git a/core/chain_pow_test.go b/core/chain_pow_test.go
index 80c6a1cc0..5aa8ed8a0 100644
--- a/core/chain_pow_test.go
+++ b/core/chain_pow_test.go
@@ -34,7 +34,7 @@ type failPow struct {
 	failing uint64
 }
 
-func (pow failPow) Search(pow.Block, <-chan struct{}) (uint64, []byte) {
+func (pow failPow) Search(pow.Block, <-chan struct{}, int) (uint64, []byte) {
 	return 0, nil
 }
 func (pow failPow) Verify(block pow.Block) bool { return block.NumberU64() != pow.failing }
@@ -47,7 +47,7 @@ type delayedPow struct {
 	delay time.Duration
 }
 
-func (pow delayedPow) Search(pow.Block, <-chan struct{}) (uint64, []byte) {
+func (pow delayedPow) Search(pow.Block, <-chan struct{}, int) (uint64, []byte) {
 	return 0, nil
 }
 func (pow delayedPow) Verify(block pow.Block) bool { time.Sleep(pow.delay); return true }
diff --git a/eth/backend.go b/eth/backend.go
index 18900c91e..dc16ba4cf 100644
--- a/eth/backend.go
+++ b/eth/backend.go
@@ -498,18 +498,6 @@ func (s *Ethereum) ResetWithGenesisBlock(gb *types.Block) {
 	s.blockchain.ResetWithGenesisBlock(gb)
 }
 
-func (s *Ethereum) StartMining(threads int) error {
-	eb, err := s.Etherbase()
-	if err != nil {
-		err = fmt.Errorf("Cannot start mining without etherbase address: %v", err)
-		glog.V(logger.Error).Infoln(err)
-		return err
-	}
-
-	go s.miner.Start(eb, threads)
-	return nil
-}
-
 func (s *Ethereum) Etherbase() (eb common.Address, err error) {
 	eb = s.etherbase
 	if (eb == common.Address{}) {
diff --git a/eth/cpu_mining.go b/eth/cpu_mining.go
new file mode 100644
index 000000000..f8795fd0c
--- /dev/null
+++ b/eth/cpu_mining.go
@@ -0,0 +1,54 @@
+// Copyright 2014 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+// +build !opencl
+
+package eth
+
+import (
+	"errors"
+	"fmt"
+
+	"github.com/ethereum/go-ethereum/logger"
+	"github.com/ethereum/go-ethereum/logger/glog"
+)
+
+const disabledInfo = "Set GO_OPENCL and re-build to enable."
+
+func (s *Ethereum) StartMining(threads int, gpus string) error {
+	eb, err := s.Etherbase()
+	if err != nil {
+		err = fmt.Errorf("Cannot start mining without etherbase address: %v", err)
+		glog.V(logger.Error).Infoln(err)
+		return err
+	}
+
+	if gpus != "" {
+		return errors.New("GPU mining disabled. " + disabledInfo)
+	}
+
+	// CPU mining
+	go s.miner.Start(eb, threads)
+	return nil
+}
+
+func GPUBench(gpuid uint64) {
+	fmt.Println("GPU mining disabled. " + disabledInfo)
+}
+
+func PrintOpenCLDevices() {
+	fmt.Println("OpenCL disabled. " + disabledInfo)
+}
diff --git a/eth/gpu_mining.go b/eth/gpu_mining.go
new file mode 100644
index 000000000..c351c2bdd
--- /dev/null
+++ b/eth/gpu_mining.go
@@ -0,0 +1,103 @@
+// Copyright 2014 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+// +build opencl
+
+package eth
+
+import (
+	"fmt"
+	"math/big"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/ethereum/ethash"
+	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/core/types"
+	"github.com/ethereum/go-ethereum/logger"
+	"github.com/ethereum/go-ethereum/logger/glog"
+	"github.com/ethereum/go-ethereum/miner"
+)
+
+func (s *Ethereum) StartMining(threads int, gpus string) error {
+	eb, err := s.Etherbase()
+	if err != nil {
+		err = fmt.Errorf("Cannot start mining without etherbase address: %v", err)
+		glog.V(logger.Error).Infoln(err)
+		return err
+	}
+
+	// GPU mining
+	if gpus != "" {
+		var ids []int
+		for _, s := range strings.Split(gpus, ",") {
+			i, err := strconv.Atoi(s)
+			if err != nil {
+				return fmt.Errorf("Invalid GPU id(s): %v", err)
+			}
+			if i < 0 {
+				return fmt.Errorf("Invalid GPU id: %v", i)
+			}
+			ids = append(ids, i)
+		}
+
+		// TODO: re-creating miner is a bit ugly
+		cl := ethash.NewCL(ids)
+		s.miner = miner.New(s, s.EventMux(), cl)
+		go s.miner.Start(eb, len(ids))
+		return nil
+	}
+
+	// CPU mining
+	go s.miner.Start(eb, threads)
+	return nil
+}
+
+func GPUBench(gpuid uint64) {
+	e := ethash.NewCL([]int{int(gpuid)})
+
+	var h common.Hash
+	bogoHeader := &types.Header{
+		ParentHash: h,
+		Number:     big.NewInt(int64(42)),
+		Difficulty: big.NewInt(int64(999999999999999)),
+	}
+	bogoBlock := types.NewBlock(bogoHeader, nil, nil, nil)
+
+	err := ethash.InitCL(bogoBlock.NumberU64(), e)
+	if err != nil {
+		fmt.Println("OpenCL init error: ", err)
+		return
+	}
+
+	stopChan := make(chan struct{})
+	reportHashRate := func() {
+		for {
+			time.Sleep(3 * time.Second)
+			fmt.Printf("hashes/s : %v\n", e.GetHashrate())
+		}
+	}
+	fmt.Printf("Starting benchmark (%v seconds)\n", 60)
+	go reportHashRate()
+	go e.Search(bogoBlock, stopChan, 0)
+	time.Sleep(60 * time.Second)
+	fmt.Println("OK.")
+}
+
+func PrintOpenCLDevices() {
+	ethash.PrintDevices()
+}
diff --git a/miner/agent.go b/miner/agent.go
index e80b222c8..4a4683bc6 100644
--- a/miner/agent.go
+++ b/miner/agent.go
@@ -118,7 +118,7 @@ func (self *CpuAgent) mine(work *Work, stop <-chan struct{}) {
 	glog.V(logger.Debug).Infof("(re)started agent[%d]. mining...\n", self.index)
 
 	// Mine
-	nonce, mixDigest := self.pow.Search(work.Block, stop)
+	nonce, mixDigest := self.pow.Search(work.Block, stop, self.index)
 	if nonce != 0 {
 		block := work.Block.WithMiningResult(nonce, common.BytesToHash(mixDigest))
 		self.returnCh <- &Result{work, block}
diff --git a/pow/ezp/pow.go b/pow/ezp/pow.go
index 03bb3da29..f7ae1cbf1 100644
--- a/pow/ezp/pow.go
+++ b/pow/ezp/pow.go
@@ -48,7 +48,7 @@ func (pow *EasyPow) Turbo(on bool) {
 	pow.turbo = on
 }
 
-func (pow *EasyPow) Search(block pow.Block, stop <-chan struct{}) (uint64, []byte) {
+func (pow *EasyPow) Search(block pow.Block, stop <-chan struct{}, index int) (uint64, []byte) {
 	r := rand.New(rand.NewSource(time.Now().UnixNano()))
 	hash := block.HashNoNonce()
 	diff := block.Difficulty()
diff --git a/pow/pow.go b/pow/pow.go
index 22daf35e4..592d96475 100644
--- a/pow/pow.go
+++ b/pow/pow.go
@@ -17,7 +17,7 @@
 package pow
 
 type PoW interface {
-	Search(block Block, stop <-chan struct{}) (uint64, []byte)
+	Search(block Block, stop <-chan struct{}, index int) (uint64, []byte)
 	Verify(block Block) bool
 	GetHashrate() int64
 	Turbo(bool)
diff --git a/rpc/api/miner.go b/rpc/api/miner.go
index 5325a660a..e07855dd2 100644
--- a/rpc/api/miner.go
+++ b/rpc/api/miner.go
@@ -100,7 +100,7 @@ func (self *minerApi) StartMiner(req *shared.Request) (interface{}, error) {
 	}
 
 	self.ethereum.StartAutoDAG()
-	err := self.ethereum.StartMining(args.Threads)
+	err := self.ethereum.StartMining(args.Threads, "")
 	if err == nil {
 		return true, nil
 	}
diff --git a/xeth/xeth.go b/xeth/xeth.go
index 3b487fed8..701932f97 100644
--- a/xeth/xeth.go
+++ b/xeth/xeth.go
@@ -474,7 +474,7 @@ func (self *XEth) ClientVersion() string {
 func (self *XEth) SetMining(shouldmine bool, threads int) bool {
 	ismining := self.backend.IsMining()
 	if shouldmine && !ismining {
-		err := self.backend.StartMining(threads)
+		err := self.backend.StartMining(threads, "")
 		return err == nil
 	}
 	if ismining && !shouldmine {
author	Jeffrey Wilcke <jeffrey@ethereum.org>	2015-10-16 21:25:33 +0800
committer	Jeffrey Wilcke <jeffrey@ethereum.org>	2015-10-16 21:25:33 +0800
commit	d5327ddc5fdc2a8b967699ea06ef5b5503657123 (patch)
tree	9872db19a62cb21fc68e7c3e27a8bf8b9fcc63bd
parent	b74775400906cc582bdbb98bf5067c5258ee491f (diff)
parent	ec6a548ee3555813d83f86f82bd25694bfd9c303 (diff)
download	go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar.gz go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar.bz2 go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar.lz go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar.xz go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.tar.zst go-tangerine-d5327ddc5fdc2a8b967699ea06ef5b5503657123.zip