4 files changed, 1246 insertions, 13 deletions
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go
index d0864da7f..ddb8ba583 100644
--- a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go
@@ -30,8 +30,8 @@ import (
 )
 
 var (
-	minDifficulty = new(big.Int).Exp(big.NewInt(2), big.NewInt(256), big.NewInt(0))
-	sharedLight   = new(Light)
+	maxUint256  = new(big.Int).Exp(big.NewInt(2), big.NewInt(256), big.NewInt(0))
+	sharedLight = new(Light)
 )
 
 const (
@@ -140,7 +140,7 @@ func (l *Light) Verify(block pow.Block) bool {
 	// the finalizer before the call completes.
 	_ = cache
 	// The actual check.
-	target := new(big.Int).Div(minDifficulty, difficulty)
+	target := new(big.Int).Div(maxUint256, difficulty)
 	return h256ToHash(ret.result).Big().Cmp(target) <= 0
 }
 
@@ -199,7 +199,7 @@ func (d *dag) generate() {
 		if d.dir == "" {
 			d.dir = DefaultDir
 		}
-		glog.V(logger.Info).Infof("Generating DAG for epoch %d (%x)", d.epoch, seedHash)
+		glog.V(logger.Info).Infof("Generating DAG for epoch %d (size %d) (%x)", d.epoch, dagSize, seedHash)
 		// Generate a temporary cache.
 		// TODO: this could share the cache with Light
 		cache := C.ethash_light_new_internal(cacheSize, (*C.ethash_h256_t)(unsafe.Pointer(&seedHash[0])))
@@ -220,14 +220,18 @@ func (d *dag) generate() {
 	})
 }
 
-func freeDAG(h *dag) {
-	C.ethash_full_delete(h.ptr)
-	h.ptr = nil
+func freeDAG(d *dag) {
+	C.ethash_full_delete(d.ptr)
+	d.ptr = nil
+}
+
+func (d *dag) Ptr() unsafe.Pointer {
+	return unsafe.Pointer(d.ptr.data)
 }
 
 //export ethashGoCallback
 func ethashGoCallback(percent C.unsigned) C.int {
-	glog.V(logger.Info).Infof("Still generating DAG: %d%%", percent)
+	glog.V(logger.Info).Infof("Generating DAG: %d%%", percent)
 	return 0
 }
 
@@ -273,7 +277,7 @@ func (pow *Full) getDAG(blockNum uint64) (d *dag) {
 	return d
 }
 
-func (pow *Full) Search(block pow.Block, stop <-chan struct{}) (nonce uint64, mixDigest []byte) {
+func (pow *Full) Search(block pow.Block, stop <-chan struct{}, index int) (nonce uint64, mixDigest []byte) {
 	dag := pow.getDAG(block.NumberU64())
 
 	r := rand.New(rand.NewSource(time.Now().UnixNano()))
@@ -286,7 +290,7 @@ func (pow *Full) Search(block pow.Block, stop <-chan struct{}) (nonce uint64, mi
 
 	nonce = uint64(r.Int63())
 	hash := hashToH256(block.HashNoNonce())
-	target := new(big.Int).Div(minDifficulty, diff)
+	target := new(big.Int).Div(maxUint256, diff)
 	for {
 		select {
 		case <-stop:
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go
new file mode 100644
index 000000000..332b7f524
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go
@@ -0,0 +1,629 @@
+// Copyright 2014 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+// +build opencl
+
+package ethash
+
+//#cgo LDFLAGS: -w
+//#include <stdint.h>
+//#include <string.h>
+//#include "src/libethash/internal.h"
+import "C"
+
+import (
+	crand "crypto/rand"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"math/big"
+	mrand "math/rand"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+	"unsafe"
+
+	"github.com/Gustav-Simonsson/go-opencl/cl"
+	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/pow"
+)
+
+/*
+
+  This code have two main entry points:
+
+  1. The initCL(...)  function configures one or more OpenCL device
+     (for now only GPU) and loads the Ethash DAG onto device memory
+
+  2. The Search(...) function loads a Ethash nonce into device(s) memory and
+     executes the Ethash OpenCL kernel.
+
+  Throughout the code, we refer to "host memory" and "device memory".
+  For most systems (e.g. regular PC GPU miner) the host memory is RAM and
+  device memory is the GPU global memory (e.g. GDDR5).
+
+  References mentioned in code comments:
+
+  1. https://github.com/ethereum/wiki/wiki/Ethash
+  2. https://github.com/ethereum/cpp-ethereum/blob/develop/libethash-cl/ethash_cl_miner.cpp
+  3. https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/
+  4. http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_OpenCL_Programming_User_Guide.pdf
+
+*/
+
+type OpenCLDevice struct {
+	deviceId int
+	device   *cl.Device
+	openCL11 bool // OpenCL version 1.1 and 1.2 are handled a bit different
+	openCL12 bool
+
+	dagBuf        *cl.MemObject // Ethash full DAG in device mem
+	headerBuf     *cl.MemObject // Hash of block-to-mine in device mem
+	searchBuffers []*cl.MemObject
+
+	searchKernel *cl.Kernel
+	hashKernel   *cl.Kernel
+
+	queue         *cl.CommandQueue
+	ctx           *cl.Context
+	workGroupSize int
+
+	nonceRand *mrand.Rand // seeded by crypto/rand, see comments where it's initialised
+	result    common.Hash
+}
+
+type OpenCLMiner struct {
+	mu sync.Mutex
+
+	ethash *Ethash // Ethash full DAG & cache in host mem
+
+	deviceIds []int
+	devices   []*OpenCLDevice
+
+	dagSize uint64
+
+	hashRate int32 // Go atomics & uint64 have some issues; int32 is supported on all platforms
+}
+
+type pendingSearch struct {
+	bufIndex   uint32
+	startNonce uint64
+}
+
+const (
+	SIZEOF_UINT32 = 4
+
+	// See [1]
+	ethashMixBytesLen = 128
+	ethashAccesses    = 64
+
+	// See [4]
+	workGroupSize    = 32 // must be multiple of 8
+	maxSearchResults = 63
+	searchBufSize    = 2
+	globalWorkSize   = 1024 * 256
+)
+
+func NewCL(deviceIds []int) *OpenCLMiner {
+	ids := make([]int, len(deviceIds))
+	copy(ids, deviceIds)
+	return &OpenCLMiner{
+		ethash:    New(),
+		dagSize:   0, // to see if we need to update DAG.
+		deviceIds: ids,
+	}
+}
+
+func PrintDevices() {
+	fmt.Println("=============================================")
+	fmt.Println("============ OpenCL Device Info =============")
+	fmt.Println("=============================================")
+
+	var found []*cl.Device
+
+	platforms, err := cl.GetPlatforms()
+	if err != nil {
+		fmt.Println("Plaform error (check your OpenCL installation): %v", err)
+		return
+	}
+
+	for i, p := range platforms {
+		fmt.Println("Platform id             ", i)
+		fmt.Println("Platform Name           ", p.Name())
+		fmt.Println("Platform Vendor         ", p.Vendor())
+		fmt.Println("Platform Version        ", p.Version())
+		fmt.Println("Platform Extensions     ", p.Extensions())
+		fmt.Println("Platform Profile        ", p.Profile())
+		fmt.Println("")
+
+		devices, err := cl.GetDevices(p, cl.DeviceTypeGPU)
+		if err != nil {
+			fmt.Println("Device error (check your GPU drivers) :", err)
+			return
+		}
+
+		for _, d := range devices {
+			fmt.Println("Device OpenCL id        ", i)
+			fmt.Println("Device id for mining    ", len(found))
+			fmt.Println("Device Name             ", d.Name())
+			fmt.Println("Vendor                  ", d.Vendor())
+			fmt.Println("Version                 ", d.Version())
+			fmt.Println("Driver version          ", d.DriverVersion())
+			fmt.Println("Address bits            ", d.AddressBits())
+			fmt.Println("Max clock freq          ", d.MaxClockFrequency())
+			fmt.Println("Global mem size         ", d.GlobalMemSize())
+			fmt.Println("Max constant buffer size", d.MaxConstantBufferSize())
+			fmt.Println("Max mem alloc size      ", d.MaxMemAllocSize())
+			fmt.Println("Max compute units       ", d.MaxComputeUnits())
+			fmt.Println("Max work group size     ", d.MaxWorkGroupSize())
+			fmt.Println("Max work item sizes     ", d.MaxWorkItemSizes())
+			fmt.Println("=============================================")
+
+			found = append(found, d)
+		}
+	}
+	if len(found) == 0 {
+		fmt.Println("Found no GPU(s). Check that your OS can see the GPU(s)")
+	} else {
+		var idsFormat string
+		for i := 0; i < len(found); i++ {
+			idsFormat += strconv.Itoa(i)
+			if i != len(found)-1 {
+				idsFormat += ","
+			}
+		}
+		fmt.Printf("Found %v devices. Benchmark first GPU:       geth gpubench 0\n", len(found))
+		fmt.Printf("Mine using all GPUs:                        geth --minegpu %v\n", idsFormat)
+	}
+}
+
+// See [2]. We basically do the same here, but the Go OpenCL bindings
+// are at a slightly higher abtraction level.
+func InitCL(blockNum uint64, c *OpenCLMiner) error {
+	platforms, err := cl.GetPlatforms()
+	if err != nil {
+		return fmt.Errorf("Plaform error: %v\nCheck your OpenCL installation and then run geth gpuinfo", err)
+	}
+
+	var devices []*cl.Device
+	for _, p := range platforms {
+		ds, err := cl.GetDevices(p, cl.DeviceTypeGPU)
+		if err != nil {
+			return fmt.Errorf("Devices error: %v\nCheck your GPU drivers and then run geth gpuinfo", err)
+		}
+		for _, d := range ds {
+			devices = append(devices, d)
+		}
+	}
+
+	pow := New()
+	_ = pow.getDAG(blockNum)     // generates DAG if we don't have it
+	pow.Light.getCache(blockNum) // and cache
+
+	c.ethash = pow
+	dagSize := uint64(C.ethash_get_datasize(C.uint64_t(blockNum)))
+	c.dagSize = dagSize
+
+	for _, id := range c.deviceIds {
+		if id > len(devices)-1 {
+			return fmt.Errorf("Device id not found. See available device ids with: geth gpuinfo")
+		} else {
+			err := initCLDevice(id, devices[id], c)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	if len(c.devices) == 0 {
+		return fmt.Errorf("No GPU devices found")
+	}
+	return nil
+}
+
+func initCLDevice(deviceId int, device *cl.Device, c *OpenCLMiner) error {
+	devMaxAlloc := uint64(device.MaxMemAllocSize())
+	devGlobalMem := uint64(device.GlobalMemSize())
+
+	// TODO: more fine grained version logic
+	if device.Version() == "OpenCL 1.0" {
+		fmt.Println("Device OpenCL version not supported: ", device.Version())
+		return fmt.Errorf("opencl version not supported")
+	}
+
+	var cl11, cl12 bool
+	if device.Version() == "OpenCL 1.1" {
+		cl11 = true
+	}
+	if device.Version() == "OpenCL 1.2" {
+		cl12 = true
+	}
+
+	// log warnings but carry on; some device drivers report inaccurate values
+	if c.dagSize > devGlobalMem {
+		fmt.Printf("WARNING: device memory may be insufficient: %v. DAG size: %v.\n", devGlobalMem, c.dagSize)
+	}
+
+	if c.dagSize > devMaxAlloc {
+		fmt.Printf("WARNING: DAG size (%v) larger than device max memory allocation size (%v).\n", c.dagSize, devMaxAlloc)
+		fmt.Printf("You probably have to export GPU_MAX_ALLOC_PERCENT=95\n")
+	}
+
+	fmt.Printf("Initialising device %v: %v\n", deviceId, device.Name())
+
+	context, err := cl.CreateContext([]*cl.Device{device})
+	if err != nil {
+		return fmt.Errorf("failed creating context:", err)
+	}
+
+	// TODO: test running with CL_QUEUE_PROFILING_ENABLE for profiling?
+	queue, err := context.CreateCommandQueue(device, 0)
+	if err != nil {
+		return fmt.Errorf("command queue err:", err)
+	}
+
+	// See [4] section 3.2 and [3] "clBuildProgram".
+	// The OpenCL kernel code is compiled at run-time.
+	kvs := make(map[string]string, 4)
+	kvs["GROUP_SIZE"] = strconv.FormatUint(workGroupSize, 10)
+	kvs["DAG_SIZE"] = strconv.FormatUint(c.dagSize/ethashMixBytesLen, 10)
+	kvs["ACCESSES"] = strconv.FormatUint(ethashAccesses, 10)
+	kvs["MAX_OUTPUTS"] = strconv.FormatUint(maxSearchResults, 10)
+	kernelCode := replaceWords(kernel, kvs)
+
+	program, err := context.CreateProgramWithSource([]string{kernelCode})
+	if err != nil {
+		return fmt.Errorf("program err:", err)
+	}
+
+	/* if using AMD OpenCL impl, you can set this to debug on x86 CPU device.
+	   see AMD OpenCL programming guide section 4.2
+
+	   export in shell before running:
+	   export AMD_OCL_BUILD_OPTIONS_APPEND="-g -O0"
+	   export CPU_MAX_COMPUTE_UNITS=1
+
+	buildOpts := "-g -cl-opt-disable"
+
+	*/
+	buildOpts := ""
+	err = program.BuildProgram([]*cl.Device{device}, buildOpts)
+	if err != nil {
+		return fmt.Errorf("program build err:", err)
+	}
+
+	var searchKernelName, hashKernelName string
+	searchKernelName = "ethash_search"
+	hashKernelName = "ethash_hash"
+
+	searchKernel, err := program.CreateKernel(searchKernelName)
+	hashKernel, err := program.CreateKernel(hashKernelName)
+	if err != nil {
+		return fmt.Errorf("kernel err:", err)
+	}
+
+	// TODO: when this DAG size appears, patch the Go bindings
+	// (context.go) to work with uint64 as size_t
+	if c.dagSize > math.MaxInt32 {
+		fmt.Println("DAG too large for allocation.")
+		return fmt.Errorf("DAG too large for alloc")
+	}
+
+	// TODO: patch up Go bindings to work with size_t, will overflow if > maxint32
+	// TODO: fuck. shit's gonna overflow around 2017-06-09 12:17:02
+	dagBuf := *(new(*cl.MemObject))
+	dagBuf, err = context.CreateEmptyBuffer(cl.MemReadOnly, int(c.dagSize))
+	if err != nil {
+		return fmt.Errorf("allocating dag buf failed: ", err)
+	}
+
+	// write DAG to device mem
+	dagPtr := unsafe.Pointer(c.ethash.Full.current.ptr.data)
+	_, err = queue.EnqueueWriteBuffer(dagBuf, true, 0, int(c.dagSize), dagPtr, nil)
+	if err != nil {
+		return fmt.Errorf("writing to dag buf failed: ", err)
+	}
+
+	searchBuffers := make([]*cl.MemObject, searchBufSize)
+	for i := 0; i < searchBufSize; i++ {
+		searchBuff, err := context.CreateEmptyBuffer(cl.MemWriteOnly, (1+maxSearchResults)*SIZEOF_UINT32)
+		if err != nil {
+			return fmt.Errorf("search buffer err:", err)
+		}
+		searchBuffers[i] = searchBuff
+	}
+
+	headerBuf, err := context.CreateEmptyBuffer(cl.MemReadOnly, 32)
+	if err != nil {
+		return fmt.Errorf("header buffer err:", err)
+	}
+
+	// Unique, random nonces are crucial for mining efficieny.
+	// While we do not need cryptographically secure PRNG for nonces,
+	// we want to have uniform distribution and minimal repetition of nonces.
+	// We could guarantee strict uniqueness of nonces by generating unique ranges,
+	// but a int64 seed from crypto/rand should be good enough.
+	// we then use math/rand for speed and to avoid draining OS entropy pool
+	seed, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64))
+	if err != nil {
+		return err
+	}
+	nonceRand := mrand.New(mrand.NewSource(seed.Int64()))
+
+	deviceStruct := &OpenCLDevice{
+		deviceId: deviceId,
+		device:   device,
+		openCL11: cl11,
+		openCL12: cl12,
+
+		dagBuf:        dagBuf,
+		headerBuf:     headerBuf,
+		searchBuffers: searchBuffers,
+
+		searchKernel: searchKernel,
+		hashKernel:   hashKernel,
+
+		queue: queue,
+		ctx:   context,
+
+		workGroupSize: workGroupSize,
+
+		nonceRand: nonceRand,
+	}
+	c.devices = append(c.devices, deviceStruct)
+
+	return nil
+}
+
+func (c *OpenCLMiner) Search(block pow.Block, stop <-chan struct{}, index int) (uint64, []byte) {
+	c.mu.Lock()
+	newDagSize := uint64(C.ethash_get_datasize(C.uint64_t(block.NumberU64())))
+	if newDagSize > c.dagSize {
+		// TODO: clean up buffers from previous DAG?
+		err := InitCL(block.NumberU64(), c)
+		if err != nil {
+			fmt.Println("OpenCL init error: ", err)
+			return 0, []byte{0}
+		}
+	}
+	defer c.mu.Unlock()
+
+	// Avoid unneeded OpenCL initialisation if we received stop while running InitCL
+	select {
+	case <-stop:
+		return 0, []byte{0}
+	default:
+	}
+
+	headerHash := block.HashNoNonce()
+	diff := block.Difficulty()
+	target256 := new(big.Int).Div(maxUint256, diff)
+	target64 := new(big.Int).Rsh(target256, 192).Uint64()
+	var zero uint32 = 0
+
+	d := c.devices[index]
+
+	_, err := d.queue.EnqueueWriteBuffer(d.headerBuf, false, 0, 32, unsafe.Pointer(&headerHash[0]), nil)
+	if err != nil {
+		fmt.Println("Error in Search clEnqueueWriterBuffer : ", err)
+		return 0, []byte{0}
+	}
+
+	for i := 0; i < searchBufSize; i++ {
+		_, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[i], false, 0, 4, unsafe.Pointer(&zero), nil)
+		if err != nil {
+			fmt.Println("Error in Search clEnqueueWriterBuffer : ", err)
+			return 0, []byte{0}
+		}
+	}
+
+	// wait for all search buffers to complete
+	err = d.queue.Finish()
+	if err != nil {
+		fmt.Println("Error in Search clFinish : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(1, d.headerBuf)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(2, d.dagBuf)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(4, target64)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+	err = d.searchKernel.SetArg(5, uint32(math.MaxUint32))
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	// wait on this before returning
+	var preReturnEvent *cl.Event
+	if d.openCL12 {
+		preReturnEvent, err = d.ctx.CreateUserEvent()
+		if err != nil {
+			fmt.Println("Error in Search create CL user event : ", err)
+			return 0, []byte{0}
+		}
+	}
+
+	pending := make([]pendingSearch, 0, searchBufSize)
+	var p *pendingSearch
+	searchBufIndex := uint32(0)
+	var checkNonce uint64
+	loops := int64(0)
+	prevHashRate := int32(0)
+	start := time.Now().UnixNano()
+	// we grab a single random nonce and sets this as argument to the kernel search function
+	// the device will then add each local threads gid to the nonce, creating a unique nonce
+	// for each device computing unit executing in parallel
+	initNonce := uint64(d.nonceRand.Int63())
+	for nonce := initNonce; ; nonce += uint64(globalWorkSize) {
+		select {
+		case <-stop:
+
+			/*
+				if d.openCL12 {
+					err = cl.WaitForEvents([]*cl.Event{preReturnEvent})
+					if err != nil {
+						fmt.Println("Error in Search WaitForEvents: ", err)
+					}
+				}
+			*/
+
+			atomic.AddInt32(&c.hashRate, -prevHashRate)
+			return 0, []byte{0}
+		default:
+		}
+
+		if (loops % (1 << 7)) == 0 {
+			elapsed := time.Now().UnixNano() - start
+			// TODO: verify if this is correct hash rate calculation
+			hashes := (float64(1e9) / float64(elapsed)) * float64(loops*1024*256)
+			hashrateDiff := int32(hashes) - prevHashRate
+			prevHashRate = int32(hashes)
+			atomic.AddInt32(&c.hashRate, hashrateDiff)
+		}
+		loops++
+
+		err = d.searchKernel.SetArg(0, d.searchBuffers[searchBufIndex])
+		if err != nil {
+			fmt.Println("Error in Search clSetKernelArg : ", err)
+			return 0, []byte{0}
+		}
+		err = d.searchKernel.SetArg(3, nonce)
+		if err != nil {
+			fmt.Println("Error in Search clSetKernelArg : ", err)
+			return 0, []byte{0}
+		}
+
+		// execute kernel
+		_, err := d.queue.EnqueueNDRangeKernel(
+			d.searchKernel,
+			[]int{0},
+			[]int{globalWorkSize},
+			[]int{d.workGroupSize},
+			nil)
+		if err != nil {
+			fmt.Println("Error in Search clEnqueueNDRangeKernel : ", err)
+			return 0, []byte{0}
+		}
+
+		pending = append(pending, pendingSearch{bufIndex: searchBufIndex, startNonce: nonce})
+		searchBufIndex = (searchBufIndex + 1) % searchBufSize
+
+		if len(pending) == searchBufSize {
+			p = &(pending[searchBufIndex])
+			cres, _, err := d.queue.EnqueueMapBuffer(d.searchBuffers[p.bufIndex], true,
+				cl.MapFlagRead, 0, (1+maxSearchResults)*SIZEOF_UINT32,
+				nil)
+			if err != nil {
+				fmt.Println("Error in Search clEnqueueMapBuffer: ", err)
+				return 0, []byte{0}
+			}
+
+			results := cres.ByteSlice()
+			nfound := binary.LittleEndian.Uint32(results)
+			nfound = uint32(math.Min(float64(nfound), float64(maxSearchResults)))
+			// OpenCL returns the offsets from the start nonce
+			for i := uint32(0); i < nfound; i++ {
+				lo := (i + 1) * SIZEOF_UINT32
+				hi := (i + 2) * SIZEOF_UINT32
+				upperNonce := uint64(binary.LittleEndian.Uint32(results[lo:hi]))
+				checkNonce = p.startNonce + upperNonce
+				if checkNonce != 0 {
+					cn := C.uint64_t(checkNonce)
+					ds := C.uint64_t(c.dagSize)
+					// We verify that the nonce is indeed a solution by
+					// executing the Ethash verification function (on the CPU).
+					ret := C.ethash_light_compute_internal(c.ethash.Light.current.ptr, ds, hashToH256(headerHash), cn)
+					// TODO: return result first
+					if ret.success && h256ToHash(ret.result).Big().Cmp(target256) <= 0 {
+						_, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil)
+						if err != nil {
+							fmt.Println("Error in Search clEnqueueUnmapMemObject: ", err)
+						}
+						if d.openCL12 {
+							err = cl.WaitForEvents([]*cl.Event{preReturnEvent})
+							if err != nil {
+								fmt.Println("Error in Search WaitForEvents: ", err)
+							}
+						}
+						return checkNonce, C.GoBytes(unsafe.Pointer(&ret.mix_hash), C.int(32))
+					}
+
+					_, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[p.bufIndex], false, 0, 4, unsafe.Pointer(&zero), nil)
+					if err != nil {
+						fmt.Println("Error in Search cl: EnqueueWriteBuffer", err)
+						return 0, []byte{0}
+					}
+				}
+			}
+			_, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil)
+			if err != nil {
+				fmt.Println("Error in Search clEnqueueUnMapMemObject: ", err)
+				return 0, []byte{0}
+			}
+			pending = append(pending[:searchBufIndex], pending[searchBufIndex+1:]...)
+		}
+	}
+	if d.openCL12 {
+		err := cl.WaitForEvents([]*cl.Event{preReturnEvent})
+		if err != nil {
+			fmt.Println("Error in Search clWaitForEvents: ", err)
+			return 0, []byte{0}
+		}
+	}
+	return 0, []byte{0}
+}
+
+func (c *OpenCLMiner) Verify(block pow.Block) bool {
+	return c.ethash.Light.Verify(block)
+}
+func (c *OpenCLMiner) GetHashrate() int64 {
+	return int64(atomic.LoadInt32(&c.hashRate))
+}
+func (c *OpenCLMiner) Turbo(on bool) {
+	// This is GPU mining. Always be turbo.
+}
+
+func replaceWords(text string, kvs map[string]string) string {
+	for k, v := range kvs {
+		text = strings.Replace(text, k, v, -1)
+	}
+	return text
+}
+
+func logErr(err error) {
+	if err != nil {
+		fmt.Println("Error in OpenCL call:", err)
+	}
+}
+
+func argErr(err error) error {
+	return fmt.Errorf("arg err: %v", err)
+}
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go
new file mode 100644
index 000000000..695ff1829
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go
@@ -0,0 +1,600 @@
+package ethash
+
+/*  DO NOT EDIT!!!
+
+    This code is version controlled at
+    https://github.com/ethereum/cpp-ethereum/blob/develop/libethash-cl/ethash_cl_miner_kernel.cl
+
+    If needed change it there first, then copy over here.
+*/
+
+const kernel = `
+// author Tim Hughes <tim@twistedfury.com>
+// Tested on Radeon HD 7850
+// Hashrate: 15940347 hashes/s
+// Bandwidth: 124533 MB/s
+// search kernel should fit in <= 84 VGPRS (3 wavefronts)
+
+#define THREADS_PER_HASH (128 / 16)
+#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
+
+#define FNV_PRIME	0x01000193
+
+__constant uint2 const Keccak_f1600_RC[24] = {
+	(uint2)(0x00000001, 0x00000000),
+	(uint2)(0x00008082, 0x00000000),
+	(uint2)(0x0000808a, 0x80000000),
+	(uint2)(0x80008000, 0x80000000),
+	(uint2)(0x0000808b, 0x00000000),
+	(uint2)(0x80000001, 0x00000000),
+	(uint2)(0x80008081, 0x80000000),
+	(uint2)(0x00008009, 0x80000000),
+	(uint2)(0x0000008a, 0x00000000),
+	(uint2)(0x00000088, 0x00000000),
+	(uint2)(0x80008009, 0x00000000),
+	(uint2)(0x8000000a, 0x00000000),
+	(uint2)(0x8000808b, 0x00000000),
+	(uint2)(0x0000008b, 0x80000000),
+	(uint2)(0x00008089, 0x80000000),
+	(uint2)(0x00008003, 0x80000000),
+	(uint2)(0x00008002, 0x80000000),
+	(uint2)(0x00000080, 0x80000000),
+	(uint2)(0x0000800a, 0x00000000),
+	(uint2)(0x8000000a, 0x80000000),
+	(uint2)(0x80008081, 0x80000000),
+	(uint2)(0x00008080, 0x80000000),
+	(uint2)(0x80000001, 0x00000000),
+	(uint2)(0x80008008, 0x80000000),
+};
+
+void keccak_f1600_round(uint2* a, uint r, uint out_size)
+{
+   #if !__ENDIAN_LITTLE__
+	for (uint i = 0; i != 25; ++i)
+		a[i] = a[i].yx;
+   #endif
+
+	uint2 b[25];
+	uint2 t;
+
+	// Theta
+	b[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20];
+	b[1] = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21];
+	b[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22];
+	b[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23];
+	b[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24];
+	t = b[4] ^ (uint2)(b[1].x << 1 | b[1].y >> 31, b[1].y << 1 | b[1].x >> 31);
+	a[0] ^= t;
+	a[5] ^= t;
+	a[10] ^= t;
+	a[15] ^= t;
+	a[20] ^= t;
+	t = b[0] ^ (uint2)(b[2].x << 1 | b[2].y >> 31, b[2].y << 1 | b[2].x >> 31);
+	a[1] ^= t;
+	a[6] ^= t;
+	a[11] ^= t;
+	a[16] ^= t;
+	a[21] ^= t;
+	t = b[1] ^ (uint2)(b[3].x << 1 | b[3].y >> 31, b[3].y << 1 | b[3].x >> 31);
+	a[2] ^= t;
+	a[7] ^= t;
+	a[12] ^= t;
+	a[17] ^= t;
+	a[22] ^= t;
+	t = b[2] ^ (uint2)(b[4].x << 1 | b[4].y >> 31, b[4].y << 1 | b[4].x >> 31);
+	a[3] ^= t;
+	a[8] ^= t;
+	a[13] ^= t;
+	a[18] ^= t;
+	a[23] ^= t;
+	t = b[3] ^ (uint2)(b[0].x << 1 | b[0].y >> 31, b[0].y << 1 | b[0].x >> 31);
+	a[4] ^= t;
+	a[9] ^= t;
+	a[14] ^= t;
+	a[19] ^= t;
+	a[24] ^= t;
+
+	// Rho Pi
+	b[0] = a[0];
+	b[10] = (uint2)(a[1].x << 1 | a[1].y >> 31, a[1].y << 1 | a[1].x >> 31);
+	b[7] = (uint2)(a[10].x << 3 | a[10].y >> 29, a[10].y << 3 | a[10].x >> 29);
+	b[11] = (uint2)(a[7].x << 6 | a[7].y >> 26, a[7].y << 6 | a[7].x >> 26);
+	b[17] = (uint2)(a[11].x << 10 | a[11].y >> 22, a[11].y << 10 | a[11].x >> 22);
+	b[18] = (uint2)(a[17].x << 15 | a[17].y >> 17, a[17].y << 15 | a[17].x >> 17);
+	b[3] = (uint2)(a[18].x << 21 | a[18].y >> 11, a[18].y << 21 | a[18].x >> 11);
+	b[5] = (uint2)(a[3].x << 28 | a[3].y >> 4, a[3].y << 28 | a[3].x >> 4);
+	b[16] = (uint2)(a[5].y << 4 | a[5].x >> 28, a[5].x << 4 | a[5].y >> 28);
+	b[8] = (uint2)(a[16].y << 13 | a[16].x >> 19, a[16].x << 13 | a[16].y >> 19);
+	b[21] = (uint2)(a[8].y << 23 | a[8].x >> 9, a[8].x << 23 | a[8].y >> 9);
+	b[24] = (uint2)(a[21].x << 2 | a[21].y >> 30, a[21].y << 2 | a[21].x >> 30);
+	b[4] = (uint2)(a[24].x << 14 | a[24].y >> 18, a[24].y << 14 | a[24].x >> 18);
+	b[15] = (uint2)(a[4].x << 27 | a[4].y >> 5, a[4].y << 27 | a[4].x >> 5);
+	b[23] = (uint2)(a[15].y << 9 | a[15].x >> 23, a[15].x << 9 | a[15].y >> 23);
+	b[19] = (uint2)(a[23].y << 24 | a[23].x >> 8, a[23].x << 24 | a[23].y >> 8);
+	b[13] = (uint2)(a[19].x << 8 | a[19].y >> 24, a[19].y << 8 | a[19].x >> 24);
+	b[12] = (uint2)(a[13].x << 25 | a[13].y >> 7, a[13].y << 25 | a[13].x >> 7);
+	b[2] = (uint2)(a[12].y << 11 | a[12].x >> 21, a[12].x << 11 | a[12].y >> 21);
+	b[20] = (uint2)(a[2].y << 30 | a[2].x >> 2, a[2].x << 30 | a[2].y >> 2);
+	b[14] = (uint2)(a[20].x << 18 | a[20].y >> 14, a[20].y << 18 | a[20].x >> 14);
+	b[22] = (uint2)(a[14].y << 7 | a[14].x >> 25, a[14].x << 7 | a[14].y >> 25);
+	b[9] = (uint2)(a[22].y << 29 | a[22].x >> 3, a[22].x << 29 | a[22].y >> 3);
+	b[6] = (uint2)(a[9].x << 20 | a[9].y >> 12, a[9].y << 20 | a[9].x >> 12);
+	b[1] = (uint2)(a[6].y << 12 | a[6].x >> 20, a[6].x << 12 | a[6].y >> 20);
+
+	// Chi
+	a[0] = bitselect(b[0] ^ b[2], b[0], b[1]);
+	a[1] = bitselect(b[1] ^ b[3], b[1], b[2]);
+	a[2] = bitselect(b[2] ^ b[4], b[2], b[3]);
+	a[3] = bitselect(b[3] ^ b[0], b[3], b[4]);
+	if (out_size >= 4)
+	{
+		a[4] = bitselect(b[4] ^ b[1], b[4], b[0]);
+		a[5] = bitselect(b[5] ^ b[7], b[5], b[6]);
+		a[6] = bitselect(b[6] ^ b[8], b[6], b[7]);
+		a[7] = bitselect(b[7] ^ b[9], b[7], b[8]);
+		a[8] = bitselect(b[8] ^ b[5], b[8], b[9]);
+		if (out_size >= 8)
+		{
+			a[9] = bitselect(b[9] ^ b[6], b[9], b[5]);
+			a[10] = bitselect(b[10] ^ b[12], b[10], b[11]);
+			a[11] = bitselect(b[11] ^ b[13], b[11], b[12]);
+			a[12] = bitselect(b[12] ^ b[14], b[12], b[13]);
+			a[13] = bitselect(b[13] ^ b[10], b[13], b[14]);
+			a[14] = bitselect(b[14] ^ b[11], b[14], b[10]);
+			a[15] = bitselect(b[15] ^ b[17], b[15], b[16]);
+			a[16] = bitselect(b[16] ^ b[18], b[16], b[17]);
+			a[17] = bitselect(b[17] ^ b[19], b[17], b[18]);
+			a[18] = bitselect(b[18] ^ b[15], b[18], b[19]);
+			a[19] = bitselect(b[19] ^ b[16], b[19], b[15]);
+			a[20] = bitselect(b[20] ^ b[22], b[20], b[21]);
+			a[21] = bitselect(b[21] ^ b[23], b[21], b[22]);
+			a[22] = bitselect(b[22] ^ b[24], b[22], b[23]);
+			a[23] = bitselect(b[23] ^ b[20], b[23], b[24]);
+			a[24] = bitselect(b[24] ^ b[21], b[24], b[20]);
+		}
+	}
+
+	// Iota
+	a[0] ^= Keccak_f1600_RC[r];
+
+   #if !__ENDIAN_LITTLE__
+	for (uint i = 0; i != 25; ++i)
+		a[i] = a[i].yx;
+   #endif
+}
+
+void keccak_f1600_no_absorb(ulong* a, uint in_size, uint out_size, uint isolate)
+{
+	for (uint i = in_size; i != 25; ++i)
+	{
+		a[i] = 0;
+	}
+#if __ENDIAN_LITTLE__
+	a[in_size] ^= 0x0000000000000001;
+	a[24-out_size*2] ^= 0x8000000000000000;
+#else
+	a[in_size] ^= 0x0100000000000000;
+	a[24-out_size*2] ^= 0x0000000000000080;
+#endif
+
+	// Originally I unrolled the first and last rounds to interface
+	// better with surrounding code, however I haven't done this
+	// without causing the AMD compiler to blow up the VGPR usage.
+	uint r = 0;
+	do
+	{
+		// This dynamic branch stops the AMD compiler unrolling the loop
+		// and additionally saves about 33% of the VGPRs, enough to gain another
+		// wavefront. Ideally we'd get 4 in flight, but 3 is the best I can
+		// massage out of the compiler. It doesn't really seem to matter how
+		// much we try and help the compiler save VGPRs because it seems to throw
+		// that information away, hence the implementation of keccak here
+		// doesn't bother.
+		if (isolate)
+		{
+			keccak_f1600_round((uint2*)a, r++, 25);
+		}
+	}
+	while (r < 23);
+
+	// final round optimised for digest size
+	keccak_f1600_round((uint2*)a, r++, out_size);
+}
+
+#define copy(dst, src, count) for (uint i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
+
+#define countof(x) (sizeof(x) / sizeof(x[0]))
+
+uint fnv(uint x, uint y)
+{
+	return x * FNV_PRIME ^ y;
+}
+
+uint4 fnv4(uint4 x, uint4 y)
+{
+	return x * FNV_PRIME ^ y;
+}
+
+uint fnv_reduce(uint4 v)
+{
+	return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
+}
+
+typedef union
+{
+	ulong ulongs[32 / sizeof(ulong)];
+	uint uints[32 / sizeof(uint)];
+} hash32_t;
+
+typedef union
+{
+	ulong ulongs[64 / sizeof(ulong)];
+	uint4 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+typedef union
+{
+	uint uints[128 / sizeof(uint)];
+	uint4 uint4s[128 / sizeof(uint4)];
+} hash128_t;
+
+hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate)
+{
+	hash64_t init;
+	uint const init_size = countof(init.ulongs);
+	uint const hash_size = countof(header->ulongs);
+
+	// sha3_512(header .. nonce)
+	ulong state[25];
+	copy(state, header->ulongs, hash_size);
+	state[hash_size] = nonce;
+	keccak_f1600_no_absorb(state, hash_size + 1, init_size, isolate);
+
+	copy(init.ulongs, state, init_size);
+	return init;
+}
+
+uint inner_loop_chunks(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, __global hash128_t const* g_dag1, __global hash128_t const* g_dag2, __global hash128_t const* g_dag3, uint isolate)
+{
+	uint4 mix = init;
+
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint init0 = *share;
+
+	uint a = 0;
+	do
+	{
+		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
+
+		#pragma unroll
+		for (uint i = 0; i != 4; ++i)
+		{
+			if (update_share)
+			{
+				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			mix = fnv4(mix, *share>=3 * DAG_SIZE / 4 ? g_dag3[*share - 3 * DAG_SIZE / 4].uint4s[thread_id] : *share>=DAG_SIZE / 2 ? g_dag2[*share - DAG_SIZE / 2].uint4s[thread_id] : *share>=DAG_SIZE / 4 ? g_dag1[*share - DAG_SIZE / 4].uint4s[thread_id]:g_dag[*share].uint4s[thread_id]);
+		}
+	} while ((a += 4) != (ACCESSES & isolate));
+
+	return fnv_reduce(mix);
+}
+
+
+
+uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate)
+{
+	uint4 mix = init;
+
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint init0 = *share;
+
+	uint a = 0;
+	do
+	{
+		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
+
+		#pragma unroll
+		for (uint i = 0; i != 4; ++i)
+		{
+			if (update_share)
+			{
+				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
+		}
+	}
+	while ((a += 4) != (ACCESSES & isolate));
+
+	return fnv_reduce(mix);
+}
+
+
+hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate)
+{
+	ulong state[25];
+
+	hash32_t hash;
+	uint const hash_size = countof(hash.ulongs);
+	uint const init_size = countof(init->ulongs);
+	uint const mix_size = countof(mix->ulongs);
+
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	copy(state, init->ulongs, init_size);
+	copy(state + init_size, mix->ulongs, mix_size);
+	keccak_f1600_no_absorb(state, init_size+mix_size, hash_size, isolate);
+
+	// copy out
+	copy(hash.ulongs, state, hash_size);
+	return hash;
+}
+
+hash32_t compute_hash_simple(
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong nonce,
+	uint isolate
+	)
+{
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	hash128_t mix;
+	for (uint i = 0; i != countof(mix.uint4s); ++i)
+	{
+		mix.uint4s[i] = init.uint4s[i % countof(init.uint4s)];
+	}
+
+	uint mix_val = mix.uints[0];
+	uint init0 = mix.uints[0];
+	uint a = 0;
+	do
+	{
+		uint pi = fnv(init0 ^ a, mix_val) % DAG_SIZE;
+		uint n = (a+1) % countof(mix.uints);
+
+		#pragma unroll
+		for (uint i = 0; i != countof(mix.uints); ++i)
+		{
+			mix.uints[i] = fnv(mix.uints[i], g_dag[pi].uints[i]);
+			mix_val = i == n ? mix.uints[i] : mix_val;
+		}
+	}
+	while (++a != (ACCESSES & isolate));
+
+	// reduce to output
+	hash32_t fnv_mix;
+	for (uint i = 0; i != countof(fnv_mix.uints); ++i)
+	{
+		fnv_mix.uints[i] = fnv_reduce(mix.uint4s[i]);
+	}
+
+	return final_hash(&init, &fnv_mix, isolate);
+}
+
+typedef union
+{
+	struct
+	{
+		hash64_t init;
+		uint pad; // avoid lds bank conflicts
+	};
+	hash32_t mix;
+} compute_hash_share;
+
+
+hash32_t compute_hash(
+	__local compute_hash_share* share,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	// Threads work together in this phase in groups of 8.
+	uint const thread_id = gid % THREADS_PER_HASH;
+	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+
+	hash32_t mix;
+	uint i = 0;
+	do
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uints, g_dag, isolate);
+
+		share[hash_id].mix.uints[thread_id] = thread_mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	while (++i != (THREADS_PER_HASH & isolate));
+
+	return final_hash(&init, &mix, isolate);
+}
+
+
+hash32_t compute_hash_chunks(
+	__local compute_hash_share* share,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	// Threads work together in this phase in groups of 8.
+	uint const thread_id = gid % THREADS_PER_HASH;
+	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+
+	hash32_t mix;
+	uint i = 0;
+	do
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint thread_mix = inner_loop_chunks(thread_init, thread_id, share[hash_id].mix.uints, g_dag, g_dag1, g_dag2, g_dag3, isolate);
+
+		share[hash_id].mix.uints[thread_id] = thread_mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	while (++i != (THREADS_PER_HASH & isolate));
+
+	return final_hash(&init, &mix, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash_simple(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search_simple(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
+
+	if (hash.ulongs[countof(hash.ulongs)-1] < target)
+	{
+		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
+		g_output[slot] = gid;
+	}
+}
+
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
+
+	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	{
+		uint slot = min((uint)MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1);
+		g_output[slot] = gid;
+	}
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash_chunks(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3,start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search_chunks(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3, start_nonce + gid, isolate);
+
+	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	{
+		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
+		g_output[slot] = gid;
+	}
+}
+`
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go
index 1e1de989d..c19e45d1d 100644
--- a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go
@@ -92,7 +92,7 @@ func TestEthashConcurrentVerify(t *testing.T) {
 	defer os.RemoveAll(eth.Full.Dir)
 
 	block := &testBlock{difficulty: big.NewInt(10)}
-	nonce, md := eth.Search(block, nil)
+	nonce, md := eth.Search(block, nil, 0)
 	block.nonce = nonce
 	block.mixDigest = common.BytesToHash(md)
 
@@ -135,7 +135,7 @@ func TestEthashConcurrentSearch(t *testing.T) {
 	// launch n searches concurrently.
 	for i := 0; i < nsearch; i++ {
 		go func() {
-			nonce, md := eth.Search(block, stop)
+			nonce, md := eth.Search(block, stop, 0)
 			select {
 			case found <- searchRes{n: nonce, md: md}:
 			case <-stop:
@@ -167,7 +167,7 @@ func TestEthashSearchAcrossEpoch(t *testing.T) {
 	for i := epochLength - 40; i < epochLength+40; i++ {
 		block := &testBlock{number: i, difficulty: big.NewInt(90)}
 		rand.Read(block.hashNoNonce[:])
-		nonce, md := eth.Search(block, nil)
+		nonce, md := eth.Search(block, nil, 0)
 		block.nonce = nonce
 		block.mixDigest = common.BytesToHash(md)
 		if !eth.Verify(block) {