1 files changed, 629 insertions, 0 deletions
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go
new file mode 100644
index 000000000..332b7f524
--- /dev/null
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go
@@ -0,0 +1,629 @@
+// Copyright 2014 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+// +build opencl
+
+package ethash
+
+//#cgo LDFLAGS: -w
+//#include <stdint.h>
+//#include <string.h>
+//#include "src/libethash/internal.h"
+import "C"
+
+import (
+	crand "crypto/rand"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"math/big"
+	mrand "math/rand"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+	"unsafe"
+
+	"github.com/Gustav-Simonsson/go-opencl/cl"
+	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/pow"
+)
+
+/*
+
+  This code have two main entry points:
+
+  1. The initCL(...)  function configures one or more OpenCL device
+     (for now only GPU) and loads the Ethash DAG onto device memory
+
+  2. The Search(...) function loads a Ethash nonce into device(s) memory and
+     executes the Ethash OpenCL kernel.
+
+  Throughout the code, we refer to "host memory" and "device memory".
+  For most systems (e.g. regular PC GPU miner) the host memory is RAM and
+  device memory is the GPU global memory (e.g. GDDR5).
+
+  References mentioned in code comments:
+
+  1. https://github.com/ethereum/wiki/wiki/Ethash
+  2. https://github.com/ethereum/cpp-ethereum/blob/develop/libethash-cl/ethash_cl_miner.cpp
+  3. https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/
+  4. http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_OpenCL_Programming_User_Guide.pdf
+
+*/
+
+type OpenCLDevice struct {
+	deviceId int
+	device   *cl.Device
+	openCL11 bool // OpenCL version 1.1 and 1.2 are handled a bit different
+	openCL12 bool
+
+	dagBuf        *cl.MemObject // Ethash full DAG in device mem
+	headerBuf     *cl.MemObject // Hash of block-to-mine in device mem
+	searchBuffers []*cl.MemObject
+
+	searchKernel *cl.Kernel
+	hashKernel   *cl.Kernel
+
+	queue         *cl.CommandQueue
+	ctx           *cl.Context
+	workGroupSize int
+
+	nonceRand *mrand.Rand // seeded by crypto/rand, see comments where it's initialised
+	result    common.Hash
+}
+
+type OpenCLMiner struct {
+	mu sync.Mutex
+
+	ethash *Ethash // Ethash full DAG & cache in host mem
+
+	deviceIds []int
+	devices   []*OpenCLDevice
+
+	dagSize uint64
+
+	hashRate int32 // Go atomics & uint64 have some issues; int32 is supported on all platforms
+}
+
+type pendingSearch struct {
+	bufIndex   uint32
+	startNonce uint64
+}
+
+const (
+	SIZEOF_UINT32 = 4
+
+	// See [1]
+	ethashMixBytesLen = 128
+	ethashAccesses    = 64
+
+	// See [4]
+	workGroupSize    = 32 // must be multiple of 8
+	maxSearchResults = 63
+	searchBufSize    = 2
+	globalWorkSize   = 1024 * 256
+)
+
+func NewCL(deviceIds []int) *OpenCLMiner {
+	ids := make([]int, len(deviceIds))
+	copy(ids, deviceIds)
+	return &OpenCLMiner{
+		ethash:    New(),
+		dagSize:   0, // to see if we need to update DAG.
+		deviceIds: ids,
+	}
+}
+
+func PrintDevices() {
+	fmt.Println("=============================================")
+	fmt.Println("============ OpenCL Device Info =============")
+	fmt.Println("=============================================")
+
+	var found []*cl.Device
+
+	platforms, err := cl.GetPlatforms()
+	if err != nil {
+		fmt.Println("Plaform error (check your OpenCL installation): %v", err)
+		return
+	}
+
+	for i, p := range platforms {
+		fmt.Println("Platform id             ", i)
+		fmt.Println("Platform Name           ", p.Name())
+		fmt.Println("Platform Vendor         ", p.Vendor())
+		fmt.Println("Platform Version        ", p.Version())
+		fmt.Println("Platform Extensions     ", p.Extensions())
+		fmt.Println("Platform Profile        ", p.Profile())
+		fmt.Println("")
+
+		devices, err := cl.GetDevices(p, cl.DeviceTypeGPU)
+		if err != nil {
+			fmt.Println("Device error (check your GPU drivers) :", err)
+			return
+		}
+
+		for _, d := range devices {
+			fmt.Println("Device OpenCL id        ", i)
+			fmt.Println("Device id for mining    ", len(found))
+			fmt.Println("Device Name             ", d.Name())
+			fmt.Println("Vendor                  ", d.Vendor())
+			fmt.Println("Version                 ", d.Version())
+			fmt.Println("Driver version          ", d.DriverVersion())
+			fmt.Println("Address bits            ", d.AddressBits())
+			fmt.Println("Max clock freq          ", d.MaxClockFrequency())
+			fmt.Println("Global mem size         ", d.GlobalMemSize())
+			fmt.Println("Max constant buffer size", d.MaxConstantBufferSize())
+			fmt.Println("Max mem alloc size      ", d.MaxMemAllocSize())
+			fmt.Println("Max compute units       ", d.MaxComputeUnits())
+			fmt.Println("Max work group size     ", d.MaxWorkGroupSize())
+			fmt.Println("Max work item sizes     ", d.MaxWorkItemSizes())
+			fmt.Println("=============================================")
+
+			found = append(found, d)
+		}
+	}
+	if len(found) == 0 {
+		fmt.Println("Found no GPU(s). Check that your OS can see the GPU(s)")
+	} else {
+		var idsFormat string
+		for i := 0; i < len(found); i++ {
+			idsFormat += strconv.Itoa(i)
+			if i != len(found)-1 {
+				idsFormat += ","
+			}
+		}
+		fmt.Printf("Found %v devices. Benchmark first GPU:       geth gpubench 0\n", len(found))
+		fmt.Printf("Mine using all GPUs:                        geth --minegpu %v\n", idsFormat)
+	}
+}
+
+// See [2]. We basically do the same here, but the Go OpenCL bindings
+// are at a slightly higher abtraction level.
+func InitCL(blockNum uint64, c *OpenCLMiner) error {
+	platforms, err := cl.GetPlatforms()
+	if err != nil {
+		return fmt.Errorf("Plaform error: %v\nCheck your OpenCL installation and then run geth gpuinfo", err)
+	}
+
+	var devices []*cl.Device
+	for _, p := range platforms {
+		ds, err := cl.GetDevices(p, cl.DeviceTypeGPU)
+		if err != nil {
+			return fmt.Errorf("Devices error: %v\nCheck your GPU drivers and then run geth gpuinfo", err)
+		}
+		for _, d := range ds {
+			devices = append(devices, d)
+		}
+	}
+
+	pow := New()
+	_ = pow.getDAG(blockNum)     // generates DAG if we don't have it
+	pow.Light.getCache(blockNum) // and cache
+
+	c.ethash = pow
+	dagSize := uint64(C.ethash_get_datasize(C.uint64_t(blockNum)))
+	c.dagSize = dagSize
+
+	for _, id := range c.deviceIds {
+		if id > len(devices)-1 {
+			return fmt.Errorf("Device id not found. See available device ids with: geth gpuinfo")
+		} else {
+			err := initCLDevice(id, devices[id], c)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	if len(c.devices) == 0 {
+		return fmt.Errorf("No GPU devices found")
+	}
+	return nil
+}
+
+func initCLDevice(deviceId int, device *cl.Device, c *OpenCLMiner) error {
+	devMaxAlloc := uint64(device.MaxMemAllocSize())
+	devGlobalMem := uint64(device.GlobalMemSize())
+
+	// TODO: more fine grained version logic
+	if device.Version() == "OpenCL 1.0" {
+		fmt.Println("Device OpenCL version not supported: ", device.Version())
+		return fmt.Errorf("opencl version not supported")
+	}
+
+	var cl11, cl12 bool
+	if device.Version() == "OpenCL 1.1" {
+		cl11 = true
+	}
+	if device.Version() == "OpenCL 1.2" {
+		cl12 = true
+	}
+
+	// log warnings but carry on; some device drivers report inaccurate values
+	if c.dagSize > devGlobalMem {
+		fmt.Printf("WARNING: device memory may be insufficient: %v. DAG size: %v.\n", devGlobalMem, c.dagSize)
+	}
+
+	if c.dagSize > devMaxAlloc {
+		fmt.Printf("WARNING: DAG size (%v) larger than device max memory allocation size (%v).\n", c.dagSize, devMaxAlloc)
+		fmt.Printf("You probably have to export GPU_MAX_ALLOC_PERCENT=95\n")
+	}
+
+	fmt.Printf("Initialising device %v: %v\n", deviceId, device.Name())
+
+	context, err := cl.CreateContext([]*cl.Device{device})
+	if err != nil {
+		return fmt.Errorf("failed creating context:", err)
+	}
+
+	// TODO: test running with CL_QUEUE_PROFILING_ENABLE for profiling?
+	queue, err := context.CreateCommandQueue(device, 0)
+	if err != nil {
+		return fmt.Errorf("command queue err:", err)
+	}
+
+	// See [4] section 3.2 and [3] "clBuildProgram".
+	// The OpenCL kernel code is compiled at run-time.
+	kvs := make(map[string]string, 4)
+	kvs["GROUP_SIZE"] = strconv.FormatUint(workGroupSize, 10)
+	kvs["DAG_SIZE"] = strconv.FormatUint(c.dagSize/ethashMixBytesLen, 10)
+	kvs["ACCESSES"] = strconv.FormatUint(ethashAccesses, 10)
+	kvs["MAX_OUTPUTS"] = strconv.FormatUint(maxSearchResults, 10)
+	kernelCode := replaceWords(kernel, kvs)
+
+	program, err := context.CreateProgramWithSource([]string{kernelCode})
+	if err != nil {
+		return fmt.Errorf("program err:", err)
+	}
+
+	/* if using AMD OpenCL impl, you can set this to debug on x86 CPU device.
+	   see AMD OpenCL programming guide section 4.2
+
+	   export in shell before running:
+	   export AMD_OCL_BUILD_OPTIONS_APPEND="-g -O0"
+	   export CPU_MAX_COMPUTE_UNITS=1
+
+	buildOpts := "-g -cl-opt-disable"
+
+	*/
+	buildOpts := ""
+	err = program.BuildProgram([]*cl.Device{device}, buildOpts)
+	if err != nil {
+		return fmt.Errorf("program build err:", err)
+	}
+
+	var searchKernelName, hashKernelName string
+	searchKernelName = "ethash_search"
+	hashKernelName = "ethash_hash"
+
+	searchKernel, err := program.CreateKernel(searchKernelName)
+	hashKernel, err := program.CreateKernel(hashKernelName)
+	if err != nil {
+		return fmt.Errorf("kernel err:", err)
+	}
+
+	// TODO: when this DAG size appears, patch the Go bindings
+	// (context.go) to work with uint64 as size_t
+	if c.dagSize > math.MaxInt32 {
+		fmt.Println("DAG too large for allocation.")
+		return fmt.Errorf("DAG too large for alloc")
+	}
+
+	// TODO: patch up Go bindings to work with size_t, will overflow if > maxint32
+	// TODO: fuck. shit's gonna overflow around 2017-06-09 12:17:02
+	dagBuf := *(new(*cl.MemObject))
+	dagBuf, err = context.CreateEmptyBuffer(cl.MemReadOnly, int(c.dagSize))
+	if err != nil {
+		return fmt.Errorf("allocating dag buf failed: ", err)
+	}
+
+	// write DAG to device mem
+	dagPtr := unsafe.Pointer(c.ethash.Full.current.ptr.data)
+	_, err = queue.EnqueueWriteBuffer(dagBuf, true, 0, int(c.dagSize), dagPtr, nil)
+	if err != nil {
+		return fmt.Errorf("writing to dag buf failed: ", err)
+	}
+
+	searchBuffers := make([]*cl.MemObject, searchBufSize)
+	for i := 0; i < searchBufSize; i++ {
+		searchBuff, err := context.CreateEmptyBuffer(cl.MemWriteOnly, (1+maxSearchResults)*SIZEOF_UINT32)
+		if err != nil {
+			return fmt.Errorf("search buffer err:", err)
+		}
+		searchBuffers[i] = searchBuff
+	}
+
+	headerBuf, err := context.CreateEmptyBuffer(cl.MemReadOnly, 32)
+	if err != nil {
+		return fmt.Errorf("header buffer err:", err)
+	}
+
+	// Unique, random nonces are crucial for mining efficieny.
+	// While we do not need cryptographically secure PRNG for nonces,
+	// we want to have uniform distribution and minimal repetition of nonces.
+	// We could guarantee strict uniqueness of nonces by generating unique ranges,
+	// but a int64 seed from crypto/rand should be good enough.
+	// we then use math/rand for speed and to avoid draining OS entropy pool
+	seed, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64))
+	if err != nil {
+		return err
+	}
+	nonceRand := mrand.New(mrand.NewSource(seed.Int64()))
+
+	deviceStruct := &OpenCLDevice{
+		deviceId: deviceId,
+		device:   device,
+		openCL11: cl11,
+		openCL12: cl12,
+
+		dagBuf:        dagBuf,
+		headerBuf:     headerBuf,
+		searchBuffers: searchBuffers,
+
+		searchKernel: searchKernel,
+		hashKernel:   hashKernel,
+
+		queue: queue,
+		ctx:   context,
+
+		workGroupSize: workGroupSize,
+
+		nonceRand: nonceRand,
+	}
+	c.devices = append(c.devices, deviceStruct)
+
+	return nil
+}
+
+func (c *OpenCLMiner) Search(block pow.Block, stop <-chan struct{}, index int) (uint64, []byte) {
+	c.mu.Lock()
+	newDagSize := uint64(C.ethash_get_datasize(C.uint64_t(block.NumberU64())))
+	if newDagSize > c.dagSize {
+		// TODO: clean up buffers from previous DAG?
+		err := InitCL(block.NumberU64(), c)
+		if err != nil {
+			fmt.Println("OpenCL init error: ", err)
+			return 0, []byte{0}
+		}
+	}
+	defer c.mu.Unlock()
+
+	// Avoid unneeded OpenCL initialisation if we received stop while running InitCL
+	select {
+	case <-stop:
+		return 0, []byte{0}
+	default:
+	}
+
+	headerHash := block.HashNoNonce()
+	diff := block.Difficulty()
+	target256 := new(big.Int).Div(maxUint256, diff)
+	target64 := new(big.Int).Rsh(target256, 192).Uint64()
+	var zero uint32 = 0
+
+	d := c.devices[index]
+
+	_, err := d.queue.EnqueueWriteBuffer(d.headerBuf, false, 0, 32, unsafe.Pointer(&headerHash[0]), nil)
+	if err != nil {
+		fmt.Println("Error in Search clEnqueueWriterBuffer : ", err)
+		return 0, []byte{0}
+	}
+
+	for i := 0; i < searchBufSize; i++ {
+		_, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[i], false, 0, 4, unsafe.Pointer(&zero), nil)
+		if err != nil {
+			fmt.Println("Error in Search clEnqueueWriterBuffer : ", err)
+			return 0, []byte{0}
+		}
+	}
+
+	// wait for all search buffers to complete
+	err = d.queue.Finish()
+	if err != nil {
+		fmt.Println("Error in Search clFinish : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(1, d.headerBuf)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(2, d.dagBuf)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(4, target64)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+	err = d.searchKernel.SetArg(5, uint32(math.MaxUint32))
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	// wait on this before returning
+	var preReturnEvent *cl.Event
+	if d.openCL12 {
+		preReturnEvent, err = d.ctx.CreateUserEvent()
+		if err != nil {
+			fmt.Println("Error in Search create CL user event : ", err)
+			return 0, []byte{0}
+		}
+	}
+
+	pending := make([]pendingSearch, 0, searchBufSize)
+	var p *pendingSearch
+	searchBufIndex := uint32(0)
+	var checkNonce uint64
+	loops := int64(0)
+	prevHashRate := int32(0)
+	start := time.Now().UnixNano()
+	// we grab a single random nonce and sets this as argument to the kernel search function
+	// the device will then add each local threads gid to the nonce, creating a unique nonce
+	// for each device computing unit executing in parallel
+	initNonce := uint64(d.nonceRand.Int63())
+	for nonce := initNonce; ; nonce += uint64(globalWorkSize) {
+		select {
+		case <-stop:
+
+			/*
+				if d.openCL12 {
+					err = cl.WaitForEvents([]*cl.Event{preReturnEvent})
+					if err != nil {
+						fmt.Println("Error in Search WaitForEvents: ", err)
+					}
+				}
+			*/
+
+			atomic.AddInt32(&c.hashRate, -prevHashRate)
+			return 0, []byte{0}
+		default:
+		}
+
+		if (loops % (1 << 7)) == 0 {
+			elapsed := time.Now().UnixNano() - start
+			// TODO: verify if this is correct hash rate calculation
+			hashes := (float64(1e9) / float64(elapsed)) * float64(loops*1024*256)
+			hashrateDiff := int32(hashes) - prevHashRate
+			prevHashRate = int32(hashes)
+			atomic.AddInt32(&c.hashRate, hashrateDiff)
+		}
+		loops++
+
+		err = d.searchKernel.SetArg(0, d.searchBuffers[searchBufIndex])
+		if err != nil {
+			fmt.Println("Error in Search clSetKernelArg : ", err)
+			return 0, []byte{0}
+		}
+		err = d.searchKernel.SetArg(3, nonce)
+		if err != nil {
+			fmt.Println("Error in Search clSetKernelArg : ", err)
+			return 0, []byte{0}
+		}
+
+		// execute kernel
+		_, err := d.queue.EnqueueNDRangeKernel(
+			d.searchKernel,
+			[]int{0},
+			[]int{globalWorkSize},
+			[]int{d.workGroupSize},
+			nil)
+		if err != nil {
+			fmt.Println("Error in Search clEnqueueNDRangeKernel : ", err)
+			return 0, []byte{0}
+		}
+
+		pending = append(pending, pendingSearch{bufIndex: searchBufIndex, startNonce: nonce})
+		searchBufIndex = (searchBufIndex + 1) % searchBufSize
+
+		if len(pending) == searchBufSize {
+			p = &(pending[searchBufIndex])
+			cres, _, err := d.queue.EnqueueMapBuffer(d.searchBuffers[p.bufIndex], true,
+				cl.MapFlagRead, 0, (1+maxSearchResults)*SIZEOF_UINT32,
+				nil)
+			if err != nil {
+				fmt.Println("Error in Search clEnqueueMapBuffer: ", err)
+				return 0, []byte{0}
+			}
+
+			results := cres.ByteSlice()
+			nfound := binary.LittleEndian.Uint32(results)
+			nfound = uint32(math.Min(float64(nfound), float64(maxSearchResults)))
+			// OpenCL returns the offsets from the start nonce
+			for i := uint32(0); i < nfound; i++ {
+				lo := (i + 1) * SIZEOF_UINT32
+				hi := (i + 2) * SIZEOF_UINT32
+				upperNonce := uint64(binary.LittleEndian.Uint32(results[lo:hi]))
+				checkNonce = p.startNonce + upperNonce
+				if checkNonce != 0 {
+					cn := C.uint64_t(checkNonce)
+					ds := C.uint64_t(c.dagSize)
+					// We verify that the nonce is indeed a solution by
+					// executing the Ethash verification function (on the CPU).
+					ret := C.ethash_light_compute_internal(c.ethash.Light.current.ptr, ds, hashToH256(headerHash), cn)
+					// TODO: return result first
+					if ret.success && h256ToHash(ret.result).Big().Cmp(target256) <= 0 {
+						_, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil)
+						if err != nil {
+							fmt.Println("Error in Search clEnqueueUnmapMemObject: ", err)
+						}
+						if d.openCL12 {
+							err = cl.WaitForEvents([]*cl.Event{preReturnEvent})
+							if err != nil {
+								fmt.Println("Error in Search WaitForEvents: ", err)
+							}
+						}
+						return checkNonce, C.GoBytes(unsafe.Pointer(&ret.mix_hash), C.int(32))
+					}
+
+					_, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[p.bufIndex], false, 0, 4, unsafe.Pointer(&zero), nil)
+					if err != nil {
+						fmt.Println("Error in Search cl: EnqueueWriteBuffer", err)
+						return 0, []byte{0}
+					}
+				}
+			}
+			_, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil)
+			if err != nil {
+				fmt.Println("Error in Search clEnqueueUnMapMemObject: ", err)
+				return 0, []byte{0}
+			}
+			pending = append(pending[:searchBufIndex], pending[searchBufIndex+1:]...)
+		}
+	}
+	if d.openCL12 {
+		err := cl.WaitForEvents([]*cl.Event{preReturnEvent})
+		if err != nil {
+			fmt.Println("Error in Search clWaitForEvents: ", err)
+			return 0, []byte{0}
+		}
+	}
+	return 0, []byte{0}
+}
+
+func (c *OpenCLMiner) Verify(block pow.Block) bool {
+	return c.ethash.Light.Verify(block)
+}
+func (c *OpenCLMiner) GetHashrate() int64 {
+	return int64(atomic.LoadInt32(&c.hashRate))
+}
+func (c *OpenCLMiner) Turbo(on bool) {
+	// This is GPU mining. Always be turbo.
+}
+
+func replaceWords(text string, kvs map[string]string) string {
+	for k, v := range kvs {
+		text = strings.Replace(text, k, v, -1)
+	}
+	return text
+}
+
+func logErr(err error) {
+	if err != nil {
+		fmt.Println("Error in OpenCL call:", err)
+	}
+}
+
+func argErr(err error) error {
+	return fmt.Errorf("arg err: %v", err)
+}