diff options
Diffstat (limited to 'Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go')
-rw-r--r-- | Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go | 629 |
1 files changed, 629 insertions, 0 deletions
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go new file mode 100644 index 000000000..332b7f524 --- /dev/null +++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go @@ -0,0 +1,629 @@ +// Copyright 2014 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. + +// +build opencl + +package ethash + +//#cgo LDFLAGS: -w +//#include <stdint.h> +//#include <string.h> +//#include "src/libethash/internal.h" +import "C" + +import ( + crand "crypto/rand" + "encoding/binary" + "fmt" + "math" + "math/big" + mrand "math/rand" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + "unsafe" + + "github.com/Gustav-Simonsson/go-opencl/cl" + "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/pow" +) + +/* + + This code have two main entry points: + + 1. The initCL(...) function configures one or more OpenCL device + (for now only GPU) and loads the Ethash DAG onto device memory + + 2. The Search(...) function loads a Ethash nonce into device(s) memory and + executes the Ethash OpenCL kernel. + + Throughout the code, we refer to "host memory" and "device memory". + For most systems (e.g. regular PC GPU miner) the host memory is RAM and + device memory is the GPU global memory (e.g. GDDR5). + + References mentioned in code comments: + + 1. https://github.com/ethereum/wiki/wiki/Ethash + 2. https://github.com/ethereum/cpp-ethereum/blob/develop/libethash-cl/ethash_cl_miner.cpp + 3. https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/ + 4. http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_OpenCL_Programming_User_Guide.pdf + +*/ + +type OpenCLDevice struct { + deviceId int + device *cl.Device + openCL11 bool // OpenCL version 1.1 and 1.2 are handled a bit different + openCL12 bool + + dagBuf *cl.MemObject // Ethash full DAG in device mem + headerBuf *cl.MemObject // Hash of block-to-mine in device mem + searchBuffers []*cl.MemObject + + searchKernel *cl.Kernel + hashKernel *cl.Kernel + + queue *cl.CommandQueue + ctx *cl.Context + workGroupSize int + + nonceRand *mrand.Rand // seeded by crypto/rand, see comments where it's initialised + result common.Hash +} + +type OpenCLMiner struct { + mu sync.Mutex + + ethash *Ethash // Ethash full DAG & cache in host mem + + deviceIds []int + devices []*OpenCLDevice + + dagSize uint64 + + hashRate int32 // Go atomics & uint64 have some issues; int32 is supported on all platforms +} + +type pendingSearch struct { + bufIndex uint32 + startNonce uint64 +} + +const ( + SIZEOF_UINT32 = 4 + + // See [1] + ethashMixBytesLen = 128 + ethashAccesses = 64 + + // See [4] + workGroupSize = 32 // must be multiple of 8 + maxSearchResults = 63 + searchBufSize = 2 + globalWorkSize = 1024 * 256 +) + +func NewCL(deviceIds []int) *OpenCLMiner { + ids := make([]int, len(deviceIds)) + copy(ids, deviceIds) + return &OpenCLMiner{ + ethash: New(), + dagSize: 0, // to see if we need to update DAG. + deviceIds: ids, + } +} + +func PrintDevices() { + fmt.Println("=============================================") + fmt.Println("============ OpenCL Device Info =============") + fmt.Println("=============================================") + + var found []*cl.Device + + platforms, err := cl.GetPlatforms() + if err != nil { + fmt.Println("Plaform error (check your OpenCL installation): %v", err) + return + } + + for i, p := range platforms { + fmt.Println("Platform id ", i) + fmt.Println("Platform Name ", p.Name()) + fmt.Println("Platform Vendor ", p.Vendor()) + fmt.Println("Platform Version ", p.Version()) + fmt.Println("Platform Extensions ", p.Extensions()) + fmt.Println("Platform Profile ", p.Profile()) + fmt.Println("") + + devices, err := cl.GetDevices(p, cl.DeviceTypeGPU) + if err != nil { + fmt.Println("Device error (check your GPU drivers) :", err) + return + } + + for _, d := range devices { + fmt.Println("Device OpenCL id ", i) + fmt.Println("Device id for mining ", len(found)) + fmt.Println("Device Name ", d.Name()) + fmt.Println("Vendor ", d.Vendor()) + fmt.Println("Version ", d.Version()) + fmt.Println("Driver version ", d.DriverVersion()) + fmt.Println("Address bits ", d.AddressBits()) + fmt.Println("Max clock freq ", d.MaxClockFrequency()) + fmt.Println("Global mem size ", d.GlobalMemSize()) + fmt.Println("Max constant buffer size", d.MaxConstantBufferSize()) + fmt.Println("Max mem alloc size ", d.MaxMemAllocSize()) + fmt.Println("Max compute units ", d.MaxComputeUnits()) + fmt.Println("Max work group size ", d.MaxWorkGroupSize()) + fmt.Println("Max work item sizes ", d.MaxWorkItemSizes()) + fmt.Println("=============================================") + + found = append(found, d) + } + } + if len(found) == 0 { + fmt.Println("Found no GPU(s). Check that your OS can see the GPU(s)") + } else { + var idsFormat string + for i := 0; i < len(found); i++ { + idsFormat += strconv.Itoa(i) + if i != len(found)-1 { + idsFormat += "," + } + } + fmt.Printf("Found %v devices. Benchmark first GPU: geth gpubench 0\n", len(found)) + fmt.Printf("Mine using all GPUs: geth --minegpu %v\n", idsFormat) + } +} + +// See [2]. We basically do the same here, but the Go OpenCL bindings +// are at a slightly higher abtraction level. +func InitCL(blockNum uint64, c *OpenCLMiner) error { + platforms, err := cl.GetPlatforms() + if err != nil { + return fmt.Errorf("Plaform error: %v\nCheck your OpenCL installation and then run geth gpuinfo", err) + } + + var devices []*cl.Device + for _, p := range platforms { + ds, err := cl.GetDevices(p, cl.DeviceTypeGPU) + if err != nil { + return fmt.Errorf("Devices error: %v\nCheck your GPU drivers and then run geth gpuinfo", err) + } + for _, d := range ds { + devices = append(devices, d) + } + } + + pow := New() + _ = pow.getDAG(blockNum) // generates DAG if we don't have it + pow.Light.getCache(blockNum) // and cache + + c.ethash = pow + dagSize := uint64(C.ethash_get_datasize(C.uint64_t(blockNum))) + c.dagSize = dagSize + + for _, id := range c.deviceIds { + if id > len(devices)-1 { + return fmt.Errorf("Device id not found. See available device ids with: geth gpuinfo") + } else { + err := initCLDevice(id, devices[id], c) + if err != nil { + return err + } + } + } + if len(c.devices) == 0 { + return fmt.Errorf("No GPU devices found") + } + return nil +} + +func initCLDevice(deviceId int, device *cl.Device, c *OpenCLMiner) error { + devMaxAlloc := uint64(device.MaxMemAllocSize()) + devGlobalMem := uint64(device.GlobalMemSize()) + + // TODO: more fine grained version logic + if device.Version() == "OpenCL 1.0" { + fmt.Println("Device OpenCL version not supported: ", device.Version()) + return fmt.Errorf("opencl version not supported") + } + + var cl11, cl12 bool + if device.Version() == "OpenCL 1.1" { + cl11 = true + } + if device.Version() == "OpenCL 1.2" { + cl12 = true + } + + // log warnings but carry on; some device drivers report inaccurate values + if c.dagSize > devGlobalMem { + fmt.Printf("WARNING: device memory may be insufficient: %v. DAG size: %v.\n", devGlobalMem, c.dagSize) + } + + if c.dagSize > devMaxAlloc { + fmt.Printf("WARNING: DAG size (%v) larger than device max memory allocation size (%v).\n", c.dagSize, devMaxAlloc) + fmt.Printf("You probably have to export GPU_MAX_ALLOC_PERCENT=95\n") + } + + fmt.Printf("Initialising device %v: %v\n", deviceId, device.Name()) + + context, err := cl.CreateContext([]*cl.Device{device}) + if err != nil { + return fmt.Errorf("failed creating context:", err) + } + + // TODO: test running with CL_QUEUE_PROFILING_ENABLE for profiling? + queue, err := context.CreateCommandQueue(device, 0) + if err != nil { + return fmt.Errorf("command queue err:", err) + } + + // See [4] section 3.2 and [3] "clBuildProgram". + // The OpenCL kernel code is compiled at run-time. + kvs := make(map[string]string, 4) + kvs["GROUP_SIZE"] = strconv.FormatUint(workGroupSize, 10) + kvs["DAG_SIZE"] = strconv.FormatUint(c.dagSize/ethashMixBytesLen, 10) + kvs["ACCESSES"] = strconv.FormatUint(ethashAccesses, 10) + kvs["MAX_OUTPUTS"] = strconv.FormatUint(maxSearchResults, 10) + kernelCode := replaceWords(kernel, kvs) + + program, err := context.CreateProgramWithSource([]string{kernelCode}) + if err != nil { + return fmt.Errorf("program err:", err) + } + + /* if using AMD OpenCL impl, you can set this to debug on x86 CPU device. + see AMD OpenCL programming guide section 4.2 + + export in shell before running: + export AMD_OCL_BUILD_OPTIONS_APPEND="-g -O0" + export CPU_MAX_COMPUTE_UNITS=1 + + buildOpts := "-g -cl-opt-disable" + + */ + buildOpts := "" + err = program.BuildProgram([]*cl.Device{device}, buildOpts) + if err != nil { + return fmt.Errorf("program build err:", err) + } + + var searchKernelName, hashKernelName string + searchKernelName = "ethash_search" + hashKernelName = "ethash_hash" + + searchKernel, err := program.CreateKernel(searchKernelName) + hashKernel, err := program.CreateKernel(hashKernelName) + if err != nil { + return fmt.Errorf("kernel err:", err) + } + + // TODO: when this DAG size appears, patch the Go bindings + // (context.go) to work with uint64 as size_t + if c.dagSize > math.MaxInt32 { + fmt.Println("DAG too large for allocation.") + return fmt.Errorf("DAG too large for alloc") + } + + // TODO: patch up Go bindings to work with size_t, will overflow if > maxint32 + // TODO: fuck. shit's gonna overflow around 2017-06-09 12:17:02 + dagBuf := *(new(*cl.MemObject)) + dagBuf, err = context.CreateEmptyBuffer(cl.MemReadOnly, int(c.dagSize)) + if err != nil { + return fmt.Errorf("allocating dag buf failed: ", err) + } + + // write DAG to device mem + dagPtr := unsafe.Pointer(c.ethash.Full.current.ptr.data) + _, err = queue.EnqueueWriteBuffer(dagBuf, true, 0, int(c.dagSize), dagPtr, nil) + if err != nil { + return fmt.Errorf("writing to dag buf failed: ", err) + } + + searchBuffers := make([]*cl.MemObject, searchBufSize) + for i := 0; i < searchBufSize; i++ { + searchBuff, err := context.CreateEmptyBuffer(cl.MemWriteOnly, (1+maxSearchResults)*SIZEOF_UINT32) + if err != nil { + return fmt.Errorf("search buffer err:", err) + } + searchBuffers[i] = searchBuff + } + + headerBuf, err := context.CreateEmptyBuffer(cl.MemReadOnly, 32) + if err != nil { + return fmt.Errorf("header buffer err:", err) + } + + // Unique, random nonces are crucial for mining efficieny. + // While we do not need cryptographically secure PRNG for nonces, + // we want to have uniform distribution and minimal repetition of nonces. + // We could guarantee strict uniqueness of nonces by generating unique ranges, + // but a int64 seed from crypto/rand should be good enough. + // we then use math/rand for speed and to avoid draining OS entropy pool + seed, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) + if err != nil { + return err + } + nonceRand := mrand.New(mrand.NewSource(seed.Int64())) + + deviceStruct := &OpenCLDevice{ + deviceId: deviceId, + device: device, + openCL11: cl11, + openCL12: cl12, + + dagBuf: dagBuf, + headerBuf: headerBuf, + searchBuffers: searchBuffers, + + searchKernel: searchKernel, + hashKernel: hashKernel, + + queue: queue, + ctx: context, + + workGroupSize: workGroupSize, + + nonceRand: nonceRand, + } + c.devices = append(c.devices, deviceStruct) + + return nil +} + +func (c *OpenCLMiner) Search(block pow.Block, stop <-chan struct{}, index int) (uint64, []byte) { + c.mu.Lock() + newDagSize := uint64(C.ethash_get_datasize(C.uint64_t(block.NumberU64()))) + if newDagSize > c.dagSize { + // TODO: clean up buffers from previous DAG? + err := InitCL(block.NumberU64(), c) + if err != nil { + fmt.Println("OpenCL init error: ", err) + return 0, []byte{0} + } + } + defer c.mu.Unlock() + + // Avoid unneeded OpenCL initialisation if we received stop while running InitCL + select { + case <-stop: + return 0, []byte{0} + default: + } + + headerHash := block.HashNoNonce() + diff := block.Difficulty() + target256 := new(big.Int).Div(maxUint256, diff) + target64 := new(big.Int).Rsh(target256, 192).Uint64() + var zero uint32 = 0 + + d := c.devices[index] + + _, err := d.queue.EnqueueWriteBuffer(d.headerBuf, false, 0, 32, unsafe.Pointer(&headerHash[0]), nil) + if err != nil { + fmt.Println("Error in Search clEnqueueWriterBuffer : ", err) + return 0, []byte{0} + } + + for i := 0; i < searchBufSize; i++ { + _, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[i], false, 0, 4, unsafe.Pointer(&zero), nil) + if err != nil { + fmt.Println("Error in Search clEnqueueWriterBuffer : ", err) + return 0, []byte{0} + } + } + + // wait for all search buffers to complete + err = d.queue.Finish() + if err != nil { + fmt.Println("Error in Search clFinish : ", err) + return 0, []byte{0} + } + + err = d.searchKernel.SetArg(1, d.headerBuf) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + + err = d.searchKernel.SetArg(2, d.dagBuf) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + + err = d.searchKernel.SetArg(4, target64) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + err = d.searchKernel.SetArg(5, uint32(math.MaxUint32)) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + + // wait on this before returning + var preReturnEvent *cl.Event + if d.openCL12 { + preReturnEvent, err = d.ctx.CreateUserEvent() + if err != nil { + fmt.Println("Error in Search create CL user event : ", err) + return 0, []byte{0} + } + } + + pending := make([]pendingSearch, 0, searchBufSize) + var p *pendingSearch + searchBufIndex := uint32(0) + var checkNonce uint64 + loops := int64(0) + prevHashRate := int32(0) + start := time.Now().UnixNano() + // we grab a single random nonce and sets this as argument to the kernel search function + // the device will then add each local threads gid to the nonce, creating a unique nonce + // for each device computing unit executing in parallel + initNonce := uint64(d.nonceRand.Int63()) + for nonce := initNonce; ; nonce += uint64(globalWorkSize) { + select { + case <-stop: + + /* + if d.openCL12 { + err = cl.WaitForEvents([]*cl.Event{preReturnEvent}) + if err != nil { + fmt.Println("Error in Search WaitForEvents: ", err) + } + } + */ + + atomic.AddInt32(&c.hashRate, -prevHashRate) + return 0, []byte{0} + default: + } + + if (loops % (1 << 7)) == 0 { + elapsed := time.Now().UnixNano() - start + // TODO: verify if this is correct hash rate calculation + hashes := (float64(1e9) / float64(elapsed)) * float64(loops*1024*256) + hashrateDiff := int32(hashes) - prevHashRate + prevHashRate = int32(hashes) + atomic.AddInt32(&c.hashRate, hashrateDiff) + } + loops++ + + err = d.searchKernel.SetArg(0, d.searchBuffers[searchBufIndex]) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + err = d.searchKernel.SetArg(3, nonce) + if err != nil { + fmt.Println("Error in Search clSetKernelArg : ", err) + return 0, []byte{0} + } + + // execute kernel + _, err := d.queue.EnqueueNDRangeKernel( + d.searchKernel, + []int{0}, + []int{globalWorkSize}, + []int{d.workGroupSize}, + nil) + if err != nil { + fmt.Println("Error in Search clEnqueueNDRangeKernel : ", err) + return 0, []byte{0} + } + + pending = append(pending, pendingSearch{bufIndex: searchBufIndex, startNonce: nonce}) + searchBufIndex = (searchBufIndex + 1) % searchBufSize + + if len(pending) == searchBufSize { + p = &(pending[searchBufIndex]) + cres, _, err := d.queue.EnqueueMapBuffer(d.searchBuffers[p.bufIndex], true, + cl.MapFlagRead, 0, (1+maxSearchResults)*SIZEOF_UINT32, + nil) + if err != nil { + fmt.Println("Error in Search clEnqueueMapBuffer: ", err) + return 0, []byte{0} + } + + results := cres.ByteSlice() + nfound := binary.LittleEndian.Uint32(results) + nfound = uint32(math.Min(float64(nfound), float64(maxSearchResults))) + // OpenCL returns the offsets from the start nonce + for i := uint32(0); i < nfound; i++ { + lo := (i + 1) * SIZEOF_UINT32 + hi := (i + 2) * SIZEOF_UINT32 + upperNonce := uint64(binary.LittleEndian.Uint32(results[lo:hi])) + checkNonce = p.startNonce + upperNonce + if checkNonce != 0 { + cn := C.uint64_t(checkNonce) + ds := C.uint64_t(c.dagSize) + // We verify that the nonce is indeed a solution by + // executing the Ethash verification function (on the CPU). + ret := C.ethash_light_compute_internal(c.ethash.Light.current.ptr, ds, hashToH256(headerHash), cn) + // TODO: return result first + if ret.success && h256ToHash(ret.result).Big().Cmp(target256) <= 0 { + _, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil) + if err != nil { + fmt.Println("Error in Search clEnqueueUnmapMemObject: ", err) + } + if d.openCL12 { + err = cl.WaitForEvents([]*cl.Event{preReturnEvent}) + if err != nil { + fmt.Println("Error in Search WaitForEvents: ", err) + } + } + return checkNonce, C.GoBytes(unsafe.Pointer(&ret.mix_hash), C.int(32)) + } + + _, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[p.bufIndex], false, 0, 4, unsafe.Pointer(&zero), nil) + if err != nil { + fmt.Println("Error in Search cl: EnqueueWriteBuffer", err) + return 0, []byte{0} + } + } + } + _, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil) + if err != nil { + fmt.Println("Error in Search clEnqueueUnMapMemObject: ", err) + return 0, []byte{0} + } + pending = append(pending[:searchBufIndex], pending[searchBufIndex+1:]...) + } + } + if d.openCL12 { + err := cl.WaitForEvents([]*cl.Event{preReturnEvent}) + if err != nil { + fmt.Println("Error in Search clWaitForEvents: ", err) + return 0, []byte{0} + } + } + return 0, []byte{0} +} + +func (c *OpenCLMiner) Verify(block pow.Block) bool { + return c.ethash.Light.Verify(block) +} +func (c *OpenCLMiner) GetHashrate() int64 { + return int64(atomic.LoadInt32(&c.hashRate)) +} +func (c *OpenCLMiner) Turbo(on bool) { + // This is GPU mining. Always be turbo. +} + +func replaceWords(text string, kvs map[string]string) string { + for k, v := range kvs { + text = strings.Replace(text, k, v, -1) + } + return text +} + +func logErr(err error) { + if err != nil { + fmt.Println("Error in OpenCL call:", err) + } +} + +func argErr(err error) error { + return fmt.Errorf("arg err: %v", err) +} |