aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/tangerine-network/mcl/src/xbyak/xbyak_util.h
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/tangerine-network/mcl/src/xbyak/xbyak_util.h')
-rw-r--r--vendor/github.com/tangerine-network/mcl/src/xbyak/xbyak_util.h653
1 files changed, 653 insertions, 0 deletions
diff --git a/vendor/github.com/tangerine-network/mcl/src/xbyak/xbyak_util.h b/vendor/github.com/tangerine-network/mcl/src/xbyak/xbyak_util.h
new file mode 100644
index 000000000..01544501d
--- /dev/null
+++ b/vendor/github.com/tangerine-network/mcl/src/xbyak/xbyak_util.h
@@ -0,0 +1,653 @@
+#ifndef XBYAK_XBYAK_UTIL_H_
+#define XBYAK_XBYAK_UTIL_H_
+
+/**
+ utility class and functions for Xbyak
+ Xbyak::util::Clock ; rdtsc timer
+ Xbyak::util::Cpu ; detect CPU
+ @note this header is UNDER CONSTRUCTION!
+*/
+#include "xbyak.h"
+
+#ifdef _MSC_VER
+ #if (_MSC_VER < 1400) && defined(XBYAK32)
+ static inline __declspec(naked) void __cpuid(int[4], int)
+ {
+ __asm {
+ push ebx
+ push esi
+ mov eax, dword ptr [esp + 4 * 2 + 8] // eaxIn
+ cpuid
+ mov esi, dword ptr [esp + 4 * 2 + 4] // data
+ mov dword ptr [esi], eax
+ mov dword ptr [esi + 4], ebx
+ mov dword ptr [esi + 8], ecx
+ mov dword ptr [esi + 12], edx
+ pop esi
+ pop ebx
+ ret
+ }
+ }
+ #else
+ #include <intrin.h> // for __cpuid
+ #endif
+#else
+ #ifndef __GNUC_PREREQ
+ #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
+ #endif
+ #if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
+ #include <cpuid.h>
+ #else
+ #if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
+ #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
+ #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
+ #else
+ #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
+ #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
+ #endif
+ #endif
+#endif
+
+namespace Xbyak { namespace util {
+
+/**
+ CPU detection class
+*/
+class Cpu {
+ uint64 type_;
+ unsigned int get32bitAsBE(const char *x) const
+ {
+ return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
+ }
+ unsigned int mask(int n) const
+ {
+ return (1U << n) - 1;
+ }
+ void setFamily()
+ {
+ unsigned int data[4];
+ getCpuid(1, data);
+ stepping = data[0] & mask(4);
+ model = (data[0] >> 4) & mask(4);
+ family = (data[0] >> 8) & mask(4);
+ // type = (data[0] >> 12) & mask(2);
+ extModel = (data[0] >> 16) & mask(4);
+ extFamily = (data[0] >> 20) & mask(8);
+ if (family == 0x0f) {
+ displayFamily = family + extFamily;
+ } else {
+ displayFamily = family;
+ }
+ if (family == 6 || family == 0x0f) {
+ displayModel = (extModel << 4) + model;
+ } else {
+ displayModel = model;
+ }
+ }
+ unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end)
+ {
+ return (val >> base) & ((1u << (end - base)) - 1);
+ }
+ void setCacheHierarchy()
+ {
+ if ((type_ & tINTEL) == 0) return;
+ const unsigned int NO_CACHE = 0;
+ const unsigned int DATA_CACHE = 1;
+// const unsigned int INSTRUCTION_CACHE = 2;
+ const unsigned int UNIFIED_CACHE = 3;
+ unsigned int smt_width = 0;
+ unsigned int n_cores = 0;
+ unsigned int data[4];
+
+ /*
+ if leaf 11 exists, we use it to get the number of smt cores and cores on socket
+ If x2APIC is supported, these are the only correct numbers.
+
+ leaf 0xB can be zeroed-out by a hypervisor
+ */
+ getCpuidEx(0x0, 0, data);
+ if (data[0] >= 0xB) {
+ getCpuidEx(0xB, 0, data); // CPUID for SMT Level
+ smt_width = data[1] & 0x7FFF;
+ getCpuidEx(0xB, 1, data); // CPUID for CORE Level
+ n_cores = data[1] & 0x7FFF;
+ }
+
+ /*
+ Assumptions:
+ the first level of data cache is not shared (which is the
+ case for every existing architecture) and use this to
+ determine the SMT width for arch not supporting leaf 11.
+ when leaf 4 reports a number of core less than n_cores
+ on socket reported by leaf 11, then it is a correct number
+ of cores not an upperbound.
+ */
+ for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) {
+ getCpuidEx(0x4, i, data);
+ unsigned int cacheType = extractBit(data[0], 0, 4);
+ if (cacheType == NO_CACHE) break;
+ if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
+ unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1;
+ if (n_cores != 0) { // true only if leaf 0xB is supported and valid
+ nb_logical_cores = (std::min)(nb_logical_cores, n_cores);
+ }
+ assert(nb_logical_cores != 0);
+ data_cache_size[data_cache_levels] =
+ (extractBit(data[1], 22, 31) + 1)
+ * (extractBit(data[1], 12, 21) + 1)
+ * (extractBit(data[1], 0, 11) + 1)
+ * (data[2] + 1);
+ if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores;
+ assert(smt_width != 0);
+ cores_sharing_data_cache[data_cache_levels] = (std::max)(nb_logical_cores / smt_width, 1u);
+ data_cache_levels++;
+ }
+ }
+ }
+
+public:
+ int model;
+ int family;
+ int stepping;
+ int extModel;
+ int extFamily;
+ int displayFamily; // family + extFamily
+ int displayModel; // model + extModel
+
+ // may I move these members into private?
+ static const unsigned int maxNumberCacheLevels = 10;
+ unsigned int data_cache_size[maxNumberCacheLevels];
+ unsigned int cores_sharing_data_cache[maxNumberCacheLevels];
+ unsigned int data_cache_levels;
+
+ unsigned int getDataCacheLevels() const { return data_cache_levels; }
+ unsigned int getCoresSharingDataCache(unsigned int i) const
+ {
+ if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER);
+ return cores_sharing_data_cache[i];
+ }
+ unsigned int getDataCacheSize(unsigned int i) const
+ {
+ if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER);
+ return data_cache_size[i];
+ }
+
+ /*
+ data[] = { eax, ebx, ecx, edx }
+ */
+ static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
+ {
+#ifdef _MSC_VER
+ __cpuid(reinterpret_cast<int*>(data), eaxIn);
+#else
+ __cpuid(eaxIn, data[0], data[1], data[2], data[3]);
+#endif
+ }
+ static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
+ {
+#ifdef _MSC_VER
+ __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
+#else
+ __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
+#endif
+ }
+ static inline uint64 getXfeature()
+ {
+#ifdef _MSC_VER
+ return _xgetbv(0);
+#else
+ unsigned int eax, edx;
+ // xgetvb is not support on gcc 4.2
+// __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+ __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
+ return ((uint64)edx << 32) | eax;
+#endif
+ }
+ typedef uint64 Type;
+
+ static const Type NONE = 0;
+ static const Type tMMX = 1 << 0;
+ static const Type tMMX2 = 1 << 1;
+ static const Type tCMOV = 1 << 2;
+ static const Type tSSE = 1 << 3;
+ static const Type tSSE2 = 1 << 4;
+ static const Type tSSE3 = 1 << 5;
+ static const Type tSSSE3 = 1 << 6;
+ static const Type tSSE41 = 1 << 7;
+ static const Type tSSE42 = 1 << 8;
+ static const Type tPOPCNT = 1 << 9;
+ static const Type tAESNI = 1 << 10;
+ static const Type tSSE5 = 1 << 11;
+ static const Type tOSXSAVE = 1 << 12;
+ static const Type tPCLMULQDQ = 1 << 13;
+ static const Type tAVX = 1 << 14;
+ static const Type tFMA = 1 << 15;
+
+ static const Type t3DN = 1 << 16;
+ static const Type tE3DN = 1 << 17;
+ static const Type tSSE4a = 1 << 18;
+ static const Type tRDTSCP = 1 << 19;
+ static const Type tAVX2 = 1 << 20;
+ static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
+ static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
+ static const Type tLZCNT = 1 << 23;
+
+ static const Type tINTEL = 1 << 24;
+ static const Type tAMD = 1 << 25;
+
+ static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
+ static const Type tRDRAND = 1 << 27;
+ static const Type tADX = 1 << 28; // adcx, adox
+ static const Type tRDSEED = 1 << 29; // rdseed
+ static const Type tSMAP = 1 << 30; // stac
+ static const Type tHLE = uint64(1) << 31; // xacquire, xrelease, xtest
+ static const Type tRTM = uint64(1) << 32; // xbegin, xend, xabort
+ static const Type tF16C = uint64(1) << 33; // vcvtph2ps, vcvtps2ph
+ static const Type tMOVBE = uint64(1) << 34; // mobve
+ static const Type tAVX512F = uint64(1) << 35;
+ static const Type tAVX512DQ = uint64(1) << 36;
+ static const Type tAVX512_IFMA = uint64(1) << 37;
+ static const Type tAVX512IFMA = tAVX512_IFMA;
+ static const Type tAVX512PF = uint64(1) << 38;
+ static const Type tAVX512ER = uint64(1) << 39;
+ static const Type tAVX512CD = uint64(1) << 40;
+ static const Type tAVX512BW = uint64(1) << 41;
+ static const Type tAVX512VL = uint64(1) << 42;
+ static const Type tAVX512_VBMI = uint64(1) << 43;
+ static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual
+ static const Type tAVX512_4VNNIW = uint64(1) << 44;
+ static const Type tAVX512_4FMAPS = uint64(1) << 45;
+ static const Type tPREFETCHWT1 = uint64(1) << 46;
+ static const Type tPREFETCHW = uint64(1) << 47;
+ static const Type tSHA = uint64(1) << 48;
+ static const Type tMPX = uint64(1) << 49;
+ static const Type tAVX512_VBMI2 = uint64(1) << 50;
+ static const Type tGFNI = uint64(1) << 51;
+ static const Type tVAES = uint64(1) << 52;
+ static const Type tVPCLMULQDQ = uint64(1) << 53;
+ static const Type tAVX512_VNNI = uint64(1) << 54;
+ static const Type tAVX512_BITALG = uint64(1) << 55;
+ static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56;
+
+ Cpu()
+ : type_(NONE)
+ , data_cache_levels(0)
+ {
+ unsigned int data[4];
+ const unsigned int& EAX = data[0];
+ const unsigned int& EBX = data[1];
+ const unsigned int& ECX = data[2];
+ const unsigned int& EDX = data[3];
+ getCpuid(0, data);
+ const unsigned int maxNum = EAX;
+ static const char intel[] = "ntel";
+ static const char amd[] = "cAMD";
+ if (ECX == get32bitAsBE(amd)) {
+ type_ |= tAMD;
+ getCpuid(0x80000001, data);
+ if (EDX & (1U << 31)) type_ |= t3DN;
+ if (EDX & (1U << 15)) type_ |= tCMOV;
+ if (EDX & (1U << 30)) type_ |= tE3DN;
+ if (EDX & (1U << 22)) type_ |= tMMX2;
+ if (EDX & (1U << 27)) type_ |= tRDTSCP;
+ }
+ if (ECX == get32bitAsBE(intel)) {
+ type_ |= tINTEL;
+ getCpuid(0x80000001, data);
+ if (EDX & (1U << 27)) type_ |= tRDTSCP;
+ if (ECX & (1U << 5)) type_ |= tLZCNT;
+ if (ECX & (1U << 8)) type_ |= tPREFETCHW;
+ }
+ getCpuid(1, data);
+ if (ECX & (1U << 0)) type_ |= tSSE3;
+ if (ECX & (1U << 9)) type_ |= tSSSE3;
+ if (ECX & (1U << 19)) type_ |= tSSE41;
+ if (ECX & (1U << 20)) type_ |= tSSE42;
+ if (ECX & (1U << 22)) type_ |= tMOVBE;
+ if (ECX & (1U << 23)) type_ |= tPOPCNT;
+ if (ECX & (1U << 25)) type_ |= tAESNI;
+ if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
+ if (ECX & (1U << 27)) type_ |= tOSXSAVE;
+ if (ECX & (1U << 30)) type_ |= tRDRAND;
+ if (ECX & (1U << 29)) type_ |= tF16C;
+
+ if (EDX & (1U << 15)) type_ |= tCMOV;
+ if (EDX & (1U << 23)) type_ |= tMMX;
+ if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
+ if (EDX & (1U << 26)) type_ |= tSSE2;
+
+ if (type_ & tOSXSAVE) {
+ // check XFEATURE_ENABLED_MASK[2:1] = '11b'
+ uint64 bv = getXfeature();
+ if ((bv & 6) == 6) {
+ if (ECX & (1U << 28)) type_ |= tAVX;
+ if (ECX & (1U << 12)) type_ |= tFMA;
+ if (((bv >> 5) & 7) == 7) {
+ getCpuidEx(7, 0, data);
+ if (EBX & (1U << 16)) type_ |= tAVX512F;
+ if (type_ & tAVX512F) {
+ if (EBX & (1U << 17)) type_ |= tAVX512DQ;
+ if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
+ if (EBX & (1U << 26)) type_ |= tAVX512PF;
+ if (EBX & (1U << 27)) type_ |= tAVX512ER;
+ if (EBX & (1U << 28)) type_ |= tAVX512CD;
+ if (EBX & (1U << 30)) type_ |= tAVX512BW;
+ if (EBX & (1U << 31)) type_ |= tAVX512VL;
+ if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
+ if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
+ if (ECX & (1U << 8)) type_ |= tGFNI;
+ if (ECX & (1U << 9)) type_ |= tVAES;
+ if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
+ if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
+ if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
+ if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
+ if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
+ if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
+ }
+ }
+ }
+ }
+ if (maxNum >= 7) {
+ getCpuidEx(7, 0, data);
+ if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
+ if (EBX & (1U << 3)) type_ |= tBMI1;
+ if (EBX & (1U << 8)) type_ |= tBMI2;
+ if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
+ if (EBX & (1U << 18)) type_ |= tRDSEED;
+ if (EBX & (1U << 19)) type_ |= tADX;
+ if (EBX & (1U << 20)) type_ |= tSMAP;
+ if (EBX & (1U << 4)) type_ |= tHLE;
+ if (EBX & (1U << 11)) type_ |= tRTM;
+ if (EBX & (1U << 14)) type_ |= tMPX;
+ if (EBX & (1U << 29)) type_ |= tSHA;
+ if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
+ }
+ setFamily();
+ setCacheHierarchy();
+ }
+ void putFamily() const
+ {
+ printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n",
+ family, model, stepping, extFamily, extModel);
+ printf("display:family=%X, model=%X\n", displayFamily, displayModel);
+ }
+ bool has(Type type) const
+ {
+ return (type & type_) != 0;
+ }
+};
+
+class Clock {
+public:
+ static inline uint64 getRdtsc()
+ {
+#ifdef _MSC_VER
+ return __rdtsc();
+#else
+ unsigned int eax, edx;
+ __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
+ return ((uint64)edx << 32) | eax;
+#endif
+ }
+ Clock()
+ : clock_(0)
+ , count_(0)
+ {
+ }
+ void begin()
+ {
+ clock_ -= getRdtsc();
+ }
+ void end()
+ {
+ clock_ += getRdtsc();
+ count_++;
+ }
+ int getCount() const { return count_; }
+ uint64 getClock() const { return clock_; }
+ void clear() { count_ = 0; clock_ = 0; }
+private:
+ uint64 clock_;
+ int count_;
+};
+
+#ifdef XBYAK64
+const int UseRCX = 1 << 6;
+const int UseRDX = 1 << 7;
+
+class Pack {
+ static const size_t maxTblNum = 15;
+ const Xbyak::Reg64 *tbl_[maxTblNum];
+ size_t n_;
+public:
+ Pack() : tbl_(), n_(0) {}
+ Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); }
+ Pack(const Pack& rhs)
+ : n_(rhs.n_)
+ {
+ for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
+ }
+ Pack& operator=(const Pack& rhs)
+ {
+ n_ = rhs.n_;
+ for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
+ return *this;
+ }
+ Pack(const Xbyak::Reg64& t0)
+ { n_ = 1; tbl_[0] = &t0; }
+ Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+ { n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; }
+ Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+ { n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; }
+ Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+ { n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; }
+ Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+ { n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; }
+ Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+ { n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; }
+ Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+ { n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; }
+ Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+ { n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; }
+ Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+ { n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; }
+ Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+ { n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
+ Pack& append(const Xbyak::Reg64& t)
+ {
+ if (n_ == maxTblNum) {
+ fprintf(stderr, "ERR Pack::can't append\n");
+ throw Error(ERR_BAD_PARAMETER);
+ }
+ tbl_[n_++] = &t;
+ return *this;
+ }
+ void init(const Xbyak::Reg64 *tbl, size_t n)
+ {
+ if (n > maxTblNum) {
+ fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
+ throw Error(ERR_BAD_PARAMETER);
+ }
+ n_ = n;
+ for (size_t i = 0; i < n; i++) {
+ tbl_[i] = &tbl[i];
+ }
+ }
+ const Xbyak::Reg64& operator[](size_t n) const
+ {
+ if (n >= n_) {
+ fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
+ throw Error(ERR_BAD_PARAMETER);
+ }
+ return *tbl_[n];
+ }
+ size_t size() const { return n_; }
+ /*
+ get tbl[pos, pos + num)
+ */
+ Pack sub(size_t pos, size_t num = size_t(-1)) const
+ {
+ if (num == size_t(-1)) num = n_ - pos;
+ if (pos + num > n_) {
+ fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
+ throw Error(ERR_BAD_PARAMETER);
+ }
+ Pack pack;
+ pack.n_ = num;
+ for (size_t i = 0; i < num; i++) {
+ pack.tbl_[i] = tbl_[pos + i];
+ }
+ return pack;
+ }
+ void put() const
+ {
+ for (size_t i = 0; i < n_; i++) {
+ printf("%s ", tbl_[i]->toString());
+ }
+ printf("\n");
+ }
+};
+
+class StackFrame {
+#ifdef XBYAK64_WIN
+ static const int noSaveNum = 6;
+ static const int rcxPos = 0;
+ static const int rdxPos = 1;
+#else
+ static const int noSaveNum = 8;
+ static const int rcxPos = 3;
+ static const int rdxPos = 2;
+#endif
+ static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
+ Xbyak::CodeGenerator *code_;
+ int pNum_;
+ int tNum_;
+ bool useRcx_;
+ bool useRdx_;
+ int saveNum_;
+ int P_;
+ bool makeEpilog_;
+ Xbyak::Reg64 pTbl_[4];
+ Xbyak::Reg64 tTbl_[maxRegNum];
+ Pack p_;
+ Pack t_;
+ StackFrame(const StackFrame&);
+ void operator=(const StackFrame&);
+public:
+ const Pack& p;
+ const Pack& t;
+ /*
+ make stack frame
+ @param sf [in] this
+ @param pNum [in] num of function parameter(0 <= pNum <= 4)
+ @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
+ @param stackSizeByte [in] local stack size
+ @param makeEpilog [in] automatically call close() if true
+
+ you can use
+ rax
+ gp0, ..., gp(pNum - 1)
+ gt0, ..., gt(tNum-1)
+ rcx if tNum & UseRCX
+ rdx if tNum & UseRDX
+ rsp[0..stackSizeByte - 1]
+ */
+ StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
+ : code_(code)
+ , pNum_(pNum)
+ , tNum_(tNum & ~(UseRCX | UseRDX))
+ , useRcx_((tNum & UseRCX) != 0)
+ , useRdx_((tNum & UseRDX) != 0)
+ , saveNum_(0)
+ , P_(0)
+ , makeEpilog_(makeEpilog)
+ , p(p_)
+ , t(t_)
+ {
+ using namespace Xbyak;
+ if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM);
+ const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
+ if (tNum_ < 0 || allRegNum > maxRegNum) throw Error(ERR_BAD_TNUM);
+ const Reg64& _rsp = code->rsp;
+ saveNum_ = (std::max)(0, allRegNum - noSaveNum);
+ const int *tbl = getOrderTbl() + noSaveNum;
+ for (int i = 0; i < saveNum_; i++) {
+ code->push(Reg64(tbl[i]));
+ }
+ P_ = (stackSizeByte + 7) / 8;
+ if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment
+ P_ *= 8;
+ if (P_ > 0) code->sub(_rsp, P_);
+ int pos = 0;
+ for (int i = 0; i < pNum; i++) {
+ pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
+ }
+ for (int i = 0; i < tNum_; i++) {
+ tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
+ }
+ if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
+ if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
+ p_.init(pTbl_, pNum);
+ t_.init(tTbl_, tNum_);
+ }
+ /*
+ make epilog manually
+ @param callRet [in] call ret() if true
+ */
+ void close(bool callRet = true)
+ {
+ using namespace Xbyak;
+ const Reg64& _rsp = code_->rsp;
+ const int *tbl = getOrderTbl() + noSaveNum;
+ if (P_ > 0) code_->add(_rsp, P_);
+ for (int i = 0; i < saveNum_; i++) {
+ code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
+ }
+
+ if (callRet) code_->ret();
+ }
+ ~StackFrame()
+ {
+ if (!makeEpilog_) return;
+ try {
+ close();
+ } catch (std::exception& e) {
+ printf("ERR:StackFrame %s\n", e.what());
+ exit(1);
+ }
+ }
+private:
+ const int *getOrderTbl() const
+ {
+ using namespace Xbyak;
+ static const int tbl[] = {
+#ifdef XBYAK64_WIN
+ Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
+#else
+ Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11,
+#endif
+ Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15
+ };
+ return &tbl[0];
+ }
+ int getRegIdx(int& pos) const
+ {
+ assert(pos < maxRegNum);
+ using namespace Xbyak;
+ const int *tbl = getOrderTbl();
+ int r = tbl[pos++];
+ if (useRcx_) {
+ if (r == Operand::RCX) { return Operand::R10; }
+ if (r == Operand::R10) { r = tbl[pos++]; }
+ }
+ if (useRdx_) {
+ if (r == Operand::RDX) { return Operand::R11; }
+ if (r == Operand::R11) { return tbl[pos++]; }
+ }
+ return r;
+ }
+};
+#endif
+
+} } // end of util
+#endif