aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/t_x86-64_emit.c
diff options
context:
space:
mode:
authorlemon <lsof@mailbox.org>2026-03-17 13:22:00 +0100
committerlemon <lsof@mailbox.org>2026-03-17 13:22:00 +0100
commita8d6f8bf30c07edb775e56889f568ca20240bedf (patch)
treeb5a452b2675b2400f15013617291fe6061180bbf /src/t_x86-64_emit.c
parent24f14b7ad1af08d872971d72ce089a529911f657 (diff)
REFACTOR: move sources to src/
Diffstat (limited to 'src/t_x86-64_emit.c')
-rw-r--r--src/t_x86-64_emit.c1422
1 files changed, 1422 insertions, 0 deletions
diff --git a/src/t_x86-64_emit.c b/src/t_x86-64_emit.c
new file mode 100644
index 0000000..d3a466b
--- /dev/null
+++ b/src/t_x86-64_emit.c
@@ -0,0 +1,1422 @@
+#include "all.h"
+#include "../obj/obj.h"
+#include "../endian.h"
+
+/** Instruction operands **
+ *
+ * Can be a register, a 32-bit immediate,
+ * a memory reference [base + index * scale + disp],
+ * or a relocatable reference to some symbol plus a displacement and maybe index*scale
+ */
+enum operkind { ONONE, OREG, OIMM, OMEM, OSYM, OSYMGOT };
+enum { NOBASE = 63, NOINDEX = 63 };
+struct oper {
+ uchar t;
+ union {
+ struct { uchar base; }; /* OMEM */
+ struct { uchar cindex : 6, cshift : 2; }; /* OSYM */
+ };
+ union {
+ struct { uchar index, shift; }; /* OMEM */
+ ushort con; /* OSYM */
+ };
+ union {
+ uchar reg; /* OREG */
+ int disp; /* OMEM, OSYM */
+ int imm; /* OIMM */
+ };
+};
+#define mkoper(t, ...) ((struct oper){(t), __VA_ARGS__})
+#define reg2oper(R) (assert((uint)(R) <= XMM15), mkoper(OREG, .reg = (R)))
+
+static struct oper mkmemoper(union ref);
+
+static struct oper
+ioper(int i)
+{
+ int reg = instrtab[i].reg - 1;
+ return reg < 0 ? mkoper(ONONE,) : reg2oper(reg);
+}
+
+static struct oper
+ref2oper(union ref r)
+{
+ switch (r.t) {
+ case RTMP: return ioper(r.i);
+ case RREG: return reg2oper(r.i);
+ case RICON: return mkoper(OIMM, .imm = r.i);
+ case RXCON:
+ if (contab.p[r.i].cls == KI32)
+ return mkoper(OIMM, .imm = contab.p[r.i].i);
+ else if (contab.p[r.i].cls == KI64) {
+ vlong i = contab.p[r.i].i;
+ assert(i == (int)i);
+ return mkoper(OIMM, .imm = i);
+ } else if (!contab.p[r.i].cls) {
+ return mkoper(OSYM, .con = r.i, .cindex = NOINDEX);
+ }
+ assert(0);
+ case RADDR: return mkmemoper(r);
+ default: assert(0);
+ }
+}
+
+static void
+addmemoper(struct oper *mem, struct oper add)
+{
+ assert(mem->t == OMEM);
+ if (add.t == OIMM) {
+ mem->disp += add.imm;
+ } else if (add.t == OREG) {
+ if (mem->base == NOBASE)
+ mem->base = add.reg;
+ else if (mem->index == NOINDEX)
+ mem->index = add.reg;
+ else
+ assert(0);
+ }
+}
+
+/* helpers to convert a reference to an operand of a specific kind,
+ * with assertions to make sure nothing went wrong */
+
+static inline struct oper
+mkregoper(union ref r)
+{
+ assert(r.t == RREG || (r.t == RTMP && ioper(r.i).t == OREG));
+ return r.t == RREG ? reg2oper(r.i) : ioper(r.i);
+}
+
+static inline struct oper
+mkimmoper(union ref r)
+{
+ assert(iscon(r) && concls(r) == KI32);
+ return mkoper(OIMM, .imm = intconval(r));
+}
+
+#define ismemref(ref) ((ref).t == RTMP && ioper((ref).i).t == OMEM)
+#define isregref(ref) ((ref).t == RREG || ((ref).t == RTMP && ioper((ref).i).t == OREG))
+
+static inline struct oper
+mkimmregoper(union ref r)
+{
+ assert(isregref(r) || (iscon(r) && concls(r) == KI32));
+ return ref2oper(r);
+}
+
+static inline struct oper
+mkdatregoper(union ref r)
+{
+ assert(isregref(r) || (r.t == RXCON && contab.p[r.i].deref));
+ return ref2oper(r);
+}
+
+static inline struct oper
+mkimmdatregoper(union ref r)
+{
+ assert(isregref(r) || r.t == RICON || (r.t == RXCON && (contab.p[r.i].cls == KI32 || contab.p[r.i].deref)));
+ return ref2oper(r);
+}
+
+static struct oper
+mkmemoper(union ref r)
+{
+ if (r.t == RTMP) {
+ struct oper wop = ioper(r.i);
+ if (wop.t == OMEM) return wop;
+ assert(wop.t == OREG);
+ return mkoper(OMEM, .base = wop.reg, .index = NOINDEX);
+ } else if (r.t == RADDR) {
+ const struct addr *addr = &addrtab.p[r.i];
+ assert(addr->shift <= 3);
+ if (isaddrcon(addr->base,0)) {
+ return mkoper(OSYM, .con = addr->base.i,
+ .cindex = addr->index.bits ? mkregoper(addr->index).reg : NOINDEX,
+ .cshift = addr->shift,
+ .disp = addr->disp);
+ } else if (isintcon(addr->base)) {
+ assert(!addr->disp);
+ return mkoper(OMEM, .base = NOBASE,
+ .index = addr->index.bits ? mkregoper(addr->index).reg : NOINDEX,
+ .disp = intconval(addr->base),
+ .shift = addr->shift);
+ } else if (isaddrcon(addr->index,0)) {
+ assert(!addr->shift);
+ return mkoper(OSYM, .con = addr->index.i,
+ .cindex = addr->base.bits ? mkregoper(addr->base).reg : NOINDEX,
+ .disp = addr->disp);
+ }
+ return mkoper(OMEM, .base = addr->base.bits ? mkregoper(addr->base).reg : NOBASE,
+ .index = addr->index.bits ? mkregoper(addr->index).reg : NOINDEX,
+ .disp = addr->disp,
+ .shift = addr->shift);
+ } else if (r.t == RXCON) {
+ assert(!contab.p[r.i].cls);
+ return mkoper(OSYM, .con = r.i, .cindex = NOINDEX);
+ } else {
+ return mkoper(OMEM, .base = isregref(r) ? ref2oper(r).reg : NOBASE,
+ .index = NOINDEX,
+ .disp = isregref(r) ? 0 : mkimmoper(r).imm);
+ }
+}
+
+/** Instruction description tables **
+ *
+ * Each instruction is a list of descs, and the first one that matches
+ * is emitted. Each entry has a size pattern field, which is a bitset
+ * of the sizes (in bytes) that the entry matches, and 2 operand patterns,
+ * which describe the operands that can match (for example, PRAX matches
+ * a RAX register operand, PGPR matches any integer register, I8 matches
+ * an immediate operand between [-128,127]) The rest of the fields describe
+ * the instruction's encoding.
+ * (reference: https://www.felixcloutier.com/x86/ & https://wiki.osdev.org/X86-64_Instruction_Encoding )
+ */
+
+enum operpat {
+ PNONE,
+ PRAX,
+ PRCX,
+ PGPR,
+ PFPR,
+ P1, /* imm = 1 */
+ PN1, /* imm = -1 */
+ PI8,
+ PU8,
+ PI16,
+ PU16,
+ PI32,
+ PU32,
+ PMEM,
+ PSYM,
+};
+enum operenc {
+ EN_R = 1, /* reg with /r */
+ EN_RR, /* reg, reg with /r */
+ EN_RRX, /* reg, reg with /r (inverted) */
+ EN_MR, /* mem, reg with /r */
+ EN_RM, /* reg, mem with /r */
+ EN_M, /* mem */
+ EN_RI8, /* reg, imm8 with /0 */
+ EN_RI32, /* reg, imm32 with /0 */
+ EN_MI8, /* mem, imm8 with /x */
+ EN_MI16, /* mem, imm16 with /x */
+ EN_MI32, /* mem, imm32 with /x */
+ EN_O, /* reg with op + reg */
+ EN_OI, /* reg, imm32 with op + reg */
+ EN_I8, /* imm8 */
+ EN_I32, /* imm32 */
+ EN_R32, /* rel32 */
+ NOPERENC,
+};
+struct desc {
+ uchar psiz; /* subset of {1,2,4,8} */
+ uchar ptd, pts; /* bitsets of enum operpat */
+ uchar nopc; /* countof opc */
+ const char opc[8]; /* opcode bytes */
+ uchar operenc; /* enum operenc */
+ uchar ext; /* ModR/M.reg opc extension */
+ bool r8; /* uses 8bit register */
+ bool norexw; /* do not use REX.W even if size is 64 bits */
+};
+
+/* match operand against pattern */
+static inline bool
+opermatch(enum operpat pat, struct oper oper)
+{
+ switch (pat) {
+ case PNONE: return !oper.t;
+ case PRAX: return oper.t == OREG && oper.reg == RAX;
+ case PRCX: return oper.t == OREG && oper.reg == RCX;
+ case PGPR: return oper.t == OREG && oper.reg <= R15;
+ case PFPR: return oper.t == OREG && oper.reg >= XMM0;
+ case P1: return oper.t == OIMM && oper.imm == 1;
+ case PN1: return oper.t == OIMM && oper.imm == -1;
+ case PI8: return oper.t == OIMM && (schar)oper.imm == oper.imm;
+ case PU8: return oper.t == OIMM && (uchar)oper.imm == oper.imm;
+ case PI16: return oper.t == OIMM && (short)oper.imm == oper.imm;
+ case PU16: return oper.t == OIMM && (ushort)oper.imm == oper.imm;
+ case PI32: return oper.t == OIMM;
+ case PU32: return oper.t == OIMM && oper.imm >= 0;
+ case PMEM: return in_range(oper.t, OMEM, OSYMGOT);
+ case PSYM: return oper.t == OSYM || oper.t == OSYMGOT;
+ }
+ assert(0);
+}
+
+/* code output helpers */
+#define B(b) (*(*pcode)++ = (b))
+#define D(xs, N) (memcpy(*pcode, (xs), (N)), (*pcode) += (N))
+#define I16(w) (wr16le(*pcode, (w)), *pcode += 2)
+#define I32(w) (wr32le(*pcode, (w)), *pcode += 4)
+#define DS(S) D(S, sizeof S - 1)
+
+static bool usebp; /* use RBP? */
+static int rbpoff;
+static internstr curfnsym;
+static uchar *fnstart;
+
+/* Given an instruction description table, find the first entry that matches
+ * the operands (where dst, src are the operands in intel syntax order) and encode it */
+static void
+encode(uchar **pcode, const struct desc *tab, int ntab, enum irclass k, struct oper dst, struct oper src)
+{
+ const uchar *opc;
+ int nopc;
+ struct oper mem;
+ enum reg reg;
+ const struct desc *en = NULL;
+ for (int i = 0; i < ntab; ++i) {
+ if ((tab[i].psiz & cls2siz[k]) && opermatch(tab[i].ptd, dst) && opermatch(tab[i].pts, src)) {
+ en = &tab[i];
+ break;
+ }
+ }
+ assert(en && "no match for instr");
+
+ if (en->ptd == PFPR) dst.reg &= 15;
+ if (en->pts == PFPR) src.reg &= 15;
+ opc = (uchar *)en->opc;
+ nopc = en->nopc;
+ /* mandatory prefixes go before REX */
+ if (*opc == 0x66 || *opc == 0xF2 || *opc == 0xF3)
+ B(*opc++), --nopc;
+ int rex = in_range(k, KI64, KPTR) << 3; /* REX.W */
+ if (en->norexw) rex = 0;
+ switch (en->operenc) {
+ case EN_RR: /* mod = 11; reg = dst; rm = src */
+ rex |= (dst.reg >> 3) << 2; /* REX.R */
+ rex |= (src.reg >> 3) << 0; /* REX.B */
+ if (rex) B(0x40 | rex);
+ else if (en->r8 && in_range(src.reg, RSP, RDI)) {
+ /* /r8 needs REX to encode SP,BP,SI,DI (otherwise -> AH..BH) */
+ B(0x40);
+ }
+ D(opc, nopc);
+ B(0300 | (dst.reg & 7) << 3 | (src.reg & 7));
+ break;
+ case EN_RRX: /* mod = 11; reg = src; rm = dst */
+ rex |= (src.reg >> 3) << 2; /* REX.R */
+ rex |= (dst.reg >> 3) << 0; /* REX.B */
+ if (rex) B(0x40 | rex);
+ else if (en->r8 && in_range(dst.reg, RSP, RDI)) {
+ /* /r8 needs REX to encode SP,BP,SI,DI (otherwise -> AH..BH) */
+ B(0x40);
+ }
+ D(opc, nopc);
+ B(0300 | (src.reg & 7) << 3 | (dst.reg & 7));
+ break;
+ case EN_MR:
+ mem = dst;
+ reg = src.reg;
+ goto Mem;
+ case EN_RM:
+ mem = src;
+ reg = dst.reg;
+ goto Mem;
+ case EN_M: case EN_MI8: case EN_MI16: case EN_MI32:
+ mem = dst;
+ reg = en->ext;
+ Mem:
+ if (mem.t == OMEM) {
+ if (mem.base != NOBASE) rex |= mem.base >> 3; /* REX.B */
+ if (mem.index != NOINDEX) rex |= mem.index >> 3 << 1; /* REX.X */
+ } else {
+ if (mem.cindex != NOINDEX) rex |= mem.cindex >> 3 << 1; /* REX.X */
+ }
+ if (en->operenc != EN_M)
+ rex |= (reg >> 3) << 2; /* REX.R */
+ if (rex) B(0x40 | rex);
+ else if (en->r8 && in_range(reg, RSP, RDI)) B(0x40);
+
+ if (mem.t == OSYM || mem.t == OSYMGOT) {
+ D(opc, nopc);
+ if (mem.cindex == NOINDEX) {
+ /* %rip(var) */
+ static uchar offs[NOPERENC] = { [EN_MI8] = 1, [EN_MI16] = 2, [EN_MI32] = 4 };
+ uint addr;
+ int disp = mem.disp - 4 - offs[en->operenc];
+ internstr sym = xcon2sym(mem.con);
+ B(/*mod 0*/ (reg & 7) << 3 | RBP);
+ if (objhassym(sym, &addr) == Stext && mem.t != OSYMGOT) {
+ I32(addr - (*pcode - objout.textbegin) + disp);
+ } else {
+ enum relockind r = REL_PCREL32;
+ if (mem.t == OSYMGOT) r = rex ? REL_GOTPCRELX_REX : REL_GOTPCRELX;
+ objreloc(xcon2sym(mem.con), r, Stext, *pcode - objout.textbegin, disp);
+ I32(0);
+ }
+ } else {
+ /* var(,%reg,shift) */
+ assert(!ccopt.pic && !ccopt.pie && "cannot encode [RIP-rel + REG] for position independent");
+ B(/*mod 0*/ (reg & 7) << 3 | RSP);
+ B(mem.cshift << 6 | ((mem.cindex & 7) << 3) | RBP); /* SIB [index*s + disp32] */
+ objreloc(xcon2sym(mem.con), REL_ABS32S, Stext, *pcode - objout.textbegin, mem.disp);
+ I32(0);
+ }
+ } else {
+ int mod;
+ bool sib = 0;
+ if (mem.base == RBP) {
+ if (!usebp) {
+ /* if RBP isn't being set up (leaf functions with no stack allocations),
+ * access thru RSP (function arguments in the stack) */
+ mem.base = RSP;
+ mem.disp -= 8;
+ } else if (mem.disp <= 0) {
+ mem.disp += rbpoff;
+ }
+ }
+ if (mem.base != NOBASE) {
+ if (mem.index == NOINDEX && mem.shift == 0) sib = 0;
+ else sib = 1;
+ mod = !mem.disp ? 0 /* disp = 0 -> mod = 00 */
+ : (schar)mem.disp == mem.disp ? 1 /* disp8 -> mod = 01 */
+ : 2; /* disp32 -> mod = 10 */
+ if (mod == 0 && (mem.base == RBP || mem.base == R13)) mod = 1;
+ if (mem.base == RSP || mem.base == R12) sib = 1;
+ } else {
+ /* [disp + (index*s)] */
+ sib = 1;
+ mem.base = RBP;
+ mod = 0;
+ assert(mem.index != RSP);
+ }
+ D(opc, nopc);
+ B(mod << 6 | (reg & 7) << 3 | (sib ? 4 : (mem.base & 7)));
+ if (sib) {
+ if (mem.index == NOINDEX) mem.index = RSP;
+ B(mem.shift << 6 | (mem.index & 7) << 3 | (mem.base & 7));
+ }
+ if (mod == 1) B(mem.disp);
+ else if (mod == 2 || (mod == 0 && mem.base == RBP/*RIP-rel*/) || (mod == 0 && sib && mem.base == RBP/*absolute*/)) {
+ I32(mem.disp);
+ }
+ }
+ if (en->operenc == EN_MI8) B(src.imm);
+ if (en->operenc == EN_MI16) I16(src.imm);
+ if (en->operenc == EN_MI32) I32(src.imm);
+ break;
+ case EN_R: case EN_RI32: case EN_RI8:
+ rex |= (dst.reg >> 3) << 0; /* REX.B */
+ if (rex) B(0x40 | rex);
+ else if (en->r8 && in_range(dst.reg, RSP, RDI)) {
+ /* /r8 needs REX to encode SP,BP,SI,DI (otherwise -> AH..BH) */
+ B(0x40);
+ }
+ D(opc, nopc);
+ B(0300 | en->ext << 3 | (dst.reg & 7));
+ if (en->operenc == EN_RI32)
+ I32(src.imm);
+ else if (en->operenc == EN_RI8)
+ B(src.imm);
+ break;
+ case EN_O: case EN_OI:
+ rex |= (dst.reg >> 3) << 0; /* REX.B */
+ if (rex) B(0x40 | rex);
+ D(opc, nopc - 1);
+ B(opc[nopc-1] + (dst.reg & 7));
+ if (en->operenc == EN_OI) I32(src.imm);
+ break;
+ case EN_I8:
+ if (rex) B(0x40 | rex);
+ D(opc, nopc);
+ B(src.imm);
+ break;
+ case EN_I32:
+ if (rex) B(0x40 | rex);
+ D(opc, nopc);
+ I32(src.imm);
+ break;
+ case EN_R32:
+ if (rex) B(0x40 | rex);
+ D(opc, nopc);
+ assert(dst.t == OSYM);
+ internstr sym = xcon2sym(dst.con);
+ uint addr;
+ if (sym == curfnsym) {
+ I32(fnstart - *pcode - 4);
+ } else if (objhassym(sym, &addr) == Stext) {
+ I32(addr - (*pcode - objout.textbegin) - 4);
+ } else {
+ enum relockind r = (ccopt.pie|ccopt.pic) ? REL_PLT32 : REL_PCREL32;
+ objreloc(sym, r, Stext, *pcode - objout.textbegin, -4);
+ I32(0);
+ }
+ break;
+ }
+}
+
+#define DEFINSTR1(X, ...) \
+ static void \
+ X(uchar **pcode, enum irclass k, struct oper oper) \
+ { \
+ static const struct desc tab[] = { __VA_ARGS__ }; \
+ encode(pcode, tab, countof(tab), k, oper, mkoper(0,)); \
+ }
+
+#define DEFINSTR2(X, ...) \
+ static void \
+ X(uchar **pcode, enum irclass k, struct oper dst, struct oper src) \
+ { \
+ static const struct desc tab[] = { __VA_ARGS__ }; \
+ encode(pcode, tab, countof(tab), k, dst, src); \
+ }
+
+#define O(s) (sizeof s)-1,s
+DEFINSTR2(Xmovb,
+ {-1, PMEM, PGPR, O("\x88"), EN_MR, .r8=1}, /* MOV m8, r8 */
+ {-1, PMEM, PI32, O("\xC6"), EN_MI8, .r8=1}, /* MOV m8, imm8 */
+)
+DEFINSTR2(Xmovw,
+ {-1, PMEM, PGPR, O("\x66\x89"), EN_MR}, /* MOV m16, r16 */
+ {-1, PMEM, PI32, O("\x66\xC7"), EN_MI16}, /* MOV m16, imm16 */
+)
+static void Xmov(uchar **pcode, enum irclass k, struct oper dst, struct oper src)
+{
+ static const struct desc all[] = {
+ {4 , PGPR, PI32, O("\xB8"), EN_OI}, /* MOV r32, imm */
+ {4|8, PGPR, PGPR, O("\x8B"), EN_RR}, /* MOV r32/64, r32/64 */
+ {4|8, PMEM, PGPR, O("\x89"), EN_MR}, /* MOV m32/64, r32/64 */
+ {4|8, PGPR, PMEM, O("\x8B"), EN_RM}, /* MOV r32/64, m32/64 */
+ {4|8, PMEM, PI32, O("\xC7"), EN_MI32}, /* MOV m32/64, imm */
+ { 8, PGPR, PU32, O("\xB8"), EN_OI, .norexw=1}, /* MOV r64, uimm */
+ { 8, PGPR, PI32, O("\xC7"), EN_RI32}, /* MOV r64, imm */
+ {4 , PFPR, PFPR, O("\x0F\x28"), EN_RR}, /* MOVPS xmm, xmm */
+ {4 , PFPR, PMEM, O("\xF3\x0F\x10"), EN_RM}, /* MOVSS xmm, m32 */
+ {4 , PMEM, PFPR, O("\xF3\x0F\x11"), EN_MR}, /* MOVSS m32, xmm */
+ {8 , PFPR, PFPR, O("\x0F\x28"), EN_RR}, /* MOVPS xmm, xmm */
+ {8 , PFPR, PMEM, O("\xF2\x0F\x10"), EN_RM}, /* MOVSD xmm, m64 */
+ {8 , PMEM, PFPR, O("\xF2\x0F\x11"), EN_MR}, /* MOVSS m64, xmm */
+ {4|8, PFPR, PGPR, O("\x66\x0F\x6E"), EN_RR}, /* MOVD/Q xmm, r64/32 */
+ {4|8, PGPR, PFPR, O("\x66\x0F\x7E"), EN_RRX}, /* MOVD/Q r64/32, xmm */
+ };
+ static const uchar k2off[] = {
+ [KI32] = 0,
+ [KI64] = 1, [KPTR] = 1,
+ [KF32] = 7,
+ [KF64] = 10,
+ };
+ if (kisflt(k) && src.t == OIMM && src.imm == 0) {
+ /* special case for storing zero float : use integer instruction with zero immediate */
+ k = KI32 + (k - KF32);
+ }
+ encode(pcode, all + k2off[k], countof(all) - k2off[k], k, dst, src);
+}
+DEFINSTR2(Xmovsxl,
+ {8, PGPR, PMEM, O("\x63"), EN_RM}, /* MOVSXD r64, m32 */
+ {8, PGPR, PGPR, O("\x63"), EN_RR}, /* MOVSXD r64, r32 */
+ {4, PGPR, PMEM, O("\x8B"), EN_RM}, /* MOV r32, m32 */
+ {4, PGPR, PGPR, O("\x8B"), EN_RR}, /* MOV r32, r32 */
+)
+DEFINSTR2(Xmovsxw,
+ {4|8, PGPR, PMEM, O("\x0F\xBF"), EN_RM}, /* MOVSX r64, m16 */
+ {4|8, PGPR, PGPR, O("\x0F\xBF"), EN_RR}, /* MOVSX r64, r16 */
+)
+DEFINSTR2(Xmovsxb,
+ {4|8, PGPR, PMEM, O("\x0F\xBE"), EN_RM}, /* MOVSX r64, m8 */
+ {4|8, PGPR, PGPR, O("\x0F\xBE"), EN_RR, .r8=1}, /* MOVSX r64, r8 */
+)
+DEFINSTR2(Xmovzxw,
+ {4|8, PGPR, PMEM, O("\x0F\xB7"), EN_RM}, /* MOVZX r64, m16 */
+ {4|8, PGPR, PGPR, O("\x0F\xB7"), EN_RR}, /* MOVZX r64, r16 */
+)
+DEFINSTR2(Xmovzxb,
+ {4|8, PGPR, PMEM, O("\x0F\xB6"), EN_RM}, /* MOVZX r64, m8 */
+ {4|8, PGPR, PGPR, O("\x0F\xB6"), EN_RR, .r8=1}, /* MOVZX r64, r8 */
+)
+DEFINSTR2(Xmovaps,
+ {-1, PMEM, PFPR, O("\x0F\x29"), EN_MR}, /* MOVAPS mem, xmm */
+)
+DEFINSTR2(Xxchg,
+ {4|8, PGPR, PGPR, O("\x87"), EN_RR}, /* XCHG r32/64, r32/64 */
+ {4|8, PGPR, PMEM, O("\x87"), EN_RM}, /* XCHG r32/64, m32/64 */
+ {4|8, PMEM, PGPR, O("\x87"), EN_MR}, /* XCHG r32/64, m32/64 */
+)
+DEFINSTR2(Xlea,
+ {4|8, PGPR, PMEM, O("\x8D"), EN_RM}, /* LEA r32/64,m32/64 */
+ { 8, PGPR, PSYM, O("\x8D"), EN_RM}, /* LEA r32/64,rel32 */
+)
+DEFINSTR2(Xadd,
+ {4|8, PGPR, PGPR, O("\x03"), EN_RR}, /* ADD r32/64, r32/64 */
+ {4|8, PGPR, P1, O("\xFF"), EN_R, .ext=0}, /* INC r32/64 */
+ {4|8, PGPR, PN1, O("\xFF"), EN_R, .ext=1}, /* DEC r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8}, /* ADD r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x05"), EN_I32}, /* ADD eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32}, /* ADD r32/64, imm */
+ { 8, PGPR, PMEM, O("\x03"), EN_RM}, /* ADD r64, m64 */
+)
+DEFINSTR2(Xaddf,
+ {4, PFPR, PFPR, O("\xF3\x0F\x58"), EN_RR}, /* ADDSS xmm, xmm */
+ {8, PFPR, PFPR, O("\xF2\x0F\x58"), EN_RR}, /* ADDSD xmm, xmm */
+ {4, PFPR, PMEM, O("\xF3\x0F\x58"), EN_RM}, /* ADDSS xmm, m32 */
+ {8, PFPR, PMEM, O("\xF2\x0F\x58"), EN_RM}, /* ADDSD xmm, m64 */
+)
+DEFINSTR2(Xsub,
+ {4|8, PGPR, PGPR, O("\x2B"), EN_RR}, /* SUB r32/64, r32/64 */
+ {4|8, PGPR, P1, O("\xFF"), EN_R, .ext=1}, /* DEC r32/64 */
+ {4|8, PGPR, PN1, O("\xFF"), EN_R, .ext=0}, /* INC r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8, .ext=5}, /* SUB r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x2D"), EN_I32}, /* SUB eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32, .ext=5}, /* SUB r32/64, imm */
+ { 8, PGPR, PMEM, O("\x2B"), EN_RM}, /* SUB r64, m64 */
+)
+DEFINSTR2(Xsubf,
+ {4, PFPR, PFPR, O("\xF3\x0F\x5C"), EN_RR}, /* SUBSS xmm, xmm */
+ {8, PFPR, PFPR, O("\xF2\x0F\x5C"), EN_RR}, /* SUBSD xmm, xmm */
+ {4, PFPR, PMEM, O("\xF3\x0F\x5C"), EN_RM}, /* SUBSS xmm, m32 */
+ {8, PFPR, PMEM, O("\xF2\x0F\x5C"), EN_RM}, /* SUBSD xmm, m64 */
+)
+DEFINSTR2(Xmulf,
+ {4, PFPR, PFPR, O("\xF3\x0F\x59"), EN_RR}, /* MULSS xmm, xmm */
+ {8, PFPR, PFPR, O("\xF2\x0F\x59"), EN_RR}, /* MULSD xmm, xmm */
+ {4, PFPR, PMEM, O("\xF3\x0F\x59"), EN_RM}, /* MULSS xmm, m32 */
+ {8, PFPR, PMEM, O("\xF2\x0F\x59"), EN_RM}, /* MULSD xmm, m64 */
+)
+DEFINSTR2(Xdivf,
+ {4, PFPR, PFPR, O("\xF3\x0F\x5E"), EN_RR}, /* DIVSS xmm, xmm */
+ {8, PFPR, PFPR, O("\xF2\x0F\x5E"), EN_RR}, /* DIVSD xmm, xmm */
+ {4, PFPR, PMEM, O("\xF3\x0F\x5E"), EN_RM}, /* DIVSS xmm, m32 */
+ {8, PFPR, PMEM, O("\xF2\x0F\x5E"), EN_RM}, /* DIVSD xmm, m64 */
+)
+DEFINSTR2(Xand,
+ {4|8, PGPR, PGPR, O("\x23"), EN_RR}, /* AND r32/64, r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8, .ext=4}, /* AND r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x25"), EN_I32}, /* AND eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32, .ext=4}, /* AND r32/64, imm */
+ { 8, PGPR, PMEM, O("\x23"), EN_RM}, /* AND r64, m64 */
+)
+DEFINSTR2(Xior,
+ {4|8, PGPR, PGPR, O("\x0B"), EN_RR}, /* OR r32/64, r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8, .ext=1}, /* OR r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x0D"), EN_I32}, /* OR eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32, .ext=1}, /* OR r32/64, imm */
+ { 8, PGPR, PMEM, O("\x0B"), EN_RM}, /* OR r64, m64 */
+ {4|8, PFPR, PFPR, O("\x0F\x57"), EN_RR}, /* ORPS xmm, xmm */
+)
+DEFINSTR2(Xxor,
+ {4|8, PGPR, PGPR, O("\x33"), EN_RR}, /* XOR r32/64, r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8, .ext=6}, /* XOR r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x35"), EN_I32}, /* XOR eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32, .ext=6}, /* XOR r32/64, imm */
+ { 8, PGPR, PMEM, O("\x33"), EN_RM}, /* XOR r64, m64 */
+ {4|8, PFPR, PFPR, O("\x0F\x57"), EN_RR}, /* XORPS xmm, xmm */
+ {4|8, PFPR, PMEM, O("\x0F\x57"), EN_RM}, /* XORPS xmm, m128 */
+)
+DEFINSTR2(Xshl,
+ {4|8, PGPR, P1, O("\xD1"), EN_R, .ext=4}, /* SHL r32/64, 1 */
+ {4|8, PGPR, PI32, O("\xC1"), EN_RI8, .ext=4}, /* SHL r32/64, imm */
+ {4|8, PGPR, PRCX, O("\xD3"), EN_R, .ext=4}, /* SHL r32/64, CL */
+)
+DEFINSTR2(Xsar,
+ {4|8, PGPR, P1, O("\xD1"), EN_R, .ext=7}, /* SAR r32/64, 1 */
+ {4|8, PGPR, PI32, O("\xC1"), EN_RI8, .ext=7}, /* SAR r32/64, imm */
+ {4|8, PGPR, PRCX, O("\xD3"), EN_R, .ext=7}, /* SAR r32/64, CL */
+)
+DEFINSTR2(Xrolw,
+ {-1, PGPR, PI8, O("\x66\xC1"), EN_RI8}, /* ROL r16, imm */
+)
+DEFINSTR2(Xshr,
+ {4|8, PGPR, P1, O("\xD1"), EN_R, .ext=5}, /* SHR r32/64, 1 */
+ {4|8, PGPR, PI32, O("\xC1"), EN_RI8, .ext=5}, /* SHR r32/64, imm */
+ {4|8, PGPR, PRCX, O("\xD3"), EN_R, .ext=5}, /* SHR r32/64, CL */
+)
+DEFINSTR2(Xcvtss2sd,
+ {-1, PFPR, PFPR, O("\xF3\x0F\x5A"), EN_RR}, /* CVTSS2SD xmm, xmm */
+ {-1, PFPR, PMEM, O("\xF3\x0F\x5A"), EN_RM}, /* CVTSS2SD xmm, m32/64 */
+)
+DEFINSTR2(Xcvtsd2ss,
+ {-1, PFPR, PFPR, O("\xF2\x0F\x5A"), EN_RR}, /* CVTSD2SS xmm, xmm */
+ {-1, PFPR, PMEM, O("\xF2\x0F\x5A"), EN_RM}, /* CVTSD2SS xmm, m32/64 */
+)
+DEFINSTR2(Xcvtsi2ss,
+ {-1, PFPR, PGPR, O("\xF3\x0F\x2A"), EN_RR}, /* CVTSI2SS xmm, r32/64 */
+ {-1, PFPR, PMEM, O("\xF3\x0F\x2A"), EN_RM}, /* CVTSI2SS xmm, m32/64 */
+)
+DEFINSTR2(Xcvtsi2sd,
+ {-1, PFPR, PGPR, O("\xF2\x0F\x2A"), EN_RR}, /* CVTSI2SD xmm, r32/64 */
+ {-1, PFPR, PMEM, O("\xF2\x0F\x2A"), EN_RM}, /* CVTSI2SD xmm, m32/64 */
+)
+DEFINSTR2(Xcvttss2si,
+ {-1, PGPR, PFPR, O("\xF3\x0F\x2C"), EN_RR}, /* CVTTSS2SI r32/64, xmm */
+ {-1, PGPR, PMEM, O("\xF3\x0F\x2C"), EN_RM}, /* CVTTSS2SI r32/64, m32 */
+)
+DEFINSTR2(Xcvttsd2si,
+ {-1, PGPR, PFPR, O("\xF2\x0F\x2C"), EN_RR}, /* CVTTSD2SI r32/64, xmm */
+ {-1, PGPR, PMEM, O("\xF2\x0F\x2C"), EN_RM}, /* CVTTSD2SI r32/64, m32 */
+)
+DEFINSTR1(Xneg,
+ {4|8, PGPR, 0, O("\xF7"), EN_R, .ext=3} /* NEG r32/64 */
+)
+DEFINSTR1(Xnot,
+ {4|8, PGPR, 0, O("\xF7"), EN_R, .ext=2} /* NOT r32/64 */
+)
+DEFINSTR1(Xidiv,
+ {4|8, PGPR, 0, O("\xF7"), EN_R, .ext=7}, /* IDIV r32/64 */
+ {4|8, PMEM, 0, O("\xF7"), EN_M, .ext=7}, /* IDIV m32/64 */
+)
+DEFINSTR1(Xdiv,
+ {4|8, PGPR, 0, O("\xF7"), EN_R, .ext=6}, /* DIV r32/64 */
+ {4|8, PMEM, 0, O("\xF7"), EN_M, .ext=6}, /* DIV m32/64 */
+)
+DEFINSTR1(Xbswap,
+ {4|8, PGPR, 0, O("\x0F\xC8"), EN_O}, /* BSWAP r32/64 */
+)
+DEFINSTR1(Xcall,
+ {-1, PSYM, 0, O("\xE8"), EN_R32, .norexw=1}, /* CALL rel32 */
+ {-1, PGPR, 0, O("\xFF"), EN_R, .ext=2, .norexw=1}, /* CALL r64 */
+ {-1, PMEM, 0, O("\xFF"), EN_M, .ext=2, .norexw=1}, /* CALL m64 */
+)
+DEFINSTR2(Xcmp,
+ {4|8, PGPR, PGPR, O("\x3B"), EN_RR}, /* CMP r32/64, r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8, .ext=7}, /* CMP r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x3D"), EN_I32}, /* CMP eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32, .ext=7}, /* CMP r32/64, imm */
+ { 8, PGPR, PMEM, O("\x3B"), EN_RM}, /* CMP r64, m64 */
+ {4 , PFPR, PFPR, O("\x0F\x2E"), EN_RR}, /* UCOMISS xmm, xmm */
+ {4 , PFPR, PMEM, O("\x0F\x2E"), EN_RM}, /* UCOMISS xmm, m32 */
+ { 8, PFPR, PFPR, O("\x66\x0F\x2E"), EN_RR}, /* UCOMISD xmm, xmm */
+ { 8, PFPR, PMEM, O("\x66\x0F\x2E"), EN_RM}, /* UCOMISD xmm, m64 */
+)
+DEFINSTR2(Xtest,
+ {4|8, PRAX, PI8, O("\xA8"), EN_I8, .norexw=1}, /* TEST AL, imm8 */
+ {4, PRAX, PI32, O("\xA9"), EN_I32}, /* TEST EAX, imm32 */
+ { 8, PRAX, PU32, O("\xA9"), EN_I32, .norexw=1}, /* TEST EAX, imm32 */
+ { 8, PRAX, PI32, O("\xA9"), EN_I32}, /* TEST RAX, imm32 */
+ {4|8, PGPR, PI8, O("\xF6"), EN_RI8, .r8=1,.norexw=1}, /* TEST r8, imm8 */
+ {4|8, PGPR, PI32, O("\xF7"), EN_RI32, .ext=0}, /* TEST r32/64, imm32 */
+ {4|8, PGPR, PGPR, O("\x85"), EN_RR}, /* TEST r32/64, r32/64 */
+ {4|8, PGPR, PMEM, O("\x85"), EN_RM}, /* TEST r32/64, m32/64 */
+)
+
+DEFINSTR2(Ximul2,
+ {4|8, PGPR, PGPR, O("\x0F\xAF"), EN_RR}, /* IMUL r32/64, r32/64 */
+ {4|8, PGPR, PMEM, O("\x0F\xAF"), EN_RM}, /* IMUL r32/64, m32/64 */
+)
+static const struct desc imul3_imm8tab[] = {
+ {4|8, PGPR, PGPR, O("\x6B"), EN_RR}, /* IMUL r32/64, r32/64, (imm8) */
+ {4|8, PGPR, PMEM, O("\x6B"), EN_RM}, /* IMUL r32/64, m32/64, (imm8) */
+}, imul3_imm32tab[] = {
+ {4|8, PGPR, PGPR, O("\x69"), EN_RR}, /* IMUL r32/64, r32/64, (imm32) */
+ {4|8, PGPR, PMEM, O("\x69"), EN_RM}, /* IMUL r32/64, m32/64, (imm32) */
+};
+#undef O
+static void
+Ximul(uchar **pcode, enum irclass k, struct oper dst, struct oper s1, struct oper s2)
+{
+ if (!memcmp(&dst, &s1, sizeof dst) && s2.t != OIMM) {
+ Ximul2(pcode, k, dst, s2);
+ return;
+ }
+ assert(s2.t == OIMM);
+ if (-128 <= s2.imm && s2.imm < 128) {
+ encode(pcode, imul3_imm8tab, countof(imul3_imm8tab), k, dst, s1);
+ B(s2.imm);
+ } else {
+ encode(pcode, imul3_imm32tab, countof(imul3_imm32tab), k, dst, s1);
+ I32(s2.imm);
+ }
+}
+
+enum cc {
+ CCO = 0x0, /* OF = 1*/
+ CCNO = 0x1, /* OF = 0*/
+ CCB = 0x2, CCC = 0x2, CCNAE = 0x2, /* below; CF = 1; not above or equal */
+ CCAE = 0x3, CCNB = 0x3, CCNC = 0x3, /* above or equal; not below; CF = 0 */
+ CCE = 0x4, CCZ = 0x4, /* equal; ZF = 1 */
+ CCNE = 0x5, CCNZ = 0x5, /* not equal; ZF = 0 */
+ CCBE = 0x6, CCNA = 0x6, /* below or equal; not above; CF=1 or ZF=1 */
+ CCA = 0x7, CCNBE = 0x7, /* above; not below or equal; CF=0 and ZF=0 */
+ CCS = 0x8, /* ZS = 1; negative */
+ CCNS = 0x9, /* ZS = 0; non-negative */
+ CCP = 0xA, CCPE = 0xA, /* PF = 1; parity even */
+ CCNP = 0xB, CCPO = 0xB, /* PF = 0; parity odd */
+ CCL = 0xC, CCNGE = 0xC, /* lower; not greater or equal; SF != OF */
+ CCGE = 0xD, CCNL = 0xD, /* greater or equal; not lower; SF == OF */
+ CCLE = 0xE, CCNG = 0xE, /* less or equal; not greater; ZF=1 or SF != OF */
+ CCG = 0xF, CCNLE = 0xF, /* greater; not less or equal; ZF=0 and SF = OF*/
+ ALWAYS,
+};
+
+/* maps blk -> address when resolved; or to linked list of jump displacement
+ * relocations */
+static struct blkaddr {
+ bool resolved;
+ union {
+ uint addr;
+ uint relreloc;
+ };
+} *blkaddr;
+
+static void
+Xjcc(uchar **pcode, enum cc cc, struct block *dst)
+{
+ int disp, insaddr = *pcode - objout.textbegin;
+ bool rel8 = 0;
+
+ if (blkaddr[dst->id].resolved) {
+ disp = blkaddr[dst->id].addr - (insaddr + 2);
+ if ((uint)(disp + 128) < 256) /* can use 1-byte displacement? */
+ rel8 = 1;
+ else { /* otherwise 4-byte displacement */
+ disp -= 3;
+ disp -= cc != ALWAYS; /* 'Jcc rel32' has 2 opcode bytes */
+ }
+ } else {
+ disp = blkaddr[dst->id].relreloc;
+ blkaddr[dst->id].relreloc = insaddr + 1 + (cc != ALWAYS);
+ }
+ if (cc == ALWAYS) {
+ B(rel8 ? 0xEB : 0xE9); /* JMP rel8/rel32 */
+ } else {
+ assert(in_range(cc, 0, 0xF));
+ if (rel8) B(0x70 + cc); /* Jcc rel8 */
+ else B(0x0F), B(0x80 + cc); /* Jcc rel32 */
+ }
+ if (rel8) B(disp); else I32(disp);
+}
+
+static void
+Xsetcc(uchar **pcode, enum cc cc, enum reg reg)
+{
+ int rex = 0;
+ assert(in_range(cc, 0x0, 0xF));
+ assert(in_range(reg, RAX, R15));
+
+ if (in_range(reg, RSP, RDI)) rex = 0x40;
+ rex |= (reg >> 3); /* REX.B */
+ if (rex) B(rex | 0x40);
+ B(0x0F), B(0x90+cc); /* SETcc */
+ B(0xC0 + (reg & 7)); /* ModR/M with mod=11, rm=reg */
+}
+
+static void
+Xpush(uchar **pcode, enum reg reg)
+{
+ if (in_range(reg, RAX, R15)) {
+ if (reg >> 3) B(0x41); /* REX.B */
+ B(0x50 + (reg & 7)); /* PUSH reg */
+ } else {
+ assert(in_range(reg, XMM0, XMM15));
+ DS("\x48\x8d\x64\x24\xF8"); /* LEA RSP, [RSP-8] */
+ Xmov(pcode, KF64, mkoper(OMEM, .base = RSP, .index = NOINDEX), reg2oper(reg)); /* MOVD [rsp],xmm0 */
+ }
+}
+
+static void
+Xpop(uchar **pcode, enum reg reg)
+{
+ if (in_range(reg, RAX, R15)) {
+ if (reg >> 3) B(0x41); /* REX.B */
+ B(0x58 + (reg & 7)); /* POP reg */
+ } else {
+ assert(in_range(reg, XMM0, XMM15));
+ Xmov(pcode, KF64, reg2oper(reg), mkoper(OMEM, .base = RSP, .index = NOINDEX)); /* MOVD xmm0,[rsp] */
+ DS("\x48\x8d\x64\x24\x08"); /* LEA RSP, [RSP+8] */
+ }
+}
+
+/* are flags live at given instruction? */
+static bool
+flagslivep(struct block *blk, int curi)
+{
+ int cmpi;
+ /* conditional branch that references a previous comparison instruction? */
+ if (blk->jmp.t != Jb || !blk->jmp.arg[0].bits)
+ return 0;
+ assert(blk->jmp.arg[0].t == RTMP);
+ cmpi = blk->jmp.arg[0].i;
+ for (int i = blk->ins.n - 1; i > curi; --i) {
+ if (blk->ins.p[i] == cmpi)
+ /* flags defined after given instruction, dead here */
+ return 0;
+ }
+ /* flags defined before given instruction, live here */
+ return 1;
+}
+
+/* Copy dst = val, with some peephole optimizations */
+static void
+gencopy(uchar **pcode, enum irclass cls, struct block *blk, int curi, struct oper dst, union ref val)
+{
+ assert(dst.t == OREG);
+ if (val.bits == UNDREF.bits) {
+ /* can be generated by ssa construction, since value is undefined no move is needed */
+ return;
+ }
+ if (val.t == RADDR) {
+ /* this is a LEA, but maybe it can be lowered to a 2-address instruction,
+ * which may clobber flags */
+ const struct addr *addr = &addrtab.p[val.i];
+ if (flagslivep(blk, curi)) goto Lea;
+ if (addr->base.t != RREG) goto Lea;
+ if (addr->base.bits && dst.reg == mkregoper(addr->base).reg) { /* base = dst */
+ if (addr->index.bits && !addr->disp && !addr->shift){
+ /* lea Rx, [Rx + Ry] -> add Rx, Ry */
+ Xadd(pcode, cls, dst, mkregoper(addr->index));
+ return;
+ } else if (!addr->index.bits) {
+ if (!addr->disp) /* lea Rx, [Rx] -> mov Rx, Rx */
+ Xmov(pcode, cls, dst, dst);
+ else /* lea Rx, [Rx + Imm] -> add Rx, Imm */
+ Xadd(pcode, cls, dst, mkoper(OIMM, .imm = addr->disp));
+ return;
+ }
+ } else if (addr->index.bits && dst.reg == mkregoper(addr->index).reg) { /* index = dst */
+ if (addr->base.bits && !addr->disp && !addr->shift) {
+ /* lea Rx, [Ry + Rx] -> add Rx, Ry */
+ Xadd(pcode, cls, dst, mkregoper(addr->base));
+ return;
+ } else if (!addr->base.bits) {
+ if (!addr->disp && !addr->shift) /* lea Rx, [Rx] -> mov Rx, Rx */
+ Xmov(pcode, cls, dst, dst);
+ else if (!addr->shift) /* lea Rx, [Rx + Imm] -> add Rx, Imm */
+ Xadd(pcode, cls, dst, mkoper(OIMM, .imm = addr->disp));
+ else if (!addr->disp) /* lea Rx, [Rx LSL s] -> shl Rx, s */
+ Xshl(pcode, cls, dst, mkoper(OIMM, .imm = addr->shift));
+ else
+ goto Lea;
+ return;
+ }
+ }
+ /* normal (not 2-address) case */
+ Lea:
+ if (isaddrcon(addr->base,0) && (ccopt.pic || (contab.p[addr->base.i].flag & SFUNC))
+ && !(contab.p[addr->base.i].flag & SLOCAL)) {
+ assert(!addr->disp && !addr->index.bits);
+ val = addr->base;
+ goto GOTLoad;
+ }
+ Xlea(pcode, cls, dst, ref2oper(val));
+ } else if (val.bits == ZEROREF.bits && dst.t == OREG && (kisflt(cls) || !flagslivep(blk, curi))) {
+ /* dst = 0 -> xor dst, dst; but only if it is ok to clobber flags */
+ Xxor(pcode, kisint(cls) ? KI32 : cls, dst, dst);
+ } else if (isaddrcon(val,0)) {
+ if ((ccopt.pic || (contab.p[val.i].flag & SFUNC)) && (contab.p[val.i].flag & (SLOCAL|SFUNC)) != (SLOCAL|SFUNC)) {
+ GOTLoad:
+ /* for mov reg, [rip(sym@GOTPCREL)] */
+ Xmov(pcode, cls, dst, mkoper(OSYMGOT, .con = val.i, .cindex = NOINDEX));
+ } else {
+ /* for lea reg, [rip(sym)] */
+ Xlea(pcode, cls, dst, mkoper(OSYM, .con = val.i, .cindex = NOINDEX));
+ }
+ } else if (val.t == RXCON && in_range(concls(val), KI64, KPTR)) {
+ /* movabs */
+ assert(dst.t == OREG && in_range(dst.reg, RAX, R15));
+ B(0x48 | (dst.reg >> 3)); /* REX.W (+ REX.B) */
+ B(0xB8 + (dst.reg & 0x7)); /* MOVABS r64, */
+ wr64le(*pcode, intconval(val)); /* imm64 */
+ *pcode += 8;
+ } else {
+ struct oper src = mkimmdatregoper(val);
+ if (memcmp(&dst, &src, sizeof dst) != 0)
+ Xmov(pcode, cls == KF64 && src.t == OREG && src.reg < XMM0 ? KI64 : cls, dst, src);
+ }
+}
+
+static void
+Xvaprologue(uchar **pcode, struct function *fn, struct oper sav)
+{
+ uint gpr0 = 0, fpr0 = 0, jmpaddr;
+ for (int i = 0; i < fn->nabiarg; ++i) {
+ struct abiarg abi = fn->abiarg[i];
+ if (!abi.isstk) {
+ if (abi.reg < XMM0) ++gpr0;
+ else ++fpr0;
+ }
+ }
+ assert(sav.t == OMEM && sav.base == RBP);
+ /* save GPRS */
+ for (int r = 0; r < 6; ++r) {
+ static const char reg[] = {RDI,RSI,RDX,RCX,R8,R9};
+ if (r >= gpr0)
+ Xmov(pcode, KI64, sav, reg2oper(reg[r]));
+ sav.disp += 8;
+ }
+
+ /* save FPRs, but only if al is non zero */
+ if (fpr0 < 8) {
+ DS("\x84\xC0"); /* TEST al,al */
+ jmpaddr = *pcode - objout.textbegin;
+ DS("\x74\xFE"); /* JE rel8 */
+ }
+ for (int r = 0; r < 8; ++r) {
+ if (r >= fpr0)
+ Xmovaps(pcode, KF64, sav, reg2oper(XMM0 + r));
+ sav.disp += 16;
+ }
+ if (fpr0 < 8) {/* patch relative jump */
+ int off = (*pcode - objout.textbegin) - jmpaddr - 2;
+ objout.textbegin[jmpaddr+1] = off;
+ }
+}
+
+/* condition code for CMP */
+static const uchar icmpop2cc[] = {
+ [Oequ] = CCE, [Oneq] = CCNE,
+ [Olth] = CCL, [Ogth] = CCG, [Olte] = CCLE, [Ogte] = CCGE,
+ [Oulth] = CCB, [Ougth] = CCA, [Oulte] = CCBE, [Ougte] = CCAE,
+ [Oand] = CCNE, [Osub] = CCNE,
+}, fcmpop2cc[] = {
+ [Oequ] = CCE, [Oneq] = CCNE,
+ [Olth] = CCB, [Ogth] = CCA, [Olte] = CCBE, [Ogte] = CCAE,
+};
+/* condition code for TEST reg,reg (compare with zero) */
+static const uchar icmpzero2cc[] = {
+ [Oequ] = CCE, [Oulte] = CCE,
+ [Oneq] = CCNE, [Ougth] = CCNE,
+ [Olth] = CCS, [Ogte] = CCNS,
+ [Olte] = CCLE, [Ogth] = CCG,
+ [Oulth] = CCB, [Ougte] = CCAE, /* actually constants */
+};
+
+static void
+emitinstr(uchar **pcode, struct function *fn, struct block *blk, int curi, struct instr *ins)
+{
+ struct oper dst, src;
+ bool regzeroed;
+ enum irclass cls = ins->cls;
+ void (*X)(uchar **, enum irclass, struct oper, struct oper) = NULL;
+ void (*X1)(uchar **, enum irclass, struct oper) = NULL;
+
+ switch (ins->op) {
+ default:
+ fatal(NULL, "x86_64: in %y; unimplemented instr '%s'", fn->name, opnames[ins->op]);
+ case Onop: break;
+ case Omove:
+ dst = ref2oper(ins->l);
+ gencopy(pcode, cls, blk, curi, dst, ins->r);
+ break;
+ case Ocopy:
+ dst = reg2oper(ins->reg-1);
+ gencopy(pcode, cls, blk, curi, dst, ins->l);
+ break;
+ case Ostorei8: cls = KI32, X = Xmovb; goto Store;
+ case Ostorei16: cls = KI32, X = Xmovw; goto Store;
+ case Ostorei32: cls = KI32, X = Xmov; goto Store;
+ case Ostorei64: cls = KI64, X = Xmov; goto Store;
+ case Ostoref32: cls = KF32, X = Xmov; goto Store;
+ case Ostoref64: cls = KF64, X = Xmov; goto Store;
+ Store:
+ src = mkimmregoper(ins->r);
+ X(pcode, cls, mkmemoper(ins->l), src);
+ break;
+ case Oexts8: src = mkregoper(ins->l); goto Movsxb;
+ case Oextu8: src = mkregoper(ins->l); goto Movzxb;
+ case Oexts16: src = mkregoper(ins->l); goto Movsxw;
+ case Oextu16: src = mkregoper(ins->l); goto Movzxw;
+ case Oexts32: src = mkregoper(ins->l); goto Movsxl;
+ case Oextu32: src = mkregoper(ins->l); goto Movzxl;
+ case Oloads8: src = mkmemoper(ins->l); Movsxb: Xmovsxb(pcode, cls, reg2oper(ins->reg-1), src); break;
+ case Oloadu8: src = mkmemoper(ins->l); Movzxb: Xmovzxb(pcode, cls, reg2oper(ins->reg-1), src); break;
+ case Oloads16: src = mkmemoper(ins->l); Movsxw: Xmovsxw(pcode, cls, reg2oper(ins->reg-1), src); break;
+ case Oloadu16: src = mkmemoper(ins->l); Movzxw: Xmovzxw(pcode, cls, reg2oper(ins->reg-1), src); break;
+ case Oloads32: src = mkmemoper(ins->l); Movsxl: Xmovsxl(pcode, cls, reg2oper(ins->reg-1), src); break;
+ case Oloadu32: src = mkmemoper(ins->l); Movzxl: Xmov(pcode, KI32, reg2oper(ins->reg-1), src); break;
+ case Oloadf32: case Oloadf64: Xmov(pcode, cls, reg2oper(ins->reg-1), mkmemoper(ins->l)); break;
+ case Oloadi64: Xmov(pcode, KI64, reg2oper(ins->reg-1), mkmemoper(ins->l)); break;
+ case Ocvtf32f64: X = Xcvtss2sd; goto FloatsCvt;
+ case Ocvtf64f32: X = Xcvtsd2ss; goto FloatsCvt;
+ case Ocvtf32s: X = Xcvttss2si; goto FloatsCvt;
+ case Ocvtf64s: X = Xcvttsd2si; goto FloatsCvt;
+ case Ocvts32f: X = cls == KF32 ? Xcvtsi2ss : Xcvtsi2sd; cls = KI32; goto FloatsCvt;
+ case Ocvts64f: X = cls == KF32 ? Xcvtsi2ss : Xcvtsi2sd; cls = KI64; goto FloatsCvt;
+ FloatsCvt:
+ X(pcode, cls, reg2oper(ins->reg-1), mkdatregoper(ins->l));
+ break;
+ case Oadd:
+ dst = mkregoper(ins->l);
+ if (kisflt(cls)) {
+ Xaddf(pcode, cls, dst, mkimmdatregoper(ins->r));
+ } else if (ins->reg-1 == dst.reg) { /* two-address add */
+ src = ref2oper(ins->r);
+ if (src.t == OIMM && src.imm < 0) /* ADD -imm -> SUB imm, for niceness */
+ Xsub(pcode, cls, dst, (src.imm = -(uint)src.imm, src));
+ else
+ Xadd(pcode, cls, dst, src);
+ } else if (isregref(ins->r) && ins->reg-1 == mkregoper(ins->r).reg) {
+ /* also two-address after swapping operands */
+ Xadd(pcode, cls, reg2oper(ins->reg-1), mkimmdatregoper(ins->l));
+ } else { /* three-address add (lea) */
+ struct oper mem = { OMEM, .base = NOBASE, .index = NOINDEX };
+ dst = reg2oper(ins->reg-1);
+ addmemoper(&mem, ref2oper(ins->l));
+ addmemoper(&mem, ref2oper(ins->r));
+ Xlea(pcode, cls, dst, mem);
+ }
+ break;
+ case Osub:
+ dst = mkregoper(ins->l);
+ if (kisflt(cls)) {
+ Xsubf(pcode, cls, dst, mkimmdatregoper(ins->r));
+ } else if (!ins->reg) {
+ Xcmp(pcode, cls, mkregoper(ins->l), mkimmdatregoper(ins->r));
+ } else if (ins->reg-1 == dst.reg) { /* two-address */
+ Xsub(pcode, cls, dst, ref2oper(ins->r));
+ } else {
+ assert(isintcon(ins->r));
+ Xlea(pcode, cls, reg2oper(ins->reg-1),
+ mkoper(OMEM, .base = mkregoper(ins->l).reg, .index = NOINDEX, .disp = -intconval(ins->r)));
+ }
+ break;
+ case Oshl:
+ dst = reg2oper(ins->reg-1);
+ src = mkregoper(ins->l);
+ if (dst.reg == src.reg)
+ Xshl(pcode, cls, dst, mkimmdatregoper(ins->r));
+ else {
+ uint sh = ins->r.i;
+ assert(ins->r.t == RICON && sh <= 3);
+ if (sh == 1) /* shl x, 1 -> lea [x + x] */
+ Xlea(pcode, cls, dst, mkoper(OMEM, .base = src.reg, .index = src.reg));
+ else /* shl x, n -> lea [x*(1<<n)+0x0] */
+ Xlea(pcode, cls, dst, mkoper(OMEM, .base = NOBASE, .index = src.reg, .shift = sh));
+ }
+ break;
+ case Osar: X = Xsar; goto ALU2;
+ case Oslr: X = Xshr; goto ALU2;
+ case Oand:
+ if (!ins->reg) {
+ Xtest(pcode, cls, mkregoper(ins->l), mkimmdatregoper(ins->r));
+ break;
+ }
+ X = Xand;
+ goto ALU2;
+ case Oxor: X = Xxor; goto ALU2;
+ case Oior: X = Xior; goto ALU2;
+ ALU2:
+ dst = mkregoper(ins->l);
+ assert(ins->reg-1 == dst.reg);
+ X(pcode, cls, dst, mkimmdatregoper(ins->r));
+ break;
+ case Oneg: X1 = Xneg; goto ALU1;
+ case Onot: X1 = Xnot; goto ALU1;
+ ALU1:
+ dst = mkregoper(ins->l);
+ assert(ins->reg-1 == dst.reg);
+ X1(pcode, cls, dst);
+ break;
+ case Obswap16:
+ dst = mkregoper(ins->l);
+ assert(ins->reg-1 == dst.reg);
+ if (dst.reg < 4) { /* AX,BX,CX,DX */
+ /* XCHG rH, rL */
+ B(0x86), B(0xC4 | dst.reg | (dst.reg)<<3);
+ } else {
+ /* ROL r16,8 */
+ Xrolw(pcode, KI32, dst, mkoper(OIMM, .imm = 8));
+ }
+ break;
+ case Obswap32: case Obswap64: X1 = Xbswap; goto ALU1;
+ case Omul:
+ if (kisint(cls))
+ Ximul(pcode, cls, reg2oper(ins->reg-1), ref2oper(ins->l), ref2oper(ins->r));
+ else
+ Xmulf(pcode, cls, reg2oper(ins->reg-1), ref2oper(ins->r));
+ break;
+ case Odiv:
+ switch (cls) {
+ default: assert(0);
+ case KPTR:
+ case KI64: B(0x48); /* REX.W */
+ case KI32: B(0x99); /* CDQ/CQO */
+ assert(mkregoper(ins->l).reg == RAX);
+ Xidiv(pcode, cls, mkdatregoper(ins->r));
+ break;
+ case KF32: case KF64:
+ Xdivf(pcode, cls, reg2oper(ins->reg-1), mkdatregoper(ins->r));
+ break;
+ }
+ break;
+ case Oudiv:
+ DS("\x31\xD2"); /* XOR EDX,EDX */
+ assert(mkregoper(ins->l).reg == RAX);
+ Xdiv(pcode, cls, mkdatregoper(ins->r));
+ break;
+ case Oequ: case Oneq:
+ case Olth: case Ogth: case Olte: case Ogte:
+ case Oulth: case Ougth: case Oulte: case Ougte:
+ dst = mkregoper(ins->l);
+ src = ref2oper(ins->r);
+ regzeroed = 0;
+ if (ins->reg && dst.reg != ins->reg-1 && (src.t != OREG || src.reg != ins->reg-1)) {
+ /* can zero output reg before test instruction (differs from both inputs) */
+ /* XXX this doesn't check if a source operand is an addr containing the register */
+ struct oper dst = reg2oper(ins->reg-1);
+ Xxor(pcode, KI32, dst, dst);
+ regzeroed = 1;
+ }
+ if (kisint(ins->cls) && ins->r.bits == ZEROREF.bits)
+ Xtest(pcode, cls, dst, dst);
+ else
+ Xcmp(pcode, cls, dst, src);
+ if (ins->reg) {
+ enum cc cc;
+ dst = reg2oper(ins->reg-1);
+ if (ins->r.bits != ZEROREF.bits) { /* CMP */
+ cc = (kisint(ins->cls) ? icmpop2cc : fcmpop2cc)[ins->op];
+ } else { /* TEST r,r (CMP r, 0) */
+ assert(kisint(ins->cls));
+ cc = icmpzero2cc[ins->op];
+ }
+ if (kisflt(ins->cls)) { /* handle float unordered result */
+ int unordres = ins->op == Oneq ? 1 : 0;
+ int rex = 0;
+ if (in_range(dst.reg, RSP, RDI)) rex = 0x40;
+ rex |= (dst.reg >> 3); /* REX.B */
+ int jpoff = 3 + (rex != 0);
+ if (regzeroed && unordres == 0) {
+ /* if cmp unordered, just jump over the SETcc; result reg was already zeroed */
+ B(0x7A), B(jpoff); /* JP <off> */
+ } else {
+ /* JNP .a
+ * MOV r8, 0/1
+ * JMP .b
+ * .a: SETcc r8
+ * .b: MOVZX r, r8
+ */
+ B(0x7B), B(jpoff+1); /* JNP <off> */
+ if (rex) B(rex | 0x40);
+ B(0xB0 + (dst.reg & 7)), B(unordres); /* MOV r8, 0/1 */
+ B(0xEB), B(jpoff); /* JMP <off> */
+ }
+ }
+ Xsetcc(pcode, cc, dst.reg);
+ if (!regzeroed)
+ Xmovzxb(pcode, KI32, dst, dst);
+ }
+ break;
+ case Oswap:
+ if (kisint(cls))
+ Xxchg(pcode, cls, ref2oper(ins->l), mkregoper(ins->r));
+ else {
+ struct oper l = mkregoper(ins->l), r = mkregoper(ins->r);
+ Xxor(pcode, cls, l, r);
+ Xxor(pcode, cls, r, l);
+ Xxor(pcode, cls, l, r);
+ }
+ break;
+ case Ocall:
+ Xcall(pcode, KPTR, ref2oper(ins->l));
+ break;
+ case Oxvaprologue:
+ Xvaprologue(pcode, fn, mkmemoper(ins->l));
+ break;
+ }
+}
+
+static void
+emitbranch(uchar **pcode, struct block *blk)
+{
+ enum cc cc = ALWAYS;
+ assert(blk->s1);
+ if (blk->s2) {
+ /* conditional branch.. */
+ union ref arg = blk->jmp.arg[0];
+ struct block *unord = NULL;
+ assert(arg.t == RTMP);
+ struct instr *ins = &instrtab[arg.i];
+ if ((oiscmp(ins->op) || ins->op == Oand || ins->op == Osub)) {
+ if (ins->r.bits != ZEROREF.bits) {
+ /* for CMP instr */
+ cc = (kisint(ins->cls) ? icmpop2cc : fcmpop2cc)[ins->op];
+ unord = ins->op == Oneq ? blk->s1 : blk->s2;
+ } else {
+ assert(kisint(ins->cls));
+ /* for TEST instr, which modifies ZF and SF and sets CF = OF = 0 */
+ cc = icmpzero2cc[ins->op];
+ }
+ } else {
+ /* implicit by ZF */
+ cc = CCNZ;
+ }
+ if (kisflt(ins->cls)) {
+ /* handle float unordered result */
+ Xjcc(pcode, CCP, unord);
+ }
+ if (blk->s1 == blk->lnext) {
+ /* if s1 is next adjacent block, swap s1,s2 and flip condition to emit a
+ * single jump */
+ struct block *tmp = blk->s1;
+ blk->s1 = blk->s2;
+ blk->s2 = tmp;
+ cc ^= 1;
+ }
+ }
+ /* make sure to fallthru if jumping to next adjacent block */
+ if (blk->s2 || blk->s1 != blk->lnext)
+ Xjcc(pcode, cc, blk->s1);
+ if (blk->s2 && blk->s2 != blk->lnext)
+ Xjcc(pcode, ALWAYS, blk->s2);
+}
+
+static bool
+calleesave(int *npush, uchar **pcode, struct function *fn)
+{
+ bool any = 0;
+ if (rstest(fn->regusage, RBX)) {
+ Xpush(pcode, RBX);
+ ++*npush;
+ any = 1;
+ }
+ for (int r = R12; r <= R15; ++r)
+ if (rstest(fn->regusage, r)) {
+ Xpush(pcode, r);
+ ++*npush;
+ any = 1;
+ }
+ return any;
+}
+
+static void
+calleerestore(uchar **pcode, struct function *fn)
+{
+ for (int r = R15; r >= R12; --r)
+ if (rstest(fn->regusage, r))
+ Xpop(pcode, r);
+ if (rstest(fn->regusage, RBX)) Xpop(pcode, RBX);
+}
+
+/* align code using NOPs */
+static void
+nops(uchar **pcode, int align)
+{
+ int rem;
+ while ((rem = (*pcode - objout.textbegin) & (align - 1)) != 0) {
+ switch (align - rem) {
+ case 15: case 14: case 13: case 12: case 11: case 10:
+ case 9: B(0x66);
+ case 8: DS("\x0f\x1f\x84\x00\x00\x00\x00\x00"); break;
+ case 7: DS("\x0f\x1f\x80\x00\x00\x00\x00"); break;
+ case 6: B(0x66);
+ case 5: DS("\x0f\x1f\x44\x00\x00"); break;
+ case 4: DS("\x0f\x1f\x40\x00"); break;
+ case 3: DS("\x0f\x1f\00"); break;
+ case 2: B(0x66);
+ case 1: B(0x90); break;
+ }
+ }
+}
+
+static void
+emitbin(struct function *fn)
+{
+ struct block *blk;
+ uchar **pcode = &objout.code;
+ int npush = 0;
+ bool saverestore;
+
+ nops(pcode, 16);
+ fnstart = *pcode;
+ curfnsym = fn->name;
+
+ /** prologue **/
+
+ /* only use frame pointer in non-leaf functions and functions that use the stack */
+ usebp = 0;
+ if (!fn->isleaf || fn->stksiz) {
+ usebp = 1;
+ /* push rbp; mov rbp, rsp */
+ DS("\x55\x48\x89\xE5");
+ }
+ saverestore = calleesave(&npush, pcode, fn);
+ if (usebp) rbpoff = -npush*8;
+
+ /* ensure stack is 16-byte aligned for function calls */
+ if (!fn->isleaf && ((fn->stksiz + npush*8) & 0xF) != 0) {
+ assert(usebp);
+ if ((rbpoff & 0xF) == 0) {
+ rbpoff -= 16;
+ fn->stksiz += 24;
+ } else {
+ rbpoff -= 8;
+ fn->stksiz += 8;
+ }
+ }
+
+ if (fn->stksiz != 0) {
+ /* sub rsp, <stack size> */
+ if (fn->stksiz < 128)
+ DS("\x48\x83\xEC"), B(fn->stksiz);
+ else if (fn->stksiz == 128)
+ DS("\x48\x83\xC4\x80"); /* add rsp, -128 */
+ else
+ DS("\x48\x81\xEC"), I32(fn->stksiz);
+ }
+
+ if (*pcode - fnstart > 6) {
+ /* largue prologue -> largue epilogue -> transform to use single exit point */
+ struct block *exit = NULL;
+ blk = fn->entry->lprev;
+ do {
+ if (blk->jmp.t == Jret) {
+ if (!exit) {
+ if (blk->ins.n == 0) {
+ exit = blk;
+ continue;
+ } else {
+ exit = newblk(fn);
+ exit->lnext = blk->lnext;
+ exit->lprev = blk;
+ blk->lnext = exit;
+ exit->lnext->lprev = exit;
+ exit->id = fn->nblk++;
+ exit->jmp.t = Jret;
+ }
+ }
+ blk->jmp.t = Jb;
+ memset(blk->jmp.arg, 0, sizeof blk->jmp.arg);
+ blk->s1 = exit;
+ } else if (exit) {
+ /* thread jumps to the exit block */
+ if (blk->s1 && !blk->s1->ins.n && blk->s1->s1 == exit && !blk->s1->s2) blk->s1 = exit;
+ if (blk->s2 && !blk->s2->ins.n && blk->s2->s1 == exit && !blk->s2->s2) blk->s2 = exit;
+ }
+ } while ((blk = blk->lprev) != fn->entry);
+ }
+
+ blkaddr = allocz(fn->passarena, fn->nblk * sizeof *blkaddr, 0);
+
+ blk = fn->entry;
+ do {
+ struct blkaddr *bb = &blkaddr[blk->id];
+ uint bbaddr = *pcode - objout.textbegin;
+ assert(!bb->resolved);
+ while (bb->relreloc) {
+ uint next;
+ memcpy(&next, objout.textbegin + bb->relreloc, 4);
+ int disp = bbaddr - bb->relreloc - 4;
+ wr32le(objout.textbegin + bb->relreloc, disp);
+ bb->relreloc = next;
+ }
+ bb->resolved = 1;
+ bb->addr = bbaddr;
+
+ for (int i = 0; i < blk->ins.n; ++i)
+ emitinstr(pcode, fn, blk, i, &instrtab[blk->ins.p[i]]);
+
+ if (blk->jmp.t == Jret) {
+ if (blk->lnext != fn->entry && blk->lnext->jmp.t == Jret && blk->lnext->ins.n == 0)
+ continue; /* fallthru to next blk's RET */
+ /* epilogue */
+ if (fn->stksiz && (saverestore || !usebp))
+ Xadd(pcode, KPTR, mkoper(OREG, .reg = RSP), mkoper(OIMM, .imm = fn->stksiz));
+ if (saverestore)
+ calleerestore(pcode, fn);
+ if (usebp) B(0xC9); /* leave */
+ B(0xC3); /* ret */
+ } else if (blk->jmp.t == Jtrap) {
+ DS("\x0F\x0B"); /* UD2 */
+ } else emitbranch(pcode, blk);
+ } while ((blk = blk->lnext) != fn->entry);
+ objdeffunc(fn->name, fn->globl, fnstart - objout.textbegin, *pcode - fnstart);
+}
+
+void
+x86_64_emit(struct function *fn)
+{
+ fn->stksiz = alignup(fn->stksiz, 8);
+ if (fn->stksiz > 1<<24) error(NULL, "'%s' stack frame too big", fn->name);
+ emitbin(fn);
+}
+
+/* vim:set ts=3 sw=3 expandtab: */