aboutsummaryrefslogtreecommitdiffhomepage
path: root/x86_64
diff options
context:
space:
mode:
authorlemon <lsof@mailbox.org>2025-12-12 17:40:35 +0100
committerlemon <lsof@mailbox.org>2025-12-12 17:40:35 +0100
commit24bcc929477751b056e81e7772dc2bb3d11ce4a5 (patch)
treef83eb0c32df505f25c828d0a62f17806dc2736b1 /x86_64
parent3cd8e39ff61217a37b41cee47f2682f5291317d6 (diff)
s/amd64/x86_64/
Diffstat (limited to 'x86_64')
-rw-r--r--x86_64/all.h18
-rw-r--r--x86_64/emit.c1388
-rw-r--r--x86_64/isel.c660
-rw-r--r--x86_64/sysv.c313
4 files changed, 2379 insertions, 0 deletions
diff --git a/x86_64/all.h b/x86_64/all.h
new file mode 100644
index 0000000..c0c38ff
--- /dev/null
+++ b/x86_64/all.h
@@ -0,0 +1,18 @@
+#include "../ir/ir.h"
+
+#define LIST_REGS(_) \
+ _(RAX) _(RCX) _(RDX) _(RBX) _(RSP) _(RBP) _(RSI) _(RDI) \
+ _(R8) _(R9) _(R10) _(R11) _(R12) _(R13) _(R14) _(R15) \
+ _(XMM0) _(XMM1) _(XMM2) _(XMM3) _(XMM4) _(XMM5) _(XMM6) _(XMM7) \
+ _(XMM8) _(XMM9) _(XMM10) _(XMM11) _(XMM12) _(XMM13) _(XMM14) _(XMM15)
+
+enum reg {
+#define R(r) r,
+ LIST_REGS(R)
+#undef R
+};
+
+void x86_64_isel(struct function *);
+void x86_64_emit(struct function *);
+
+/* vim:set ts=3 sw=3 expandtab: */
diff --git a/x86_64/emit.c b/x86_64/emit.c
new file mode 100644
index 0000000..4a7d287
--- /dev/null
+++ b/x86_64/emit.c
@@ -0,0 +1,1388 @@
+#include "all.h"
+#include "../obj/obj.h"
+#include "../endian.h"
+
+/** Instruction operands **
+ *
+ * Can be a register, a 32-bit immediate,
+ * a memory reference [base + index * scale + disp],
+ * or a relocatable reference to some symbol plus a displacement and maybe index*scale
+ */
+enum operkind { ONONE, OREG, OIMM, OMEM, OSYM };
+enum { NOBASE = 63, NOINDEX = 63 };
+struct oper {
+ uchar t;
+ union {
+ struct { uchar base; }; /* OMEM */
+ struct { uchar cindex : 6, cshift : 2; }; /* OSYM */
+ };
+ union {
+ struct { uchar index, shift; }; /* OMEM */
+ ushort con; /* OSYM */
+ };
+ union {
+ uchar reg; /* OREG */
+ int disp; /* OMEM, OSYM */
+ int imm; /* OIMM */
+ };
+};
+#define mkoper(t, ...) ((struct oper){(t), __VA_ARGS__})
+#define reg2oper(R) (assert((uint)(R) <= XMM15), mkoper(OREG, .reg = (R)))
+
+static struct oper mkmemoper(union ref);
+
+static struct oper
+ioper(int i)
+{
+ int reg = instrtab[i].reg - 1;
+ return reg < 0 ? mkoper(ONONE,) : reg2oper(reg);
+}
+
+static struct oper
+ref2oper(union ref r)
+{
+ switch (r.t) {
+ case RTMP: return ioper(r.i);
+ case RREG: return reg2oper(r.i);
+ case RICON: return mkoper(OIMM, .imm = r.i);
+ case RXCON:
+ if (conht[r.i].cls == KI32)
+ return mkoper(OIMM, .imm = conht[r.i].i);
+ else if (conht[r.i].cls == KI64) {
+ vlong i = conht[r.i].i;
+ assert(i == (int)i);
+ return mkoper(OIMM, .imm = i);
+ } else if (!conht[r.i].cls) {
+ return mkoper(OSYM, .con = r.i, .cindex = NOINDEX);
+ }
+ assert(0);
+ case RADDR: return mkmemoper(r);
+ default: assert(0);
+ }
+}
+
+static void
+addmemoper(struct oper *mem, struct oper add)
+{
+ assert(mem->t == OMEM);
+ if (add.t == OIMM) {
+ mem->disp += add.imm;
+ } else if (add.t == OREG) {
+ if (mem->base == NOBASE)
+ mem->base = add.reg;
+ else if (mem->index == NOINDEX)
+ mem->index = add.reg;
+ else
+ assert(0);
+ }
+}
+
+/* helpers to convert a reference to an operand of a specific kind,
+ * with assertions to make sure nothing went wrong */
+
+static inline struct oper
+mkregoper(union ref r)
+{
+ assert(r.t == RREG || (r.t == RTMP && ioper(r.i).t == OREG));
+ return r.t == RREG ? reg2oper(r.i) : ioper(r.i);
+}
+
+static inline struct oper
+mkimmoper(union ref r)
+{
+ assert(iscon(r) && concls(r) == KI32);
+ return mkoper(OIMM, .imm = intconval(r));
+}
+
+#define ismemref(ref) ((ref).t == RTMP && ioper((ref).i).t == OMEM)
+#define isregref(ref) ((ref).t == RREG || ((ref).t == RTMP && ioper((ref).i).t == OREG))
+
+static inline struct oper
+mkimmregoper(union ref r)
+{
+ assert(isregref(r) || (iscon(r) && concls(r) == KI32));
+ return ref2oper(r);
+}
+
+static inline struct oper
+mkdatregoper(union ref r)
+{
+ assert(isregref(r) || (r.t == RXCON && conht[r.i].deref));
+ return ref2oper(r);
+}
+
+static inline struct oper
+mkimmdatregoper(union ref r)
+{
+ assert(isregref(r) || r.t == RICON || (r.t == RXCON && (conht[r.i].cls == KI32 || conht[r.i].deref)));
+ return ref2oper(r);
+}
+
+static int rbpoff;
+
+static struct oper
+mkmemoper(union ref r)
+{
+ if (r.t == RTMP) {
+ struct oper wop = ioper(r.i);
+ if (wop.t == OMEM) return wop;
+ assert(wop.t == OREG);
+ return mkoper(OMEM, .base = wop.reg, .index = NOINDEX);
+ } else if (r.t == RADDR) {
+ const struct addr *addr = &addrht[r.i];
+ struct oper mem;
+
+ assert(addr->shift <= 3);
+ if (addr->base.t == RTMP && ioper(addr->base.i).t == OMEM) {
+ mem = ioper(addr->base.i);
+ if (addr->index.bits) addmemoper(&mem, mkregoper(addr->index));
+ assert(!mem.shift);
+ mem.shift = addr->shift;
+ addmemoper(&mem, mkoper(OIMM, .imm = addr->disp));
+ return mem;
+ }
+ if (isaddrcon(addr->base,0)) {
+ return mkoper(OSYM, .con = addr->base.i,
+ .cindex = addr->index.bits ? mkregoper(addr->index).reg : NOINDEX,
+ .cshift = addr->shift,
+ .disp = addr->disp);
+ } else if (isintcon(addr->base)) {
+ assert(!addr->disp);
+ return mkoper(OMEM, .base = NOBASE,
+ .index = addr->index.bits ? mkregoper(addr->index).reg : NOINDEX,
+ .disp = intconval(addr->base),
+ .shift = addr->shift);
+ } else if (isaddrcon(addr->index,0)) {
+ assert(!addr->shift);
+ return mkoper(OSYM, .con = addr->index.i,
+ .cindex = addr->base.bits ? mkregoper(addr->base).reg : NOINDEX,
+ .disp = addr->disp);
+ }
+ return mkoper(OMEM, .base = addr->base.bits ? mkregoper(addr->base).reg : NOBASE,
+ .index = addr->index.bits ? mkregoper(addr->index).reg : NOINDEX,
+ .disp = addr->disp,
+ .shift = addr->shift);
+ } else if (r.t == RXCON) {
+ assert(!conht[r.i].cls);
+ return mkoper(OSYM, .con = r.i, .cindex = NOINDEX);
+ } else {
+ return mkoper(OMEM, .base = isregref(r) ? ref2oper(r).reg : NOBASE,
+ .index = NOINDEX,
+ .disp = isregref(r) ? 0 : mkimmoper(r).imm);
+ }
+}
+
+/** Instruction description tables **
+ *
+ * Each instruction is a list of descs, and the first one that matches
+ * is emitted. Each entry has a size pattern field, which is a bitset
+ * of the sizes (in bytes) that the entry matches, and 2 operand patterns,
+ * which describe the operands that can match (for example, PRAX matches
+ * a RAX register operand, PGPR matches any integer register, I8 matches
+ * an immediate operand between [-128,127]) The rest of the fields describe
+ * the instruction's encoding.
+ * (reference: https://www.felixcloutier.com/x86/ & https://wiki.osdev.org/X86-64_Instruction_Encoding )
+ */
+
+enum operpat {
+ PNONE,
+ PRAX,
+ PRCX,
+ PGPR,
+ PFPR,
+ P1, /* imm = 1 */
+ PN1, /* imm = -1 */
+ PI8,
+ PU8,
+ PI16,
+ PU16,
+ PI32,
+ PU32,
+ PMEM,
+ PSYM,
+};
+enum operenc {
+ EN_R = 1, /* reg with /r */
+ EN_RR, /* reg, reg with /r */
+ EN_RRX, /* reg, reg with /r (inverted) */
+ EN_MR, /* mem, reg with /r */
+ EN_RM, /* reg, mem with /r */
+ EN_M, /* mem */
+ EN_RI8, /* reg, imm8 with /0 */
+ EN_RI32, /* reg, imm32 with /0 */
+ EN_MI8, /* mem, imm8 with /x */
+ EN_MI16, /* mem, imm16 with /x */
+ EN_MI32, /* mem, imm32 with /x */
+ EN_OI, /* reg, imm32 with op + reg */
+ EN_I8, /* imm8 */
+ EN_I32, /* imm32 */
+ EN_R32, /* rel32 */
+ NOPERENC,
+};
+struct desc {
+ uchar psiz; /* subset of {1,2,4,8} */
+ uchar ptd, pts; /* bitsets of enum operpat */
+ uchar nopc; /* countof opc */
+ const char opc[8]; /* opcode bytes */
+ uchar operenc; /* enum operenc */
+ uchar ext; /* ModR/M.reg opc extension */
+ bool r8; /* uses 8bit register */
+ bool norexw; /* do not use REX.W even if size is 64 bits */
+};
+
+/* match operand against pattern */
+static inline bool
+opermatch(enum operpat pat, struct oper oper)
+{
+ switch (pat) {
+ case PNONE: return !oper.t;
+ case PRAX: return oper.t == OREG && oper.reg == RAX;
+ case PRCX: return oper.t == OREG && oper.reg == RCX;
+ case PGPR: return oper.t == OREG && oper.reg <= R15;
+ case PFPR: return oper.t == OREG && oper.reg >= XMM0;
+ case P1: return oper.t == OIMM && oper.imm == 1;
+ case PN1: return oper.t == OIMM && oper.imm == -1;
+ case PI8: return oper.t == OIMM && (schar)oper.imm == oper.imm;
+ case PU8: return oper.t == OIMM && (uchar)oper.imm == oper.imm;
+ case PI16: return oper.t == OIMM && (short)oper.imm == oper.imm;
+ case PU16: return oper.t == OIMM && (ushort)oper.imm == oper.imm;
+ case PI32: return oper.t == OIMM;
+ case PU32: return oper.t == OIMM && oper.imm >= 0;
+ case PMEM: return in_range(oper.t, OMEM, OSYM);
+ case PSYM: return oper.t == OSYM;
+ }
+ assert(0);
+}
+
+/* code output helpers */
+#define B(b) (*(*pcode)++ = (b))
+#define D(xs, N) (memcpy(*pcode, (xs), (N)), (*pcode) += (N))
+#define I16(w) (wr16le(*pcode, (w)), *pcode += 2)
+#define I32(w) (wr32le(*pcode, (w)), *pcode += 4)
+#define DS(S) D(S, sizeof S - 1)
+
+static bool usebp; /* use RBP? */
+static const char *curfnsym;
+static uchar *fnstart;
+
+/* Given an instruction description table, find the first entry that matches
+ * the operands (where dst, src are the operands in intel syntax order) and encode it */
+static void
+encode(uchar **pcode, const struct desc *tab, int ntab, enum irclass k, struct oper dst, struct oper src)
+{
+ const uchar *opc;
+ int nopc;
+ struct oper mem;
+ enum reg reg;
+ const struct desc *en = NULL;
+ for (int i = 0; i < ntab; ++i) {
+ if ((tab[i].psiz & cls2siz[k]) && opermatch(tab[i].ptd, dst) && opermatch(tab[i].pts, src)) {
+ en = &tab[i];
+ break;
+ }
+ }
+ assert(en && "no match for instr");
+
+ if (en->ptd == PFPR) dst.reg &= 15;
+ if (en->pts == PFPR) src.reg &= 15;
+ opc = (uchar *)en->opc;
+ nopc = en->nopc;
+ /* mandatory prefixes go before REX */
+ if (*opc == 0x66 || *opc == 0xF2 || *opc == 0xF3)
+ B(*opc++), --nopc;
+ int rex = in_range(k, KI64, KPTR) << 3; /* REX.W */
+ if (en->norexw) rex = 0;
+ switch (en->operenc) {
+ case EN_RR: /* mod = 11; reg = dst; rm = src */
+ rex |= (dst.reg >> 3) << 2; /* REX.R */
+ rex |= (src.reg >> 3) << 0; /* REX.B */
+ if (rex) B(0x40 | rex);
+ else if (en->r8 && in_range(src.reg, RSP, RDI)) {
+ /* /r8 needs REX to encode SP,BP,SI,DI (otherwise -> AH..BH) */
+ B(0x40);
+ }
+ D(opc, nopc);
+ B(0300 | (dst.reg & 7) << 3 | (src.reg & 7));
+ break;
+ case EN_RRX: /* mod = 11; reg = src; rm = dst */
+ rex |= (src.reg >> 3) << 2; /* REX.R */
+ rex |= (dst.reg >> 3) << 0; /* REX.B */
+ if (rex) B(0x40 | rex);
+ else if (en->r8 && in_range(dst.reg, RSP, RDI)) {
+ /* /r8 needs REX to encode SP,BP,SI,DI (otherwise -> AH..BH) */
+ B(0x40);
+ }
+ D(opc, nopc);
+ B(0300 | (src.reg & 7) << 3 | (dst.reg & 7));
+ break;
+ case EN_MR:
+ mem = dst;
+ reg = src.reg;
+ goto Mem;
+ case EN_RM:
+ mem = src;
+ reg = dst.reg;
+ goto Mem;
+ case EN_M: case EN_MI8: case EN_MI16: case EN_MI32:
+ mem = dst;
+ reg = en->ext;
+ Mem:
+ if (mem.t == OMEM) {
+ if (mem.base != NOBASE) rex |= mem.base >> 3; /* REX.B */
+ if (mem.index != NOINDEX) rex |= mem.index >> 3 << 1; /* REX.X */
+ } else {
+ if (mem.cindex != NOINDEX) rex |= mem.cindex >> 3 << 1; /* REX.X */
+ }
+ if (en->operenc != EN_M)
+ rex |= (reg >> 3) << 2; /* REX.R */
+ if (rex) B(0x40 | rex);
+ else if (en->r8 && in_range(reg, RSP, RDI)) B(0x40);
+
+ if (mem.t == OSYM) {
+ D(opc, nopc);
+ if (mem.cindex == NOINDEX) {
+ /* %rip(var) */
+ static uchar offs[NOPERENC] = { [EN_MI8] = 1, [EN_MI16] = 2, [EN_MI32] = 4 };
+ enum relockind r =
+ (!conht[mem.con].deref && ccopt.pic) ? (rex ? REL_GOTPCRELX : REL_GOTPCRELX_REX)
+ : REL_PCREL32;
+ int off = -4 - offs[en->operenc];
+ B(/*mod 0*/ (reg & 7) << 3 | RBP);
+ objreloc(xcon2sym(mem.con), r, Stext, *pcode - objout.textbegin, mem.disp + off);
+ } else {
+ /* var(,%reg,shift) */
+ assert(!ccopt.pic && !ccopt.pie && "cannot encode [RIP-rel + REG] for position independent");
+ B(/*mod 0*/ (reg & 7) << 3 | RSP);
+ B(mem.cshift << 6 | mem.cindex << 3 | RBP); /* SIB [index*s + disp32] */
+ objreloc(xcon2sym(mem.con), REL_ABS32S, Stext, *pcode - objout.textbegin, mem.disp);
+ }
+ I32(0);
+ } else {
+ int mod;
+ bool sib = 0;
+ if (mem.base == RBP) {
+ if (!usebp) {
+ /* if RBP isn't being set up (leaf functions with no stack allocations),
+ * access thru RSP (function arguments in the stack) */
+ mem.base = RSP;
+ mem.disp -= 8;
+ } else if (mem.disp <= 0) {
+ mem.disp += rbpoff;
+ }
+ }
+ if (mem.base != NOBASE) {
+ if (mem.index == NOINDEX && mem.shift == 0) sib = 0;
+ else sib = 1;
+ mod = !mem.disp ? 0 /* disp = 0 -> mod = 00 */
+ : (uint)(mem.disp + 128) < 256 ? 1 /* disp8 -> mod = 01 */
+ : 2; /* disp32 -> mod = 10 */
+ if (mod == 0 && (mem.base == RBP || mem.base == R13)) mod = 1;
+ if (mem.base == RSP || mem.base == R12) sib = 1;
+ } else {
+ /* [disp + (index*s)] */
+ sib = 1;
+ mem.base = RBP;
+ mod = 0;
+ assert(mem.index != RSP);
+ }
+ D(opc, nopc);
+ B(mod << 6 | (reg & 7) << 3 | (sib ? 4 : (mem.base & 7)));
+ if (sib) {
+ if (mem.index == NOINDEX) mem.index = RSP;
+ B(mem.shift << 6 | (mem.index & 7) << 3 | (mem.base & 7));
+ }
+ if (mod == 1) B(mem.disp);
+ else if (mod == 2 || (mod == 0 && mem.base == RBP/*RIP-rel*/) || (mod == 0 && sib && mem.base == RBP/*absolute*/)) {
+ I32(mem.disp);
+ }
+ }
+ if (en->operenc == EN_MI8) B(src.imm);
+ if (en->operenc == EN_MI16) I16(src.imm);
+ if (en->operenc == EN_MI32) I32(src.imm);
+ break;
+ case EN_R: case EN_RI32: case EN_RI8:
+ rex |= (dst.reg >> 3) << 0; /* REX.B */
+ if (rex) B(0x40 | rex);
+ else if (en->r8 && in_range(dst.reg, RSP, RDI)) {
+ /* /r8 needs REX to encode SP,BP,SI,DI (otherwise -> AH..BH) */
+ B(0x40);
+ }
+ D(opc, nopc);
+ B(0300 | en->ext << 3 | (dst.reg & 7));
+ if (en->operenc == EN_RI32)
+ I32(src.imm);
+ else if (en->operenc == EN_RI8)
+ B(src.imm);
+ break;
+ case EN_OI:
+ rex |= (dst.reg >> 3) << 0; /* REX.B */
+ if (rex) B(0x40 | rex);
+ B(*opc++ + (dst.reg & 7));
+ D(opc, nopc - 1);
+ I32(src.imm);
+ break;
+ case EN_I8:
+ if (rex) B(0x40 | rex);
+ D(opc, nopc);
+ B(src.imm);
+ break;
+ case EN_I32:
+ if (rex) B(0x40 | rex);
+ D(opc, nopc);
+ I32(src.imm);
+ break;
+ case EN_R32:
+ if (rex) B(0x40 | rex);
+ D(opc, nopc);
+ assert(dst.t == OSYM);
+ const char *sym = xcon2sym(dst.con);
+ if (sym != curfnsym) {
+ enum relockind r = (ccopt.pie|ccopt.pic) ? REL_PLT32 : REL_PCREL32;
+ objreloc(sym, r, Stext, *pcode - objout.textbegin, -4);
+ I32(0);
+ } else {
+ /* self-recursive call */
+ I32(fnstart - *pcode - 4);
+ }
+ break;
+ }
+}
+
+#define DEFINSTR1(X, ...) \
+ static void \
+ X(uchar **pcode, enum irclass k, struct oper oper) \
+ { \
+ static const struct desc tab[] = { __VA_ARGS__ }; \
+ encode(pcode, tab, countof(tab), k, oper, mkoper(0,)); \
+ }
+
+#define DEFINSTR2(X, ...) \
+ static void \
+ X(uchar **pcode, enum irclass k, struct oper dst, struct oper src) \
+ { \
+ static const struct desc tab[] = { __VA_ARGS__ }; \
+ encode(pcode, tab, countof(tab), k, dst, src); \
+ }
+
+#define O(s) (sizeof s)-1,s
+DEFINSTR2(Xmovb,
+ {-1, PMEM, PGPR, O("\x88"), EN_MR, .r8=1}, /* MOV m8, r8 */
+ {-1, PMEM, PI8, O("\xC6"), EN_MI8, .r8=1}, /* MOV m8, imm8 */
+ {-1, PMEM, PU8, O("\xC6"), EN_MI8, .r8=1}, /* MOV m8, imm8 */
+)
+DEFINSTR2(Xmovw,
+ {-1, PMEM, PGPR, O("\x66\x89"), EN_MR}, /* MOV m16, r16 */
+ {-1, PMEM, PI16, O("\x66\xC7"), EN_MI16}, /* MOV m16, imm16 */
+ {-1, PMEM, PU16, O("\x66\xC7"), EN_MI16}, /* MOV m16, imm16 */
+)
+static void Xmov(uchar **pcode, enum irclass k, struct oper dst, struct oper src)
+{
+ static const struct desc all[] = {
+ {4 , PGPR, PI32, O("\xB8"), EN_OI}, /* MOV r32, imm */
+ {4|8, PGPR, PGPR, O("\x8B"), EN_RR}, /* MOV r32/64, r32/64 */
+ {4|8, PMEM, PGPR, O("\x89"), EN_MR}, /* MOV m32/64, r32/64 */
+ {4|8, PGPR, PMEM, O("\x8B"), EN_RM}, /* MOV r32/64, m32/64 */
+ {4|8, PMEM, PI32, O("\xC7"), EN_MI32}, /* MOV m32/64, imm */
+ { 8, PGPR, PU32, O("\xB8"), EN_OI, .norexw=1}, /* MOV r64, uimm */
+ { 8, PGPR, PI32, O("\xC7"), EN_RI32}, /* MOV r64, imm */
+ {4 , PFPR, PFPR, O("\x0F\x28"), EN_RR}, /* MOVPS xmm, xmm */
+ {4 , PFPR, PMEM, O("\xF3\x0F\x10"), EN_RM}, /* MOVSS xmm, m32 */
+ {4 , PMEM, PFPR, O("\xF3\x0F\x11"), EN_MR}, /* MOVSS m32, xmm */
+ {8 , PFPR, PFPR, O("\x0F\x28"), EN_RR}, /* MOVPS xmm, xmm */
+ {8 , PFPR, PMEM, O("\xF2\x0F\x10"), EN_RM}, /* MOVSD xmm, m64 */
+ {8 , PMEM, PFPR, O("\xF2\x0F\x11"), EN_MR}, /* MOVSS m64, xmm */
+ {4|8, PFPR, PGPR, O("\x66\x0F\x6E"), EN_RR}, /* MOVD/Q xmm, r64/32 */
+ {4|8, PGPR, PFPR, O("\x66\x0F\x7E"), EN_RRX}, /* MOVD/Q r64/32, xmm */
+ };
+ static const uchar k2off[] = {
+ [KI32] = 0,
+ [KI64] = 1, [KPTR] = 1,
+ [KF32] = 7,
+ [KF64] = 10,
+ };
+ encode(pcode, all + k2off[k], countof(all) - k2off[k], k, dst, src);
+}
+DEFINSTR2(Xmovsxl,
+ {8, PGPR, PMEM, O("\x63"), EN_RM}, /* MOVSXD r64, m32 */
+ {8, PGPR, PGPR, O("\x63"), EN_RR}, /* MOVSXD r64, r32 */
+ {4, PGPR, PMEM, O("\x8B"), EN_RM}, /* MOV r32, m32 */
+ {4, PGPR, PGPR, O("\x8B"), EN_RR}, /* MOV r32, r32 */
+)
+DEFINSTR2(Xmovsxw,
+ {4|8, PGPR, PMEM, O("\x0F\xBF"), EN_RM}, /* MOVSX r64, m16 */
+ {4|8, PGPR, PGPR, O("\x0F\xBF"), EN_RR}, /* MOVSX r64, r16 */
+)
+DEFINSTR2(Xmovsxb,
+ {4|8, PGPR, PMEM, O("\x0F\xBE"), EN_RM}, /* MOVSX r64, m8 */
+ {4|8, PGPR, PGPR, O("\x0F\xBE"), EN_RR, .r8=1}, /* MOVSX r64, r8 */
+)
+DEFINSTR2(Xmovzxw,
+ {4|8, PGPR, PMEM, O("\x0F\xB7"), EN_RM}, /* MOVZX r64, m16 */
+ {4|8, PGPR, PGPR, O("\x0F\xB7"), EN_RR}, /* MOVZX r64, r16 */
+)
+DEFINSTR2(Xmovzxb,
+ {4|8, PGPR, PMEM, O("\x0F\xB6"), EN_RM}, /* MOVZX r64, m8 */
+ {4|8, PGPR, PGPR, O("\x0F\xB6"), EN_RR, .r8=1}, /* MOVZX r64, r8 */
+)
+DEFINSTR2(Xmovaps,
+ {-1, PMEM, PFPR, O("\x0F\x29"), EN_MR}, /* MOVAPS mem, xmm */
+)
+DEFINSTR2(Xxchg,
+ {4|8, PGPR, PGPR, O("\x87"), EN_RR}, /* XCHG r32/64, r32/64 */
+ {4|8, PGPR, PMEM, O("\x87"), EN_RM}, /* XCHG r32/64, m32/64 */
+ {4|8, PMEM, PGPR, O("\x87"), EN_MR}, /* XCHG r32/64, m32/64 */
+)
+DEFINSTR2(Xlea,
+ {4|8, PGPR, PMEM, O("\x8D"), EN_RM}, /* LEA r32/64,m32/64 */
+ { 8, PGPR, PSYM, O("\x8D"), EN_RM}, /* LEA rel32 */
+)
+DEFINSTR2(Xadd,
+ {4|8, PGPR, PGPR, O("\x03"), EN_RR}, /* ADD r32/64, r32/64 */
+ {4|8, PGPR, P1, O("\xFF"), EN_R, .ext=0}, /* INC r32/64 */
+ {4|8, PGPR, PN1, O("\xFF"), EN_R, .ext=1}, /* DEC r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8}, /* ADD r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x05"), EN_I32}, /* ADD eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32}, /* ADD r32/64, imm */
+ { 8, PGPR, PMEM, O("\x03"), EN_RM}, /* ADD r64, m64 */
+)
+DEFINSTR2(Xaddf,
+ {4, PFPR, PFPR, O("\xF3\x0F\x58"), EN_RR}, /* ADDSS xmm, xmm */
+ {8, PFPR, PFPR, O("\xF2\x0F\x58"), EN_RR}, /* ADDSD xmm, xmm */
+ {4, PFPR, PMEM, O("\xF3\x0F\x58"), EN_RM}, /* ADDSS xmm, m32 */
+ {8, PFPR, PMEM, O("\xF2\x0F\x58"), EN_RM}, /* ADDSD xmm, m64 */
+)
+DEFINSTR2(Xsub,
+ {4|8, PGPR, PGPR, O("\x2B"), EN_RR}, /* SUB r32/64, r32/64 */
+ {4|8, PGPR, P1, O("\xFF"), EN_R, .ext=1}, /* DEC r32/64 */
+ {4|8, PGPR, PN1, O("\xFF"), EN_R, .ext=0}, /* INC r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8, .ext=5}, /* SUB r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x2D"), EN_I32}, /* SUB eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32, .ext=5}, /* SUB r32/64, imm */
+ { 8, PGPR, PMEM, O("\x2B"), EN_RM}, /* SUB r64, m64 */
+)
+DEFINSTR2(Xsubf,
+ {4, PFPR, PFPR, O("\xF3\x0F\x5C"), EN_RR}, /* SUBSS xmm, xmm */
+ {8, PFPR, PFPR, O("\xF2\x0F\x5C"), EN_RR}, /* SUBSD xmm, xmm */
+ {4, PFPR, PMEM, O("\xF3\x0F\x5C"), EN_RM}, /* SUBSS xmm, m32 */
+ {8, PFPR, PMEM, O("\xF2\x0F\x5C"), EN_RM}, /* SUBSD xmm, m64 */
+)
+DEFINSTR2(Xmulf,
+ {4, PFPR, PFPR, O("\xF3\x0F\x59"), EN_RR}, /* MULSS xmm, xmm */
+ {8, PFPR, PFPR, O("\xF2\x0F\x59"), EN_RR}, /* MULSD xmm, xmm */
+ {4, PFPR, PMEM, O("\xF3\x0F\x59"), EN_RM}, /* MULSS xmm, m32 */
+ {8, PFPR, PMEM, O("\xF2\x0F\x59"), EN_RM}, /* MULSD xmm, m64 */
+)
+DEFINSTR2(Xdivf,
+ {4, PFPR, PFPR, O("\xF3\x0F\x5E"), EN_RR}, /* DIVSS xmm, xmm */
+ {8, PFPR, PFPR, O("\xF2\x0F\x5E"), EN_RR}, /* DIVSD xmm, xmm */
+ {4, PFPR, PMEM, O("\xF3\x0F\x5E"), EN_RM}, /* DIVSS xmm, m32 */
+ {8, PFPR, PMEM, O("\xF2\x0F\x5E"), EN_RM}, /* DIVSD xmm, m64 */
+)
+DEFINSTR2(Xand,
+ {4|8, PGPR, PGPR, O("\x23"), EN_RR}, /* AND r32/64, r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8, .ext=4}, /* AND r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x25"), EN_I32}, /* AND eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32, .ext=4}, /* AND r32/64, imm */
+ { 8, PGPR, PMEM, O("\x23"), EN_RM}, /* AND r64, m64 */
+)
+DEFINSTR2(Xior,
+ {4|8, PGPR, PGPR, O("\x0B"), EN_RR}, /* OR r32/64, r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8, .ext=1}, /* OR r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x0D"), EN_I32}, /* OR eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32, .ext=1}, /* OR r32/64, imm */
+ { 8, PGPR, PMEM, O("\x0B"), EN_RM}, /* OR r64, m64 */
+ {4|8, PFPR, PFPR, O("\x0F\x57"), EN_RR}, /* ORPS xmm, xmm */
+)
+DEFINSTR2(Xxor,
+ {4|8, PGPR, PGPR, O("\x33"), EN_RR}, /* XOR r32/64, r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8, .ext=6}, /* XOR r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x35"), EN_I32}, /* XOR eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32, .ext=6}, /* XOR r32/64, imm */
+ { 8, PGPR, PMEM, O("\x33"), EN_RM}, /* XOR r64, m64 */
+ {4|8, PFPR, PFPR, O("\x0F\x57"), EN_RR}, /* XORPS xmm, xmm */
+ {4|8, PFPR, PMEM, O("\x0F\x57"), EN_RM}, /* XORPS xmm, m128 */
+)
+DEFINSTR2(Xshl,
+ {4|8, PGPR, P1, O("\xD1"), EN_R, .ext=4}, /* SHL r32/64, 1 */
+ {4|8, PGPR, PI32, O("\xC1"), EN_RI8, .ext=4}, /* SHL r32/64, imm */
+ {4|8, PGPR, PRCX, O("\xD3"), EN_R, .ext=4}, /* SHL r32/64, CL */
+)
+DEFINSTR2(Xsar,
+ {4|8, PGPR, P1, O("\xD1"), EN_R, .ext=7}, /* SAR r32/64, 1 */
+ {4|8, PGPR, PI32, O("\xC1"), EN_RI8, .ext=7}, /* SAR r32/64, imm */
+ {4|8, PGPR, PRCX, O("\xD3"), EN_R, .ext=7}, /* SAR r32/64, CL */
+)
+DEFINSTR2(Xshr,
+ {4|8, PGPR, P1, O("\xD1"), EN_R, .ext=5}, /* SHR r32/64, 1 */
+ {4|8, PGPR, PI32, O("\xC1"), EN_RI8, .ext=5}, /* SHR r32/64, imm */
+ {4|8, PGPR, PRCX, O("\xD3"), EN_R, .ext=5}, /* SHR r32/64, CL */
+)
+DEFINSTR2(Xcvtss2sd,
+ {-1, PFPR, PFPR, O("\xF3\x0F\x5A"), EN_RR}, /* CVTSS2SD xmm, xmm */
+ {-1, PFPR, PMEM, O("\xF3\x0F\x5A"), EN_RM}, /* CVTSS2SD xmm, m32/64 */
+)
+DEFINSTR2(Xcvtsd2ss,
+ {-1, PFPR, PFPR, O("\xF2\x0F\x5A"), EN_RR}, /* CVTSD2SS xmm, xmm */
+ {-1, PFPR, PMEM, O("\xF2\x0F\x5A"), EN_RM}, /* CVTSD2SS xmm, m32/64 */
+)
+DEFINSTR2(Xcvtsi2ss,
+ {-1, PFPR, PGPR, O("\xF3\x0F\x2A"), EN_RR}, /* CVTSI2SS xmm, r32/64 */
+ {-1, PFPR, PMEM, O("\xF3\x0F\x2A"), EN_RM}, /* CVTSI2SS xmm, m32/64 */
+)
+DEFINSTR2(Xcvtsi2sd,
+ {-1, PFPR, PGPR, O("\xF2\x0F\x2A"), EN_RR}, /* CVTSI2SD xmm, r32/64 */
+ {-1, PFPR, PMEM, O("\xF2\x0F\x2A"), EN_RM}, /* CVTSI2SD xmm, m32/64 */
+)
+DEFINSTR2(Xcvttss2si,
+ {-1, PGPR, PFPR, O("\xF3\x0F\x2C"), EN_RR}, /* CVTTSS2SI r32/64, xmm */
+ {-1, PGPR, PMEM, O("\xF3\x0F\x2C"), EN_RM}, /* CVTTSS2SI r32/64, m32 */
+)
+DEFINSTR2(Xcvttsd2si,
+ {-1, PGPR, PFPR, O("\xF2\x0F\x2C"), EN_RR}, /* CVTTSD2SI r32/64, xmm */
+ {-1, PGPR, PMEM, O("\xF2\x0F\x2C"), EN_RM}, /* CVTTSD2SI r32/64, m32 */
+)
+DEFINSTR1(Xneg,
+ {4|8, PGPR, 0, O("\xF7"), EN_R, .ext=3} /* NEG r32/64 */
+)
+DEFINSTR1(Xnot,
+ {4|8, PGPR, 0, O("\xF7"), EN_R, .ext=2} /* NOT r32/64 */
+)
+DEFINSTR1(Xidiv,
+ {4|8, PGPR, 0, O("\xF7"), EN_R, .ext=7}, /* IDIV r32/64 */
+ {4|8, PMEM, 0, O("\xF7"), EN_M, .ext=7}, /* IDIV m32/64 */
+)
+DEFINSTR1(Xdiv,
+ {4|8, PGPR, 0, O("\xF7"), EN_R, .ext=6}, /* DIV r32/64 */
+ {4|8, PMEM, 0, O("\xF7"), EN_M, .ext=6}, /* DIV m32/64 */
+)
+DEFINSTR1(Xcall,
+ {-1, PSYM, 0, O("\xE8"), EN_R32, .norexw=1}, /* CALL rel32 */
+ {-1, PGPR, 0, O("\xFF"), EN_R, .ext=2, .norexw=1}, /* CALL r64 */
+ {-1, PMEM, 0, O("\xFF"), EN_M, .ext=2, .norexw=1}, /* CALL m64 */
+)
+DEFINSTR2(Xcmp,
+ {4|8, PGPR, PGPR, O("\x3B"), EN_RR}, /* CMP r32/64, r32/64 */
+ {4|8, PGPR, PI8, O("\x83"), EN_RI8, .ext=7}, /* CMP r32/64, imm8 */
+ {4|8, PRAX, PI32, O("\x3D"), EN_I32}, /* CMP eax/rax, imm */
+ {4|8, PGPR, PI32, O("\x81"), EN_RI32, .ext=7}, /* CMP r32/64, imm */
+ { 8, PGPR, PMEM, O("\x3B"), EN_RM}, /* CMP r64, m64 */
+ {4 , PFPR, PFPR, O("\x0F\x2E"), EN_RR}, /* UCOMISS xmm, xmm */
+ {4 , PFPR, PMEM, O("\x0F\x2E"), EN_RM}, /* UCOMISS xmm, m32 */
+ { 8, PFPR, PFPR, O("\x66\x0F\x2E"), EN_RR}, /* UCOMISD xmm, xmm */
+ { 8, PFPR, PMEM, O("\x66\x0F\x2E"), EN_RM}, /* UCOMISD xmm, m64 */
+)
+DEFINSTR2(Xtest,
+ {4|8, PRAX, PI8, O("\xA8"), EN_I8}, /* TEST AL, imm8 */
+ {4, PRAX, PI32, O("\xA9"), EN_I32}, /* TEST EAX, imm32 */
+ { 8, PRAX, PU32, O("\xA9"), EN_I32}, /* TEST EAX, imm32 */
+ { 8, PRAX, PI32, O("\xA9"), EN_I32}, /* TEST RAX, imm32 */
+ {4|8, PGPR, PI8, O("\xF6"), EN_RI8, .r8=1,.norexw=1}, /* TEST r8, imm8 */
+ {4|8, PGPR, PI32, O("\xF7"), EN_RI32, .ext=0}, /* TEST r32/64, imm32 */
+ {4|8, PGPR, PGPR, O("\x85"), EN_RR}, /* TEST r32/64, r32/64 */
+ {4|8, PGPR, PMEM, O("\x85"), EN_RM}, /* TEST r32/64, m32/64 */
+)
+
+DEFINSTR2(Ximul2,
+ {4|8, PGPR, PGPR, O("\x0F\xAF"), EN_RR}, /* IMUL r32/64, r32/64 */
+ {4|8, PGPR, PMEM, O("\x0F\xAF"), EN_RM}, /* IMUL r32/64, m32/64 */
+)
+static const struct desc imul3_imm8tab[] = {
+ {4|8, PGPR, PGPR, O("\x6B"), EN_RR}, /* IMUL r32/64, r32/64, (imm8) */
+ {4|8, PGPR, PMEM, O("\x6B"), EN_RM}, /* IMUL r32/64, m32/64, (imm8) */
+}, imul3_imm32tab[] = {
+ {4|8, PGPR, PGPR, O("\x69"), EN_RR}, /* IMUL r32/64, r32/64, (imm32) */
+ {4|8, PGPR, PMEM, O("\x69"), EN_RM}, /* IMUL r32/64, m32/64, (imm32) */
+};
+#undef O
+static void
+Ximul(uchar **pcode, enum irclass k, struct oper dst, struct oper s1, struct oper s2)
+{
+ if (!memcmp(&dst, &s1, sizeof dst) && s2.t != OIMM) {
+ Ximul2(pcode, k, dst, s2);
+ return;
+ }
+ assert(s2.t == OIMM);
+ if ((uint)(s2.imm + 128) < 256) {
+ encode(pcode, imul3_imm8tab, countof(imul3_imm8tab), k, dst, s1);
+ B(s2.imm);
+ } else {
+ encode(pcode, imul3_imm32tab, countof(imul3_imm32tab), k, dst, s1);
+ I32(s2.imm);
+ }
+}
+
+enum cc {
+ CCO = 0x0, /* OF = 1*/
+ CCNO = 0x1, /* OF = 0*/
+ CCB = 0x2, CCC = 0x2, CCNAE = 0x2, /* below; CF = 1; not above or equal */
+ CCAE = 0x3, CCNB = 0x3, CCNC = 0x3, /* above or equal; not below; CF = 0 */
+ CCE = 0x4, CCZ = 0x4, /* equal; ZF = 1 */
+ CCNE = 0x5, CCNZ = 0x5, /* not equal; ZF = 0 */
+ CCBE = 0x6, CCNA = 0x6, /* below or equal; not above; CF=1 or ZF=1 */
+ CCA = 0x7, CCNBE = 0x7, /* above; not below or equal; CF=0 and ZF=0 */
+ CCS = 0x8, /* ZS = 1; negative */
+ CCNS = 0x9, /* ZS = 0; non-negative */
+ CCP = 0xA, CCPE = 0xA, /* PF = 1; parity even */
+ CCNP = 0xB, CCPO = 0xB, /* PF = 0; parity odd */
+ CCL = 0xC, CCNGE = 0xC, /* lower; not greater or equal; SF != OF */
+ CCGE = 0xD, CCNL = 0xD, /* greater or equal; not lower; SF == OF */
+ CCLE = 0xE, CCNG = 0xE, /* less or equal; not greater; ZF=1 or SF != OF */
+ CCG = 0xF, CCNLE = 0xF, /* greater; not less or equal; ZF=0 and SF = OF*/
+ ALWAYS,
+};
+
+/* maps blk -> address when resolved; or to linked list of jump displacement
+ * relocations */
+static struct blkaddr {
+ bool resolved;
+ union {
+ uint addr;
+ uint relreloc;
+ };
+} *blkaddr;
+static uint nblkaddr;
+
+static void
+Xjcc(uchar **pcode, enum cc cc, struct block *dst)
+{
+ int disp, insaddr = *pcode - objout.textbegin;
+ bool rel8 = 0;
+
+ if (blkaddr[dst->id].resolved) {
+ disp = blkaddr[dst->id].addr - (insaddr + 2);
+ if ((uint)(disp + 128) < 256) /* can use 1-byte displacement? */
+ rel8 = 1;
+ else { /* otherwise 4-byte displacement */
+ disp -= 3;
+ disp -= cc != ALWAYS; /* 'Jcc rel32' has 2 opcode bytes */
+ }
+ } else {
+ disp = blkaddr[dst->id].relreloc;
+ blkaddr[dst->id].relreloc = insaddr + 1 + (cc != ALWAYS);
+ }
+ if (cc == ALWAYS) {
+ B(rel8 ? 0xEB : 0xE9); /* JMP rel8/rel32 */
+ } else {
+ assert(in_range(cc, 0, 0xF));
+ if (rel8) B(0x70 + cc); /* Jcc rel8 */
+ else B(0x0F), B(0x80 + cc); /* Jcc rel32 */
+ }
+ if (rel8) B(disp); else I32(disp);
+}
+
+static void
+Xsetcc(uchar **pcode, enum cc cc, enum reg reg)
+{
+ int rex = 0;
+ assert(in_range(cc, 0x0, 0xF));
+ assert(in_range(reg, RAX, R15));
+
+ if (in_range(reg, RSP, RDI)) rex = 0x40;
+ rex |= (reg >> 3); /* REX.B */
+ if (rex) B(rex | 0x40);
+ B(0x0F), B(0x90+cc); /* SETcc */
+ B(0xC0 + (reg & 7)); /* ModR/M with mod=11, rm=reg */
+}
+
+static void
+Xpush(uchar **pcode, enum reg reg)
+{
+ if (in_range(reg, RAX, R15)) {
+ if (reg >> 3) B(0x41); /* REX.B */
+ B(0x50 + (reg & 7)); /* PUSH reg */
+ } else {
+ assert(in_range(reg, XMM0, XMM15));
+ DS("\x48\x8d\x64\x24\xF8"); /* LEA RSP, [RSP-8] */
+ Xmov(pcode, KF64, mkoper(OMEM, .base = RSP, .index = NOINDEX), reg2oper(reg)); /* MOVD [rsp],xmm0 */
+ }
+}
+
+static void
+Xpop(uchar **pcode, enum reg reg)
+{
+ if (in_range(reg, RAX, R15)) {
+ if (reg >> 3) B(0x41); /* REX.B */
+ B(0x58 + (reg & 7)); /* POP reg */
+ } else {
+ assert(in_range(reg, XMM0, XMM15));
+ Xmov(pcode, KF64, reg2oper(reg), mkoper(OMEM, .base = RSP, .index = NOINDEX)); /* MOVD xmm0,[rsp] */
+ DS("\x48\x8d\x64\x24\x08"); /* LEA RSP, [RSP+8] */
+ }
+}
+
+/* are flags live at given instruction? */
+static bool
+flagslivep(struct block *blk, int curi)
+{
+ int cmpi;
+ /* conditional branch that references a previous comparison instruction? */
+ if (blk->jmp.t != Jb || !blk->jmp.arg[0].bits)
+ return 0;
+ assert(blk->jmp.arg[0].t == RTMP);
+ cmpi = blk->jmp.arg[1].i;
+ for (int i = blk->ins.n - 1; i > curi; --i) {
+ if (blk->ins.p[i] == cmpi)
+ /* flags defined after given instruction, dead here */
+ return 0;
+ }
+ /* flags defined before given instruction, live here */
+ return 1;
+}
+
+/* Copy dst = val, with some peephole optimizations */
+static void
+gencopy(uchar **pcode, enum irclass cls, struct block *blk, int curi, struct oper dst, union ref val)
+{
+ assert(dst.t == OREG);
+ if (val.bits == UNDREF.bits) {
+ /* can be generated by ssa construction, since value is undefined no move is needed */
+ return;
+ }
+ if (val.t == RADDR) {
+ /* this is a LEA, but maybe it can be lowered to a 2-address instruction,
+ * which may clobber flags */
+ const struct addr *addr = &addrht[val.i];
+ if (flagslivep(blk, curi)) goto Lea;
+ if (addr->base.t != RREG) goto Lea;
+ if (addr->base.bits && dst.reg == mkregoper(addr->base).reg) { /* base = dst */
+ if (addr->index.bits && !addr->disp && !addr->shift){
+ /* lea Rx, [Rx + Ry] -> add Rx, Ry */
+ Xadd(pcode, cls, dst, mkregoper(addr->index));
+ return;
+ } else if (!addr->index.bits) {
+ if (!addr->disp) /* lea Rx, [Rx] -> mov Rx, Rx */
+ Xmov(pcode, cls, dst, dst);
+ else /* lea Rx, [Rx + Imm] -> add Rx, Imm */
+ Xadd(pcode, cls, dst, mkoper(OIMM, .imm = addr->disp));
+ return;
+ }
+ } else if (addr->index.bits && dst.reg == mkregoper(addr->index).reg) { /* index = dst */
+ if (addr->base.bits && !addr->disp && !addr->shift) {
+ /* lea Rx, [Ry + Rx] -> add Rx, Ry */
+ Xadd(pcode, cls, dst, mkregoper(addr->base));
+ return;
+ } else if (!addr->base.bits) {
+ if (!addr->disp && !addr->shift) /* lea Rx, [Rx] -> mov Rx, Rx */
+ Xmov(pcode, cls, dst, dst);
+ else if (!addr->shift) /* lea Rx, [Rx + Imm] -> add Rx, Imm */
+ Xadd(pcode, cls, dst, mkoper(OIMM, .imm = addr->disp));
+ else if (!addr->disp) /* lea Rx, [Rx LSL s] -> shl Rx, s */
+ Xshl(pcode, cls, dst, mkoper(OIMM, .imm = addr->shift));
+ else
+ goto Lea;
+ return;
+ }
+ }
+ /* normal (not 2-address) case */
+ Lea:
+ if (isaddrcon(addr->base,0) && ccopt.pic) {
+ assert(!addr->disp && !addr->index.bits);
+ val = addr->base;
+ goto GOTLoad;
+ }
+ Xlea(pcode, cls, dst, ref2oper(val));
+ } else if (val.bits == ZEROREF.bits && dst.t == OREG && (kisflt(cls) || !flagslivep(blk, curi))) {
+ /* dst = 0 -> xor dst, dst; but only if it is ok to clobber flags */
+ Xxor(pcode, kisint(cls) ? KI32 : cls, dst, dst);
+ } else if (isaddrcon(val,0)) {
+ if (ccopt.pic) GOTLoad:
+ /* for mov reg, [rip(sym@GOTPCREL)] */
+ Xmov(pcode, cls, dst, mkoper(OSYM, .con = val.i, .cindex = NOINDEX));
+ else
+ /* for lea reg, [rip(sym)] */
+ Xlea(pcode, cls, dst, mkoper(OSYM, .con = val.i, .cindex = NOINDEX));
+ } else if (val.t == RXCON && in_range(concls(val), KI64, KPTR)) {
+ /* movabs */
+ assert(dst.t == OREG && in_range(dst.reg, RAX, R15));
+ B(0x48 | (dst.reg >> 3)); /* REX.W (+ REX.B) */
+ B(0xB8 + (dst.reg & 0x7)); /* MOVABS r64, */
+ wr64le(*pcode, intconval(val)); /* imm64 */
+ *pcode += 8;
+ } else {
+ struct oper src = mkimmdatregoper(val);
+ if (memcmp(&dst, &src, sizeof dst) != 0)
+ Xmov(pcode, cls == KF64 && src.t == OREG && src.reg < XMM0 ? KI64 : cls, dst, src);
+ }
+}
+
+static void
+Xvaprologue(uchar **pcode, struct function *fn, struct oper sav)
+{
+ uint gpr0 = 0, fpr0 = 0, jmpaddr;
+ for (int i = 0; i < fn->nabiarg; ++i) {
+ struct abiarg abi = fn->abiarg[i];
+ if (!abi.isstk) {
+ if (abi.reg < XMM0) ++gpr0;
+ else ++fpr0;
+ }
+ }
+ assert(sav.t == OMEM && sav.base == RBP);
+ /* save GPRS */
+ for (int r = 0; r < 6; ++r) {
+ static const char reg[] = {RDI,RSI,RDX,RCX,R8,R9};
+ if (r >= gpr0)
+ Xmov(pcode, KI64, sav, reg2oper(reg[r]));
+ sav.disp += 8;
+ }
+
+ /* save FPRs, but only if al is non zero */
+ if (fpr0 < 8) {
+ DS("\x84\xC0"); /* TEST al,al */
+ jmpaddr = *pcode - objout.textbegin;
+ DS("\x74\xFE"); /* JE rel8 */
+ }
+ for (int r = 0; r < 8; ++r) {
+ if (r >= fpr0)
+ Xmovaps(pcode, KF64, sav, reg2oper(XMM0 + r));
+ sav.disp += 16;
+ }
+ if (fpr0 < 8) {/* patch relative jump */
+ int off = (*pcode - objout.textbegin) - jmpaddr - 2;
+ objout.textbegin[jmpaddr+1] = off;
+ }
+}
+
+/* condition code for CMP */
+static const uchar icmpop2cc[] = {
+ [Oequ] = CCE, [Oneq] = CCNE,
+ [Olth] = CCL, [Ogth] = CCG, [Olte] = CCLE, [Ogte] = CCGE,
+ [Oulth] = CCB, [Ougth] = CCA, [Oulte] = CCBE, [Ougte] = CCAE,
+ [Oand] = CCNE, [Osub] = CCNE,
+}, fcmpop2cc[] = {
+ [Oequ] = CCE, [Oneq] = CCNE,
+ [Olth] = CCB, [Ogth] = CCA, [Olte] = CCBE, [Ogte] = CCAE,
+};
+/* condition code for TEST reg,reg (compare with zero) */
+static const uchar icmpzero2cc[] = {
+ [Oequ] = CCE, [Oulte] = CCE,
+ [Oneq] = CCNE, [Ougth] = CCNE,
+ [Olth] = CCS, [Ogte] = CCNS,
+ [Olte] = CCLE, [Ogth] = CCG,
+ [Oulth] = CCB, [Ougte] = CCAE, /* actually constants */
+};
+
+static void
+emitinstr(uchar **pcode, struct function *fn, struct block *blk, int curi, struct instr *ins)
+{
+ struct oper dst, src;
+ bool regzeroed;
+ enum irclass cls = ins->cls;
+ void (*X)(uchar **, enum irclass, struct oper, struct oper) = NULL;
+ void (*X1)(uchar **, enum irclass, struct oper) = NULL;
+
+ switch (ins->op) {
+ default:
+ fatal(NULL, "x86_64: in %y; unimplemented instr '%s'", fn->name, opnames[ins->op]);
+ case Onop: break;
+ case Ostore8: cls = KI32, X = Xmovb; goto Store;
+ case Ostore16: cls = KI32, X = Xmovw; goto Store;
+ case Ostore32: cls = KI32, X = Xmov; goto Store;
+ case Ostore64: cls = KI64, X = Xmov;
+ Store:
+ src = mkimmregoper(ins->r);
+ if (cls == KI32 && src.t == OREG && src.reg >= XMM0) cls = KF32;
+ if (cls == KI64 && src.t == OREG && src.reg >= XMM0) cls = KF64;
+ X(pcode, cls, mkmemoper(ins->l), src);
+ break;
+ case Oexts8: src = mkregoper(ins->l); goto Movsxb;
+ case Oextu8: src = mkregoper(ins->l); goto Movzxb;
+ case Oexts16: src = mkregoper(ins->l); goto Movsxw;
+ case Oextu16: src = mkregoper(ins->l); goto Movzxw;
+ case Oexts32: src = mkregoper(ins->l); goto Movsxl;
+ case Oextu32: src = mkregoper(ins->l); goto Movzxl;
+ case Oloads8: src = mkmemoper(ins->l); Movsxb: Xmovsxb(pcode, cls, reg2oper(ins->reg-1), src); break;
+ case Oloadu8: src = mkmemoper(ins->l); Movzxb: Xmovzxb(pcode, cls, reg2oper(ins->reg-1), src); break;
+ case Oloads16: src = mkmemoper(ins->l); Movsxw: Xmovsxw(pcode, cls, reg2oper(ins->reg-1), src); break;
+ case Oloadu16: src = mkmemoper(ins->l); Movzxw: Xmovzxw(pcode, cls, reg2oper(ins->reg-1), src); break;
+ case Oloads32: src = mkmemoper(ins->l); Movsxl: Xmovsxl(pcode, cls, reg2oper(ins->reg-1), src); break;
+ case Oloadu32: src = mkmemoper(ins->l); Movzxl: Xmov(pcode, KI32, reg2oper(ins->reg-1), src); break;
+ case Oloadf32: case Oloadf64: Xmov(pcode, cls, reg2oper(ins->reg-1), mkmemoper(ins->l)); break;
+ case Oloadi64: Xmov(pcode, KI64, reg2oper(ins->reg-1), mkmemoper(ins->l)); break;
+ case Ocvtf32f64: X = Xcvtss2sd; goto FloatsCvt;
+ case Ocvtf64f32: X = Xcvtsd2ss; goto FloatsCvt;
+ case Ocvtf32s: X = Xcvttss2si; goto FloatsCvt;
+ case Ocvtf64s: X = Xcvttsd2si; goto FloatsCvt;
+ case Ocvts32f: X = cls == KF32 ? Xcvtsi2ss : Xcvtsi2sd; cls = KI32; goto FloatsCvt;
+ case Ocvts64f: X = cls == KF32 ? Xcvtsi2ss : Xcvtsi2sd; cls = KI64; goto FloatsCvt;
+ FloatsCvt:
+ X(pcode, cls, reg2oper(ins->reg-1), mkdatregoper(ins->l));
+ break;
+ case Oadd:
+ dst = mkregoper(ins->l);
+ if (kisflt(cls)) {
+ Xaddf(pcode, cls, dst, mkimmdatregoper(ins->r));
+ } else if (ins->reg-1 == dst.reg) { /* two-address add */
+ src = ref2oper(ins->r);
+ if (src.t == OIMM && src.imm < 0) /* ADD -imm -> SUB imm, for niceness */
+ Xsub(pcode, cls, dst, (src.imm = -src.imm, src));
+ else
+ Xadd(pcode, cls, dst, src);
+ } else if (isregref(ins->r) && ins->reg-1 == mkregoper(ins->r).reg) {
+ /* also two-address after swapping operands */
+ Xadd(pcode, cls, reg2oper(ins->reg-1), mkimmdatregoper(ins->l));
+ } else { /* three-address add (lea) */
+ struct oper mem = { OMEM, .base = NOBASE, .index = NOINDEX };
+ dst = reg2oper(ins->reg-1);
+ addmemoper(&mem, ref2oper(ins->l));
+ addmemoper(&mem, ref2oper(ins->r));
+ Xlea(pcode, cls, dst, mem);
+ }
+ break;
+ case Osub:
+ dst = mkregoper(ins->l);
+ if (kisflt(cls)) {
+ Xsubf(pcode, cls, dst, mkimmdatregoper(ins->r));
+ } else if (ins->reg-1 == dst.reg) { /* two-address */
+ Xsub(pcode, cls, dst, ref2oper(ins->r));
+ } else {
+ assert(isintcon(ins->r));
+ Xlea(pcode, cls, reg2oper(ins->reg-1),
+ mkoper(OMEM, .base = mkregoper(ins->l).reg, .index = NOINDEX, .disp = -intconval(ins->r)));
+ }
+ break;
+ case Oshl: X = Xshl; goto ALU2;
+ case Osar: X = Xsar; goto ALU2;
+ case Oslr: X = Xshr; goto ALU2;
+ case Oand:
+ if (!ins->reg) {
+ Xtest(pcode, cls, mkregoper(ins->l), mkimmdatregoper(ins->r));
+ break;
+ }
+ X = Xand;
+ goto ALU2;
+ case Oxor: X = Xxor; goto ALU2;
+ case Oior: X = Xior; goto ALU2;
+ ALU2:
+ dst = mkregoper(ins->l);
+ assert(ins->reg-1 == dst.reg);
+ X(pcode, cls, dst, mkimmdatregoper(ins->r));
+ break;
+ case Oneg: X1 = Xneg; goto ALU1;
+ case Onot: X1 = Xnot; goto ALU1;
+ ALU1:
+ dst = mkregoper(ins->l);
+ assert(ins->reg-1 == dst.reg);
+ X1(pcode, cls, dst);
+ break;
+ case Omul:
+ if (kisint(cls))
+ Ximul(pcode, cls, reg2oper(ins->reg-1), ref2oper(ins->l), ref2oper(ins->r));
+ else
+ Xmulf(pcode, cls, reg2oper(ins->reg-1), ref2oper(ins->r));
+ break;
+ case Odiv:
+ switch (cls) {
+ default: assert(0);
+ case KPTR:
+ case KI64: B(0x48); /* REX.W */
+ case KI32: B(0x99); /* CDQ/CQO */
+ assert(mkregoper(ins->l).reg == RAX);
+ Xidiv(pcode, cls, mkdatregoper(ins->r));
+ break;
+ case KF32: case KF64:
+ Xdivf(pcode, cls, reg2oper(ins->reg-1), mkdatregoper(ins->r));
+ break;
+ }
+ break;
+ case Oudiv:
+ DS("\x31\xD2"); /* XOR EDX,EDX */
+ assert(mkregoper(ins->l).reg == RAX);
+ Xdiv(pcode, cls, mkdatregoper(ins->r));
+ break;
+ case Oequ: case Oneq:
+ case Olth: case Ogth: case Olte: case Ogte:
+ case Oulth: case Ougth: case Oulte: case Ougte:
+ dst = mkregoper(ins->l);
+ src = ref2oper(ins->r);
+ regzeroed = 0;
+ if (ins->reg && dst.reg != ins->reg-1 && (src.t != OREG || src.reg != ins->reg-1)) {
+ /* can zero output reg before test instruction (differs from both inputs) */
+ /* XXX this doesn't check if a source operand is an addr containing the register */
+ struct oper dst = reg2oper(ins->reg-1);
+ Xxor(pcode, KI32, dst, dst);
+ regzeroed = 1;
+ }
+ if (kisint(ins->cls) && ins->r.bits == ZEROREF.bits)
+ Xtest(pcode, cls, dst, dst);
+ else
+ Xcmp(pcode, cls, dst, src);
+ if (ins->reg) {
+ enum cc cc;
+ dst = reg2oper(ins->reg-1);
+ if (ins->r.bits != ZEROREF.bits) { /* CMP */
+ cc = (kisint(ins->cls) ? icmpop2cc : fcmpop2cc)[ins->op];
+ } else { /* TEST r,r (CMP r, 0) */
+ assert(kisint(ins->cls));
+ cc = icmpzero2cc[ins->op];
+ }
+ if (kisflt(ins->cls)) { /* handle float unordered result */
+ int unordres = ins->op == Oneq ? 1 : 0;
+ int rex = 0;
+ if (in_range(dst.reg, RSP, RDI)) rex = 0x40;
+ rex |= (dst.reg >> 3); /* REX.B */
+ int jpoff = 3 + (rex != 0);
+ if (regzeroed && unordres == 0) {
+ /* if cmp unordered, just jump over the SETcc; result reg was already zeroed */
+ B(0x7A), B(jpoff); /* JP <off> */
+ } else {
+ /* JNP .a
+ * MOV r8, 0/1
+ * JMP .b
+ * .a: SETcc r8
+ * .b: MOVZX r, r8
+ */
+ B(0x7B), B(jpoff+1); /* JNP <off> */
+ if (rex) B(rex | 0x40);
+ B(0xB0 + (dst.reg & 7)), B(unordres); /* MOV r8, 0/1 */
+ B(0xEB), B(jpoff); /* JMP <off> */
+ }
+ }
+ Xsetcc(pcode, cc, dst.reg);
+ if (!regzeroed)
+ Xmovzxb(pcode, KI32, dst, dst);
+ }
+ break;
+ case Omove:
+ dst = ref2oper(ins->l);
+ gencopy(pcode, cls, blk, curi, dst, ins->r);
+ break;
+ case Ocopy:
+ dst = reg2oper(ins->reg-1);
+ gencopy(pcode, cls, blk, curi, dst, ins->l);
+ break;
+ case Oswap:
+ if (kisint(cls))
+ Xxchg(pcode, cls, ref2oper(ins->l), mkregoper(ins->r));
+ else {
+ struct oper l = mkregoper(ins->l), r = mkregoper(ins->r);
+ Xxor(pcode, cls, l, r);
+ Xxor(pcode, cls, r, l);
+ Xxor(pcode, cls, l, r);
+ }
+ break;
+ case Oxsave:
+ Xpush(pcode, mkregoper(ins->l).reg);
+ break;
+ case Oxrestore:
+ Xpop(pcode, mkregoper(ins->l).reg);
+ break;
+ case Ocall:
+ if (calltab.p[ins->r.i].vararg >= 0) {
+ struct call *call = &calltab.p[ins->r.i];
+ /* variadic functions need the caller to write num of args in sse regs to %al */
+ int n = 0;
+ for (int i = 0; i < call->narg; ++i)
+ if (!call->abiarg[i].isstk && call->abiarg[i].reg >= XMM0)
+ ++n;
+ if (!n) DS("\x31\xC0"); /* XOR EAX, EAX */
+ else B(0xB0), B(n); /* MOV AL, n */
+ }
+ Xcall(pcode, KPTR, ref2oper(ins->l));
+ break;
+ case Oxvaprologue:
+ Xvaprologue(pcode, fn, mkmemoper(ins->l));
+ break;
+ }
+}
+
+static void
+emitbranch(uchar **pcode, struct block *blk)
+{
+ enum cc cc = ALWAYS;
+ assert(blk->s1);
+ if (blk->s2) {
+ /* conditional branch.. */
+ union ref arg = blk->jmp.arg[0];
+ struct block *unord = NULL;
+ assert(arg.t == RTMP);
+ struct instr *ins = &instrtab[arg.i];
+ if ((oiscmp(ins->op) || ins->op == Oand || ins->op == Osub)) {
+ if (ins->r.bits != ZEROREF.bits) {
+ /* for CMP instr */
+ cc = (kisint(ins->cls) ? icmpop2cc : fcmpop2cc)[ins->op];
+ unord = ins->op == Oneq ? blk->s1 : blk->s2;
+ } else {
+ assert(kisint(ins->cls));
+ /* for TEST instr, which modifies ZF and SF and sets CF = OF = 0 */
+ cc = icmpzero2cc[ins->op];
+ }
+ } else {
+ /* implicit by ZF */
+ cc = CCNZ;
+ }
+ if (kisflt(ins->cls)) {
+ /* handle float unordered result */
+ Xjcc(pcode, CCP, unord);
+ }
+ if (blk->s1 == blk->lnext) {
+ /* if s1 is next adjacent block, swap s1,s2 and flip condition to emit a
+ * single jump */
+ struct block *tmp = blk->s1;
+ blk->s1 = blk->s2;
+ blk->s2 = tmp;
+ cc ^= 1;
+ }
+ }
+ /* make sure to fallthru if jumping to next adjacent block */
+ if (blk->s2 || blk->s1 != blk->lnext)
+ Xjcc(pcode, cc, blk->s1);
+ if (blk->s2 && blk->s2 != blk->lnext)
+ Xjcc(pcode, ALWAYS, blk->s2);
+}
+
+static bool
+calleesave(int *npush, uchar **pcode, struct function *fn)
+{
+ bool any = 0;
+ if (rstest(fn->regusage, RBX)) {
+ Xpush(pcode, RBX);
+ ++*npush;
+ any = 1;
+ }
+ for (int r = R12; r <= R15; ++r)
+ if (rstest(fn->regusage, r)) {
+ Xpush(pcode, r);
+ ++*npush;
+ any = 1;
+ }
+ return any;
+}
+
+static void
+calleerestore(uchar **pcode, struct function *fn)
+{
+ for (int r = R15; r >= R12; --r)
+ if (rstest(fn->regusage, r))
+ Xpop(pcode, r);
+ if (rstest(fn->regusage, RBX)) Xpop(pcode, RBX);
+}
+
+/* align code using NOPs */
+static void
+nops(uchar **pcode, int align)
+{
+ int rem;
+ while ((rem = (*pcode - objout.textbegin) & (align - 1)) != 0) {
+ switch (align - rem) {
+ case 15: case 14: case 13: case 12: case 11: case 10:
+ case 9: B(0x66);
+ case 8: DS("\x0f\x1f\x84\x00\x00\x00\x00\x00"); break;
+ case 7: DS("\x0f\x1f\x80\x00\x00\x00\x00"); break;
+ case 6: B(0x66);
+ case 5: DS("\x0f\x1f\x44\x00\x00"); break;
+ case 4: DS("\x0f\x1f\x40\x00"); break;
+ case 3: DS("\x0f\x1f\00"); break;
+ case 2: B(0x66);
+ case 1: B(0x90); break;
+ }
+ }
+}
+
+static void
+emitbin(struct function *fn)
+{
+ struct block *blk;
+ uchar **pcode = &objout.code;
+ int npush = 0;
+ uint epilogueaddr = 0;
+ bool saverestore;
+
+ if (nblkaddr < fn->nblk) {
+ blkaddr = xrealloc(blkaddr, fn->nblk * sizeof *blkaddr);
+ nblkaddr = fn->nblk;
+ }
+ memset(blkaddr, 0, nblkaddr * sizeof *blkaddr);
+
+ nops(pcode, 16);
+ fnstart = *pcode;
+ curfnsym = fn->name;
+
+ /** prologue **/
+
+ /* only use frame pointer in non-leaf functions and functions that use the stack */
+ usebp = 0;
+ if (!fn->isleaf || fn->stksiz) {
+ usebp = 1;
+ /* push rbp; mov rbp, rsp */
+ DS("\x55\x48\x89\xE5");
+ }
+ saverestore = calleesave(&npush, pcode, fn);
+ if (usebp) rbpoff = -npush*8;
+
+ /* ensure stack is 16-byte aligned for function calls */
+ if (!fn->isleaf && ((fn->stksiz + npush*8) & 0xF) != 0) {
+ assert(usebp);
+ if ((rbpoff & 0xF) == 0) {
+ rbpoff -= 16;
+ fn->stksiz += 24;
+ } else {
+ rbpoff -= 8;
+ fn->stksiz += 8;
+ }
+ }
+
+ if (fn->stksiz != 0) {
+ /* sub rsp, <stack size> */
+ if (fn->stksiz < 128)
+ DS("\x48\x83\xEC"), B(fn->stksiz);
+ else if (fn->stksiz == 128)
+ DS("\x48\x83\xC4\x80"); /* add rsp, -128 */
+ else
+ DS("\x48\x81\xEC"), I32(fn->stksiz);
+ }
+
+ blk = fn->entry;
+ do {
+ struct blkaddr *bb = &blkaddr[blk->id];
+ uint bbaddr = *pcode - objout.textbegin;
+ assert(!bb->resolved);
+ while (bb->relreloc) {
+ uint next;
+ int disp = bbaddr - bb->relreloc - 4;
+
+ memcpy(&next, objout.textbegin + bb->relreloc, 4);
+ wr32le(objout.textbegin + bb->relreloc, disp);
+ bb->relreloc = next;
+ }
+ bb->resolved = 1;
+ bb->addr = bbaddr;
+
+ for (int i = 0; i < blk->ins.n; ++i) {
+ emitinstr(pcode, fn, blk, i, &instrtab[blk->ins.p[i]]);
+ }
+ if (blk->jmp.t == Jret) {
+ /* epilogue */
+ uint here = *pcode - fnstart;
+ if (epilogueaddr) {
+ int disp = epilogueaddr - (here + 2);
+ if ((uint)(disp + 128) < 256) {/* can use 1-byte displacement? */
+ B(0xEB), B(disp); /* JMP rel8 */
+ } else {
+ B(0xE9), I32(disp - 3); /* JMP rel32 */
+ }
+ } else {
+ if (fn->stksiz && (saverestore || !usebp))
+ Xadd(pcode, KPTR, mkoper(OREG, .reg = RSP), mkoper(OIMM, .imm = fn->stksiz));
+ if (saverestore) {
+ epilogueaddr = here;
+ calleerestore(pcode, fn);
+ }
+ if (usebp) B(0xC9); /* leave */
+ B(0xC3); /* ret */
+ }
+ } else if (blk->jmp.t == Jtrap) {
+ DS("\x0F\x0B"); /* UD2 */
+ } else emitbranch(pcode, blk);
+ } while ((blk = blk->lnext) != fn->entry);
+ objdeffunc(fn->name, fn->globl, fnstart - objout.textbegin, *pcode - fnstart);
+}
+
+void
+x86_64_emit(struct function *fn)
+{
+ fn->stksiz = alignup(fn->stksiz, 8);
+ if (fn->stksiz > 1<<24) error(NULL, "'%s' stack frame too big", fn->name);
+ emitbin(fn);
+}
+
+/* vim:set ts=3 sw=3 expandtab: */
diff --git a/x86_64/isel.c b/x86_64/isel.c
new file mode 100644
index 0000000..5d373f3
--- /dev/null
+++ b/x86_64/isel.c
@@ -0,0 +1,660 @@
+#include "all.h"
+#include "../endian.h"
+
+enum flag {
+ ZF = 1 << 0,
+ SF = 1 << 1,
+ CF = 1 << 2,
+ OF = 1 << 3,
+ CLOBF = 1 << 4,
+};
+
+/* flags modified by each integer op */
+static const uchar opflags[NOPER] = {
+ [Oneg] = ZF|CLOBF,
+ [Oadd] = ZF|CLOBF,
+ [Osub] = ZF|CLOBF,
+ [Omul] = CLOBF,
+ [Odiv] = CLOBF,
+ [Oudiv] = CLOBF,
+ [Orem] = CLOBF,
+ [Ourem] = CLOBF,
+ [Oand] = ZF|CLOBF,
+ [Oior] = ZF|CLOBF,
+ [Oxor] = ZF|CLOBF,
+ [Oshl] = ZF|CLOBF,
+ [Osar] = ZF|CLOBF,
+ [Oslr] = ZF|CLOBF,
+ [Oequ] = ZF|CLOBF,
+ [Oneq] = ZF|CLOBF,
+ [Olth] = ZF|CLOBF,
+ [Ogth] = ZF|CLOBF,
+ [Olte] = ZF|CLOBF,
+ [Ogte] = ZF|CLOBF,
+ [Oulth] = ZF|CLOBF,
+ [Ougth] = ZF|CLOBF,
+ [Oulte] = ZF|CLOBF,
+ [Ougte] = ZF|CLOBF,
+ [Ocall] = CLOBF,
+};
+
+static int iflagsrc = -1;
+
+static void
+picfixsym(union ref *r, struct block *blk, int *curi)
+{
+ if (!ccopt.pic || !isaddrcon(*r,0)) return;
+ *r = insertinstr(blk, (*curi)++, mkinstr(Ocopy, KPTR, .l = *r));
+}
+
+/* map alloca tmp -> stack frame displacement (0 if not alloca) */
+static ushort *stkslots;
+static uint nstkslots;
+
+#define isstkslot(r) ((r).t == RTMP && (r).i < nstkslots && stkslots[(r).i])
+
+static void
+fixarg(union ref *r, struct instr *ins, struct block *blk, int *curi)
+{
+ int sh;
+ enum op op = ins ? ins->op : 0;
+
+ if (r->t == RXCON) {
+ struct xcon *con = &conht[r->i];
+ if (in_range(op, Oshl, Oslr) && r == &ins->r) {
+ sh = con->i;
+ goto ShiftImm;
+ } else if (in_range(op, Oadd, Osub) && con->i == 2147483648 && r == &ins->r) {
+ /* add X, INT32MAX+1 -> sub X, INT32MIN */
+ ins->op = Oadd + (op == Oadd);
+ *r = mkintcon(KI32, -2147483648);
+ } else if (kisflt(con->cls) && con->i == 0) {
+ /* copy of positive float zero -> regular zero, that emit() will turn into xor x,x */
+ if (in_range(op, Ocopy, Omove) || op == Ophi)
+ *r = ZEROREF;
+ else
+ *r = insertinstr(blk, (*curi)++, mkinstr(Ocopy, con->cls, ZEROREF));
+ } else if (con->cls >= KI64) {
+ /* float immediates & 64bit immediates are loaded from memory */
+ uchar data[8];
+ uint ksiz = cls2siz[con->cls];
+ union type ctype;
+ /* can't use memory arg in rhs if lhs is memory */
+ bool docopy = &ins->l != r && (oisstore(ins->op) || ins->l.t == RADDR);
+ if (con->cls <= KPTR && in_range(ins->op, Ocopy, Omove)) /* in this case we can use movabs */
+ return;
+ else if (!docopy || con->cls >= KF32) {
+ if (con->cls != KF32) {
+ wr64le(data, con->i);
+ ctype = mktype(con->cls == KF64 ? TYDOUBLE : TYVLONG);
+ } else {
+ union { float f; int i; } pun = { con->f };
+ wr32le(data, pun.i);
+ ctype = mktype(TYFLOAT);
+ }
+ *r = mkdatref(NULL, ctype, ksiz, /*align*/ksiz, data, ksiz, /*deref*/1);
+ }
+ if (docopy)
+ *r = insertinstr(blk, (*curi)++, mkinstr(Ocopy, con->cls, *r));
+ } else if (ins->op != Omove && con->issym && r == &ins->r) {
+ *r = insertinstr(blk, (*curi)++, mkinstr(Ocopy, KPTR, mkaddr((struct addr){*r})));
+ } else if (in_range(op, Odiv, Ourem) && kisint(ins->cls))
+ goto DivImm;
+ } else if (r->t == RICON && in_range(op, Odiv, Ourem) && kisint(ins->cls) && r == &ins->r) {
+ DivImm: /* there is no division by immediate, must be copied to a register */
+ *r = insertinstr(blk, (*curi)++, mkinstr(Ocopy, ins->cls, *r));
+ } else if (r->t == RICON && in_range(op, Oshl, Oslr) && r == &ins->r) {
+ sh = r->i;
+ ShiftImm: /* shift immediate is always 8bit */
+ *r = mkref(RICON, sh & 255);
+ } else if (isstkslot(*r)) {
+ struct instr adr = mkinstr(Oadd, KPTR, mkref(RREG, RBP), mkintcon(KI32, -stkslots[r->i]));
+ if (in_range(op, Ocopy, Omove))
+ *ins = adr;
+ else
+ *r = insertinstr(blk, (*curi)++, adr);
+ }
+ picfixsym(r, blk, curi);
+}
+
+#define isimm32(r) (iscon(r) && concls(r) == KI32)
+
+static void
+selcall(struct function *fn, struct instr *ins, struct block *blk, int *curi)
+{
+ const struct call *call = &calltab.p[ins->r.i];
+ int iarg = *curi - 1;
+ enum irclass cls;
+ uint argstksiz = alignup(call->argstksiz, 16);
+
+ for (int i = call->narg - 1; i >= 0; --i) {
+ struct abiarg abi = call->abiarg[i];
+ struct instr *arg;
+ for (;; --iarg) {
+ assert(iarg >= 0 && i >= 0 && "arg?");
+ if ((arg = &instrtab[blk->ins.p[iarg]])->op == Oarg)
+ break;
+ }
+
+ if (!abi.isstk) {
+ assert(!abi.ty.isagg);
+ *arg = mkinstr(Omove, call->abiarg[i].ty.cls, mkref(RREG, abi.reg), arg->r);
+ } else {
+ union ref adr = mkaddr((struct addr){mkref(RREG, RSP), .disp = abi.stk});
+ int iargsave = iarg;
+ if (!abi.ty.isagg) { /* scalar arg in stack */
+ *arg = mkinstr(Ostore8+ilog2(cls2siz[abi.ty.cls]), 0, adr, arg->r);
+ if (isaddrcon(arg->r,1) || arg->r.t == RADDR)
+ arg->r = insertinstr(blk, iarg++, mkinstr(Ocopy, abi.ty.cls, arg->r));
+ else
+ fixarg(&ins->r, ins, blk, &iarg);
+ } else { /* aggregate arg in stack, callee stack frame destination address */
+ *arg = mkinstr(Ocopy, KPTR, adr);
+ }
+ *curi += iarg - iargsave;
+ }
+ }
+ if (call->argstksiz) {
+ union ref disp = mkref(RICON, argstksiz);
+ insertinstr(blk, iarg--, (struct instr){Osub, KPTR, .keep=1, .reg = RSP+1, .l=mkref(RREG,RSP), disp});
+ ++*curi;
+ insertinstr(blk, *curi+1, (struct instr){Oadd, KPTR, .keep=1, .reg = RSP+1, .l=mkref(RREG,RSP), disp});
+ }
+ if (isimm32(ins->l))
+ ins->l = mkaddr((struct addr){.base = ins->l});
+ else if (isintcon(ins->l))
+ ins->l = insertinstr(blk, (*curi)++, mkinstr(Ocopy, KPTR, ins->l));
+
+ if (call->vararg >= 0 && ins->l.t == RTMP) {
+ /* variadic calls write number of sse regs used to AL, so mark it as clobbered such that
+ * the function pointer of an indirect calls does not get allocated to RAX by regalloc */
+ insertinstr(blk, (*curi)++, mkinstr(Omove, KPTR, mkref(RREG, RAX), mkref(RREG, RAX)));
+ }
+ cls = ins->cls;
+ ins->cls = 0;
+ if (cls) {
+ /* duplicate to reuse same TMP ref */
+ insertinstr(blk, (*curi)++, *ins);
+ *ins = mkinstr(Ocopy, cls, mkref(RREG, call->abiret[0].reg));
+ for (int i = 1; i <= 2; ++i) {
+ if (*curi + i >= blk->ins.n) break;
+ if (instrtab[blk->ins.p[*curi + i]].op == Ocall2r) {
+ ins = &instrtab[blk->ins.p[*curi += i]];
+ *ins = mkinstr(Ocopy, ins->cls, mkref(RREG, call->abiret[1].reg));
+ break;
+ }
+ }
+ }
+}
+
+static bool
+aimm(struct addr *addr, int disp)
+{
+ vlong a = addr->disp;
+ a += disp;
+ if ((int)a == a) {
+ addr->disp = a;
+ return 1;
+ }
+ return 0;
+}
+
+static bool
+acon(struct addr *addr, union ref r)
+{
+ vlong a = addr->disp;
+ assert(isintcon(r));
+ a += intconval(r);
+ if ((int)a == a) {
+ addr->disp = a;
+ return 1;
+ }
+ return 0;
+}
+
+static bool
+ascale(struct addr *addr, union ref a, union ref b)
+{
+ if (b.t != RICON) return 0;
+ if (addr->index.bits) return 0;
+ if ((unsigned)b.i > 3) return 0;
+ if (a.t == RREG) {
+ Scaled:
+ addr->index = a;
+ addr->shift = b.i;
+ return 1;
+ } else if (a.t == RTMP) {
+ struct instr *ins = &instrtab[a.i];
+ /* factor out shifted immediate from 'shl {add %x, imm}, s' */
+ /* XXX maybe we shouldn't do this here because it should be done by a generic
+ * arithemetic optimization pass ? */
+ if (ins->op == Oadd && (ins->l.t == RREG || ins->l.t == RTMP) && isintcon(ins->r)) {
+ vlong a = ((vlong) addr->disp + intconval(ins->r)) * (1 << b.i);
+ if (a != (int) a) return 0;
+ addr->disp = a;
+ addr->index = ins->l;
+ addr->shift = b.i;
+ return 1;
+ } else {
+ goto Scaled;
+ }
+ }
+ return 0;
+}
+
+static bool
+aadd(struct addr *addr, struct block *blk, int *curi, union ref r)
+{
+ if (isstkslot(r)) {
+ if (addr->base.bits || !aimm(addr, -stkslots[r.i])) goto Ref;
+ addr->base = mkref(RREG, RBP);
+ } else if (r.t == RTMP) {
+ struct instr *ins = &instrtab[r.i];
+ if (ins->op == Oadd) {
+ if (!aadd(addr, blk, curi, ins->l)) goto Ref;
+ if (!aadd(addr, blk, curi, ins->r)) goto Ref;
+ ins->skip = 1;
+ } else if (ins->op == Oshl) {
+ if (!ascale(addr, ins->l, ins->r)) goto Ref;
+ ins->skip = 1;
+ } else if (ins->op == Ocopy && ins->l.t == RADDR) {
+ struct addr save = *addr, *addr2 = &addrht[ins->l.i];
+ if ((!addr2->base.bits || aadd(addr, blk, curi, addr2->base))
+ && aimm(addr, addr2->disp)
+ && (!addr2->index.bits || ascale(addr, addr2->index, mkref(RICON, addr2->shift))))
+ {
+ ins->skip = 1;
+ } else {
+ *addr = save;
+ goto Ref;
+ }
+ } else if (ins->op == Ocopy) {
+ if (!aadd(addr, blk, curi, ins->l)) goto Ref;
+ ins->skip = 1;
+ } else goto Ref;
+ } else if (isnumcon(r)) {
+ return acon(addr, r);
+ } else if (isaddrcon(r,1)) {
+ if (!addr->base.bits && !isaddrcon(addr->index,1)) addr->base = r;
+ else return 0;
+ } else if (r.t == RREG) {
+ /* temporaries are single assignment, but register aren't, so they can't be *
+ * safely hoisted into an address value, unless they have global lifetime */
+ if (!rstest(mctarg->rglob, r.i)) return 0;
+ Ref:
+ if (isstkslot(r) && (addr->base.bits || addr->index.bits)) {
+ r = insertinstr(blk, (*curi)++, mkinstr(Oadd, KPTR, mkref(RREG, RBP), mkref(RICON, -stkslots[r.i])));
+ }
+ if (!addr->base.bits) addr->base = r;
+ else if (!addr->index.bits) addr->index = r;
+ else return 0;
+ } else return 0;
+ return 1;
+}
+
+static bool
+fuseaddr(union ref *r, struct block *blk, int *curi)
+{
+ struct addr addr = { 0 };
+
+ if (isaddrcon(*r,1)) return 1;
+ if (r->t == RADDR) {
+ const struct addr *a0 = &addrht[r->i];
+ if (aadd(&addr, blk, curi, a0->base)
+ && (!addr.index.bits || ascale(&addr, a0->index, mkref(RICON, a0->shift)))
+ && aadd(&addr, blk, curi, mkintcon(KPTR, a0->disp))) {
+ *r = mkaddr(addr);
+ }
+ return 1;
+ }
+ if (r->t != RTMP) return 0;
+ if (!aadd(&addr, blk, curi, *r)) return 0;
+
+ if (isaddrcon(addr.base,0) && (ccopt.pic || (ccopt.pie && addr.index.bits))) {
+ /* pic needs to load from GOT */
+ /* pie cannot encode RIP-relative address with index register */
+ /* first load symbol address into a temp register */
+ union ref temp = mkaddr((struct addr){.base = addr.base, .disp = ccopt.pic ? 0 : addr.disp});
+ addr.base = insertinstr(blk, (*curi)++, mkinstr(Ocopy, KPTR, .l = temp));
+ if (!ccopt.pic) addr.disp = 0;
+ }
+
+ if (!addr.base.bits) {
+ /* absolute int address in disp */
+ if (addr.index.bits) return 0;
+ addr.base = mkintcon(KPTR, addr.disp);
+ addr.disp = 0;
+ }
+
+ *r = mkaddr(addr);
+ return 1;
+}
+
+/* is add instruction with this arg a candidate to transform into efective addr? */
+static bool
+addarg4addrp(union ref r)
+{
+ struct instr *ins;
+ if (r.t == RXCON && !conht[r.i].cls && !conht[r.i].deref) return 1; /* sym or dat ref */
+ if (r.t != RTMP) return 0;
+ if (isstkslot(r)) return 1;
+ ins = &instrtab[r.i];
+ return ins->op == Oshl || (ins->op == Ocopy && ins->l.t == RADDR) || ins->op == Oadd;
+}
+
+static void
+loadstoreaddr(struct block *blk, union ref *r, int *curi)
+{
+ if (isimm32(*r)) {
+ *r = mkaddr((struct addr){.base = *r});
+ } else if (isaddrcon(*r, 0)) {
+ picfixsym(r, blk, curi);
+ } else if (r->t == RTMP) {
+ if (addarg4addrp(*r)) fuseaddr(r, blk, curi);
+ } else if (r->t != RREG) {
+ *r = insertinstr(blk, (*curi)++, mkinstr(Ocopy, KPTR, *r));
+ }
+}
+
+static bool
+arithfold(struct instr *ins)
+{
+ if (isnumcon(ins->l) && (!ins->r.t || isnumcon(ins->r))) {
+ union ref r;
+ bool ok = ins->r.t ? foldbinop(&r, ins->op, ins->cls, ins->l, ins->r) : foldunop(&r, ins->op, ins->cls, ins->l);
+ assert(ok && "fold?");
+ *ins = mkinstr(Ocopy, insrescls(*ins), r);
+ return 1;
+ }
+ return 0;
+}
+
+static void
+sel(struct function *fn, struct instr *ins, struct block *blk, int *curi)
+{
+ uint siz, alignlog2;
+ int t = ins - instrtab;
+ struct instr temp = {0};
+ enum op op = ins->op;
+
+ if (oisarith(ins->op) && arithfold(ins)) {
+ fixarg(&ins->l, ins, blk, curi);
+ return;
+ }
+
+ switch (op) {
+ default: assert(0);
+ case Onop: break;
+ case Oalloca1: case Oalloca2: case Oalloca4: case Oalloca8: case Oalloca16:
+ alignlog2 = ins->op - Oalloca1;
+ assert(ins->l.i > 0);
+ siz = ins->l.i << alignlog2;
+ fn->stksiz += siz;
+ fn->stksiz = alignup(fn->stksiz, 1 << alignlog2);
+ if (fn->stksiz > (1<<16)-1) error(NULL, "'%s' stack frame too big", fn->name);
+ stkslots[t] = fn->stksiz;
+ *ins = mkinstr(Onop,0,);
+ break;
+ case Oparam:
+ assert(ins->l.t == RICON && ins->l.i < fn->nabiarg);
+ if (!fn->abiarg[ins->l.i].isstk)
+ *ins = mkinstr(Ocopy, ins->cls, mkref(RREG, fn->abiarg[ins->l.i].reg));
+ else /* stack */
+ *ins = mkinstr(Oadd, KPTR, mkref(RREG, RBP), mkref(RICON, 16+fn->abiarg[ins->l.i].stk));
+ break;
+ case Oarg:
+ fixarg(&ins->r, ins, blk, curi);
+ break;
+ case Ocall:
+ selcall(fn, ins, blk, curi);
+ break;
+ case Ocall2r: assert(0);
+ case Ointrin:
+ break;
+ case Oshl: case Osar: case Oslr:
+ if (!iscon(ins->r)) {
+ /* shift amount register is always CL */
+ insertinstr(blk, (*curi)++, mkinstr(Omove, KI32, mkref(RREG, RCX), ins->r));
+ ins->r = mkref(RREG, RCX);
+ }
+ goto ALU;
+ case Oequ: case Oneq:
+ case Olth: case Ogth: case Olte: case Ogte:
+ case Oulth: case Ougth: case Oulte: case Ougte:
+ if (iscon(ins->l)) {
+ /* lth imm, x -> gth x, imm */
+ if (!in_range(ins->op, Oequ, Oneq))
+ ins->op = ((op - Olth) ^ 1) + Olth;
+ rswap(ins->l, ins->r);
+ }
+ if (ins->l.t != RTMP && ins->l.t != RREG)
+ ins->l = insertinstr(blk, (*curi)++, mkinstr(Ocopy, ins->cls, ins->l));
+ else
+ fixarg(&ins->l, ins, blk, curi);
+ fixarg(&ins->r, ins, blk, curi);
+ break;
+ case Odiv: case Oudiv: case Orem: case Ourem:
+ if (kisflt(ins->cls)) goto ALU;
+ /* TODO fuse div/rem pair */
+
+ /* (I)DIV dividend is always in RDX:RAX, output also in those regs */
+ insertinstr(blk, (*curi)++, mkinstr(Omove, ins->cls, mkref(RREG, RAX), ins->l));
+ /* mark RDX as clobbered. sign/zero-extending RAX into RDX is handled in emit() */
+ insertinstr(blk, (*curi)++, mkinstr(Omove, ins->cls, mkref(RREG, RDX), mkref(RREG, RDX)));
+ fixarg(&ins->r, ins, blk, curi); /* make sure rhs is memory or reg */
+ ins->l = mkref(RREG, RAX);
+ ins->keep = 1;
+ if (op == Orem) ins->op = Odiv;
+ else if (op == Ourem) ins->op = Oudiv;
+ insertinstr(blk, (*curi)++, *ins); /* duplicate ins to reuse tmp ref */
+ *ins = mkinstr(Ocopy, ins->cls, mkref(RREG, op < Orem ? RAX : RDX)); /* get output */
+ temp = mkinstr(Ocopy, ins->cls, mkref(RREG, op < Orem ? RDX : RAX)); /* clobber other reg*/
+ insertinstr(blk, ++(*curi), temp);
+ /* swap instrs so that clobber goes first */
+ t = blk->ins.p[*curi - 1];
+ blk->ins.p[*curi - 1] = blk->ins.p[*curi - 0];
+ blk->ins.p[*curi - 0] = t;
+ break;
+ case Osub:
+ if (isintcon(ins->l)) {
+ /* sub imm, x -> sub x, imm; neg x */
+ fixarg(&ins->l, ins, blk, curi);
+ ins->inplace = 1;
+ struct instr sub = *ins;
+ rswap(sub.l, sub.r);
+ ins->op = op = Oneg;
+ ins->l = insertinstr(blk, (*curi)++, sub);
+ ins->r = NOREF;
+ goto ALU;
+ } else if (kisint(ins->cls) && isintcon(ins->r)) {
+ ins->op = op = Oadd;
+ ins->r = mkintcon(concls(ins->r), -intconval(ins->r));
+ } else {
+ goto ALU;
+ }
+ /* fallthru */
+ case Oadd:
+ if (kisint(ins->cls)) {
+ if ((addarg4addrp(ins->l) || addarg4addrp(ins->r))) {
+ temp.op = Ocopy;
+ temp.cls = ins->cls;
+ temp.l = mkref(RTMP, t);
+ if (fuseaddr(&temp.l, blk, curi)) {
+ *ins = temp;
+ break;
+ }
+ }
+ }
+ /* fallthru */
+ case Omul:
+ case Oand: case Oxor: case Oior:
+ /* commutative ops */
+ if (iscon(ins->l))
+ rswap(ins->l, ins->r);
+ goto ALU;
+ case Oneg:
+ if (kisflt(ins->cls)) {
+ /* flip sign bit with XORPS/D */
+ static const uvlong sd[2] = {0x8000000000000000,0x8000000000000000};
+ static const uint sf[4] = {0x80000000,80000000,0x80000000,80000000};
+ ins->op = Oxor;
+ ins->r = mkdatref(NULL, mktype(ins->cls == KF32 ? TYFLOAT : TYDOUBLE), /*siz*/16,
+ /*align*/16, ins->cls == KF32 ? (void *)sf : sd, /*siz*/16, /*deref*/1);
+ }
+ /* fallthru */
+ case Onot:
+ ALU:
+ if (!(op == Oadd && kisint(ins->cls))) /* 3-address add is lea */
+ if (!(op == Omul && kisint(ins->cls) && isimm32(ins->r))) /* for (I)MUL r,r/m,imm */
+ ins->inplace = 1;
+ if (iscon(ins->l)) {
+ fixarg(&ins->l, ins, blk, curi);
+ ins->l = insertinstr(blk, (*curi)++, mkinstr(Ocopy, ins->cls, ins->l));
+ }
+ if (ins->r.bits)
+ case Omove:
+ fixarg(&ins->r, ins, blk, curi);
+ if (op == Oadd && isaddrcon(ins->r,1)) /* no 3-address add if rhs is mem */
+ ins->inplace = 1;
+ break;
+ case Oloads8: case Oloadu8: case Oloads16: case Oloadu16:
+ case Oloads32: case Oloadu32: case Oloadi64: case Oloadf32: case Oloadf64:
+ loadstoreaddr(blk, &ins->l, curi);
+ break;
+ case Ostore8: case Ostore16: case Ostore32: case Ostore64:
+ loadstoreaddr(blk, &ins->l, curi);
+ if (isaddrcon(ins->r,1) || ins->r.t == RADDR)
+ ins->r = insertinstr(blk, (*curi)++, mkinstr(Ocopy, KPTR, ins->r));
+ else
+ fixarg(&ins->r, ins, blk, curi);
+ break;
+ case Ocvtu32f:
+ fixarg(&ins->l, ins, blk, curi);
+ ins->l = insertinstr(blk, (*curi)++, mkinstr(Oextu32, KI64, ins->l));
+ ins->op = Ocvts64f;
+ break;
+ case Ocvtf32u: case Ocvtf64u:
+ fixarg(&ins->l, ins, blk, curi);
+ if (ins->cls == KI32) {
+ ins->l = insertinstr(blk, (*curi)++, mkinstr(ins->op == Ocvtf32u ? Ocvtf32s : Ocvtf64s, KI64, ins->l));
+ ins->op = Oextu32;
+ } else assert(!"nyi flt -> u64");
+ break;
+ case Ocvtf32f64: case Ocvtf64f32: case Ocvtf32s: case Ocvtf64s: case Ocvts32f: case Ocvts64f:
+ case Ocvtu64f:
+ case Oexts8: case Oextu8: case Oexts16: case Oextu16: case Oexts32: case Oextu32:
+ if (isnumcon(ins->l)) {
+ union ref it;
+ bool ok = foldunop(&it, ins->op, ins->cls, ins->l);
+ assert(ok);
+ ins->op = Ocopy;
+ ins->l = it;
+ break;
+ }
+ case Ocopy:
+ fixarg(&ins->l, ins, blk, curi);
+ break;
+ case Oxvaprologue:
+ fuseaddr(&ins->l, blk, curi);
+ assert(ins->l.t == RADDR);
+ /* !this must be the first instruction */
+ assert(*curi == 1);
+ assert(blk == fn->entry);
+ t = blk->ins.p[0];
+ blk->ins.p[0] = blk->ins.p[1];
+ blk->ins.p[1] = t;
+ break;
+ }
+}
+
+static void
+seljmp(struct function *fn, struct block *blk)
+{
+ if (blk->jmp.t == Jb && blk->jmp.arg[0].bits) {
+ int curi = blk->ins.n;
+ fixarg(&blk->jmp.arg[0], NULL, blk, &curi);
+ union ref c = blk->jmp.arg[0];
+ if (c.t != RTMP) {
+ enum irclass cls = c.t == RICON ? KI32 : c.t == RXCON && conht[c.i].cls ? conht[c.i].cls : KPTR;
+ int curi = blk->ins.n;
+
+ c = insertinstr(blk, blk->ins.n, mkinstr(Ocopy, cls, c));
+ sel(fn, &instrtab[c.i], blk, &curi);
+ }
+ if (iflagsrc == c.i /* test cmp */
+ && (oiscmp(instrtab[c.i].op) || instrtab[c.i].op == Oand || instrtab[c.i].op == Osub)) {
+ instrtab[c.i].keep = 1;
+ } else {
+ if (!(opflags[instrtab[c.i].op] & ZF) || blk->ins.n == 0 || c.i != blk->ins.p[blk->ins.n - 1]) {
+ struct instr *ins;
+ int curi = blk->ins.n;
+ blk->jmp.arg[0] = insertinstr(blk, blk->ins.n, mkinstr(Oneq, instrtab[c.i].cls, c, ZEROREF));
+ ins = &instrtab[blk->jmp.arg[0].i];
+ if (kisflt(ins->cls)) {
+ ins->r = insertinstr(blk, curi, mkinstr(Ocopy, ins->cls, ZEROREF));
+ }
+ ins->keep = 1;
+ } else if (instrtab[c.i].op == Oadd) {
+ /* prevent a 3-address add whose flag results are used from becoming a LEA */
+ instrtab[c.i].inplace = 1;
+ }
+ }
+ } else if (blk->jmp.t == Jret) {
+ if (blk->jmp.arg[0].bits) {
+ int curi;
+ union ref r = mkref(RREG, fn->abiret[0].reg);
+ struct instr *ins = &instrtab[insertinstr(blk, blk->ins.n, mkinstr(Omove, fn->abiret[0].ty.cls, r , blk->jmp.arg[0])).i];
+ curi = blk->ins.n;
+ fixarg(&ins->r, ins, blk, &curi);
+ blk->jmp.arg[0] = r;
+ if (blk->jmp.arg[1].bits) {
+ r = mkref(RREG, fn->abiret[1].reg);
+ ins = &instrtab[insertinstr(blk, blk->ins.n, mkinstr(Omove, fn->abiret[1].ty.cls, r, blk->jmp.arg[1])).i];
+ curi = blk->ins.n;
+ fixarg(&ins->r, ins, blk, &curi);
+ blk->jmp.arg[1] = r;
+ }
+ }
+ }
+}
+
+void
+x86_64_isel(struct function *fn)
+{
+ extern int ninstr;
+ struct block *blk = fn->entry;
+
+ fn->stksiz = 0;
+ stkslots = xcalloc((nstkslots = ninstr) * sizeof *stkslots);
+ do {
+ int i;
+ for (i = 0; i < blk->phi.n; ++i) {
+ struct instr *ins = &instrtab[blk->phi.p[i]];
+ union ref *phi = phitab.p[ins->l.i];
+ for (int i = 0; i < blk->npred; ++i) {
+ int curi = blkpred(blk, i)->ins.n;
+ fixarg(&phi[i], ins, blkpred(blk, i), &curi);
+ }
+ }
+ iflagsrc = -1;
+ for (i = 0; i < blk->ins.n; ++i) {
+ struct instr *ins = &instrtab[blk->ins.p[i]];
+ sel(fn, ins, blk, &i);
+ if (ins->op < countof(opflags) && kisint(insrescls(*ins))) {
+ if (opflags[ins->op] & ZF) iflagsrc = ins - instrtab;
+ else if (opflags[ins->op] & CLOBF) iflagsrc = -1;
+ }
+ }
+ seljmp(fn, blk);
+ } while ((blk = blk->lnext) != fn->entry);
+ free(stkslots);
+
+ if (ccopt.dbg.i) {
+ bfmt(ccopt.dbgout, "<< After isel >>\n");
+ irdump(fn);
+ }
+
+ fn->prop = 0;
+}
+
+/* vim:set ts=3 sw=3 expandtab: */
diff --git a/x86_64/sysv.c b/x86_64/sysv.c
new file mode 100644
index 0000000..32cc9e5
--- /dev/null
+++ b/x86_64/sysv.c
@@ -0,0 +1,313 @@
+#include "all.h"
+
+static int classify(uchar cls[2], const struct typedata *td, uint off);
+
+static void
+clsscalar(uchar cls[2], uint off, union type ty)
+{
+ enum irclass k = type2cls[scalartypet(ty)];
+ uchar *fcls = &cls[off/8];
+ if (isflt(ty)) { /* SSE */
+ if (!*fcls || (*fcls == KF32 && k > *fcls))
+ *fcls = k;
+ } else { /* INTEGER */
+ assert(isint(ty) || ty.t == TYPTR);
+ if (cls2siz[*fcls] < cls2siz[k])
+ *fcls = k == KPTR ? KI64 : k;
+ }
+ if (off % 8 >= 4 && cls2siz[*fcls] < 8)
+ *fcls = kisint(*fcls) ? KI64 : KF64;
+}
+
+static int
+classifyarr(uchar cls[2], union type ty, uint off)
+{
+ union type chld = typechild(ty);
+ uint n = typearrlen(ty), siz = typesize(chld);
+ assert(n > 0);
+ for (uint i = 0; i < n; ++i) {
+ uint offx = off + i * siz;
+ if (isagg(chld)) {
+ if (!classify(cls, &typedata[chld.dat], offx))
+ return cls[0] = cls[1] = 0;
+ } else if (chld.t == TYARRAY) {
+ if (!classifyarr(cls, chld, offx))
+ return cls[0] = cls[1] = 0;
+ } else {
+ clsscalar(cls, offx, chld);
+ }
+ }
+ return !!cls[0] + !!cls[1];
+}
+
+static int
+classify(uchar cls[2], const struct typedata *td, uint off)
+{
+ uint siz = alignup(td->siz, 4);
+ if (siz > 16) /* MEMORY */
+ return 0;
+ for (int i = 0; i < td->nmemb; ++i) {
+ struct fielddata *fld = &td->fld[i].f;
+ uint align = typealign(fld->t);
+ if (alignup(fld->off, align) != fld->off) /* unaligned field -> MEMORY */
+ return cls[0] = cls[1] = 0;
+ if (isagg(fld->t)) {
+ if (!classify(cls, &typedata[fld->t.dat], off + fld->off))
+ return cls[0] = cls[1] = 0;
+ } else if (fld->t.t == TYARRAY) {
+ if (isincomplete(fld->t)) continue;
+ if (!classifyarr(cls, fld->t, off + fld->off))
+ return cls[0] = cls[1] = 0;
+ } else {
+ clsscalar(cls, fld->off + off, fld->t);
+ }
+ }
+ return !!cls[0] + !!cls[1];
+}
+
+static int
+abiarg(short r[2], uchar cls[2], uchar *r2off, int *ni, int *nf, int *ns, union irtype typ)
+{
+ static const uchar intregs[] = { RDI, RSI, RDX, RCX, R8, R9 };
+ enum { NINT = countof(intregs), NFLT = 8 };
+ int ret, ni_save, nf_save;
+
+ if (!typ.isagg) {
+ if (kisflt(cls[0] = typ.cls) && *nf < NFLT) {
+ r[0] = XMM0 + (*nf)++;
+ } else if (kisint(cls[0]) && *ni < NINT) {
+ r[0] = intregs[(*ni)++];
+ } else {
+ r[0] = *ns;
+ *ns += 8;
+ return 0; /* MEMORY */
+ }
+ return 1;
+ }
+ cls[0] = cls[1] = 0;
+ ret = classify(cls, &typedata[typ.dat], 0);
+ if (!ret) { /*MEMORY*/
+ r[0] = *ns;
+ *ns = alignup(*ns + typedata[typ.dat].siz, 8);
+ return 0;
+ }
+ assert(ret <= 2);
+ ni_save = *ni, nf_save = *nf;
+ *r2off = 8;
+ for (int i = 0; i < ret; ++i) {
+ assert(cls[i]);
+ if (kisflt(cls[i]) && *nf < NFLT)
+ r[i] = XMM0 + (*nf)++;
+ else if (kisint(cls[i]) && *ni < NINT)
+ r[i] = intregs[(*ni)++];
+ else { /* MEMORY */
+ *ni = ni_save, *nf = nf_save;
+ r[0] = *ns;
+ *ns = alignup(*ns + typedata[typ.dat].siz, 8);
+ r[1] = -1;
+ return cls[0] = cls[1] = 0;
+ }
+ }
+ return ret;
+}
+
+static int
+abiret(short r[2], uchar cls[2], uchar *r2off, int *ni, union irtype typ)
+{
+ int ret;
+
+ if (!typ.isagg) {
+ r[0] = kisflt(cls[0] = typ.cls) ? XMM0 : RAX;
+ return 1;
+ }
+
+ cls[0] = cls[1] = 0;
+ ret = classify(cls, &typedata[typ.dat], 0);
+ if (!ret) { /* MEMORY */
+ assert(*ni == 0);
+ r[0] = RAX; /* on return should contain result location address */
+ r[1] = RDI; /* register for caller-owned result location argument */
+ ++*ni;
+ return 0;
+ }
+ assert(ret <= 2);
+ *r2off = 8;
+ for (int i = 0, ni = 0, nf = 0; i < ret; ++i) {
+ assert(cls[i]);
+ if (kisflt(cls[i])) /* SSE (XMM0, XMM1) */
+ r[i] = XMM0 + nf++;
+ else if (kisint(cls[i])) /* INTEGER (RAX, RDX) */
+ r[i] = ni++ == 0 ? RAX : RDX;
+ else assert(0);
+ }
+ return ret;
+}
+
+/* Layout of va_list:
+ * struct {
+ * ( 0) unsigned int gp_offset;
+ * ( 4) unsigned int fp_offset;
+ * ( 8) void *overflow_arg_area;
+ * (16) void *reg_save_area;
+ * }
+ * Layout of register save area (align 16):
+ * reg off
+ * rdi 0
+ * rsi 8
+ * rdx 16
+ * rcx 24
+ * r8 32
+ * r9 40
+ * xmm0 48
+ * xmm1 64
+ * ...
+ * in x86_64/emit xvaprologue generates the code to save the registers to a stack slot
+ * there only needs to be one xvaprologue if there's any vastart instrs, and it has to be
+ * at the beginning of the function (before IR generated by regalloc can touch any registers)
+ * then vastart can initialize va_list.reg_save_area with a pointer to that
+ */
+
+static void
+vastart(struct function *fn, struct block *blk, int *curi)
+{
+ union ref rsave; /* register save area */
+ int gpr0 = 0, fpr0 = 0, stk0 = 0;
+ struct instr *ins = &instrtab[blk->ins.p[*curi]];
+ union ref ap = ins->l, src, dst;
+ assert(ins->op == Ovastart);
+ /* add xvaprologue if not there yet, which must be the first
+ * real instruction in the function (following alloca) */
+ if (fn->entry->ins.n > 1 && instrtab[fn->entry->ins.p[1]].op == Oxvaprologue) {
+ rsave = mkref(RTMP, fn->entry->ins.p[0]); /* alloca instruction */
+ assert(instrtab[rsave.i].op == Oalloca16);
+ } else {
+ rsave = insertinstr(fn->entry, 0, mkalloca(192, 16));
+ insertinstr(fn->entry, 1, mkinstr(Oxvaprologue, 0, rsave, .keep=1));
+ }
+ /* find first unnamed gpr and fpr */
+ for (int i = 0; i < fn->nabiarg; ++i) {
+ struct abiarg abi = fn->abiarg[i];
+ if (!abi.isstk){
+ if (abi.reg < XMM0) ++gpr0;
+ else ++fpr0;
+ } else {
+ stk0 = abi.stk+8;
+ }
+ }
+ /* set ap->reg_save_area */
+ *ins = mkinstr(Oadd, KPTR, ap, mkref(RICON, 16));
+ dst = mkref(RTMP, ins - instrtab);
+ int i = *curi + 1;
+ insertinstr(blk, i++, mkinstr(Ostore64, 0, dst, rsave));
+ /* set ap->overflow_arg_area */
+ src = insertinstr(blk, i++, mkinstr(Oadd, KPTR, mkref(RREG, RBP), mkref(RICON, 16+stk0)));
+ dst = insertinstr(blk, i++, mkinstr(Oadd, KPTR, ap, mkref(RICON, 8)));
+ insertinstr(blk, i++, mkinstr(Ostore64, 0, dst, src));
+ /* set ap->gp_offset */
+ insertinstr(blk, i++, mkinstr(Ostore32, 0, ap, mkref(RICON, gpr0*8)));
+ /* set ap->fp_offset */
+ dst = insertinstr(blk, i++, mkinstr(Oadd, KPTR, ap, mkref(RICON, 4)));
+ insertinstr(blk, i++, mkinstr(Ostore32, 0, dst, mkref(RICON, 6*8 + fpr0*16)));
+ *curi = i-1;
+}
+
+static void
+vaarg(struct function *fn, struct block *blk, int *curi)
+{
+ short r[2];
+ uchar cls[2];
+ union ref tmp;
+ int ni = 0, nf = 0, ns = 0;
+ uchar r2off;
+ int var = blk->ins.p[*curi];
+ union ref ap = instrtab[var].l;
+ union irtype ty = ref2type(instrtab[var].r);
+
+ assert(instrtab[var].op == Ovaarg);
+ blk->ins.p[*curi] = newinstr(blk, (struct instr){Onop});
+
+ int ret = abiarg(r, cls, &r2off, &ni, &nf, &ns, ty);
+
+ if (ret == 2) assert(!"nyi");
+ else if (ret == 1) {
+ struct block *merge;
+ union ref phi, phiargs[2];
+ /* int: l->gp_offset < 48 - num_gp * 8 */
+ /* sse: l->fp_offset < 304 - num_gp * 16 (why 304? ... 176) */
+ tmp = ni ? ap : insertinstr(blk, (*curi)++, mkinstr(Oadd, KPTR, ap, mkref(RICON, 4)));
+ tmp = insertinstr(blk, (*curi)++, mkinstr(Oloadu32, KI32, tmp));
+ tmp = insertinstr(blk, (*curi)++, mkinstr(Oulte, KI32, tmp, mkref(RICON, ni ? 48 - ni*8 : 176 - nf*16)));
+ merge = blksplitafter(fn, blk, *curi);
+ blk->jmp.t = 0;
+ useblk(fn, blk);
+ putcondbranch(fn, tmp, newblk(fn), newblk(fn));
+ useblk(fn, blk->s1);
+ {
+ /* phi0: &l->reg_save_area[l->gp/fp_offset] */
+ union ref sav = addinstr(fn, mkinstr(Oloadi64, KPTR, irbinop(fn, Oadd, KPTR, ap, mkref(RICON, 16))));
+ union ref roff = addinstr(fn, mkinstr(Oloadu32, KI32, irbinop(fn, Oadd, KPTR, ap, mkref(RICON, ni ? 0 : 4))));
+ phiargs[0] = irbinop(fn, Oadd, KPTR, sav, roff);
+ /* l->gp/fp_offset += num_gp/fp * 8(16) */
+ roff = irbinop(fn, Oadd, KI32, roff, mkref(RICON, ni ? ni * 8 : nf * 16));
+ addinstr(fn, mkinstr(Ostore32, 0, irbinop(fn, Oadd, KPTR, ap, mkref(RICON, ni ? 0 : 4)), roff));
+ assert(merge->npred == 1);
+ blkpred(merge, 0) = blk->s1;
+ blk->s1->jmp.t = Jb;
+ blk->s1->s1 = merge;
+ }
+ useblk(fn, blk->s2);
+ {
+ /* phi1: l->overflow_arg_area */
+ union ref adr = irbinop(fn, Oadd, KPTR, ap, mkref(RICON, 8));
+ union ref ovf = addinstr(fn, mkinstr(Oloadi64, KPTR, adr));
+ /* align no-op */
+
+ phiargs[1] = ovf;
+ /* update l->overflow_arg_area += size */
+ int siz = 8;
+ addinstr(fn, mkinstr(Ostore64, 0, adr, irbinop(fn, Oadd, KPTR, ovf, mkref(RICON, siz))));
+ putbranch(fn, merge);
+ }
+ assert(merge->npred == 2);
+ vpush(&merge->ins, 0);
+ memmove(merge->ins.p+1, merge->ins.p, (merge->ins.n-1)*sizeof *merge->ins.p);
+ merge->ins.p[0] = var;
+ phi = insertphi(merge, KPTR);
+ memcpy(phitab.p[instrtab[phi.i].l.i], phiargs, sizeof phiargs);
+ if (!ty.isagg) {
+ instrtab[var] = mkinstr(cls[0] == KI32 ? Oloads32 : Oloadi64, cls[0], phi);
+ } else {
+ instrtab[var] = mkalloca(8, 8);
+ tmp = insertinstr(merge, 1, mkinstr(Oloadi64, KI64, phi));
+ insertinstr(merge, 2, mkinstr(Ostore64, 0, mkref(RTMP, var), tmp));
+ }
+ fn->prop &= ~FNUSE;
+ } else {
+ assert(!"nyi");
+ }
+}
+
+static const char x86_64_rnames[][6] = {
+#define R(r) #r,
+ LIST_REGS(R)
+#undef R
+};
+
+const struct mctarg t_x86_64_sysv = {
+ .gpr0 = RAX, .ngpr = R15 - RAX + 1,
+ .bpr = RBP,
+ .gprscratch = R11, .fprscratch = XMM15,
+ .fpr0 = XMM0, .nfpr = XMM15 - XMM0 + 1,
+ .rcallee = 1<<RBX | 1<<R12 | 1<<R13 | 1<<R14 | 1<<R15,
+ .rglob = 1<<RSP | 1<<RBP,
+ .rnames = x86_64_rnames,
+ .objkind = OBJELF,
+ .abiret = abiret,
+ .abiarg = abiarg,
+ .vastart = vastart,
+ .vaarg = vaarg,
+ .isel = x86_64_isel,
+ .emit = x86_64_emit
+};
+
+/* vim:set ts=3 sw=3 expandtab: */