2 files changed, 61 insertions, 18 deletions
diff --git a/ir/builder.c b/ir/builder.c
index 36f955c..8fb626c 100644
--- a/ir/builder.c
+++ b/ir/builder.c
@@ -16,7 +16,8 @@ irbinop(struct function *fn, enum op op, enum irclass k, union ref l, union ref
       if (l.bits == ZEROREF.bits) return r; /* 0 + x ==> x */
       /* fallthru */
    case Osub:
-      if (r.bits == ZEROREF.bits) return l; /* x +/- 0 ==> x */
+      if (r.bits == ZEROREF.bits) return l; /* x - 0 ==> x */
+      if (kisint(k) && l.bits == r.bits) return ZEROREF; /* x - 0 ==> x */
       break;
    case Omul:
       if (isnumcon(l)) rswap(l, r); /* put const in rhs */
@@ -28,7 +29,7 @@ irbinop(struct function *fn, enum op op, enum irclass k, union ref l, union ref
       if (isintcon(r) && ispo2(iv = intconval(r))) {
          /* x * 2^y ==> x << y */
          op = Oshl;
-         r = mkintcon(k, ilog2(iv));
+         r = mkref(RICON, ilog2(iv));
       }
       break;
    case Odiv:
@@ -38,7 +39,7 @@ irbinop(struct function *fn, enum op op, enum irclass k, union ref l, union ref
       if (isintcon(r) && ispo2(iv = intconval(r))) {
          /* x / 2^y ==> x >> y */
          op = Oslr;
-         r = mkintcon(k, ilog2(iv));
+         r = mkref(RICON, ilog2(iv));
       }
       break;
    case Orem:
@@ -98,7 +99,7 @@ irbinop(struct function *fn, enum op op, enum irclass k, union ref l, union ref
    default:
       assert(!"binop?");
    }
-   return addinstr(fn, mkinstr(op, k, l, r));
+   return fn ? addinstr(fn, mkinstr(op, k, l, r)) : NOREF;
 }
 
 /* implements f32/f64 -> u64 conversion */
@@ -175,7 +176,7 @@ irunop(struct function *fn, enum op op, enum irclass k, union ref a)
    case Ocvts64f:
       break;
    case Ocvtf32u: case Ocvtf64u:
-      if (k == KI64) {
+      if (k == KI64 && fn) {
          /* XXX some architectures like arm64 do have these instructions natively
           * this should probably be handled in a separate "arithmetic-lowering" pass, earlier than isel
           */
@@ -184,14 +185,15 @@ irunop(struct function *fn, enum op op, enum irclass k, union ref a)
       break;
    case Ocvtu64f:
       /* XXX see above */
-      return cvtu64f(fn, k, a);
+      if (fn)
+         return cvtu64f(fn, k, a);
    case Oexts8: case Oextu8: case Oexts16: case Oextu16:
    case Oexts32: case Oextu32:
    case Ocopy:
       break;
    default: assert(!"unop?");
    }
-   return addinstr(fn, mkinstr(op, k, a));
+   return fn ? addinstr(fn, mkinstr(op, k, a)) : NOREF;
 }
 
 int allocinstr(void);
diff --git a/ir/simpl.c b/ir/simpl.c
index 7e6cfd0..0dc32ed 100644
--- a/ir/simpl.c
+++ b/ir/simpl.c
@@ -1,22 +1,62 @@
 #include "ir.h"
 
 static int
-ins(struct function *fn, struct instr *ins, struct block *blk, int *curi)
+mulk(struct instr *ins, struct block *blk, int *curi)
 {
-   int narg = opnarg[ins->op];
-   if ((oisarith(ins->op) || oiscmp(ins->op)) && isnumcon(ins->l) && (narg == 1 || isnumcon(ins->r))) {
-      bool ok;
-      if (narg == 1) ok = foldunop(&ins->l, ins->op, ins->cls, ins->l);
-      else ok = foldbinop(&ins->l, ins->op, ins->cls, ins->l, ins->r);
-      if (!ok) return 0; /* could be div/0 */
-      ins->op = Ocopy;
-      ins->cls = insrescls(*ins);
-      ins->r = NOREF;
+   vlong iv = intconval(ins->r);
+   enum irclass cls = ins->cls;
+   assert(iv > 1 && "trivial mul not handled by irbinop() ?");
+   /* This can be generalized to any sequence of shifts and
+    * adds/subtracts, but whether that's worth it depends on the number of them
+    * and the microarchitecture..  clang seems to stop after two shifts. Should
+    * we compute approximate cost of instrs to determine? For now just handle
+    * the po2 (+/- 1) case */
+   if (ispo2(iv)) {
+      /* x * 2^y ==> x << y */
+      ins->op = Oshl;
+      ins->r = mkref(RICON, ilog2(iv));
+      return 1;
+   } else if (ispo2(iv-1)) {
+      /* x * 5 ==> (x << 2) + x */
+      ins->op = Oadd;
+      ins->r = ins->l;
+      ins->l = insertinstr(blk, (*curi)++, mkinstr(Oshl, cls, ins->l, mkref(RICON, ilog2(iv-1))));
+      return 1;
+   } else if (ispo2(iv+1)) {
+      /* x * 7 ==> (x << 3) - x */
+      ins->op = Osub;
+      ins->r = ins->l;
+      ins->l = insertinstr(blk, (*curi)++, mkinstr(Oshl, cls, ins->l, mkref(RICON, ilog2(iv+1))));
       return 1;
    }
    return 0;
 }
 
+static int
+ins(struct instr *ins, struct block *blk, int *curi)
+{
+   int narg = opnarg[ins->op];
+   if (oisarith(ins->op)) {
+      union ref r = narg == 1 ? irunop(NULL, ins->op, ins->cls, ins->l)
+                              : irbinop(NULL, ins->op, ins->cls, ins->l, ins->r);
+      if (r.bits) {
+         ins->op = Ocopy;
+         ins->cls = insrescls(*ins);
+         ins->l = r;
+         ins->r = NOREF;
+         return 1;
+      }
+   }
+   enum irclass k = ins->cls;
+   switch (ins->op) {
+   case Omul:
+      if (kisflt(k)) break;
+      if (isnumcon(ins->l)) rswap(ins->l, ins->r); /* put const in rhs */
+      if (isintcon(ins->r)) return mulk(ins, blk, curi);
+   }
+   return 0;
+}
+
 static void
 jmpfind(struct block **final, struct block **pblk)
 {
@@ -73,12 +113,13 @@ simpl(struct function *fn)
    struct block **jmpfinal = allocz(fn->passarena, fn->nblk * sizeof *jmpfinal, 0);
    struct block *blk = fn->entry;
 
+   fn->curblk = NULL;
    do {
       for (int i = 0; i < blk->phi.n; ++i) {
 
       }
       for (int i = 0; i < blk->ins.n; ++i) {
-         inschange += ins(fn, &instrtab[blk->ins.p[i]], blk, &i);
+         inschange += ins(&instrtab[blk->ins.p[i]], blk, &i);
       }
 
       /* merge blocks: