From 2e0160015d756702a799be95db237b7a91bae400 Mon Sep 17 00:00:00 2001
From: lemon <lsof@mailbox.org>
Date: Thu, 1 Sep 2022 12:09:22 +0200
Subject: check for duplicate fields faster using bloom filter

---
 src/parse.cff | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 6 deletions(-)

(limited to 'src')
diff --git a/src/parse.cff b/src/parse.cff
index 14f7082..7afec96 100644
--- a/src/parse.cff
+++ b/src/parse.cff
@@ -725,6 +725,46 @@ fn isdecltokt(t TokT) bool {
    return #f;
 }
 
+// A simple bloom filter for quick duplicates definitions checking for aggregates
+// At least for reasonably sized ones..
+struct BloomFilt {
+   bits [4]u32,
+
+   fn _hash(x u32) u32 {
+      // https://nullprogram.com/blog/2018/07/31/
+      x ^= x >> 16;
+      x *= 0x7feb352dU;
+      x ^= x >> 15;
+      x *= 0x846ca68bU;
+      x ^= x >> 16;
+      return x;
+   }
+
+   def NHASH = 4;
+   def NBITS = 8 * sizeof BloomFilt;
+
+   fn add(this *BloomFilt, s *const u8) void {
+      let h u32 = as(uptrint)s;
+      for let i = 0; i < NHASH; ++i {
+         h = _hash(h);
+         let idx = h % NBITS;
+         this.bits[idx/32] |= 1u32 << (idx%32);
+      }
+   }
+
+   fn matches(this *const BloomFilt, s *const u8) bool {
+      let h u32 = as(uptrint)s;
+      for let i = 0; i < NHASH; ++i {
+         h = _hash(h);
+         let idx = h % NBITS;
+         if this.bits[idx/32] & (1u64 << (idx%32)) == 0 {
+            return #f;
+         }
+      }
+      return #t;
+   }
+}
+
 fn parseagg(P *Parser, loc Loc, kind AggKind, name *const u8, retdecl **Decl) *const Type {
    let tok Tok #?;
    static id int = 0;
@@ -758,6 +798,7 @@ fn parseagg(P *Parser, loc Loc, kind AggKind, name *const u8, retdecl **Decl) *c
       lexexpect(P, '{');
       let size = 0z, align = 1z;
       let flds Vec<AggField> = {};
+      let bfilt BloomFilt = {};
       let havedecls = #f;
       while !lexmatch(P, &tok, '}') {
          if isdecltokt(lexpeek(P).t) {
@@ -780,11 +821,14 @@ fn parseagg(P *Parser, loc Loc, kind AggKind, name *const u8, retdecl **Decl) *c
             size = kind == :Struct ? off + type.size
                                    : MAX(size, type.size);
          }
-         vec_each (fld, _, flds) {
-            if fld.name == name {
-               err(P, tok.loc, "duplicate field %qT", &tok);
+         if bfilt->matches(name) {
+            vec_each (fld, _, flds) {
+               if fld.name == name {
+                  err(P, tok.loc, "duplicate field %qT", &tok);
+               }
             }
          }
+         bfilt->add(name);
          flds->push({ name, type, off });
 
          if !lexmatch(P, #null, ',') {
@@ -877,6 +921,7 @@ fn parsebitfield(P *Parser, name *const u8) *const Type {
    lexexpect(P, '{');
    static id int = {};
    bitf.id = id++;
+   let bfilt BloomFilt = {};
    let flds Vec<BitFField> = {};
    let iota = 0u16;
    while !lexmatch(P, &tok, '}') {
@@ -884,11 +929,14 @@ fn parsebitfield(P *Parser, name *const u8) *const Type {
       let off = 0u, size = 0u;
       let ty = bitf.intty;
 
-      vec_each (fld, _, flds) {
-         if fld.name == name {
-            err(P, tok.loc, "duplicate field %qT", &tok);
+      if bfilt->matches(name) {
+         vec_each (fld, _, flds) {
+            if fld.name == name {
+               err(P, tok.loc, "duplicate field %qT", &tok);
+            }
          }
       }
+      bfilt->add(name);
 
       if lexmatch(P, &tok, '(') {
          // `(offset, size)`
-- 
cgit v1.2.3