From 640d5e95560d3c9099a58e09a93831197b863a2b Mon Sep 17 00:00:00 2001 From: lemon Date: Sat, 20 Dec 2025 12:11:16 +0100 Subject: optimize lexer a bit more --- c/lex.c | 146 ++++++++++++++++++++++++++++++++++++++-------------------------- c/lex.h | 4 +- 2 files changed, 88 insertions(+), 62 deletions(-) (limited to 'c') diff --git a/c/lex.c b/c/lex.c index 4ae24da..7bd6ecc 100644 --- a/c/lex.c +++ b/c/lex.c @@ -7,7 +7,7 @@ static void fillchrbuf(struct lexer *lx) { const uchar *p = lx->dat + lx->idx; - int i = lx->chrbuf0, idx = lx->idx, c; + int i = lx->chrbuf0, idx = lx->idx; int rem = countof(lx->chrbuf) - i; assert(rem >= 0); if (rem > 0) { @@ -20,15 +20,21 @@ fillchrbuf(struct lexer *lx) i = rem; for (; i < countof(lx->chrbuf); ++i) { - int n; - /* skip backslash-newline */ - while ((n = 2, (p[0] == '\\') & (p[1] == '\n')) || (ccopt.trigraph && !memcmp(p, "\?\?/\n", n = 4))) { - idx += n; - p += n; + uchar c; + /* skip backslash-newline* */ + for (;;) { + if (p[0] == '\\' && p[1] == '\n') { + idx += 2; + p += 2; + } else if (ccopt.trigraph && !memcmp(p, "\?\?/\n", 4)) { + idx += 4; + p += 4; + } else break; addfileline(lx->fileid, idx); } + if (idx >= lx->ndat) { - c = TKEOF; + c = 0; } else if (ccopt.trigraph && ((p[0] == '?') & (p[1] == '?'))) { switch (p[2]) { case '=': c = '#'; break; @@ -65,7 +71,7 @@ next(struct lexer *lx) fillchrbuf(lx); lx->chridx = lx->chridxbuf[lx->chrbuf0]; c = lx->chrbuf[lx->chrbuf0]; - lx->eof = c == TKEOF; + lx->eof = lx->chridx >= lx->ndat; ++lx->chrbuf0; return c; } @@ -385,51 +391,66 @@ isppnum(char prev, char c) /* special mode to parse header path for #include */ static bool lexingheadername = 0; +enum { MAXLITLEN = 256 }; /* maximum length of num literals and identifiers */ static int lex0(struct lexer *lx, struct token *tk) { - int idx, c, q; + int idx,q; +Begin: + idx = lx->chridx; + if (lx->chrbuf0+4 >= countof(lx->chrbuf)) + fillchrbuf(lx); + lx->chridx = lx->chridxbuf[lx->chrbuf0]; + uchar *p = &lx->chrbuf[lx->chrbuf0++], + c = p[0]; + switch (c) { #define RET(t_) do { tk->t = (t_); goto End; } while (0) +#define TK2(c2,t) if (p[1] == c2) { \ + lx->chridx = lx->chridxbuf[lx->chrbuf0]; \ + ++lx->chrbuf0; \ + RET(t); \ + } +#define TK3(c2,c3,t) if (p[1] == c2 && p[2] == c3) { \ + lx->chridx = lx->chridxbuf[++lx->chrbuf0]; \ + ++lx->chrbuf0; \ + RET(t); \ + } -Begin: - idx = lx->chridx; - switch (c = next(lx)) { case ' ': case '\t': case '\f': case '\v': case '\r': goto Begin; break; case '(': case ')': case ',': case ':': case ';': case '?': case '[': case ']': case '{': case '}': case '~': case '$': - case '@': case '`': case '\\': case TKEOF: case '\n': + case '@': case '`': case '\\': case '\n': RET(c); case '!': - if (match(lx, '=')) RET(TKNEQ); + TK2('=', TKNEQ); RET(c); case '#': - if (match(lx, '#')) RET(TKPPCAT); + TK2('#', TKPPCAT); RET(c); case '+': - if (match(lx, '+')) RET(TKINC); - if (match(lx, '=')) RET(TKSETADD); + TK2('+', TKINC); + TK2('=', TKSETADD); RET(c); case '-': - if (match(lx, '-')) RET(TKDEC); - if (match(lx, '=')) RET(TKSETSUB); - if (match(lx, '>')) RET(TKARROW); + TK2('-', TKDEC); + TK2('=', TKSETSUB); + TK2('>', TKARROW); RET(c); case '*': - if (match(lx, '=')) RET(TKSETMUL); + TK2('=', TKSETMUL); RET(c); case '/': - if (match(lx, '=')) RET(TKSETDIV); + TK2('=', TKSETDIV); if (match(lx, '/')) { /* // comment */ while (!lx->eof && peek(lx, 0) != '\n') next(lx); goto Begin; - } - if (match(lx, '*')) { + } else if (match(lx, '*')) { /* comment */ while (!(peek(lx, 0) == '*' && peek(lx, 1) == '/')) { if (next(lx) == TKEOF) { @@ -442,13 +463,13 @@ Begin: } RET(c); case '%': - if (match(lx, '=')) RET(TKSETREM); + TK2('=', TKSETREM); RET(c); case '^': - if (match(lx, '=')) RET(TKSETXOR); + TK2('=', TKSETXOR); RET(c); case '=': - if (match(lx, '=')) RET(TKEQU); + TK2('=', TKEQU); RET(c); case '<': if (lexingheadername) { @@ -456,20 +477,22 @@ Begin: lexingheadername = 0; goto End; } - if (match(lx, '=')) RET(TKLTE); - if (match(lx, '<')) RET(match(lx, '=') ? TKSETSHL : TKSHL); + TK2('=', TKLTE); + TK3('<','=', TKSETSHL) + TK2('<', TKSHL); RET(c); case '>': - if (match(lx, '=')) RET(TKGTE); - if (match(lx, '>')) RET(match(lx, '=') ? TKSETSHR : TKSHR); + TK2('=', TKGTE); + TK3('>','=', TKSETSHR) + TK2('>', TKSHR); RET(c); case '&': - if (match(lx, '&')) RET(TKLOGAND); - if (match(lx, '=')) RET(TKSETAND); + TK2('&', TKLOGAND); + TK2('=', TKSETAND); RET(c); case '|': - if (match(lx, '|')) RET(TKLOGIOR); - if (match(lx, '=')) RET(TKSETIOR); + TK2('|', TKLOGIOR); + TK2('=', TKSETIOR); RET(c); case '"': if (lexingheadername) { @@ -482,12 +505,8 @@ Begin: } goto End; case '.': - if (peek(lx, 0) == '.' && peek(lx, 1) == '.') { - next(lx), next(lx); - RET(TKDOTS); - } else if (aisdigit(peek(lx, 0))) { - goto Numlit; - } + TK3('.','.',TKDOTS) + if (aisdigit(p[1])) goto Numlit; RET(c); case 'L': if (match(lx, (q = '\'')) || match(lx, (q = '"'))) { @@ -498,39 +517,46 @@ Begin: /* fallthru */ default: if (aisdigit(c)) Numlit: { - char tmp[200]; - int n = 0; - tmp[n++] = c; - while (isppnum(tmp[n-1], peek(lx, 0))) { - assert(n < countof(tmp)-1 && "too big"); - tmp[n++] = next(lx); + --lx->chrbuf0; + if (lx->chrbuf0 + MAXLITLEN >= countof(lx->chrbuf)) + fillchrbuf(lx); + uchar *p = &lx->chrbuf[lx->chrbuf0]; + int n = 1; + for (; isppnum(p[n-1], p[n]); ++n) { + if (n >= MAXLITLEN) TooLong: { + lx->chridx = lx->chridxbuf[lx->chrbuf0+n-1]; + fatal(&(struct span) {{ idx, lx->chridx - idx, lx->fileid }}, + "token is too long"); + } } - tmp[n] = 0; tk->len = n; + lx->chridx = lx->chridxbuf[(lx->chrbuf0 += n) - 1]; if (n == lx->chridx - idx) { tk->litlit = 1; tk->s = (char *)&lx->dat[idx]; } else { tk->litlit = 0; - tk->s = alloccopy(lx->tmparena, tmp, n, 1); + tk->s = alloccopy(lx->tmparena, p, n, 1); } RET(TKNUMLIT); } else if (c == '_' || aisalpha(c)) { - char tmp[200]; - int n = 0; - tmp[n++] = c; - while (!aissep(c = peek(lx, 0))) { - assert(n < countof(tmp)-1 && "too big"); - tmp[n++] = next(lx); + --lx->chrbuf0; + if (lx->chrbuf0 + MAXLITLEN >= countof(lx->chrbuf)) + fillchrbuf(lx); + uchar *p = &lx->chrbuf[lx->chrbuf0]; + int n = 1; + for (; !aissep(p[n]); ++n) { + if (n >= MAXLITLEN) goto TooLong; } - tmp[n] = 0; - tk->t = TKIDENT; tk->blue = 0; tk->len = n; - tk->name = intern(tmp); - goto End; + tk->name = intern_((char *)p, n); + lx->chridx = lx->chridxbuf[(lx->chrbuf0 += n) - 1]; + RET(TKIDENT); } + /* fallthru */ case 0: if (lx->idx >= lx->ndat) RET(TKEOF); +#undef TK2 } fatal(&(struct span) {{ idx, lx->chridx - idx, lx->fileid }}, "unexpected character %'c at %d (%d)", c, idx, lx->idx); @@ -1320,7 +1346,7 @@ Unary: } if (!prec) { /* not a sub expr */ if (elex(lx, &tk) != '\n' && tk.t != TKEOF) { - error(&tk.span, "garbage after preprocessor expression"); + error(&tk.span, "extra tokens after preprocessor expression"); ppskipline(lx); } } diff --git a/c/lex.h b/c/lex.h index a850445..21519f7 100644 --- a/c/lex.h +++ b/c/lex.h @@ -95,8 +95,6 @@ struct lexer { const uchar *dat; uint ndat; uint idx, chridx; - short chrbuf[1<<10]; - uint chridxbuf[1<<10]; ushort chrbuf0; struct macrostack *macstk; struct token peektok; @@ -105,6 +103,8 @@ struct lexer { bool firstdirective; ushort nppcnd0; internstr inclguard; + uchar chrbuf[1<<10]; + uint chridxbuf[1<<10]; }; enum initlexer { -- cgit v1.2.3