From 104330a399f405b83328525bb2be55b360109b16 Mon Sep 17 00:00:00 2001 From: lemon Date: Sun, 28 May 2023 19:29:10 +0200 Subject: improve struct token --- io.c | 16 +++-- lex.c | 213 ++++++++++++++++++++++++++++++++++++++-------------------------- parse.c | 22 ++++--- parse.h | 75 +++++++++++++---------- test.c | 8 ++- type.c | 6 +- 6 files changed, 200 insertions(+), 140 deletions(-) diff --git a/io.c b/io.c index d51e28d..cc9f672 100644 --- a/io.c +++ b/io.c @@ -437,17 +437,21 @@ vbfmt(struct wbuf *out, const char *fmt, va_list ap) n += bwriteS(buf, "\?\?\?"); break; case TKNUMLIT: - s = (const char *)(getfile(tok->span.sl.file)->p + tok->span.sl.off); if (quote) n += bputc(buf, '`'); - for (i = tok->span.sl.len; i--; ++s) - if (*s != '\\' && *s != '\n') n += bputc(buf, *s); + n += bfmt(buf, "%S", tok->s, tok->len); if (quote) n += bputc(buf, '\''); break; + case TKCHRLIT: + n += bputc(buf, '\''); + for (int i = 0; i < tok->len; ++i) + n += putquoted(buf, tok->s[i], '\'', i < tok->len - 1 ? tok->s[i+1] : -1); + n += bputc(buf, '\''); + break; case TKSTRLIT: - n += bfmt(buf, "%'S", tok->s.p, tok->s.n-1); + n += bfmt(buf, "%'S", tok->s, tok->len); break; case TKIDENT: - n += bfmt(buf, "`%s'", tok->ident); + n += bfmt(buf, "`%s'", tok->s); break; case TKEOF: n += bwriteS(buf, ""); @@ -478,7 +482,7 @@ vbfmt(struct wbuf *out, const char *fmt, va_list ap) default: if (quote) n += bputc(buf, '`'); if (in_range(tok->t, TKWBEGIN_, TKWEND_)) { - n += bfmt(buf, "%s", tok->ident); + n += bfmt(buf, "%s", tok->s); } else if (aisprint(tok->t)) { n += bputc(buf, tok->t); } else { diff --git a/lex.c b/lex.c index 7ffe8e0..f82f255 100644 --- a/lex.c +++ b/lex.c @@ -43,13 +43,13 @@ identkeyword(struct token *tk, const char *s, int len) else if (cmp > 0) h = i - 1; else if (kwtab[i].cstd <= ccopt.cstd) { tk->t = kwtab[i].t; - tk->ident = kwtab[i].s; + tk->s = kwtab[i].s; return; } else break; } ident: tk->t = TKIDENT; - tk->ident = intern(s); + tk->s = intern(s); } static int @@ -141,61 +141,90 @@ aissep(int c) { return 0; } -static void -strtonum(struct token *tk, const char *s) + +enum typetag +parsenumlit(uvlong *outi, double *outf, const struct token *tk, bool ispp) { - extern uvlong strtoull(const char *, char **, int); - extern double strtod(const char *, char **); - char *sx; /*suffix*/ - - tk->ty = TYXXX; - if (strchr(s, '.')) { /* float literal */ - Float: - tk->f = strtod(s, &sx); - if (sx == s) - return; - if (!*sx) - tk->ty = TYDOUBLE; - else if ((sx[0]|0x20) == 'f' && !sx[1]) { - tk->ty = TYFLOAT; - tk->f = (float) tk->f; - } else tk->ty = TYXXX; + if (tk->t == TKCHRLIT) { + uvlong n = 0; + for (int i = 0; i < tk->len; ++i) + n = n << 8 | (uchar)tk->s[i]; + if (outi) *outi = n; + return TYINT; + } else if (memchr(tk->s, '.', tk->len)) { + extern double strtod(const char *, char **); + double f; + char buf[80], *suffix; + Float: /* float literal */ + assert(tk->len < sizeof buf - 1 && "numlit too big"); + memcpy(buf, tk->s, tk->len); + buf[tk->len] = 0; + f = strtod(buf, &suffix); + if (suffix == buf) + return 0; + if (!*suffix) { + if (outf) *outf = f; + return TYDOUBLE; + } else if ((suffix[0]|0x20) == 'f' && !suffix[1]) { + if (outf) *outf = f; + return TYFLOAT; + } + return 0; } else { /* int literal */ static uvlong max4typ[TYUVLONG-TYINT+1]; - enum typetag t; - bool u = 0, dec = s[0] != '0'; - bool c99 = ccopt.cstd >= STDC99; - - tk->u = strtoull(s, &sx, 0); - if (sx == s) - return; + uvlong n = 0; + int base = 10, nsx; + bool dec, u = 0, c99 = ccopt.cstd >= STDC99; + enum typetag ty = 0; + const char *sx; /*suffix*/ + char c; if (!max4typ[0]) - for (t = TYINT; t <= TYUVLONG; ++t) - max4typ[t-TYINT] = ((1ull << (8*targ_primsizes[t]-1))-1) << isunsignedt(t) | 1; + for (ty = TYINT; ty <= TYUVLONG; ++ty) + max4typ[ty-TYINT] = ((1ull << (8*targ_primsizes[ty]-1))-1) << isunsignedt(ty) | 1; + + sx = tk->s; + if (tk->len > 2 && sx[0] == '0') { + if ((sx[1]|32) == 'x') sx += 2, base = 16; /* 0x.. */ + else if ((sx[1]|32) == 'b') sx += 2, base = 2; /* 0b.. */ + else base = 8; /* 0.. */ + } + for (; sx < tk->s + tk->len; ++sx) { + if (base < 16) { + if (!in_range(c = *sx, '0', '0'+base-1)) break; + n = n * base + c - '0'; + } else { + n *= base; + if (in_range(c = *sx, '0', '9')) n += c - '0'; + else if (in_range(c|32, 'a', 'f')) n += 0xa + (c|32) - 'a'; + else break; + } + } + dec = base == 10; + nsx = tk->len - (sx - tk->s); - if (!*sx) /* '' */ {} - else if ((sx[0]|0x20) == 'u') { + if (nsx == 0) /* '' */ {} + else if ((sx[0]|32) == 'u') { u = 1; - if (!sx[1]) /* 'u' */ {} - else if ((sx[1]|0x20) == 'l') { - if (!sx[2]) /* 'ul' */ goto L; - if (c99 && sx[1] == sx[2] && !sx[3]) /* 'ull' */ goto LL; - return; - } else return; - } else if ((sx[0]|0x20) == 'l') { - if (!sx[1]) /* 'l' */ goto L; - if ((sx[1]|0x20) == 'u' && !sx[2]) /* 'lu' */ { u=1; goto L; } + if (nsx == 1) /* 'u' */ {} + else if ((sx[1]|32) == 'l') { + if (nsx == 2) /* 'ul' */ goto L; + if (c99 && sx[1] == sx[2] && nsx == 3) /* 'ull' */ goto LL; + return 0; + } else return 0; + } else if ((sx[0]|32) == 'l') { + if (nsx == 1) /* 'l' */ goto L; + if ((sx[1]|32) == 'u' && nsx == 2) /* 'lu' */ { u=1; goto L; } if (c99 && sx[1] == sx[0]) { - if (!sx[2]) /* 'll' */ goto LL; - if ((sx[2]|0x20) == 'u' && !sx[3]) /* 'llu' */ { u=1; goto LL; } + if (nsx == 2) /* 'll' */ goto LL; + if ((sx[2]|32) == 'u' && nsx == 3) /* 'llu' */ { u=1; goto LL; } } - return; - } else if ((sx[0]|0x20) == 'e' || (sx[0]|0x20) == 'p') + return 0; + } else if ((sx[0]|32) == 'e' || (sx[0]|32) == 'p') goto Float; - else return; + else return 0; -#define I(T) if (tk->u <= max4typ[T - TYINT]) { t = T; goto Ok; } +#define I(T) if (n <= max4typ[T - TYINT]) { ty = T; goto Ok; } I(TYINT) if (u || !dec) I(TYUINT) L: @@ -206,12 +235,19 @@ strtonum(struct token *tk, const char *s) I(TYVLONG) if (u || !dec) I(TYUVLONG) } + if (ispp) { ty = TYUVLONG; goto Ok; } #undef I /* too big */ - return; + if (outi) *outi = n; + return 0; Ok: - if (u && issignedt(t)) ++t; /* make unsigned */ - tk->ty = t; + if (u && issignedt(ty)) ++ty; /* make unsigned */ + if (outi) *outi = n; + if (ispp) { + if (u) return TYUVLONG; + else if (n <= max4typ[TYVLONG-TYINT]) return TYVLONG; + } + return ty; } } @@ -222,7 +258,8 @@ readstrchrlit(struct parser *pr, struct token *tk, char delim) uchar tmp[80]; vec_of(uchar) b = VINIT(tmp, sizeof tmp); struct span span = {0}; - uint n, idx = pr->chridx; + uint n, beginoff, idx; + beginoff = idx = pr->chridx; while ((c = next(pr)) != delim) { if (c == '\n' || c == TKEOF) { @@ -269,7 +306,7 @@ readstrchrlit(struct parser *pr, struct token *tk, char delim) } if (n > 0377) { span.sl.len = pr->chridx - span.sl.off; - error(&span, "hex escape sequence out of range"); + error(&span, "octal escape sequence out of range"); } c = n; break; @@ -283,10 +320,17 @@ readstrchrlit(struct parser *pr, struct token *tk, char delim) idx = pr->chridx;; } if (delim == '"') { - vpush(&b, 0); tk->t = TKSTRLIT; - tk->s.p = alloc(&pr->exarena, b.n, 1); - memcpy(tk->s.p, b.p, tk->s.n = b.n-1); + tk->len = b.n; + if (pr->chridx - beginoff == tk->len + 1) { + tk->litlit = 1; + tk->s = (char *)&pr->dat[beginoff]; + } else { + tk->litlit = 0; + vpush(&b, 0); + tk->s = alloc(&pr->exarena, b.n, 1); + memcpy((char *)tk->s, b.p, b.n); + } } else { if (b.n == 0) { span.sl = (struct span0) { idx, pr->chridx - idx, pr->fileid }; @@ -295,11 +339,16 @@ readstrchrlit(struct parser *pr, struct token *tk, char delim) span.sl = (struct span0) { idx, pr->chridx - idx, pr->fileid }; error(&span, "multicharacter literal too long"); } - tk->t = TKNUMLIT; - tk->ty = TYINT; - tk->u = 0; - for (i = 0; i < b.n; ++i) - tk->u = tk->u<<8 | b.p[i]; + tk->t = TKCHRLIT; + tk->len = b.n; + if (pr->chridx - beginoff == tk->len + 1) { + tk->litlit = 1; + tk->s = (char *)&pr->dat[beginoff]; + } else { + tk->litlit = 0; + tk->s = alloc(&pr->exarena, tk->len, 1); + memcpy((char *)tk->s, b.p, tk->len); + } } vfree(&b); } @@ -414,7 +463,12 @@ Begin: tmp[n++] = next(pr); } tmp[n] = 0; - strtonum(tk, tmp); + tk->len = n; + if (n == pr->chridx - idx) tk->s = (char *)&pr->dat[idx]; + else { + tk->s = alloc(&pr->exarena, n, 1); + memcpy((char *)tk->s, tmp, n); + } RET(TKNUMLIT); } else if (c == '_' || aisalpha(c)) { char tmp[70]; @@ -435,9 +489,7 @@ End: tk->span.sl.file = pr->fileid; tk->span.sl.off = idx; tk->span.sl.len = pr->chridx - idx; - tk->span.ex.file = pr->fileid; - tk->span.ex.off = idx; - tk->span.ex.len = pr->chridx - idx; + tk->span.ex = tk->span.sl; return tk->t; #undef RET } @@ -478,23 +530,12 @@ freemac(struct macro *mac) static bool tokequ(const struct token *a, const struct token *b) { - char tmpbuf[100]; - struct wbuf tmp = MEMBUF(tmpbuf, sizeof tmpbuf); if (a->t != b->t) return 0; - if (a->t == TKNUMLIT) { - const char *s1 = tmp.buf, *s2; - int n1, n2; - - if (a->ty != b->ty) return 0; - n1 = bfmt(&tmp, "%tk", a); - s2 = tmp.buf + tmp.len; - n2 = bfmt(&tmp, "%tk", b); - if (tmp.err) return 0; - return n1 == n2 && !memcmp(s1, s2, n1); + if (a->t == TKNUMLIT || a->t == TKSTRLIT || a->t == TKCHRLIT) { + if (a->len != b->len) return 0; + return !memcmp(a->s, b->s, a->len); } else if (a->t == TKIDENT) { - return a->ident == b->ident; - } else if (a->t == TKSTRLIT) { - return a->s.n == b->s.n && !memcmp(a->s.p, b->s.p, a->s.n); + return a->s == b->s; } return 1; } @@ -577,7 +618,7 @@ ppdefine(struct parser *pr) ppskipline(pr); return; } - mac.name = tk0.ident; + mac.name = tk0.s; mac.span = tk0.span.sl; if (peek(pr, 0) == '(') { @@ -641,6 +682,7 @@ expr(struct parser *pr, bool *pu, int prec) { vlong x, y; struct token tk; + enum typetag ty; int opprec; char unops[16]; int nunop = 0; @@ -664,15 +706,16 @@ Unary: } break; case TKNUMLIT: - if (!tk.ty) { + case TKCHRLIT: + ty = parsenumlit((uvlong *)&x, NULL, &tk, 1); + if (!ty) { error(&tk.span, "bad number literal"); goto Err; - } else if (isfltt(tk.ty)) { + } else if (isfltt(ty)) { error(&tk.span, "float literal in preprocessor expresion"); goto Err; } - x = tk.i; - xu = isunsignedt(tk.ty); + xu = isunsignedt(ty); break; default: if (in_range(tk.t, TKWBEGIN_, TKWEND_)) { @@ -850,7 +893,7 @@ tryexpand(struct parser *pr, const struct token *tk) struct macrostack *l; int macidx, i; - if (!isppident(*tk) || !(mac = findmac(tk->ident))) + if (!isppident(*tk) || !(mac = findmac(tk->s))) return 0; if (!inimstk) { @@ -937,7 +980,7 @@ findppcmd(const struct token *tk) "warning", }; int l = 0, h = arraylength(tab) - 1, i, cmp; - const char *s = tk->ident; + const char *s = tk->s; if (tk->t == TKWif) return PPIF; if (tk->t == TKWelse) return PPELSE; diff --git a/parse.c b/parse.c index b39e299..0de9346 100644 --- a/parse.c +++ b/parse.c @@ -36,7 +36,7 @@ isdecltok(struct parser *pr) case TKWdouble: return 1; case TKIDENT: - return (decl = finddecl(pr, tk.ident)) && decl->scls == SCTYPEDEF; + return (decl = finddecl(pr, tk.s)) && decl->scls == SCTYPEDEF; } return 0; } @@ -665,16 +665,18 @@ Unary: /* base exprs */ case TKNUMLIT: - if (!tk.ty) - error(&tk.span, "invalid number literal %'tk", &tk); - ex = mkexpr(ENUMLIT, tk.span, mktype(tk.ty ? tk.ty : TYINT), .u = tk.u); + case TKCHRLIT: + ex = mkexpr(ENUMLIT, tk.span, mktype(0), ); + if (!(ty.t = parsenumlit(&ex.u, &ex.f, &tk, 0))) + error(&tk.span, "bad number literal %'tk", &tk); + ex.ty.t = ty.t ? ty.t : TYINT; break; case TKSTRLIT: - ++tk.s.n; - ex = mkexpr(ESTRLIT, tk.span, mkarrtype(mktype(TYCHAR), 0, tk.s.n), .s = tk.s); + ex = mkexpr(ESTRLIT, tk.span, + mkarrtype(mktype(TYCHAR), 0, tk.len+1), .s = (uchar *)tk.s); break; case TKIDENT: - decl = finddecl(pr, tk.ident); + decl = finddecl(pr, tk.s); if (!decl) { error(&tk.span, "undeclared identifier %'tk", &tk); ex = mkexpr(ESYM, tk.span, mktype(TYINT), .sym = NULL); @@ -1605,7 +1607,7 @@ tagtype(struct parser *pr, enum toktag kind) const char *tag = NULL; if (match(pr, &tk, TKIDENT)) - tag = tk.ident; + tag = tk.s; span = tk.span; if (!match(pr, NULL, '{')) { if (!tag) { @@ -1727,7 +1729,7 @@ declspec(struct declstate *st, struct parser *pr) joinspan(&span.ex, tk.span.ex); goto End; case TKIDENT: - if (!st->base.t && !arith && (decl = finddecl(pr, tk.ident)) + if (!st->base.t && !arith && (decl = finddecl(pr, tk.s)) && decl->scls == SCTYPEDEF) { st->base = decl->ty; break; @@ -1897,7 +1899,7 @@ decltypes(struct parser *pr, struct decllist *list, const char **name, struct sp if (!name) error(&tk.span, "unexpected identifier in type name"); else { - *name = tk.ident; + *name = tk.s; *span = tk.span; } lex(pr, &tk); diff --git a/parse.h b/parse.h index 520a76e..2a9b076 100644 --- a/parse.h +++ b/parse.h @@ -17,30 +17,31 @@ enum toktag { /* single-character tokens' tag value is the character itself */ TKEOF = -1, TKXXX, TKNUMLIT, + TKCHRLIT, TKSTRLIT, - TKEQU = '@', - TKNEQ, - TKLTE, - TKGTE, - TKSHR, - TKSHL, - TKINC, - TKDEC, - TKDOTS, - TKARROW, - TKPPCAT, - TKLOGAND, - TKLOGIOR, - TKSETADD, - TKSETSUB, - TKSETMUL, - TKSETDIV, - TKSETREM, - TKSETIOR, - TKSETXOR, - TKSETAND, - TKSETSHL, - TKSETSHR, + TKEQU = '@', /* == */ + TKNEQ, /* != */ + TKLTE, /* <= */ + TKGTE, /* >= */ + TKSHR, /* >> */ + TKSHL, /* << */ + TKINC, /* ++ */ + TKDEC, /* -- */ + TKDOTS, /* ... */ + TKARROW, /* -> */ + TKPPCAT, /* ## */ + TKLOGAND, /* && */ + TKLOGIOR, /* || */ + TKSETADD, /* += */ + TKSETSUB, /* -= */ + TKSETMUL, /* *= */ + TKSETDIV, /* /= */ + TKSETREM, /* %= */ + TKSETIOR, /* |= */ + TKSETXOR, /* ^= */ + TKSETAND, /* &= */ + TKSETSHL, /* <<= */ + TKSETSHR, /* >>= */ TKIDENT = 0x80, #define _(kw, stdc) TKW##kw, #include "keywords.def" @@ -48,20 +49,27 @@ enum toktag { /* single-character tokens' tag value is the character itself */ }; struct token { - enum toktag t; - uchar ty; /* type tag for num lits */ + short t; /* toktag */ + bool litlit; + uint len; struct span span; - union { - uvlong u; - vlong i; - double f; - const char *ident; - struct bytes s; - }; + const char *s; + /* for (multi-)character tokens s & len are unused + * for keywords, s is constant cstring, len = strlen(s) + * for idents, s is interned cstring, len = strlen(s) + * for strlit and chrlit: + * when litlit : s points to start of string within file buffer (after the ") + * len == span.sl.len - 2 (string data appears literally in source code) + * otherwise s is heap allocated buffer of len bytes + * for numlit: + * when litlit : s points to start of token within file buffer (normal case) + * len == span.sl.len (number literal appears literally in source code) + * otherwise s is heap allocated buffer of len bytes + */ }; struct macro { - const char *name; /* interned from tk->ident */ + const char *name; /* interned */ const char **param; struct span0 span; uchar nparam; @@ -101,6 +109,7 @@ struct parser { const char *intern(const char *); int lex(struct parser *, struct token *); int lexpeek(struct parser *, struct token *); +enum typetag parsenumlit(uvlong *, double *, const struct token *, bool ispp); void initparser(struct parser *, const char *file); void parse(struct parser *); diff --git a/test.c b/test.c index cb81b9c..1d6c01f 100644 --- a/test.c +++ b/test.c @@ -1,6 +1,6 @@ /* coment */ -#if 1+1 < (-2*2) +#if 1+1 < (-2*'a') wawa #elif 9<<1 #define wow 3 @@ -16,13 +16,15 @@ int add (int x, int y) { struct foo {struct foo *foo;}; int abs(int x){ - return (x ^ x >> 31) - (x >> 31); + return (x ^ x >> 3\ +1) - (x >> 31); } int popcnt(unsigned x) { int n = 0; while (x) x >>= 1, n++; - return n + sizeof &"รก"[0]; + return n + sizeof "ab\r\ +c"; } struct f { diff --git a/type.c b/type.c index 3aeb8a1..5f984d6 100644 --- a/type.c +++ b/type.c @@ -80,7 +80,7 @@ interntd(const struct typedata *td) if (!slot->t) { uint nmemb; static struct arena *datarena; - if (!datarena) { + if (!datarena) { enum { N = 1<<12 }; static union { char m[sizeof(struct arena) + N]; struct arena *_align; } amem; datarena = (void *)amem.m, datarena->cap = N; @@ -270,9 +270,9 @@ cvtarith(union type a, union type b) if (issigned(a) == issigned(b)) { /* when both are integers with same signage, choose type with greatest rank */ return a.t > b.t ? a : b; - } + } /* if the signed type can represent all values of the unsigned type, - * choose it, otherwise choose its corresponding unsigned type */ + * choose it, otherwise choose its corresponding unsigned type */ /* so long long + unsigned = long long; * but long long + unsigned long = unsigned long long */ if (issigned(a)) { -- cgit v1.2.3