From 1eb17cda6780476b166b55d0fedc3ad355969e87 Mon Sep 17 00:00:00 2001 From: lemon Date: Fri, 12 Aug 2022 16:43:06 +0200 Subject: selfhosted lexer --- src/all.hff | 5 +++ src/fmt.cff | 113 +++++++++++++++++++++++++++++++++------------- src/libc.hff | 3 ++ src/parse.cff | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- src/util.cff | 54 ++++++++++++++++++++++ 5 files changed, 281 insertions(+), 36 deletions(-) (limited to 'src') diff --git a/src/all.hff b/src/all.hff index a35694b..e090570 100644 --- a/src/all.hff +++ b/src/all.hff @@ -81,6 +81,7 @@ struct Tok { flo f64, bool bool, str [#]const u8, + ident *const u8, }, } @@ -104,15 +105,19 @@ extern fn parse(*Parser) [#]Decl; // util.cff extern fn xmalloc(n usize) *void; +extern fn xcalloc(n usize, m usize) *void; extern fn xrealloc(p *void, n usize) *void; +extern fn xstrdup(str *const u8) *u8; def FNV1A_INI u32 = 0x811c9dc5; extern fn fnv1a(h u32, [#]const u8) u32; extern fn fnv1a_s(h u32, *const u8) u32; extern fn addfilepath(*const u8) int; extern fn fatal(*Parser, Loc, fmt *const u8, ...) void; +extern fn internstr(*const u8) *const u8; // fmt.cff extern fn vpfmt(proc *fn(u8, *void) void, parg *void, fmt *const u8, va_list) void; +extern fn pfmt(proc *fn(u8, *void) void, parg *void, fmt *const u8, ...) void; extern fn vefmt(fmt *const u8, ap va_list) void; extern fn efmt(fmt *const u8, ...) void; diff --git a/src/fmt.cff b/src/fmt.cff index 885ea50..ecf7d74 100644 --- a/src/fmt.cff +++ b/src/fmt.cff @@ -3,41 +3,83 @@ import "all.hff"; extern fn vpfmt(proc *fn(u8, *void) void, parg *void, fmt *const u8, ap va_list) void { defmacro p(x) [ proc(x, parg) ] defmacro ps(s) [ - for let $i = 0; (s)[$i] != 0; ++$i { - p(s[$i]); + for let $s *const u8 = (s); *$s != 0; ++$s { + p(*$s); } ] - let buf [100]u8 = {}; + + defmacro pch(ch) [ { + extern fn isprint(int) int; + if isprint(ch) != 0 { + p(ch); + } else { + p('\\'); + p('0' + (ch % 8)); + p('0' + ((ch / 8) % 8)); + p('0' + ((ch / 8 / 8) % 8)); + } + } ] + + static buf [100]u8 = {}; + + fn pritok(proc typeof(proc), parg *void, quote bool, tok *const Tok) void { + switch tok.t { + case :int; + sprintf(buf, "%lld", tok.u.int); + ps(buf); + case :str; + pfmt(proc, parg, "%S", tok.u.str); + case :ident; + if quote { p('`'); } + ps(tok.u.ident); + if quote { p('\''); } + case else + if tok.t >= 0 and tok.t < NUM_KEYWORDS { + if quote { p('`'); } + ps(tok.u.ident); + if quote { p('\''); } + } else if tok.t > 0 { + if quote { p('`'); } + let t = bswap32(tok.t); + let i = 0; + while t != 0 { + if t & 0xFF != 0 { + p(t); + } + t >>= 8; + } + buf[i] = '\0'; + if quote { p('\''); } + } + } + } + for let c u8 = *fmt; c != 0; c = *++fmt { assert(c != 0, "?"); if c != '%' { p(c); if fmt[1] == 0 { break; } - continue; + continue; } let quote = #f; - #'fmt do { + for ;; { switch (c = *++fmt) { - case 'i'; - sprintf(buf, "%d", ap->arg(int)); - ps(buf); case 'q'; quote = #t; - continue #'fmt; + continue; + case 'i', 'd'; + sprintf(buf, "%d", ap->arg(int)); + ps(buf); + case 'p'; + sprintf(buf, "%p", ap->arg(*void)); + ps(buf); case 'c'; let ch u32 = ap->arg(int); if quote { - extern fn isprint(int) int; p('\''); for ch = bswap32(ch); ch != 0; ch >>= 8 { if ch & 0xFF != 0 { - if isprint(ch) != 0 { p(ch); } - else { - p('\\'); - p('0' + (ch % 8)); - p('0' + ((ch / 8) % 8)); - p('0' + ((ch / 8 / 8) % 8)); - } + pch(ch); } } p('\''); @@ -57,38 +99,47 @@ extern fn vpfmt(proc *fn(u8, *void) void, parg *void, fmt *const u8, ap va_list) extern fn isprint(int) int; p('\"'); for let c u8 #?; (c = *s++) != 0; { - if isprint(c) != 0 { - p(c); - } else { - p('\\'); - p('0' + (c % 8)); - p('0' + ((c / 8) % 8)); - p('0' + ((c / 8 / 8) % 8)); - } + pch(c); } p('\"'); } else { ps(s); } + case 'S'; + let str = ap->arg([#]const u8); + p('"'); + foreach(c, i, str, + pch(c); + ) + p('"'); + case 'T'; + let tok = ap->arg(Tok); + pritok(proc, parg, quote, &tok); case else - // assert(#f, "bad fmt '%c' @ %d", c, i); + assert(#f, "bad fmt '%c'", c); } - } while #f; + break; + } } } +extern fn pfmt(proc *fn(u8, *void) void, parg *void, fmt *const u8, ...) void { + let ap va_list #?; + ap->start(fmt); + vpfmt(proc, parg, fmt, ap); + ap->end(); +} + extern fn vefmt(fmt *const u8, ap va_list) void { - fn epri(c u8, *void) void { + fn eputc(c u8, *void) void { fputc(c, stderr); } - - vpfmt(&epri, #null, fmt, ap); + vpfmt(&eputc, #null, fmt, ap); } extern fn efmt(fmt *const u8, ...) void { let ap va_list #?; ap->start(fmt); - vefmt(fmt, ap); ap->end(); } diff --git a/src/libc.hff b/src/libc.hff index a231614..488a495 100644 --- a/src/libc.hff +++ b/src/libc.hff @@ -18,12 +18,15 @@ extern fn abort() void; extern fn exit(c int) void; extern fn perror(s *const u8) void; extern fn malloc(n usize) *void; +extern fn calloc(n usize, m usize) *void; extern fn realloc(p *void, n usize) *void; extern fn free(p *void) void; // string.h extern fn strlen(s *const u8) usize; extern fn strcmp(a *const u8, b *const u8) int; +extern fn memcpy(*void, *const void, usize) *void; +extern fn strcpy(*u8, *const u8) *u8; //ctype.h extern fn tolower(int) int; diff --git a/src/parse.cff b/src/parse.cff index d672173..bcb5763 100644 --- a/src/parse.cff +++ b/src/parse.cff @@ -1,3 +1,4 @@ +import "vec.hff"; import "all.hff"; /////////// @@ -31,6 +32,9 @@ fn chrpeek(P *Parser) int { case Some c; return c; } let c = fgetc(P.fp); + if c == EOF { + P.eof = #t; + } P.peekchr = :Some c; return c; } @@ -63,7 +67,7 @@ fn isxdigit(c u8) bool { fn isalpha(c u8) bool { return (c >= 'a' and c <= 'z') - or (c >= 'A' and c <= 'z'); + or (c >= 'A' and c <= 'Z'); } fn issep(c u8) bool { @@ -153,7 +157,7 @@ fn readnumber(s *const u8) Option { continue; } if nused > 0 and c == '_' { continue; } - if (base == 16 and not isdigit(c)) + if (base == 16 and not isxdigit(c)) or (base != 16 and (c < '0' or c > ('0' + base) - 1)) { suffix = s + i; } @@ -197,7 +201,7 @@ fn lex(P *Parser) Tok { if isdigit(c = chrpeek(P)) { let s [80]u8 = {}; if readtilsep(P, s[0::], #t) < 0 { - // fatal + fatal(P, tok.loc, "bad number literal"); } switch readnumber(s) { case None; @@ -207,7 +211,131 @@ fn lex(P *Parser) Tok { return tok; } } - if c == EOF { + if isalpha(c) or c == '_' { + let s [120]u8; + if readtilsep(P, s[0::], #f) < 0 { + fatal(P, tok.loc, "identifier too long"); + } + let kw = str2keyword(s); + if kw >= 0 { + tok.t = kw; + tok.u.ident = keyword2str[kw]; + } else { + tok.t = :ident; + tok.u.ident = internstr(s); + } + return tok; + } + if c == '"' or c == '\'' { + chr(P); + let delim = c; + let str Vec = {}; + let c u8 #?; + let i = 0z; + while (c = chr(P)) != delim { + if c == 0 or c == '\n' { + fatal(P, P.tokloc, "unterminated %s literal", + delim == '"' ? "string" : "character"); + } + if c != '\\' { + str->push(c); + continue; + } + switch ((c = chr(P))) { + case 0, '\n'; + fatal(P, P.tokloc, "unterminated %s literal", + delim == '"' ? "string" : "character"); + case '\''; str->push('\''); case '\\'; str->push('\\'); + case '"'; str->push('"'); case 'n'; str->push('\n'); + case 'r'; str->push('\r'); case 't'; str->push('\t'); + case 'v'; str->push('\v'); case 'f'; str->push('\f'); + case '0'; str->push('\0'); + case else + fatal(P, P.tokloc, "unknown escape sequence '\\%c'", c); + } + } + + tok.t = :str; + tok.u.str = str->compact(); + return tok; + } + switch c = chr(P) { + case '(', ')', '[', ']', '{', + '}', ',', ';', '?', '~'; + tok.t = c; + return tok; + case '.'; + if chrmatch(P, '.') { + if chrmatch(P, '.') { tok.t = '...'; } + else { tok.t = '..'; } + } else { tok.t = '.'; } + return tok; + case '*'; + if chrmatch(P, '=') { tok.t = '*='; } + else { tok.t = '*'; } + return tok; + case '/'; + if chrmatch(P, '=') { tok.t = '/='; } + else if chrmatch(P, '/') { + while (c = chr(P)) != 0 and c != '\n' { } + return lex(P); + } + else { tok.t = '/'; } + return tok; + case '%'; + if chrmatch(P, '=') { tok.t = '%='; } + else { tok.t = '%'; } + return tok; + case '+'; + if chrmatch(P, '=') { tok.t = '+='; } + else if chrmatch(P, '+') { tok.t = '++'; } + else { tok.t = '+'; } + return tok; + case '-'; + if chrmatch(P, '=') { tok.t = '-='; } + else if chrmatch(P, '-') { tok.t = '--'; } + else if chrmatch(P, '>') { tok.t = '->'; } + else { tok.t = '-'; } + return tok; + case '&'; + if chrmatch(P, '=') { tok.t = '&='; } + else { tok.t = '&'; } + return tok; + case '|'; + if chrmatch(P, '=') { tok.t = '|='; } + else { tok.t = '|'; } + return tok; + case '^'; + if chrmatch(P, '=') { tok.t = '^='; } + else { tok.t = '^'; } + return tok; + case ':'; + if chrmatch(P, ':') { tok.t = '::'; } + else { tok.t = ':'; } + return tok; + case '='; + if chrmatch(P, '=') { tok.t = '=='; } + else { tok.t = '='; } + return tok; + case '!'; + if chrmatch(P, '=') { tok.t = '!='; } + else { tok.t = '!'; } + return tok; + case '<'; + if chrmatch(P, '=') { tok.t = '<='; } + else if chrmatch(P, '<') { + if chrmatch(P, '=') { tok.t = '<<='; } + else { tok.t = '<<'; } + } else { tok.t = '<'; } + return tok; + case '>'; + if chrmatch(P, '=') { tok.t = '>='; } + else if chrmatch(P, '>') { + if chrmatch(P, '=') { tok.t = '>>='; } + else { tok.t = '>>'; } + } else { tok.t = '>'; } + return tok; + case EOF, 0; tok.t = :eof; return tok; } @@ -215,7 +343,11 @@ fn lex(P *Parser) Tok { } extern fn parse(P *Parser) [#]Decl { - let tok = lex(P); + while not P.eof { + let tok = lex(P); + if tok.t == :eof { break; } + efmt("* tok: %qT\n", tok); + } } extern fn parser_init(P *Parser, path *const u8) void { diff --git a/src/util.cff b/src/util.cff index c3a08cb..d1c9dc5 100644 --- a/src/util.cff +++ b/src/util.cff @@ -1,3 +1,4 @@ +import "vec.hff"; import "all.hff"; extern fn xmalloc(n usize) *void { @@ -6,12 +7,23 @@ extern fn xmalloc(n usize) *void { return p; } +extern fn xcalloc(n usize, m usize) *void { + let p = calloc(n, n); + assert(p != #null, "calloc"); + return p; +} + extern fn xrealloc(p *void, n usize) *void { let p = realloc(p, n); assert(p != #null, "realloc"); return p; } +extern fn xstrdup(str *const u8) *u8 { + let p = xmalloc(strlen(str) + 1); + strcpy(p, str); + return p; +} extern fn fnv1a(h u32, d [#]const u8) u32 { foreach(i, x, d, @@ -65,3 +77,45 @@ extern fn fatal(P *Parser, loc Loc, fmt *const u8, ...) void { ap->end(); exit(1); } + +extern fn internstr(s *const u8) *const u8 { + static buf Vec<*const u8> = {}; + static set **const u8 = {}; + static N int = {}; + static count int = {}; + + if set == #null { + set = xcalloc(N = 16, sizeof int); + } + + if count == N / 2 { + free(set); + set = xcalloc(N *= 2, sizeof int); + vec_each(s, i, buf, + let i = fnv1a_s(FNV1A_INI, s) & (N - 1); + for ;; { + if set[i] == #null { + set[i] = s; + break; + } + i = (i + 1) & (N - 1); + } + ) + } + + let i0 = fnv1a_s(FNV1A_INI, s) & (N - 1); + let i int = i0; + do { + if set[i] == #null { + ++count; + buf->push(xstrdup(s)); + set[i] = buf.dat[buf.len - 1]; + return set[i]; + } else if streq(set[i], s) { + return set[i]; + } + i = (i + 1) & (N - 1); + } while i != i0; + assert(#f, "unreachable"); + +} -- cgit v1.2.3