diff options
| author | 2025-10-16 17:25:02 +0200 | |
|---|---|---|
| committer | 2025-10-16 17:25:02 +0200 | |
| commit | 77b13b42643991fc8c2b8942ca167eb7bf156908 (patch) | |
| tree | f65a402832af6111c623af02cf946f7de928e223 | |
| parent | c19b3e277399a513c5e3a02d126ba666847566df (diff) | |
wide str and char literals
| -rw-r--r-- | c.c | 58 | ||||
| -rw-r--r-- | common.h | 8 | ||||
| -rw-r--r-- | embedfilesdir.c | 46 | ||||
| -rw-r--r-- | io.c | 60 | ||||
| -rw-r--r-- | ir.c | 3 | ||||
| -rw-r--r-- | lex.c | 101 | ||||
| -rw-r--r-- | lex.h | 8 | ||||
| -rw-r--r-- | targ.c | 4 | ||||
| -rw-r--r-- | test/pp.c | 9 |
9 files changed, 225 insertions, 72 deletions
@@ -21,23 +21,38 @@ struct comp { static int lexc(struct comp *cm, struct token *tk) { + struct token tk2; int t = lex(&cm->lx, tk); - if (t == TKSTRLIT && peek(cm, NULL) == TKSTRLIT) { + if (t == TKSTRLIT && peek(cm, &tk2) == TKSTRLIT && tk2.wide == tk->wide) { /* 5.1.1.2 Translation phase 6: concatenate adjacent string literal tokens */ static char buf[200]; - struct token tk2; vec_of(char) rest = VINIT(buf, sizeof buf); do { - lex(&cm->lx, &tk2); + lex(&cm->lx, NULL); if (tk) { joinspan(&tk->span.ex, tk2.span.ex); - vpushn(&rest, tk2.s, tk2.len); + if (!tk->wide) + vpushn(&rest, tk2.s, tk2.len); + else if (tk->wide && targ_primsizes[targ_wchartype] == 2) + vpushn(&rest, tk2.ws16, tk2.len*2); + else + vpushn(&rest, tk2.ws32, tk2.len*4); } - } while (peek(cm, NULL) == TKSTRLIT); + } while (peek(cm, &tk2) == TKSTRLIT && tk2.wide == tk->wide); if (tk) { - tk->s = memcpy(alloc(&cm->exarena, tk->len + rest.n, 0), tk->s, tk->len); - memcpy((char *)tk->s + tk->len, rest.p, rest.n); - tk->len += rest.n; + if (!tk->wide) { + tk->s = memcpy(alloc(&cm->exarena, tk->len + rest.n, 1), tk->s, tk->len); + memcpy((char *)tk->s + tk->len, rest.p, rest.n); + tk->len += rest.n; + } else if (tk->wide && targ_primsizes[targ_wchartype] == 2) { + tk->ws16 = memcpy(alloc(&cm->exarena, tk->len + rest.n*2, 2), tk->ws16, tk->len*2); + memcpy((short *)tk->s + tk->len, rest.p, rest.n); + tk->len += rest.n * 2; + } else { + tk->ws32 = memcpy(alloc(&cm->exarena, tk->len + rest.n*4, 4), tk->ws32, tk->len*4); + memcpy((int *)tk->s + tk->len, rest.p, rest.n); + tk->len += rest.n * 4; + } } vfree(&rest); } @@ -769,7 +784,8 @@ Unary: ex.ty.t = ty.t ? ty.t : TYINT; break; case TKSTRLIT: - ex = mkexpr(ESTRLIT, tk.span, mkarrtype(mktype(TYCHAR), 0, tk.len+1), .s = { (uchar *)tk.s, tk.len }); + ty = mktype(((const char []){TYCHAR, TYSHORT, TYINT})[tk.wide]); + ex = mkexpr(ESTRLIT, tk.span, mkarrtype(ty, 0, tk.len+1), .s = { (void *)tk.s, tk.len }); break; case TKIDENT: Ident: @@ -1069,9 +1085,10 @@ objectp(union type ty) } static bool -chararrayp(union type ty) +chrarrayof(union type ty, union type chld) { - return ty.t == TYARRAY && in_range(typechild(ty).t, TYCHAR, TYUCHAR); + assert(isint(chld)); + return ty.t == TYARRAY && isint(typechild(ty)) && typesize(typechild(ty)) == typesize(chld); } static union type @@ -1239,8 +1256,9 @@ iniwrite(struct comp *cm, struct initparser *ip, uint off, union type ty, struct case 8: isint(ty) ? wr64targ(p, e->u) : wrf64targ(p, e->f); break; } } else if (ty.t == TYARRAY && ex->t == ESTRLIT) { - uint n = siz < ex->s.n ? siz : ex->s.n; - //efmt("%s wrs %'S at %u\n", dat->name, ex->s.p, n, off); + uint n = ex->s.n * typesize(typechild(ty)); + if (siz < n) n = siz; + /* XXX endian for wide strs */ memcpy(p, ex->s.p, n); } else { union ref sym; @@ -1320,9 +1338,9 @@ inistrlit(struct comp *cm, struct expr *ex, union type *ty) { if (isincomplete(*ty)) { *ty = mkarrtype(typechild(*ty), ty->flag & TFCHLDQUAL, ex->s.n + 1); - } else if (typesize(*ty) < ex->s.n) { + } else if (typearrlen(*ty) < ex->s.n) { warn(&ex->span, "string literal in initializer is truncated from %u to %u bytes", - ex->s.n+1, typesize(*ty)); + (ex->s.n+1)*typesize(typechild(*ty)), typesize(*ty)); } ex->ty = *ty; } @@ -1344,15 +1362,14 @@ Retry: ++ip->sub->idx; return; } - if (ex.t == ESTRLIT && chararrayp(targ)) { + if (ex.t == ESTRLIT && chrarrayof(targ, typechild(ex.ty))) { assert(!isincomplete(targ)); inistrlit(cm, &ex, &targ); iniwrite(cm, ip, ip->sub->off + off, targ, &ex); ++ip->sub->idx; return; - } else if (ex.t == ESTRLIT && ip->sub->idx == 0 && chararrayp(ip->sub->ty)) { + } else if (ex.t == ESTRLIT && ip->sub->idx == 0 && chrarrayof(ip->sub->ty, typechild(ex.ty))) { /* handle e.g. (char []){"foo"} */ - assert(in_range(targ.t, TYCHAR, TYUCHAR)); assert(off == 0); targ = ip->sub->ty; inistrlit(cm, &ex, &targ); @@ -1534,7 +1551,7 @@ initializer(struct comp *cm, union type *ty, enum evalmode ev, bool globl, if (!match(cm, &tk, '{')) { struct expr ex = expr(cm); - if (ex.t == ESTRLIT && chararrayp(*ty)) { + if (ex.t == ESTRLIT && chrarrayof(*ty, typechild(ex.ty))) { inistrlit(cm, &ex, ty); iniwrite(cm, ip, 0, *ty, &ex); if (ip->dyn) @@ -2542,7 +2559,8 @@ expraddr(struct function *fn, const struct expr *ex) } break; case ESTRLIT: - return mkdatref(NULL, ex->s.n+1, /*align*/ 1, ex->s.p, ex->s.n, /*deref*/0); + /* XXX endian for wide strs */ + return mkdatref(NULL, typesize(ex->ty), typealign(ex->ty), ex->s.p, ex->s.n * typesize(typechild(ex->ty)), /*deref*/0); case EDEREF: return exprvalue(fn, ex->sub); case EGETF: @@ -321,7 +321,9 @@ typechild(union type t) static inline enum typetag scalartypet(union type t) { - return t.t == TYENUM ? t.backing : t.t; + if (t.t == TYENUM) return t.backing; + if (isptrcvt(t)) return TYPTR; + return t.t; } static inline uint typearrlen(union type t) @@ -335,7 +337,7 @@ typearrlen(union type t) extern uchar targ_primsizes[]; extern uchar targ_primalign[]; -extern enum typetag targ_sizetype, targ_ptrdifftype; +extern enum typetag targ_sizetype, targ_ptrdifftype, targ_wchartype; extern bool targ_charsigned, targ_bigendian, targ_64bit; extern const struct mctarg *mctarg; void targ_init(const char *); @@ -574,6 +576,8 @@ void fatal(const struct span *, const char *, ...); void error(const struct span *, const char *, ...); void warn(const struct span *, const char *, ...); void note(const struct span *, const char *, ...); +ushort *utf8to16(uint *ulen, struct arena **, const uchar *s, size_t len); +uint *utf8to32(uint *ulen, struct arena **, const uchar *s, size_t len); #endif /* COMMON_H_ */ diff --git a/embedfilesdir.c b/embedfilesdir.c index 5d58c60..237ea62 100644 --- a/embedfilesdir.c +++ b/embedfilesdir.c @@ -12,8 +12,7 @@ struct embedfile embedfilesdir[] = { {"stddef.h", S("\ typedef __typeof__((char*)0 - (char*)0) ptrdiff_t;\n\ typedef __typeof__(sizeof 0) size_t;\n\ -/*typedef __typeof__(L'a') wchar_t;*/\n\ -typedef int wchar_t;\n\ +typedef __typeof__(L'a') wchar_t;\n\ #define NULL ((void *)0)\n\ #define offsetof(type, memb) ((size_t)((char *)&((type *)0)->memb - (char *)0))\n\ ")}, @@ -37,6 +36,49 @@ typedef __builtin_va_list __gnuc_va_list;\n\ #define __bool_true_false_are_defined 1\n\ ")}, +{"float.h", S("\ +#define FLT_ROUNDS (-1)\n\ +#define FLT_EVAL_METHOD (-1)\n\ +#define FLT_HAS_SUBNORM (-1)\n\ +#define DBL_HAS_SUBNORM (-1)\n\ +#define LDBL_HAS_SUBNORM (-1)\n\ +#define FLT_RADIX 2\n\ +#define FLT_MANT_DIG 24\n\ +#define DBL_MANT_DIG 53\n\ +#define LDBL_MANT_DIG 53\n\ +#define FLT_DECIMAL_DIG 9\n\ +#define DBL_DECIMAL_DIG 17\n\ +#define LDBL_DECIMAL_DIG 17\n\ +#define DECIMAL_DIG 17\n\ +#define FLT_DIG 6\n\ +#define DBL_DIG 15\n\ +#define LDBL_DIG 15\n\ +#define FLT_MIN_EXP (-125)\n\ +#define DBL_MIN_EXP (-1021)\n\ +#define LDBL_MIN_EXP (-1021)\n\ +#define FLT_MIN_10_EXP (-37)\n\ +#define DBL_MIN_10_EXP (-307)\n\ +#define LDBL_MIN_10_EXP (-307)\n\ +#define FLT_MAX_EXP 128\n\ +#define DBL_MAX_EXP 1024\n\ +#define LDBL_MAX_EXP 1024\n\ +#define FLT_MAX_10_EXP 38\n\ +#define DBL_MAX_10_EXP 308\n\ +#define LDBL_MAX_10_EXP 308\n\ +#define FLT_MAX 3.40282e+38\n\ +#define DBL_MAX 1.79769e+308\n\ +#define LDBL_MAX 1.79769e+308\n\ +#define FLT_EPSILON 1.19209e-07\n\ +#define DBL_EPSILON 2.22045e-16\n\ +#define LDBL_EPSILON 2.22045e-16\n\ +#define FLT_MIN 1.17549e-38\n\ +#define DBL_MIN 2.22507e-308\n\ +#define LDBL_MIN 2.22507e-308\n\ +#define FLT_TRUE_MIN 1.4013e-45\n\ +#define DBL_TRUE_MIN 4.94066e-324\n\ +#define LDBL_TRUE_MIN 4.94066e-324\n\ +")}, + {NULL} }; @@ -919,4 +919,64 @@ note(const struct span *span, const char *fmt, ...) va_end(ap); } +/*** UTF util ***/ + +ushort * +utf8to16(uint *ulen, struct arena **arena, const uchar *s, size_t len) +{ + assert(0 && "nyi"); +} + +uint * +utf8to32(uint *ulen, struct arena **arena, const uchar *s, size_t len) +{ + uint *ret, *w; + const uchar *p, *end; + size_t n = 0; + bool istrunc; + + if (!len) return NULL; + + for (p = s; p < s + len; ++n) { + end = p; + if ((*p & 0xF8) == 0xF0) /* 11110xxx */ + p += 4; + else if ((*p & 0xF0) == 0xE0) /* 1110xxxx */ + p += 3; + else if ((*p & 0xE0) == 0xC0) /* 110xxxxx */ + p += 2; + else p += 1; + } + istrunc = p > s+len; + if (!istrunc) end += 1; + + ret = allocz(arena, n * sizeof *ret, sizeof *ret); + for (w = ret, p = s; p < end; ++w) { + if ((*p & 0xF8) == 0xF0) { /* 11110xxx */ + *w = (uint)(p[0] & 0x07) << 18 + | (uint)(p[1] & 0x3F) << 12 + | (uint)(p[2] & 0x3F) << 6 + | (uint)(p[3] & 0x3F); + p += 4; + } else if ((*p & 0xF0) == 0xE0) { /* 1110xxxx */ + *w = (uint)(p[0] & 0x07) << 12 + | (uint)(p[1] & 0x3F) << 6 + | (uint)(p[2] & 0x3F); + p += 3; + } else if ((*p & 0xE0) == 0xC0) { /* 110xxxxx */ + *w = (uint)(p[0] & 0x07) << 6 + | (uint)(p[1] & 0x3F); + p += 2; + } else { + *w = *p; + p += 1; + } + } + if (istrunc) *w++ = 0xFFFD; + *ulen = n; + + return ret; +} + + /* vim:set ts=3 sw=3 expandtab: */ @@ -492,7 +492,8 @@ fillblkids(struct function *fn) void useblk(struct function *fn, struct block *blk) { - if (fn->curblk) assert(fn->curblk->jmp.t && "never finished block"); + extern int nerror; + if (fn->curblk && nerror == 0) assert(fn->curblk->jmp.t && "never finished block"); if (blk) assert(!blk->jmp.t && "reusing built block"); if (!blk->lprev) { /* initialize */ blk->lnext = fn->entry; @@ -157,8 +157,15 @@ parsenumlit(uvlong *outi, double *outf, const struct token *tk, bool ispp) { if (tk->t == TKCHRLIT) { uvlong n = 0; - for (int i = 0; i < tk->len; ++i) - n = n << 8 | (uchar)tk->s[i]; + if (!tk->wide) { + for (int i = 0; i < tk->len; ++i) + n = n << 8 | (uchar)tk->s[i]; + } else if (tk->wide == 1) { + n = tk->ws16[0]; + } else { + assert(tk->wide == 2); + n = tk->ws32[0]; + } if (outi) *outi = n; return TYINT; } else if (memchr(tk->s, '.', tk->len)) { @@ -267,7 +274,7 @@ parsenumlit(uvlong *outi, double *outf, const struct token *tk, bool ispp) } static void -readstrchrlit(struct lexer *lx, struct token *tk, char delim) +readstrchrlit(struct lexer *lx, struct token *tk, char delim, int wide) { int c, i; uchar tmp[80]; @@ -337,7 +344,13 @@ readstrchrlit(struct lexer *lx, struct token *tk, char delim) if (delim == '"') { tk->t = TKSTRLIT; tk->len = b.n; - if (lx->chridx - beginoff == tk->len + 1) { + if ((tk->wide = wide)) { + tk->litlit = 0; + if (wide == 1) + tk->ws16 = utf8to16(&tk->len, lx->tmparena, b.p, b.n); + else + tk->ws32 = utf8to32(&tk->len, lx->tmparena, b.p, b.n); + } else if (lx->chridx - beginoff == tk->len + 1) { tk->litlit = 1; tk->s = (char *)&lx->dat[beginoff]; } else { @@ -355,7 +368,13 @@ readstrchrlit(struct lexer *lx, struct token *tk, char delim) } tk->t = TKCHRLIT; tk->len = b.n; - if (lx->chridx - beginoff == tk->len + 1) { + if ((tk->wide = wide)) { + tk->litlit = 0; + if (wide == 1) + tk->ws16 = utf8to16(&tk->len, lx->tmparena, b.p, b.n); + else + tk->ws32 = utf8to32(&tk->len, lx->tmparena, b.p, b.n); + } else if (lx->chridx - beginoff == tk->len + 1) { tk->litlit = 1; tk->s = (char *)&lx->dat[beginoff]; } else { @@ -416,7 +435,7 @@ static bool lexingheadername = 0; static int lex0(struct lexer *lx, struct token *tk) { - int idx, c; + int idx, c, q; #define RET(t_) do { tk->t = (t_); goto End; } while (0) @@ -505,7 +524,7 @@ Begin: lexingheadername = 0; } else { case '\'': - readstrchrlit(lx, tk, c); + readstrchrlit(lx, tk, c, 0); } goto End; case '.': @@ -516,6 +535,12 @@ Begin: goto Numlit; } RET(c); + case 'L': + if (match(lx, (q = '\'')) || match(lx, (q = '"'))) { + readstrchrlit(lx, tk, q, /* wide */ targ_primsizes[targ_wchartype] == 2 ? 1 : 2); + goto End; + } + /* fallthru */ default: if (aisdigit(c)) Numlit: { char tmp[70]; @@ -671,6 +696,7 @@ putmac(struct macro *mac) return slot; } else if (!slot->name) { /* was tomb */ *slot = *mac; + return slot; } assert(--n && "macro limit"); } @@ -690,7 +716,7 @@ delmac(const char *name) return; } else if ((slot = ¯os.p[macroht[i]-1])->name == name) { freemac(slot); - slot->name = NULL; + memset(slot, 0, sizeof *slot); return; } } @@ -1113,20 +1139,9 @@ expandfnmacro(struct lexer *lx, struct span *span, struct macro *mac) vpush(&rlist2, new); } } - while (l->idx < l->rlist.n) { - tk = l->rlist.tk[l->idx++]; - /* expand argument only once */ - if (tk.s != mac->name && tryexpand(lx, &tk)) { - assert(l != lx->macstk); - while (lx->macstk->idx < lx->macstk->rlist.n) { - vpush(&rlist2, lx->macstk->rlist.tk[lx->macstk->idx++]); - } - popmac(lx); - } else { - vpush(&rlist2, tk); - } - assert(lx->macstk == l); - } + while (lex(lx, &tk) != TKEOF) + vpush(&rlist2, tk); + assert(lx->macstk == l); popmac(lx); if (lhsargpaste) lhsargforpaste = argsbuf.p[arg->idx + arg->n-1]; @@ -1174,10 +1189,30 @@ expandfnmacro(struct lexer *lx, struct span *span, struct macro *mac) vfree(&argsbuf); } +static bool +advancemacro(struct lexer *lx, struct token *tk) +{ + struct rlist rl; + assert(lx->macstk); + rl = lx->macstk->rlist; + if (lx->macstk->idx == rl.n) { + if (lx->macstk->stop) return tk->t = TKEOF; + popmac(lx); + return 0; + } + *tk = rl.tk[lx->macstk->idx++]; + assert(tk->t); + tk->span.ex = lx->macstk->exspan; + if (tryexpand(lx, tk)) + return 0; + return tk->t; +} + static struct token epeektk; static int elex(struct lexer *lx, struct token *tk) { + assert(tk); if (epeektk.t) { int tt = epeektk.t; if (tk) *tk = epeektk; @@ -1185,15 +1220,7 @@ elex(struct lexer *lx, struct token *tk) return tt; } if (lx->macstk) { - const struct rlist rl = lx->macstk->rlist; - if (lx->macstk->idx == rl.n) { - popmac(lx); - return elex(lx, tk); - } - *tk = rl.tk[lx->macstk->idx++]; - assert(tk->t); - tk->span.ex = lx->macstk->exspan; - if (tryexpand(lx, tk)) + if (!advancemacro(lx, tk)) return elex(lx, tk); return tk->t; } @@ -1557,6 +1584,7 @@ ppinclude(struct lexer *lx, const struct span *span0) if (tryinclude(lx, &span, path)) return; } /* try embedded files pseudo-path */ + xbgrow(&path, tk.len + 3); path[0] = '@', path[1] = ':'; memcpy(path+2, tk.s, tk.len); path[tk.len+2] = 0; @@ -1641,16 +1669,7 @@ lex(struct lexer *lx, struct token *tk_) } if (lx->macstk) { - const struct rlist rl = lx->macstk->rlist; - if (lx->macstk->idx == rl.n) { - if (lx->macstk->stop) return tk->t = TKEOF; - popmac(lx); - return lex(lx, tk_); - } - *tk = rl.tk[lx->macstk->idx++]; - assert(tk->t); - tk->span.ex = lx->macstk->exspan; - if (tryexpand(lx, tk)) + if (!advancemacro(lx, tk)) return lex(lx, tk_); return tk->t; } @@ -51,12 +51,17 @@ enum toktag { /* single-character tokens' tag value is the character itself */ struct token { short t; /* toktag */ bool litlit; + uchar wide : 2; /* for CHRLIT & STRLIT; 1 -> 16bit, 2 -> 32bit */ union { uint len; struct { ushort macidx, argidx; }; }; struct span span; - const char *s; + union { + const char *s; + const ushort *ws16; + const uint *ws32; + }; /* for (multi-)character tokens s & len are unused * for keywords, s is constant cstring, len = strlen(s) * for idents, s is interned cstring, len = strlen(s) @@ -64,6 +69,7 @@ struct token { * when litlit : s points to start of string within file buffer (after the ") * len == span.sl.len - 2 (string data appears literally in source code) * otherwise s is heap allocated buffer of len bytes + * when wide, litlit = 0 and use ws16/ws32 * for numlit: * when litlit : s points to start of token within file buffer (normal case) * len == span.sl.len (number literal appears literally in source code) @@ -9,8 +9,8 @@ static const struct targ { uchar sizetype, ptrdifftype, wchartype; const struct mctarg *mctarg; } targs[] = { - { "amd64-sysv", {8, 8, 8, 24}, {8, 8, 8, 8}, 1, TYULONG, TYLONG, TYUINT, &t_amd64_sysv }, - { "i686-sysv", {4, 8, 4, 8}, {4, 4, 4, 4}, 1, TYUINT, TYINT, TYUINT } + { "amd64-sysv", {8, 8, 8, 24}, {8, 8, 8, 8}, 1, TYULONG, TYLONG, TYINT, &t_amd64_sysv }, + { "i686-sysv", {4, 8, 4, 8}, {4, 4, 4, 4}, 1, TYUINT, TYINT, TYINT } }; uchar targ_primsizes[TYPTR+1]; @@ -3,8 +3,8 @@ #include "pp.h" #include <stddef.h> #include <stdio.h> -#include <assert.h> -#include <limits.h> +#include <wchar.h> +#include <locale.h> // #define CATl(a) a##bar #define CATr(a) foo##a @@ -39,6 +39,10 @@ main(void) "%s %g\n", str(Foo,5), xstr(Foo), CAT(1.5,e3f) + CAT(7,)-CAT(,1)); printf("join: \"%s\"\n", p); + setlocale(LC_ALL, "en_US.utf8"); + + printf("wide\t L\"%ls\", U+%x\n", L"abc123 猫,€á💫", L'🦋'); + PUT\ S\ ("Output ends here\\ @@ -47,6 +51,5 @@ S\ */ ); - CAT(ret,urn) 0; } |