/* Copyright (c) Günter Woigk 2018 - 2020 mailto:kio@little-bat.de This file is free software. Permission to use, copy, modify, distribute, and sell this software and its documentation for any purpose is hereby granted without fee, provided that the above copyright notice appears in all copies and that both that copyright notice, this permission notice and the following disclaimer appear in supporting documentation. THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT ANY WARRANTY, NOT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, AND IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DAMAGES ARISING FROM THE USE OF THIS SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. */ #define LOGLEVEL 1 #define SAFETY 3 #include "kio/kio.h" #include "cstrings/utf8.h" #include "Tokenizer.h" #include "Error.h" #include "Word.h" #include "Parser/Names.h" //#include "Ival.h" #include "Type.h" //#include "backends/TargetDescr.h" namespace vpp { // tokenize_line(…) tokenisiert idR. eine Zeile bis Nullbyte oder Newline. // tokenize() tokenisiert alle Zeilen bis zum Nullbyte. // Der Text muss mit einem Nullbyte abgeschlossen sein. (c-style string) // Als Zeilenende wird nur Newline '\n' erkannt. // Der Text muss UTF8-kodiert sein oder reines 7-Bit Ascii. // // tokenize() überspringt '//…' Zeilenkommentare und '/*…*/' Blockkommentare. // tokenize() tokenisiert u.U. mehr als eine Zeile, wenn ein Blockkommentar oder ein Langstring «…» gefunden wird // oder wenn die Zeile mit einem Backslash endet. (Ausnahme: Zeilenkommentar) // Langstrings und Blockkommentare könnengeschachtelt werden. // // Macros werden nicht expandiert. // Preprozessor-Direktiven werden nicht ausgeführt. // // Die abgetrennten Token werden mit den Methoden des TokenizeInterface übergeben // und von diesem z.B. in Worte gewandelt und an einen Array gehängt. // Dabei werden Zeiger in den Originaltext übergeben, bei Strings inkl. Quotes und Escape-Zeichen. // // tokenize() wirft TokenizeError, wenn ein Blockkommentar oder ein String nicht terminiert ist. // Bei Kurzstrings zeigt die enthaltene pos auf das unerwartete nl oder null, // bei Blockkommentar und Langstring auf den Blockstarter. // // tokenize() erkennt // • Strings: ".." '..' `..` und «..» // • Bezeichner: ['_' ] ['_' 0-9]* // • Zahlen: // [0-9] [0-9 a-z A-Z]* // [0-9]+ '.' [0-9]* [a-z A-Z]* // [0-9]+ '.' [0-9]* [eE] [+-]? [0-9]+ [a-z A-Z]* // • Operatoren und Sonderzeichen: // ^!$%&/()=?+*#-.:,;<>@{}[]| // • Mehrzeichenoperatoren: // ++ -- == != >= <= ≤ ≥ << >> && || -> /% := += -= *= /= %= >>= <<= &&= ||= // // Ketten von Sonderzeichen werden von links nach rechts gebrochen, "+++" wird zu "++" plus "+". // // Zahlen werden nicht ausgewertet. Es wird nicht garantiert, dass sie syntaktisch korrekt sind! // Vorzeichen werden separat als Operator '+' bzw. '-' gespeichert. static uint32 normalizeLinebreaks (ptr q) noexcept { // replace non-unix linebreaks with simple '\n' // returns number of bytes removed for DOS linebreaks static const char nl = '\n'; static const char cr = '\r'; ptr z = strchr(q,cr); if(z == nullptr) return 0; if (z>q) z--; // prev char might be nl for (q=z;;) { char c; while (uchar(c = *z++ = *q++) > 13) {} if (c==0) return uint32(q-z); if (c==nl || c==cr) { if (c + *q == cr+nl) q++; *(z-1) = nl; } } } static cptr skip_linecomment (cptr q) noexcept { // skip to end of line // stop at endofline or null // also works with DOS line ends while(*q && *q != '\n') q++; return q; } #ifdef DEBUG ON_INIT([]{ static cstr s1 = "12 \\t4€ Af3'456"; static cstr s2 = "/2 \\t4€ A+3\n//456'"; static cstr s3 = "//\t\\t4€ A*3\\\n456'"; static cstr s4 = "12 \\t4€ //3\r\n//456'"; assert(skip_linecomment(s1) == s1+17); assert(skip_linecomment(s2) == s2+13); assert(skip_linecomment(s3) == s3+14); assert(skip_linecomment(s4) == s4+14); }); #endif static cptr skip_string (cptr q) throws // cstr { // skip beyond next delim // throw at nl or null // note: '\' before line end works with "\r\n" DOS line ends: // caller must also handle this when unescaping the string! char delim = *q++; assert(delim=='"' || delim=='`' || delim=='\''); if(*q==delim && *(q+1)==delim) return q+2; // accept ''' and """ as 1-char strings for(char c; (c=*q++) && c!='\n'; ) { if(c == delim) return q; if(c == '\\') { if(*q=='\r' && *(q+1)=='\n') q++; if(*q) q++; } } throw "unterminated string literal"; } #ifdef DEBUG ON_INIT([]{ static cstr s1 = "'12 \\t4€ Af3'456"; static cstr s2 = "'12 \t4€ A+3\n456'"; static cstr s3 = "'12 \"t\\'€ A*3"; assert(skip_string(s1) == s1+15); try{ skip_string(s2); IERR(); } catch(cstr) {} try{ skip_string(s3); IERR(); } catch(cstr) {} }); #endif static cptr skip_longstring (cptr q) throws // cstr { // skip beyond next '»' // skip over nl // throw at null // skip recursively over '«' .. '»' (must be balanced) // note: "\r\n" DOS line ends are preserved // and caller must handle these when unescaping the string! // note: '«' = $C2,$AB // '»' = $C2,$BB assert(*q==char(0xC2) && *(q+1)==char(0xAB)); q += 2; while(char c = *q++) { if(c > 0) { if(c == '\\') { if(*q) q++; } } else if(c == char(0xC2)) { c = *q++; if(c == char(0xBB)) return q; // '»' if(c == char(0xAB)) { q = skip_longstring(q-2); continue; } } } throw "unterminated string literal"; } #ifdef DEBUG ON_INIT([]{ static cstr s1 = "«12 \\t4€ A¢3'456»xx"; static cstr s2 = "«12 \t4€ A+3\n456'»»"; static cstr s3 = "«12 \"t4«€» ««»»A*3»\n"; static cstr s4 = "«12 \"t4«€» A*3\n"; static cstr s5 = "«12 \"t4«» A*3\""; static cstr s6 = "«123\\"; assert(skip_longstring(s1) == s1+22); assert(skip_longstring(s2) == s2+21); assert(skip_longstring(s3) == s3+29); try{skip_longstring(s4); IERR();}catch(cstr){} try{skip_longstring(s5); IERR();}catch(cstr){} try{skip_longstring(s6); IERR();}catch(cstr){} }); #endif static cptr skip_blockcomment (cptr q) throws // cstr { // skip beyond next "*/" // skip over nl // throw at null // nested block comments are detected and skipped. "/*" .. "*/" must be balanced. // long strings are detected and skipped. '«' .. '»' must be balanced. spurious '»' are ignored. // line comments "//" are detected and skipped. // contained short strings should be delimited. // '*/' at the end of lines with unbalanced short string delimiters are recognized! // -> natural text with apostrophs should work // while commented-out source is expected to contain only balanced strings. assert(*q=='/' && *(q+1)=='*'); q += 2; while(char c = *q++) { if(c > '/') continue; switch(c) { case '*': if(*q == '/') return q+1; // closing '*/' found continue; case '/': if(*q == '*') q = skip_blockcomment(q-1); else if(*q == '/') q = skip_linecomment(q); continue; case '\'': case '"': case '`': try { q = skip_string(q-1); } // skip string (may contain '*/') catch(cstr) {} // else just skip unbalanced quote continue; default: if(c == char(0xC2) && *q == char(0xAB)) q = skip_longstring(q-1); continue; } } throw "unterminated block comment"; } #ifdef DEBUG ON_INIT([]{ {static cstr s = "/*123 \t \nxx**//zz"; assert(skip_blockcomment(s) == s+14);} {static cstr s = "/*12'*/'; // xyz\n\t*/ "; assert(skip_blockcomment(s) == s+20);} {static cstr s = "/*/*foo*/«/*\n»*/;"; assert(skip_blockcomment(s) == s+18);} {static cstr s = "/*\n//foo*/\n//«\n*/x"; assert(skip_blockcomment(s) == s+18);} {static cstr s = "/*it's a lie! */\n"; assert(skip_blockcomment(s) == s+16);} {static cstr s = "/*foo\n//*/"; try{skip_blockcomment(s);IERR();}catch(cstr){}} {static cstr s = "/*foo «*/"; try{skip_blockcomment(s);IERR();}catch(cstr){}} {static cstr s = "/*foo \\"; try{skip_blockcomment(s);IERR();}catch(cstr){}} }); #endif static cptr skip_spaces (cptr q) noexcept { // skip spaces // skip backslash + endofline // stop at endofline, nonspace or null // works with "\r\n" DOS line ends for(char c; (c = *q++); ) { if(uchar(c) <= ' ') { if(c != '\n') continue; else break; } if(c == '\\' && *q == '\n') { q++; continue; } if(c == '\\' && *q == '\r' && *(q+1) == '\n') { q+=2; continue; } if(c == '/' && *q == '/') { return skip_linecomment(q+1); } else break; } return q-1; } #ifdef DEBUG ON_INIT([]{ static cstr s1 = "12 \\t4€ Af3"; static cstr s2 = "\t2"; static cstr s3 = " \t\\\n\t \\\r\n \t \\\r\n \r\nx"; static cstr s4 = "\\\t 12\nx"; static cstr s5 = "\t"; static cstr s6 = "\t// foobar «....x\nxx"; assert(skip_spaces(s1) == s1); assert(skip_spaces(s2) == s2+1); assert(skip_spaces(s3) == s3+19); assert(skip_spaces(s4) == s4); assert(skip_spaces(s5) == s5+1); assert(skip_spaces(s6) == s6+18); }); #endif static cptr skip_identifier (cptr q) noexcept { // Identifier: ['_' ] ['_' 0-9]* assert(utf8::is_letter(q) || *q == '_'); do { q = utf8::nextchar(q); } while( utf8::is_letter(q) || *q=='_' || utf8::is_dec_digit(q) ); return q; } #ifdef DEBUG ON_INIT([]{ {static cstr s = "L23L+"; assert(skip_identifier(s) == s+4);} {static cstr s = "_a65qh.f"; assert(skip_identifier(s) == s+6);} {static cstr s = "é1Ä€"; assert(skip_identifier(s) == s+5);} {static cstr s = "кирлица+"; assert(skip_identifier(s) == s+14);} {static cstr s = "an_0n«»"; assert(skip_identifier(s) == s+5);} {static cstr s = "L23L\n"; assert(skip_identifier(s) == s+4);} {static cstr s = "L23L"; assert(skip_identifier(s) == s+4);} }); #endif inline cptr skip_decimals (cptr p) noexcept { while (is_dec_digit(*p)) { p++; } return p; } static cptr skip_number (cptr q) noexcept { // Number: // '0x' [0-9,a-f,A-F]+ // '0b' [01]+ // [0-9]+ // [0-9]+ '.' [0-9]+ // [0-9]+ '.' [0-9]+ [eE] [+-]? [0-9]+ // [0-9]+ [eE] [+-]? [0-9]+ if (*q == '0') { if ((*(q+1)|0x20)=='x' && is_hex_digit(*(q+2))) // hex number { q += 3; while(is_hex_digit(*q)) { q++; } return q; } if ((*(q+1)|0x20)=='b' && is_bin_digit(*(q+2))) // bin number { q += 3; while(is_bin_digit(*q)) { q++; } return q; } } if (*q == '+' || *q == '-') q++; // skip sign assert(is_dec_digit(*q)); q = skip_decimals(q+1); // skip manissa or integer number if (*q=='.' && is_dec_digit(*(q+1))) // decimal dot: fractional part of floating point number { // test is_dec_digit() wg. member functions, e.g. 123.lo() q = skip_decimals(q+2); } if ((*q|0x20)=='e') // exponent { cptr q0 = q++; if (*q=='+'||*q=='-') q++; if (no_dec_digit(*q)) return q0; q = skip_decimals(q); } if (*q == 's' || *q == 'l') q++; // size specifier return q; } #ifdef DEBUG ON_INIT([]{ {static cstr s = "123L+"; assert(skip_number(s) == s+3);} {static cstr s = "0xA23L+"; assert(skip_number(s) == s+5);} {static cstr s = "0A23h0"; assert(skip_number(s) == s+1);} {static cstr s = "1e65qh.f"; assert(skip_number(s) == s+4);} {static cstr s = "12.34e+5s."; assert(skip_number(s) == s+9);} {static cstr s = "123L+"; assert(skip_number(s) == s+3);} {static cstr s = "123L\n"; assert(skip_number(s) == s+3);} {static cstr s = "123L"; assert(skip_number(s) == s+3);} {static cstr s = "12.L"; assert(skip_number(s) == s+2);} {static cstr s = "12e34"; assert(skip_number(s) == s+5);} {static cstr s = "12.0e"; assert(skip_number(s) == s+4);} }); #endif static cptr skip_operator (cptr q) noexcept { // • Operatoren und Sonderzeichen: // ^!$%&/()=?+*#-.:,;<>@{}[]| // • Mehrzeichenoperatoren: // ≤ ≥ ++ -- == != >= <= << >> && || -> /% := += -= *= /= %= >>= <<= &&= ||= char c1 = *q; if(strchr("+-*/%><:=!&|^",c1)) // potential 2- and 3-char operators { char c2 = *(q+1); static char o[] = "<<=>>=&&=||=++ -- == != >= <= !! -> /% := += -= *= /= %= &= |= ^= "; for(uint i=0; i tulong->bits) throw "literal size exceeds sizeof ulong"; } else if (*a=='0' && (*(a+1)|0x20)=='b') // 0b0101.. { uint64 value = strtoull(a+2,&z,2); *ptr(e) = c; uint bits = uint(e - a) -2; cType* type = bits <= 8 ? tushort : bits <= 16 ? tuint16 : bits <= 32 ? tuint32 : tulong; append_word(type,value); if (s|l) throw "suffix s|l not allowed: size is based on number of digits"; if (bits > tulong->bits) throw "literal size exceeds sizeof ulong"; } else if (skip_decimals(a) == e) // 1234.. { uint64 value = strtoull(a,&z,10); *ptr(e) = c; cType* type = l ? tulong : s ? tushort : value <= tushort->max ? tushort : value <= tuint16->max ? tuint16 : value <= tuint32->max ? tuint32 : tulong; append_word(type,value); if (s && type->bits > tushort->bits) throw "value exceeds sizeof ushort"; if (errno==ERANGE || type->bits > tulong->bits) throw "value exceeds sizeof ulong"; } else if ((*a=='+' || *a=='-') && skip_decimals(a+1) == e) // ±1234.. { int64 value = strtoll(a,&z,10); *ptr(e) = c; cType* type = l ? tlong : s ? tshort : value >= tshort->min && value <= tshort->max ? tshort : value >= tint16->min && value <= tint16->max ? tint16 : value >= tint32->min && value <= tint32->max ? tint32 : tlong; append_word(type,value); if (s && type->bits > tshort->bits) throw "value exceeds sizeof short"; if (errno==ERANGE || type->bits > tlong->bits) throw "value exceeds sizeof long"; } else // float { float128 value = std::strtold(a,&z); *ptr(e) = c; cType* type = s ? tsfloat : l ? tlfloat : tfloat; if (type == nullptr) throw "the target system does not support floating point numbers"; append_word(type,value); if (errno==ERANGE || !type->canRepresent(value)) { cstr sfx = names[type->name]; if ((errno&&value!=0.0l) || type->isInfinity(value)) throw usingstr("%s: value reached infinity", sfx); if ((errno&&value==0.0l) || type->isRoundToZero(value)) throw usingstr("%s: value rounded to null", sfx); if (type->isDenormalized(value)) { if (type == tlfloat || type->numLostBits(value) < 3) logline("%s %s: %i bits lost due to denormalization", sfx, substr(a,e), type->numLostBits(value)); else throw usingstr("%s: %i bits lost due to denormalization", sfx, type->numLostBits(value)); } } } if(errno) throw strerror(errno); // EINVAL, ERANGE assert(z==e); assert(*e==c); } void Tokenizer::parse_base256_number(cptr a, cptr e) throws // cstr { assert (*a=='\''); // base-256 number errno = noerror; cstr s = utf8::unescapedstr(substr(a+1,e-1)); // preserves escaped char(0) as 2 byte code //if (errno) throw errorstr(errno); uint len = utf8::charcount(s); if (len==1) // single character: may be any unicode value in range of target's char type { uint32 n; utf8::utf8_to_ucs4(s,&n); append_word(tchar,n); if (errno) throw errorstr(errno); if (n > tchar->max) throw "character code exceeds char"; } else // multi char: ucs1 only { ucs1char us[8] = {0,0,0,0,0,0,0,0}; utf8::utf8_to_ucs1(s,us+8-minmax(1u,len,8u)); uint64 n = peek8Z(us); auto type = len <= 2 ? tuint16 : len <= 4 ? tuint32 : tuint64; append_word(type,n); if (len < 1) throw "base-256 literal: min. 1 character required"; if (len > 8) throw "base-256 literal: max. 8 characters allowed"; if (errno) throw errorstr(errno); //if (errno) throw "base-256 number: character code exceeds ucs1 (latin-1)"; if (type->bits > tulong->bits) throw "literal size exceeds ulong"; } } void Tokenizer::parse_string(cptr a, cptr e) throws // cstr { // parse string literal and return Value object // cstring is unquoted, unescaped and utf8-encoded // DOS linebreaks in long strings are normalized to '\n' errno = noerror; if (*a=='"') // short string { cstr s = utf8::unescapedstr(substr(a+1,e-1)); // sets errno append_word(s); // sets errno } else if(*a==char(0xC2)) // long string { str s = substr(a+2,e-2); normalizeLinebreaks(s); s = utf8::unescapedstr(s); // sets errno append_word(s); // sets errno } else IERR(); if (errno) throw errorstr(errno); } Words Tokenizer::tokenize (cstr source, uint32 offs) throws // AnyError { // tokenize source[offs++] --> words[] // skips: initial BOM and SHEBANG // white space, line and block comments // stores: identifiers, operators etc., numbers and text values in words[] // linebreaks (except if inside string or block comment) // stores: errors in errors[] // throws: if too many errors assert(words.count() == 0); this->source = source; cptr a, q = source + offs; // skip BOM: if (*q==char(0xEF) && *(q+1)==char(0xBB) && *(q+2)==char(0xBF)) q += 3; // skip SHEBANG: if (*q=='#' && *(q+1)=='!') { q = strchr(q,'\n'); if(!q) return std::move(words); } // DOIT: while (errors.count() <= max_errors) { try { for (a = q = skip_spaces(q); char c = *a; a = q = skip_spaces(q)) { spos = uint32(a-source); if(utf8::is_letter(a) || c=='_') { q = skip_identifier(a); parse_identifier(a,q); } else if(is_dec_digit(c)) { num: q = skip_number(a); if (*q=='s' || *q=='l') q++; parse_number(a,q); // throws } else if(c=='"') { q = skip_string(a); // throws parse_string(a,q); // throws } else if(c=='\'') { q = skip_string(a); // throws parse_base256_number(a,q); // throws } else if(c==char(0xC2) && *(a+1)==char(0xAB)) // '«' { q = skip_longstring(a); // throws parse_string(a,q); } else if(c == '/' && *(a+1) == '*') { q = skip_blockcomment(a); // throws continue; } else if((c=='+' || c=='-') && is_dec_digit(*(a+1))) { // try to use '+' and '-' as numeric sign: // this is important to handle numeric signs before unsigned values // which grow in size when they become signed // e.g. 40000 = uint16 --> +40000 = int32 // because Value::operator+(int) will not grow size beyond size of dflt. int // but truncate the value instead, to mimic what compiled code does. //char d = *(q+1); //char e = *(q+2) | 0x20; //if(d=='0' && (e=='x' || e=='b')) goto op; // test for number sign: // pass-through bei nl // nicht nach ++ -- // nach operator // nach ({[,;~!?: // nicht nach string, number or other idf // nicht nach )}] note: ')' könnte cast sein uint i = words.count(); NameID idf; do { idf = words[--i].idf; } while(idf == tNL); if(idf==tIVAL) goto op; // after literal number or string if(idf==tINCR || idf==tDECR) goto op; // must be postfixes: ++ival is not possible if(idf<=tEKauf) goto num; // after operator ( { [ , ; //if(idf==tRKauf || idf==tGKauf || idf==tEKauf) goto num; //if(idf==tKOMMA || idf==tSEMIK) goto num; goto op; } else { op: q = skip_operator(a); parse_identifier(a,q); } } return std::move(words); // ok } catch (cstr msg) { errors.append(SyntaxError(spos, "%s", msg)); } } // too many errors: throw AnyError("too many errors"); } Tokenizer::Tokenizer (Names& names, Errors& errors) : names(names), errors(errors) {} } // namespace