1 module dud.sdlang.lexer; 2 3 import std.ascii; 4 import std.array : appender, empty, front, popFront, popBack; 5 import std.algorithm.searching : startsWith; 6 import std.base64; 7 import std.exception : enforce; 8 import std.conv : to; 9 import std.experimental.logger; 10 import std.format : format; 11 import std.typecons : Flag; 12 import std.stdio; 13 14 import dud.sdlang.tokenmodule; 15 import dud.sdlang.value; 16 17 struct Lexer { 18 string input; 19 20 size_t line; 21 size_t column; 22 23 Token cur; 24 25 this(string input) @safe pure { 26 this.input = input; 27 this.line = 1; 28 this.column = 1; 29 this.buildToken(); 30 } 31 32 private bool eatComment() @safe pure { 33 if(this.input.startsWith("#") || this.input.startsWith("--") 34 || this.input.startsWith("//")) 35 { 36 while(!this.input.empty 37 && (!this.input.startsWith('\n') 38 && !this.input.startsWith('\r'))) 39 { 40 ++this.column; 41 this.input.popFront(); 42 } 43 return true; 44 } else if(this.input.startsWith("/*")) { 45 while(!this.input.empty && !this.input.startsWith("*/")) { 46 if(this.input.startsWith('\n') || this.input.startsWith('\r')) { 47 ++this.line; 48 this.column = 1; 49 } else { 50 ++this.column; 51 } 52 this.input.popFront(); 53 } 54 enforce(!this.input.empty, 55 "No more input while parsing a C comment"); 56 this.input = this.input[2 .. $]; 57 return true; 58 } 59 return false; 60 } 61 62 private void eatWhitespace() @safe pure { 63 while(!this.input.empty) { 64 if(this.eatComment()) { 65 continue; 66 } else if(this.input.front == ' ') { 67 ++this.column; 68 } else if(this.input.front == '\t') { 69 ++this.column; 70 } else { 71 break; 72 } 73 this.input.popFront(); 74 } 75 } 76 77 private void singleCharToken(TokenType tt) @safe pure { 78 this.cur = Token(tt, this.line, this.column); 79 ++this.column; 80 this.input.popFront(); 81 } 82 83 private void buildToken() @safe pure { 84 this.eatWhitespace(); 85 86 if(this.input.empty) { 87 this.cur = this.cur.type == TokenType.eof 88 ? Token(TokenType.undefined, this.line, this.column) 89 : Token(TokenType.eof, this.line, this.column); 90 return; 91 } 92 93 if(this.input.front == '{') { 94 this.singleCharToken(TokenType.lcurly); 95 return; 96 } else if(this.input.front == '}') { 97 this.singleCharToken(TokenType.rcurly); 98 return; 99 } else if(this.input.front == '\r') { 100 this.singleCharToken(TokenType.eol); 101 ++this.line; 102 this.column = 1; 103 return; 104 } else if(this.input.front == '\n') { 105 this.singleCharToken(TokenType.eol); 106 ++this.line; 107 this.column = 1; 108 return; 109 } else if(this.input.front == '=') { 110 this.singleCharToken(TokenType.assign); 111 return; 112 } else if(this.input.front == ':') { 113 this.singleCharToken(TokenType.colon); 114 return; 115 } else if(this.input.front == '\\') { 116 ++this.column; 117 this.input.popFront(); 118 while(this.input.front != '\n') { 119 this.input.popFront(); 120 ++this.column; 121 } 122 this.column = 1; 123 ++this.line; 124 this.input.popFront(); 125 this.buildToken(); 126 return; 127 } else if(this.input.front == ';') { 128 this.singleCharToken(TokenType.semicolon); 129 return; 130 } else if(this.input.front == '[') { 131 size_t l = this.line; 132 size_t c = this.column; 133 ++this.column; 134 this.input.popFront(); 135 136 size_t rbrack; 137 while(rbrack < this.input.length && this.input[rbrack] != ']') { 138 ++rbrack; 139 ++this.column; 140 } 141 142 ++this.column; 143 144 string theData = this.input[0 .. rbrack]; 145 ubyte[] data = Base64.decode(theData); 146 this.input = this.input[rbrack + 1 .. $]; 147 this.cur = Token(TokenType.value, Value(data), theData, l, c); 148 return; 149 } else if(this.input.startsWith("`")) { 150 size_t l = this.line; 151 size_t c = this.column; 152 ++this.column; 153 this.input.popFront(); 154 155 auto app = appender!string(); 156 157 while(this.input.front != '`') { 158 app.put(this.input.front); 159 if(this.input.front == '\n') { 160 ++this.line; 161 this.column = 1; 162 } else { 163 ++this.column; 164 } 165 this.input.popFront(); 166 } 167 168 assert(this.input.front == '`', this.input); 169 this.input.popFront(); 170 this.cur = Token(TokenType.value, Value(app.data), app.data, l, c); 171 return; 172 } else if(this.input.front == '"') { 173 size_t l = this.line; 174 size_t c = this.column; 175 ++this.column; 176 this.input.popFront(); 177 178 auto app = appender!string(); 179 180 while(!this.input.startsWith('"')) { 181 if(this.input.startsWith("\\\"")) { 182 app.put('"'); 183 this.input = this.input[2 .. $]; 184 this.column += 2; 185 } else if(this.input.startsWith("\\\\")) { 186 app.put('\\'); 187 this.input = this.input[2 .. $]; 188 this.column += 2; 189 } else if(this.input.startsWith("\\t")) { 190 app.put('\t'); 191 this.input = this.input[2 .. $]; 192 this.column += 2; 193 } else if(this.input.startsWith("\\n")) { 194 app.put('\n'); 195 this.input = this.input[2 .. $]; 196 this.column += 2; 197 } else if(this.input.length > 1 && this.input.front == '\\') { 198 this.input.popFront(); 199 while(this.input.front.isWhite()) { 200 if(this.input.front == ' ') { 201 ++this.column; 202 } else if(this.input.front == '\t') { 203 ++this.column; 204 } else if(this.input.front == '\n') { 205 ++this.line; 206 this.column = 1; 207 } 208 this.input.popFront(); 209 } 210 } else { 211 app.put(this.input.front); 212 ++this.column; 213 this.input.popFront(); 214 } 215 } 216 assert(this.input.front == '"', this.input); 217 this.input.popFront(); 218 ++this.column; 219 220 this.cur = Token(TokenType.value, Value(app.data), app.data, l, c); 221 return; 222 } else if(this.input.front == '-' || isDigit(this.input.front)) { 223 size_t l = this.line; 224 size_t c = this.column; 225 226 size_t idx; 227 if(this.input.front == '-') { 228 ++idx; 229 } 230 231 while(idx < this.input.length && isDigit(this.input[idx])) { 232 ++idx; 233 ++this.column; 234 } 235 236 string tmp = this.input[idx .. $]; 237 238 if(tmp.empty || isWhite(tmp.front) || tmp.front == '.' 239 || tmp.front == 'l' || tmp.front == 'L' 240 || tmp.front == 'f' || tmp.front == 'F') 241 { 242 parseNumber(idx, l, c); 243 return; 244 } else if(tmp.front == 'd' || tmp.front == 'D' 245 || tmp.front == ':') 246 { 247 parseDuration(idx, l, c); 248 return; 249 } else if(tmp.front == '/') { 250 parseDate(idx, l, c); 251 return; 252 } else { 253 assert(false, this.input); 254 } 255 } else if(isAlpha(this.input.front)) { 256 size_t e; 257 while(e < this.input.length && 258 ( isAlphaNum(this.input[e]) || this.input[e] == '_' 259 || this.input[e] == '-' || this.input[e] == '.' 260 || this.input[e] == '$' 261 ) 262 ) 263 { 264 ++e; 265 } 266 string str = this.input[0 .. e]; 267 switch(str) { 268 case "null": 269 this.cur = Token(TokenType.value, Value.init, 270 str, this.line, this.column); 271 break; 272 case "on": 273 goto case; 274 case "true": 275 this.cur = Token(TokenType.value, Value(true), 276 str, this.line, this.column); 277 break; 278 case "off": 279 goto case; 280 case "false": 281 this.cur = Token(TokenType.value, Value(false), 282 str, this.line, this.column); 283 break; 284 default: 285 this.cur = Token(TokenType.ident, Value(str), str, 286 this.line, this.column); 287 break; 288 } 289 this.column += e; 290 this.input = this.input[e .. $]; 291 return; 292 } 293 throw new Exception(format( 294 "Unexpected input: '%s' ascii: %d at Line:%d Column:%d", 295 this.input, this.input[0], this.line, this.column)); 296 } 297 298 void parseNumber(size_t idx, size_t l, size_t c) @safe pure { 299 string prefix = this.input[0 .. idx]; 300 string tmp = this.input[idx .. $]; 301 if(tmp.empty) { 302 this.cur = Token(TokenType.value, Value(to!int(prefix)), prefix, 303 l, c); 304 this.input = tmp; 305 } else if(tmp.startsWith('L') 306 || tmp.startsWith('l')) 307 { 308 this.cur = Token(TokenType.value, Value(to!long(prefix)), prefix, 309 l, c); 310 this.input = tmp; 311 this.input.popFront(); 312 ++this.column; 313 } else if(tmp.startsWith('F') 314 || tmp.startsWith('f')) 315 { 316 this.cur = Token(TokenType.value, Value(to!float(prefix)), prefix, 317 l, c); 318 this.input = tmp; 319 this.input.popFront(); 320 ++this.column; 321 } else if(tmp.startsWith('D') || tmp.startsWith('d')) 322 { 323 this.cur = Token(TokenType.value, Value(to!double(prefix)), prefix, 324 l, c); 325 this.input = tmp; 326 this.input.popFront(); 327 ++this.column; 328 } else if(tmp.startsWith("bd") 329 || tmp.startsWith("BD") 330 || tmp.startsWith("bD") 331 || tmp.startsWith("Bd")) 332 { 333 this.cur = Token(TokenType.value, Value(to!real(prefix)), prefix, 334 l, c); 335 this.input = tmp; 336 this.input.popFront(); 337 this.input.popFront(); 338 this.column += 2; 339 } else if(tmp.startsWith('.')) { 340 tmp.popFront(); 341 ++this.column; 342 while(!tmp.empty && isDigit(tmp.front)) { 343 ++idx; 344 ++this.column; 345 tmp.popFront(); 346 } 347 ++idx; 348 string theNum = this.input[0 .. idx]; 349 this.input = this.input[idx .. $]; 350 if(this.input.empty) { 351 this.cur = Token(TokenType.value, Value(to!double(theNum)), 352 theNum, l, c); 353 } else if(this.input.startsWith('F') 354 || this.input.startsWith('f')) 355 { 356 this.input.popFront(); 357 this.cur = Token(TokenType.value, Value(to!float(theNum)), 358 theNum, l, c); 359 } else if(this.input.startsWith("BD") 360 || this.input.startsWith("bd") 361 || this.input.startsWith("Bd") 362 || this.input.startsWith("bD")) 363 { 364 this.input.popFront(); 365 this.input.popFront(); 366 this.cur = Token(TokenType.value, Value(to!real(theNum)), 367 theNum, l, c); 368 } else { 369 this.cur = Token(TokenType.value, Value(to!double(prefix)), 370 prefix, l, c); 371 this.input = tmp; 372 } 373 } else { 374 this.cur = Token(TokenType.value, Value(to!int(prefix)), prefix, 375 l, c); 376 this.input = tmp; 377 } 378 } 379 380 void parseDuration(size_t idx, size_t l, size_t c) @safe pure { 381 } 382 383 void parseDate(size_t idx, size_t l, size_t c) @safe pure { 384 } 385 386 @property bool empty() const @safe pure { 387 return this.input.empty 388 && this.cur.type == TokenType.undefined; 389 } 390 391 Token front() @property @safe pure { 392 return this.cur; 393 } 394 395 @property Token front() const @safe @nogc pure { 396 return this.cur; 397 } 398 399 void popFront() @safe pure { 400 this.buildToken(); 401 } 402 403 string getRestOfInput() const @safe pure { 404 return this.input; 405 } 406 } 407 408 @safe pure: 409 410 void test(ref Lexer lex, TokenType tt) { 411 assert(!lex.empty); 412 assert(lex.front.type == tt, 413 format("\nexp: %s\ngot: %s", tt, lex.front.type)); 414 lex.popFront(); 415 } 416 417 void test(T)(ref Lexer lex, TokenType tt, ValueType vt, T value) { 418 import std.traits : isFloatingPoint; 419 import std.math : isClose; 420 421 import dud.utils : floatToStringPure; 422 assert(!lex.empty); 423 assert(lex.front.type == tt, 424 format("\nexp: %s\ngot: %s", tt, lex.front.type)); 425 assert(lex.front.value.type == vt, 426 format("\nexp: %s\ngot: %s", vt, lex.front.value.type)); 427 428 T tValue = lex.front.value.get!T(); 429 static if(isFloatingPoint!T) { 430 assert(isClose(value, tValue), 431 format("\nexp: %s\ngot: %s", floatToStringPure(value), 432 floatToStringPure(tValue))); 433 } else { 434 assert(value == tValue, 435 format("\nexp: %s\ngot: %s", value, tValue)); 436 } 437 438 lex.popFront(); 439 } 440 441 unittest { 442 auto l = Lexer("1337"); 443 test(l, TokenType.value, ValueType.int32, 1337); 444 test(l, TokenType.eof); 445 assert(l.empty); 446 } 447 448 unittest { 449 auto l = Lexer("1337l"); 450 test(l, TokenType.value, ValueType.int64, 1337); 451 test(l, TokenType.eof); 452 assert(l.empty); 453 } 454 455 unittest { 456 auto l = Lexer("1337.0"); 457 test(l, TokenType.value, ValueType.float64, 1337.0); 458 test(l, TokenType.eof); 459 assert(l.empty); 460 } 461 462 unittest { 463 auto l = Lexer("1337.0BD"); 464 test(l, TokenType.value, ValueType.float128, 1337.0); 465 test(l, TokenType.eof); 466 assert(l.empty); 467 } 468 469 unittest { 470 auto l = Lexer("1337.0f"); 471 test(l, TokenType.value, ValueType.float32, 1337.0f); 472 test(l, TokenType.eof); 473 assert(l.empty); 474 } 475 476 unittest { 477 auto l = Lexer(`"Hello World"`); 478 test(l, TokenType.value, ValueType.str, "Hello World"); 479 test(l, TokenType.eof); 480 assert(l.empty); 481 } 482 483 unittest { 484 string input; 485 version(Windows) { 486 input = "`Hello\n World`"; 487 } else { 488 input = q{`Hello 489 World`}; 490 } 491 auto l = Lexer(input); 492 test(l, TokenType.value, ValueType.str, "Hello\n World"); 493 test(l, TokenType.eof); 494 assert(l.empty); 495 } 496 497 unittest { 498 auto l = Lexer(`Hello "World"`); 499 test(l, TokenType.ident, ValueType.str, "Hello"); 500 test(l, TokenType.value, ValueType.str, "World"); 501 test(l, TokenType.eof); 502 assert(l.empty); 503 } 504 505 unittest { 506 auto l = Lexer(`Hello "World"1337`); 507 test(l, TokenType.ident, ValueType.str, "Hello"); 508 test(l, TokenType.value, ValueType.str, "World"); 509 test(l, TokenType.value, ValueType.int32, 1337); 510 test(l, TokenType.eof); 511 assert(l.empty); 512 } 513 514 unittest { 515 auto l = Lexer(`H`); 516 test(l, TokenType.ident, ValueType.str, "H"); 517 test(l, TokenType.eof); 518 assert(l.empty); 519 }