1 module dud.sdlang.lexer;
2 
3 import std.ascii;
4 import std.array : appender, empty, front, popFront, popBack;
5 import std.algorithm.searching : startsWith;
6 import std.base64;
7 import std.exception : enforce;
8 import std.conv : to;
9 import std.experimental.logger;
10 import std.format : format;
11 import std.typecons : Flag;
12 import std.stdio;
13 
14 import dud.sdlang.tokenmodule;
15 import dud.sdlang.value;
16 
17 struct Lexer {
18 	string input;
19 
20 	size_t line;
21 	size_t column;
22 
23 	Token cur;
24 
25 	this(string input) @safe pure {
26 		this.input = input;
27 		this.line = 1;
28 		this.column = 1;
29 		this.buildToken();
30 	}
31 
32 	private bool eatComment() @safe pure {
33 		if(this.input.startsWith("#") || this.input.startsWith("--")
34 				|| this.input.startsWith("//"))
35 		{
36 			while(!this.input.empty
37 					&& (!this.input.startsWith('\n')
38 						&& !this.input.startsWith('\r')))
39 			{
40 				++this.column;
41 				this.input.popFront();
42 			}
43 			return true;
44 		} else if(this.input.startsWith("/*")) {
45 			while(!this.input.empty && !this.input.startsWith("*/")) {
46 				if(this.input.startsWith('\n') || this.input.startsWith('\r')) {
47 					++this.line;
48 					this.column = 1;
49 				} else {
50 					++this.column;
51 				}
52 				this.input.popFront();
53 			}
54 			enforce(!this.input.empty,
55 				"No more input while parsing a C comment");
56 			this.input = this.input[2 .. $];
57 			return true;
58 		}
59 		return false;
60 	}
61 
62 	private void eatWhitespace() @safe pure {
63 		while(!this.input.empty) {
64 			if(this.eatComment()) {
65 				continue;
66 			} else if(this.input.front == ' ') {
67 				++this.column;
68 			} else if(this.input.front == '\t') {
69 				++this.column;
70 			} else {
71 				break;
72 			}
73 			this.input.popFront();
74 		}
75 	}
76 
77 	private void singleCharToken(TokenType tt) @safe pure {
78 		this.cur = Token(tt, this.line, this.column);
79 		++this.column;
80 		this.input.popFront();
81 	}
82 
83 	private void buildToken() @safe pure {
84 		this.eatWhitespace();
85 
86 		if(this.input.empty) {
87 			this.cur = this.cur.type == TokenType.eof
88 				? Token(TokenType.undefined, this.line, this.column)
89 				: Token(TokenType.eof, this.line, this.column);
90 			return;
91 		}
92 
93 		if(this.input.front == '{') {
94 			this.singleCharToken(TokenType.lcurly);
95 			return;
96 		} else if(this.input.front == '}') {
97 			this.singleCharToken(TokenType.rcurly);
98 			return;
99 		} else if(this.input.front == '\r') {
100 			this.singleCharToken(TokenType.eol);
101 			++this.line;
102 			this.column = 1;
103 			return;
104 		} else if(this.input.front == '\n') {
105 			this.singleCharToken(TokenType.eol);
106 			++this.line;
107 			this.column = 1;
108 			return;
109 		} else if(this.input.front == '=') {
110 			this.singleCharToken(TokenType.assign);
111 			return;
112 		} else if(this.input.front == ':') {
113 			this.singleCharToken(TokenType.colon);
114 			return;
115 		} else if(this.input.front == '\\') {
116 			++this.column;
117 			this.input.popFront();
118 			while(this.input.front != '\n') {
119 				this.input.popFront();
120 				++this.column;
121 			}
122 			this.column = 1;
123 			++this.line;
124 			this.input.popFront();
125 			this.buildToken();
126 			return;
127 		} else if(this.input.front == ';') {
128 			this.singleCharToken(TokenType.semicolon);
129 			return;
130 		} else if(this.input.front == '[') {
131 			size_t l = this.line;
132 			size_t c = this.column;
133 			++this.column;
134 			this.input.popFront();
135 
136 			size_t rbrack;
137 			while(rbrack < this.input.length && this.input[rbrack] != ']') {
138 				++rbrack;
139 				++this.column;
140 			}
141 
142 			++this.column;
143 
144 			string theData = this.input[0 .. rbrack];
145 			ubyte[] data = Base64.decode(theData);
146 			this.input = this.input[rbrack + 1 .. $];
147 			this.cur = Token(TokenType.value, Value(data), theData, l, c);
148 			return;
149 		} else if(this.input.startsWith("`")) {
150 			size_t l = this.line;
151 			size_t c = this.column;
152 			++this.column;
153 			this.input.popFront();
154 
155 			auto app = appender!string();
156 
157 			while(this.input.front != '`') {
158 				app.put(this.input.front);
159 				if(this.input.front == '\n') {
160 					++this.line;
161 					this.column = 1;
162 				} else {
163 					++this.column;
164 				}
165 				this.input.popFront();
166 			}
167 
168 			assert(this.input.front == '`', this.input);
169 			this.input.popFront();
170 			this.cur = Token(TokenType.value, Value(app.data), app.data, l, c);
171 			return;
172 		} else if(this.input.front == '"') {
173 			size_t l = this.line;
174 			size_t c = this.column;
175 			++this.column;
176 			this.input.popFront();
177 
178 			auto app = appender!string();
179 
180 			while(!this.input.startsWith('"')) {
181 				if(this.input.startsWith("\\\"")) {
182 					app.put('"');
183 					this.input = this.input[2 .. $];
184 					this.column += 2;
185 				} else if(this.input.startsWith("\\\\")) {
186 					app.put('\\');
187 					this.input = this.input[2 .. $];
188 					this.column += 2;
189 				} else if(this.input.startsWith("\\t")) {
190 					app.put('\t');
191 					this.input = this.input[2 .. $];
192 					this.column += 2;
193 				} else if(this.input.startsWith("\\n")) {
194 					app.put('\n');
195 					this.input = this.input[2 .. $];
196 					this.column += 2;
197 				} else if(this.input.length > 1 && this.input.front == '\\') {
198 					this.input.popFront();
199 					while(this.input.front.isWhite()) {
200 						if(this.input.front == ' ') {
201 							++this.column;
202 						} else if(this.input.front == '\t') {
203 							++this.column;
204 						} else if(this.input.front == '\n') {
205 							++this.line;
206 							this.column = 1;
207 						}
208 						this.input.popFront();
209 					}
210 				} else {
211 					app.put(this.input.front);
212 					++this.column;
213 					this.input.popFront();
214 				}
215 			}
216 			assert(this.input.front == '"', this.input);
217 			this.input.popFront();
218 			++this.column;
219 
220 			this.cur = Token(TokenType.value, Value(app.data), app.data, l, c);
221 			return;
222 		} else if(this.input.front == '-' || isDigit(this.input.front)) {
223 			size_t l = this.line;
224 			size_t c = this.column;
225 
226 			size_t idx;
227 			if(this.input.front == '-') {
228 				++idx;
229 			}
230 
231 			while(idx < this.input.length && isDigit(this.input[idx])) {
232 				++idx;
233 				++this.column;
234 			}
235 
236 			string tmp = this.input[idx .. $];
237 
238 			if(tmp.empty || isWhite(tmp.front) || tmp.front == '.'
239 					|| tmp.front == 'l' || tmp.front == 'L'
240 					|| tmp.front == 'f' || tmp.front == 'F')
241 			{
242 				parseNumber(idx, l, c);
243 				return;
244 			} else if(tmp.front == 'd' || tmp.front == 'D'
245 					|| tmp.front == ':')
246 			{
247 				parseDuration(idx, l, c);
248 				return;
249 			} else if(tmp.front == '/') {
250 				parseDate(idx, l, c);
251 				return;
252 			} else {
253 				assert(false, this.input);
254 			}
255 		} else if(isAlpha(this.input.front)) {
256 			size_t e;
257 			while(e < this.input.length &&
258 					( isAlphaNum(this.input[e]) || this.input[e] == '_'
259 					|| this.input[e] == '-' || this.input[e] == '.'
260 					|| this.input[e] == '$'
261 					)
262 				)
263 			{
264 				++e;
265 			}
266 			string str = this.input[0 .. e];
267 			switch(str) {
268 				case "null":
269 					this.cur = Token(TokenType.value, Value.init,
270 							str, this.line, this.column);
271 					break;
272 				case "on":
273 					goto case;
274 				case "true":
275 					this.cur = Token(TokenType.value, Value(true),
276 							str, this.line, this.column);
277 					break;
278 				case "off":
279 					goto case;
280 				case "false":
281 					this.cur = Token(TokenType.value, Value(false),
282 							str, this.line, this.column);
283 					break;
284 				default:
285 					this.cur = Token(TokenType.ident, Value(str), str,
286 							this.line, this.column);
287 					break;
288 			}
289 			this.column += e;
290 			this.input = this.input[e .. $];
291 			return;
292 		}
293 		throw new Exception(format(
294 			"Unexpected input: '%s' ascii: %d at Line:%d Column:%d",
295 			this.input, this.input[0], this.line, this.column));
296 	}
297 
298 	void parseNumber(size_t idx, size_t l, size_t c) @safe pure {
299 		string prefix = this.input[0 .. idx];
300 		string tmp = this.input[idx .. $];
301 		if(tmp.empty) {
302 			this.cur = Token(TokenType.value, Value(to!int(prefix)), prefix,
303 				l, c);
304 			this.input = tmp;
305 		} else if(tmp.startsWith('L')
306 				|| tmp.startsWith('l'))
307 		{
308 			this.cur = Token(TokenType.value, Value(to!long(prefix)), prefix,
309 				l, c);
310 			this.input = tmp;
311 			this.input.popFront();
312 			++this.column;
313 		} else if(tmp.startsWith('F')
314 				|| tmp.startsWith('f'))
315 		{
316 			this.cur = Token(TokenType.value, Value(to!float(prefix)), prefix,
317 				l, c);
318 			this.input = tmp;
319 			this.input.popFront();
320 			++this.column;
321 		} else if(tmp.startsWith('D') || tmp.startsWith('d'))
322 		{
323 			this.cur = Token(TokenType.value, Value(to!double(prefix)), prefix,
324 				l, c);
325 			this.input = tmp;
326 			this.input.popFront();
327 			++this.column;
328 		} else if(tmp.startsWith("bd")
329 				|| tmp.startsWith("BD")
330 				|| tmp.startsWith("bD")
331 				|| tmp.startsWith("Bd"))
332 		{
333 			this.cur = Token(TokenType.value, Value(to!real(prefix)), prefix,
334 				l, c);
335 			this.input = tmp;
336 			this.input.popFront();
337 			this.input.popFront();
338 			this.column += 2;
339 		} else if(tmp.startsWith('.')) {
340 			tmp.popFront();
341 			++this.column;
342 			while(!tmp.empty && isDigit(tmp.front)) {
343 				++idx;
344 				++this.column;
345 				tmp.popFront();
346 			}
347 			++idx;
348 			string theNum = this.input[0 .. idx];
349 			this.input = this.input[idx .. $];
350 			if(this.input.empty) {
351 				this.cur = Token(TokenType.value, Value(to!double(theNum)),
352 					theNum, l, c);
353 			} else if(this.input.startsWith('F')
354 					|| this.input.startsWith('f'))
355 			{
356 				this.input.popFront();
357 				this.cur = Token(TokenType.value, Value(to!float(theNum)),
358 					theNum, l, c);
359 			} else if(this.input.startsWith("BD")
360 					|| this.input.startsWith("bd")
361 					|| this.input.startsWith("Bd")
362 					|| this.input.startsWith("bD"))
363 			{
364 				this.input.popFront();
365 				this.input.popFront();
366 				this.cur = Token(TokenType.value, Value(to!real(theNum)),
367 					theNum, l, c);
368 			} else {
369 				this.cur = Token(TokenType.value, Value(to!double(prefix)),
370 					prefix, l, c);
371 				this.input = tmp;
372 			}
373 		} else {
374 			this.cur = Token(TokenType.value, Value(to!int(prefix)), prefix,
375 				l, c);
376 			this.input = tmp;
377 		}
378 	}
379 
380 	void parseDuration(size_t idx, size_t l, size_t c) @safe pure {
381 	}
382 
383 	void parseDate(size_t idx, size_t l, size_t c) @safe pure {
384 	}
385 
386 	@property bool empty() const @safe pure {
387 		return this.input.empty
388 			&& this.cur.type == TokenType.undefined;
389 	}
390 
391 	Token front() @property @safe pure {
392 		return this.cur;
393 	}
394 
395 	@property Token front() const @safe @nogc pure {
396 		return this.cur;
397 	}
398 
399 	void popFront() @safe pure {
400 		this.buildToken();
401 	}
402 
403 	string getRestOfInput() const @safe pure {
404 		return this.input;
405 	}
406 }
407 
408 @safe pure:
409 
410 void test(ref Lexer lex, TokenType tt) {
411 	assert(!lex.empty);
412 	assert(lex.front.type == tt,
413 		format("\nexp: %s\ngot: %s", tt, lex.front.type));
414 	lex.popFront();
415 }
416 
417 void test(T)(ref Lexer lex, TokenType tt, ValueType vt, T value) {
418 	import std.traits : isFloatingPoint;
419 	import std.math : isClose;
420 
421 	import dud.utils : floatToStringPure;
422 	assert(!lex.empty);
423 	assert(lex.front.type == tt,
424 		format("\nexp: %s\ngot: %s", tt, lex.front.type));
425 	assert(lex.front.value.type == vt,
426 		format("\nexp: %s\ngot: %s", vt, lex.front.value.type));
427 
428 	T tValue = lex.front.value.get!T();
429 	static if(isFloatingPoint!T) {
430 		assert(isClose(value, tValue),
431 			format("\nexp: %s\ngot: %s", floatToStringPure(value),
432 				floatToStringPure(tValue)));
433 	} else {
434 		assert(value == tValue,
435 			format("\nexp: %s\ngot: %s", value, tValue));
436 	}
437 
438 	lex.popFront();
439 }
440 
441 unittest {
442 	auto l = Lexer("1337");
443 	test(l, TokenType.value, ValueType.int32, 1337);
444 	test(l, TokenType.eof);
445 	assert(l.empty);
446 }
447 
448 unittest {
449 	auto l = Lexer("1337l");
450 	test(l, TokenType.value, ValueType.int64, 1337);
451 	test(l, TokenType.eof);
452 	assert(l.empty);
453 }
454 
455 unittest {
456 	auto l = Lexer("1337.0");
457 	test(l, TokenType.value, ValueType.float64, 1337.0);
458 	test(l, TokenType.eof);
459 	assert(l.empty);
460 }
461 
462 unittest {
463 	auto l = Lexer("1337.0BD");
464 	test(l, TokenType.value, ValueType.float128, 1337.0);
465 	test(l, TokenType.eof);
466 	assert(l.empty);
467 }
468 
469 unittest {
470 	auto l = Lexer("1337.0f");
471 	test(l, TokenType.value, ValueType.float32, 1337.0f);
472 	test(l, TokenType.eof);
473 	assert(l.empty);
474 }
475 
476 unittest {
477 	auto l = Lexer(`"Hello World"`);
478 	test(l, TokenType.value, ValueType.str, "Hello World");
479 	test(l, TokenType.eof);
480 	assert(l.empty);
481 }
482 
483 unittest {
484 	string input;
485 	version(Windows) {
486 		input = "`Hello\n World`";
487 	} else {
488 		input = q{`Hello
489  World`};
490 	}
491 	auto l = Lexer(input);
492 	test(l, TokenType.value, ValueType.str, "Hello\n World");
493 	test(l, TokenType.eof);
494 	assert(l.empty);
495 }
496 
497 unittest {
498 	auto l = Lexer(`Hello "World"`);
499 	test(l, TokenType.ident, ValueType.str, "Hello");
500 	test(l, TokenType.value, ValueType.str, "World");
501 	test(l, TokenType.eof);
502 	assert(l.empty);
503 }
504 
505 unittest {
506 	auto l = Lexer(`Hello "World"1337`);
507 	test(l, TokenType.ident, ValueType.str, "Hello");
508 	test(l, TokenType.value, ValueType.str, "World");
509 	test(l, TokenType.value, ValueType.int32, 1337);
510 	test(l, TokenType.eof);
511 	assert(l.empty);
512 }
513 
514 unittest {
515 	auto l = Lexer(`H`);
516 	test(l, TokenType.ident, ValueType.str, "H");
517 	test(l, TokenType.eof);
518 	assert(l.empty);
519 }