1
2
3
4
5
6 #include <soulng/lexer/Lexer.hpp>
7 #include <soulng/lexer/ParsingException.hpp>
8 #include <soulng/util/Unicode.hpp>
9 #include <string>
10 #include <algorithm>
11
12 namespace soulng { namespace lexer {
13
14 using namespace soulng::unicode;
15
16 Lexer::Lexer(const std::u32string& content_, const std::string& fileName_, int fileIndex_) :
17 content(content_), fileName(fileName_), fileIndex(fileIndex_), line(1), keywordMap(nullptr), start(content.c_str()), end(content.c_str() + content.length()), pos(start), current(tokens.end()),
18 log(nullptr), countLines(true), separatorChar('\0'), flags(), commentTokenId(-1)
19 {
20 }
21
22 Lexer::Lexer(const char32_t* start_, const char32_t* end_, const std::string& fileName_, int fileIndex_) :
23 content(), fileName(fileName_), fileIndex(fileIndex_), line(1), keywordMap(nullptr), start(start_), end(end_), pos(start), current(tokens.end()),
24 log(nullptr), countLines(true), separatorChar('\0'), flags(), commentTokenId(-1)
25 {
26 }
27
28 Lexer::~Lexer()
29 {
30 }
31
32 void Lexer::operator++()
33 {
34 if (current != tokens.end())
35 {
36 ++current;
37 }
38 if (current == tokens.end())
39 {
40 NextToken();
41 }
42 else
43 {
44 line = current->line;
45 }
46 }
47
48 int64_t Lexer::GetPos() const
49 {
50 int32_t p = static_cast<int32_t>(current - tokens.begin());
51 return (static_cast<int64_t>(line) << 32) | static_cast<int64_t>(p);
52 }
53
54 void Lexer::SetPos(int64_t pos)
55 {
56 current = tokens.begin() + static_cast<int32_t>(pos);
57 line = static_cast<int32_t>(pos >> 32);
58 }
59
60 void Lexer::NextToken()
61 {
62 int state = 0;
63 while (true)
64 {
65 char32_t c = separatorChar;
66 if (pos != end)
67 {
68 c = *pos;
69 }
70 else if (c == '\0')
71 {
72 break;
73 }
74 if (state == 0)
75 {
76 lexeme.begin = pos;
77 token.id = INVALID_TOKEN;
78 token.line = line;
79 }
80 if (pos == end)
81 {
82 lexeme.end = end;
83 }
84 else
85 {
86 lexeme.end = pos + 1;
87 }
88 state = NextState(state, c);
89 if (state == -1)
90 {
91 if (token.id == CONTINUE_TOKEN)
92 {
93 if (pos == end)
94 {
95 break;
96 }
97 else
98 {
99 pos = token.match.end;
100 }
101 state = 0;
102 continue;
103 }
104 else if (token.id == INVALID_TOKEN)
105 {
106 if (pos == end)
107 {
108 break;
109 }
110 else
111 {
112 throw std::runtime_error("soulng::lexer::Lexer::NextToken(): error: invalid character '" + ToUtf8(std::u32string(1, c)) + "' in file '" + fileName + "' at line " + std::to_string(line));
113 }
114 }
115 else
116 {
117 tokens.push_back(token);
118 current = tokens.end() - 1;
119 pos = token.match.end;
120 return;
121 }
122 }
123 if (c == '\n' && countLines)
124 {
125 ++line;
126 }
127 ++pos;
128 }
129 token.id = INVALID_TOKEN;
130 state = NextState(state, '\0');
131 int64_t p = -1;
132 if (token.id != INVALID_TOKEN && token.id != CONTINUE_TOKEN)
133 {
134 tokens.push_back(token);
135 current = tokens.end() - 1;
136 p = GetPos();
137 }
138 Token endToken(END_TOKEN);
139 endToken.match.begin = end;
140 endToken.match.end = end;
141 tokens.push_back(endToken);
142 if (p == -1)
143 {
144 current = tokens.end() - 1;
145 p = GetPos();
146 }
147 SetPos(p);
148 }
149
150 int Lexer::NextState(int state, char32_t c)
151 {
152 return -1;
153 }
154
155 int Lexer::GetKeywordToken(const Lexeme& lexeme) const
156 {
157 if (keywordMap)
158 {
159 return keywordMap->GetKeywordToken(lexeme);
160 }
161 else
162 {
163 return INVALID_TOKEN;
164 }
165 }
166
167 void Lexer::ConvertExternal(Span& span)
168 {
169 Token startToken = GetToken(span.start);
170 span.start = static_cast<int>(startToken.match.begin - start);
171 Token endToken = GetToken(span.end);
172 span.end = static_cast<int>(endToken.match.end - start);
173 }
174
175 Token Lexer::GetToken(int64_t pos) const
176 {
177 int32_t tokenIndex = static_cast<int32_t>(pos);
178 if (tokenIndex >= 0 && tokenIndex < tokens.size())
179 {
180 return tokens[tokenIndex];
181 }
182 else
183 {
184 throw std::runtime_error("invalid token index");
185 }
186 }
187
188 void Lexer::SetTokens(const std::std::vector<Token>&tokens_)
189 {
190 if (!tokens_.empty())
191 {
192 tokens.push_back(tokens_.front());
193 }
194 else
195 {
196 tokens.push_back(Token(END_TOKEN, Lexeme(end, end), 1));
197 }
198 for (const Token& token : tokens_)
199 {
200 tokens.push_back(token);
201 }
202 tokens.push_back(Token(END_TOKEN, Lexeme(end, end), 1));
203 current = tokens.begin();
204 }
205
206 std::u32string Lexer::GetMatch(const Span& span) const
207 {
208 std::u32string match;
209 Token startToken = GetToken(span.start);
210 match.append(startToken.match.ToString());
211 const char32_t* e = startToken.match.end;
212 for (int i = span.start + 1; i <= span.end; ++i)
213 {
214 Token token = GetToken(i);
215 match.append(std::u32string(token.match.begin - e, ' '));
216 match.append(token.match.ToString());
217 e = token.match.end;
218 }
219 return match;
220 }
221
222 const char32_t* LineStart(const char32_t* start, const char32_t* p)
223 {
224 while (p != start && *p != '\n' && *p != '\r')
225 {
226 --p;
227 }
228 if (p != start)
229 {
230 ++p;
231 }
232 return p;
233 }
234
235 const char32_t* LineEnd(const char32_t* end, const char32_t* p)
236 {
237 while (p != end && *p != '\n' && *p != '\r')
238 {
239 ++p;
240 }
241 return p;
242 }
243
244 std::u32string GetErrorLines(const char32_t* start, const char32_t* end, const Span& externalSpan)
245 {
246 const char32_t* startPos = start + externalSpan.start;
247 if (startPos < start || startPos >= end)
248 {
249 return std::u32string();
250 }
251 const char32_t* lineStart = LineStart(start, startPos);
252 int cols = static_cast<int>(startPos - lineStart);
253 if (cols < 0)
254 {
255 cols = 0;
256 }
257 const char32_t* lineEnd = LineEnd(end, startPos);
258 if (lineEnd < lineStart)
259 {
260 lineEnd = lineStart;
261 }
262 int lineLength = static_cast<int>(lineEnd - lineStart);
263 std::u32string lines(lineStart, lineEnd);
264 int spanCols = std::max(static_cast<int>(1), std::min(externalSpan.end - externalSpan.start, lineLength - cols));
265 lines.append(1, '\n').append(std::u32string(cols, ' ')).append(spanCols, '^');
266 return lines;
267 }
268
269 void GetColumns(const char32_t* start, const char32_t* end, const Span& externalSpan, int32_t& startCol, int32_t& endCol)
270 {
271 startCol = 0;
272 endCol = 0;
273 const char32_t* startPos = start + externalSpan.start;
274 if (startPos < start || startPos >= end)
275 {
276 return;
277 }
278 const char32_t* lineStart = LineStart(start, startPos);
279 int cols = static_cast<int>(startPos - lineStart);
280 if (cols < 0)
281 {
282 cols = 0;
283 }
284 startCol = cols + 1;
285 const char32_t* lineEnd = LineEnd(end, startPos);
286 if (lineEnd < lineStart)
287 {
288 lineEnd = lineStart;
289 }
290 int lineLength = static_cast<int>(lineEnd - lineStart);
291 int spanCols = std::max(static_cast<int>(1), std::min(externalSpan.end - externalSpan.start, lineLength - cols));
292 endCol = startCol + spanCols;
293 }
294
295 std::u32string Lexer::ErrorLines(const Token& token) const
296 {
297 std::u32string lines;
298 const char32_t* lineStart = LineStart(start, token.match.begin);
299 const char32_t* lineEnd = LineEnd(end, token.match.end);
300 lines.append(std::u32string(lineStart, token.match.begin));
301 lines.append(token.match.ToString());
302 lines.append(std::u32string(token.match.end, lineEnd));
303 lines.append(1, '\n');
304 lines.append(token.match.begin - lineStart, ' ');
305 lines.append(std::max(static_cast<int64_t>(1), token.match.end - token.match.begin), '^');
306 lines.append(lineEnd - token.match.end, ' ');
307 lines.append(1, '\n');
308 return lines;
309 }
310
311 std::u32string Lexer::ErrorLines(const Span& span) const
312 {
313 std::u32string lines;
314 Token startToken = GetToken(span.start);
315 Token endToken = startToken;
316 const char32_t* lineStart = LineStart(start, startToken.match.begin);
317 if (span.end != span.start)
318 {
319 endToken = GetToken(span.end);
320 }
321 const char32_t* lineEnd = LineEnd(end, endToken.match.end);
322 lines.append(std::u32string(lineStart, startToken.match.begin));
323 lines.append(startToken.match.ToString());
324 const char32_t* s = startToken.match.begin;
325 const char32_t* e = startToken.match.end;
326 for (int i = span.start + 1; i <= span.end; ++i)
327 {
328 Token token = GetToken(i);
329 lines.append(std::u32string(token.match.begin - e, ' '));
330 lines.append(token.match.ToString());
331 e = token.match.end;
332 }
333 lines.append(std::u32string(e, lineEnd));
334 lines.append(1, '\n');
335 lines.append(s - lineStart, ' ');
336 lines.append(std::max(static_cast<int64_t>(1), e - s), '^');
337 lines.append(lineEnd - e, ' ');
338 lines.append(1, '\n');
339 return lines;
340 }
341
342 void Lexer::GetColumns(const Span& span, int32_t& startCol, int32_t& endCol) const
343 {
344 Token startToken = GetToken(span.start);
345 Token endToken = startToken;
346 const char32_t* lineStart = LineStart(start, startToken.match.begin);
347 if (span.end != span.start)
348 {
349 endToken = GetToken(span.end);
350 }
351 int cols = static_cast<int>(startToken.match.begin - lineStart);
352 if (cols < 0)
353 {
354 cols = 0;
355 }
356 startCol = cols + 1;
357 const char32_t* lineEnd = LineEnd(end, endToken.match.end);
358 if (lineEnd < lineStart)
359 {
360 lineEnd = lineStart;
361 }
362 int lineLength = static_cast<int>(lineEnd - lineStart);
363 int spanCols = std::max(static_cast<int>(1), std::min(static_cast<int>(endToken.match.end - startToken.match.begin), lineLength - cols));
364 endCol = startCol + spanCols;
365 }
366
367 void Lexer::ThrowExpectationFailure(const Span& span, const std::u32string& name)
368 {
369 Token token = GetToken(span.start);
370 throw ParsingException("parsing error in '" + fileName + ":" + std::to_string(token.line) + "': " + ToUtf8(name) + " expected:\n" + ToUtf8(ErrorLines(span)), fileName, span);
371 }
372
373 void Lexer::AddError(const Span& span, const std::u32string& name)
374 {
375 if (GetFlag(LexerFlags::synchronize) && GetFlag(LexerFlags::synchronized))
376 {
377 SetFlag(LexerFlags::synchronizedAtLeastOnce);
378 }
379 else
380 {
381 Token token = GetToken(span.start);
382 ParsingException * error(new ParsingException("parsing error in '" + fileName + ":" + std::to_string(token.line) + "': " + ToUtf8(name) + " expected:\n" + ToUtf8(ErrorLines(span)), fileName, span));
383 errors.push_back(std::unique_ptr<std::exception>(error));
384 }
385 }
386
387 std::u32string Lexer::RestOfLine(int maxLineLength)
388 {
389 std::u32string restOfLine(current->match.ToString() + std::u32string(current->match.end, pos) + std::u32string(pos, LineEnd(end, pos)));
390 if (maxLineLength != 0)
391 {
392 restOfLine = restOfLine.substr(0, maxLineLength);
393 }
394 return restOfLine;
395 }
396
397 TokenLine Lexer::TokenizeLine(const std::u32string& line, int lineNumber, int startState)
398 {
399 pos = line.c_str();
400 end = line.c_str() + line.length();
401 TokenLine tokenLine;
402 tokenLine.startState = startState;
403 lexeme.begin = pos;
404 lexeme.end = end;
405 token.match = lexeme;
406 token.id = INVALID_TOKEN;
407 token.line = lineNumber;
408 int state = startState;
409 int prevState = 0;
410 int prevPrevState = 0;
411 bool cont = false;
412 while (pos != end)
413 {
414 char32_t c = *pos;
415 if (state == 0)
416 {
417 lexeme.begin = pos;
418 token.id = INVALID_TOKEN;
419 token.line = lineNumber;
420 }
421 lexeme.end = pos + 1;
422 prevPrevState = prevState;
423 prevState = state;
424 state = NextState(state, c);
425 if (state == -1)
426 {
427 if (prevState == 0)
428 {
429 break;
430 }
431 state = 0;
432 pos = token.match.end;
433 tokenLine.tokens.push_back(token);
434 if (pos + 1 < end && *pos == '\"' && *(pos + 1) == '\\' && prevPrevState == 13 && prevState == 71)
435 {
436 Token tok;
437 tok.match.begin = pos;
438 tok.match.end = pos + 2;
439 tokenLine.tokens.push_back(tok);
440 pos += 2;
441 }
442 lexeme.begin = lexeme.end;
443 }
444 else
445 {
446 ++pos;
447 }
448 }
449 if (state != 0 && state != -1)
450 {
451 state = NextState(state, '\r');
452 }
453 if (state != 0 && state != -1)
454 {
455 state = NextState(state, '\n');
456 }
457 if (state != 0 && state != -1)
458 {
459 if (blockCommentStates.find(state) != blockCommentStates.cend())
460 {
461 token.id = commentTokenId;
462 token.match.end = end;
463 tokenLine.tokens.push_back(token);
464 tokenLine.endState = state;
465 return tokenLine;
466 }
467 }
468 if (lexeme.begin != lexeme.end)
469 {
470 token.match = lexeme;
471 tokenLine.tokens.push_back(token);
472 }
473 if (state == -1)
474 {
475 state = 0;
476 }
477 tokenLine.endState = state;
478 return tokenLine;
479 }
480
481 void Lexer::SetSyncTokens(const std::std::vector<int>&syncTokens_)
482 {
483 syncTokens = syncTokens_;
484 }
485
486 bool Lexer::Synchronize()
487 {
488 if (GetFlag(LexerFlags::synchronize))
489 {
490 if (GetFlag(LexerFlags::synchronized)) return false;
491 SetFlag(LexerFlags::synchronized);
492 while (pos != end)
493 {
494 int curToken = token.id;
495 for (int syncToken : syncTokens)
496 {
497 if (curToken == syncToken)
498 {
499 return true;
500 }
501 }
502 ++*this;
503 }
504 }
505 return false;
506 }
507
508 void Lexer::SetBlockCommentStates(const std::std::set<int>&blockCommentStates_)
509 {
510 blockCommentStates = blockCommentStates_;
511 }
512
513 const std::std::set<int>&Lexer::BlockCommentStates() const
514 {
515 return blockCommentStates;
516 }
517
518 void WriteBeginRuleToLog(Lexer& lexer, const std::u32string& ruleName)
519 {
520 lexer.Log()->WriteBeginRule(ruleName);
521 lexer.Log()->IncIndent();
522 lexer.Log()->WriteTry(lexer.RestOfLine(lexer.Log()->MaxLineLength()));
523 lexer.Log()->IncIndent();
524 }
525
526 void WriteSuccessToLog(Lexer& lexer, const Span& matchSpan, const std::u32string& ruleName)
527 {
528 lexer.Log()->DecIndent();
529 lexer.Log()->WriteSuccess(lexer.GetMatch(matchSpan));
530 lexer.Log()->DecIndent();
531 lexer.Log()->WriteEndRule(ruleName);
532 }
533
534 void WriteFailureToLog(Lexer& lexer, const std::u32string& ruleName)
535 {
536 lexer.Log()->DecIndent();
537 lexer.Log()->WriteFail();
538 lexer.Log()->DecIndent();
539 lexer.Log()->WriteEndRule(ruleName);
540 }
541
542 } }