1
2
3
4
5
6 using System;
7 using System.Collections;
8 using System.IO;
9 using System.Security;
10
11 namespace System.Lex
12 {
13 [nodiscard]
14 public Result<int*> GetClassMap(const string& classMapName)
15 {
16 int errorId = 0;
17 int* classMap = RtmGetClassMap(classMapName.Chars(), errorId);
18 if (errorId > 0)
19 {
20 return Result<int*>(ErrorId(errorId));
21 }
22 return Result<int*>(classMap);
23 }
24
25 public class Lexer : IOBase
26 {
27 public Lexer(const uchar* start_, const uchar* end_, const string& fileName_, const string& classMapName_) :
28 content(), fileName(fileName_), line(1), keywordMap(null), start(start_), end(end_), pos(start),
29 current(tokens.End()), log(null), countLines(true), separatorChar('\0'),
30 commentTokenId(-1), farthestPos(GetPos()), classMapName(classMapName_), classMap(null), fileIndex(-1)
31 {
32 ComputeLineStarts();
33 }
34 suppress Lexer(const Lexer&);
35 suppress void operator=(const Lexer&);
36 public inline const string& ClassMapName()
37 {
38 return classMapName;
39 }
40 public void SetBlockCommentStates(const Set<int>& blockCommentStates_) const
41 {
42 blockCommentStates = blockCommentStates_;
43 }
44 public const Set<int>& BlockCommentStates() const
45 {
46 return blockCommentStates;
47 }
48 public void SetCommentTokenId(int commentTokenId_)
49 {
50 commentTokenId = commentTokenId_;
51 }
52 protected virtual int GetCommentTokenId() const
53 {
54 return -1;
55 }
56 public inline long operator*() const
57 {
58 return current->id;
59 }
60 public inline void SetKeywordMap(KeywordMap* keywordMap_)
61 {
62 keywordMap = keywordMap_;
63 }
64 public inline KeywordMap* GetKeywordMap()
65 {
66 return keywordMap;
67 }
68 public void Retract()
69 {
70 token.match.end = pos;
71 }
72 public const string& FileName() const
73 {
74 return fileName;
75 }
76 public Span GetSpan(long pos) const
77 {
78 Token token = GetToken(pos);
79 return Span(cast<int>(token.match.begin - start), cast<int>(token.match.Length()));
80 }
81 public inline int FileIndex() const
82 {
83 return fileIndex;
84 }
85 public void SetFileIndex(int fileIndex_)
86 {
87 fileIndex = fileIndex_;
88 }
89 public void SetLine(int line_)
90 {
91 line = line_;
92 }
93 public void SetCountLines(bool countLines_)
94 {
95 countLines = countLines_;
96 }
97 public Token token;
98 public inline const uchar* Start() const
99 {
100 return start;
101 }
102 public inline const uchar* End() const
103 {
104 return end;
105 }
106 public inline const uchar* Pos() const
107 {
108 return pos;
109 }
110 public void SetLog(ParsingLog* log_)
111 {
112 log = log_;
113 }
114 public ParsingLog* Log() const
115 {
116 return log;
117 }
118 public void SetSeparatorChar(uchar separatorChar_)
119 {
120 separatorChar = separatorChar_;
121 }
122 [nodiscard]
123 public Result<bool> operator++()
124 {
125 if (Error())
126 {
127 return Result<bool>(ErrorId(GetErrorId()));
128 }
129 if (current != tokens.End())
130 {
131 ++current;
132 }
133 if (current == tokens.End())
134 {
135 auto nextTokenResult = NextToken();
136 if (nextTokenResult.Error())
137 {
138 return nextTokenResult;
139 }
140 }
141 else
142 {
143 line = current->line;
144 }
145 long p = GetPos();
146 if (p > farthestPos)
147 {
148 farthestPos = p;
149 farthestRuleContext = ruleContext;
150 }
151 return Result<bool>(true);
152 }
153 public inline long GetPos() const
154 {
155 int p = cast<int>(current - tokens.Begin());
156 return (cast<long>(line) << 32) | cast<long>(p);
157 }
158 public inline void SetPos(long pos)
159 {
160 current = tokens.Begin() + cast<int>(pos);
161 line = cast<int>(pos >> 32);
162 }
163 [nodiscard]
164 public Result<bool> NextToken()
165 {
166 if (Error())
167 {
168 return Result<bool>(ErrorId(GetErrorId()));
169 }
170 int state = 0;
171 while (true)
172 {
173 uchar c = separatorChar;
174 if (pos != end)
175 {
176 c = *pos;
177 }
178 else if (c == '\0')
179 {
180 break;
181 }
182 if (state == 0)
183 {
184 lexeme.begin = pos;
185 token.id = INVALID_TOKEN;
186 token.line = line;
187 }
188 if (pos == end)
189 {
190 lexeme.end = end;
191 }
192 else
193 {
194 lexeme.end = pos + 1;
195 }
196 int prevState = state;
197 state = NextState(state, c);
198 if (state == -1)
199 {
200 if (token.id == CONTINUE_TOKEN)
201 {
202 if (pos == end)
203 {
204 break;
205 }
206 else
207 {
208 pos = token.match.end;
209 }
210 state = 0;
211 continue;
212 }
213 else if (token.id == INVALID_TOKEN)
214 {
215 if (pos == end)
216 {
217 break;
218 }
219 else
220 {
221 auto utf8 = ToUtf8(ustring(c, 1));
222 if (utf8.Error())
223 {
224 SetErrorId(utf8.GetErrorId());
225 return Result<bool>(ErrorId(utf8.GetErrorId()));
226 }
227 string errorMessage = "System.Lex.Lexer.NextToken(): error: invalid character \'" + utf8.Value() + "\' in file \'" + fileName + "\' at line " + ToString(line) +
228 ", state=" + ToString(prevState) + ", class=" + ToString(GetClass(c));
229 int errorId = AllocateError(errorMessage);
230 SetErrorId(errorId);
231 return Result<bool>(ErrorId(errorId));
232 }
233 }
234 else
235 {
236 tokens.Add(token);
237 current = tokens.End() - 1;
238 pos = token.match.end;
239 return Result<bool>(true);
240 }
241 }
242 if (c == '\n' && countLines)
243 {
244 ++line;
245 }
246 ++pos;
247 }
248 token.id = INVALID_TOKEN;
249 state = NextState(state, '\0');
250 long p = -1;
251 if (token.id != INVALID_TOKEN && token.id != CONTINUE_TOKEN)
252 {
253 tokens.Add(token);
254 current = tokens.End() - 1;
255 p = GetPos();
256 }
257 Token endToken(END_TOKEN);
258 endToken.match.begin = end;
259 endToken.match.end = end;
260 tokens.Add(endToken);
261 if (p == -1)
262 {
263 current = tokens.End() - 1;
264 p = GetPos();
265 }
266 SetPos(p);
267 return Result<bool>(true);
268 }
269 public long GetKeywordToken(const Lexeme& lexeme) const
270 {
271 if (keywordMap != null)
272 {
273 return keywordMap->GetKeywordToken(lexeme);
274 }
275 else
276 {
277 return INVALID_TOKEN;
278 }
279 }
280 public Token GetToken(long pos)
281 {
282 int tokenIndex = cast<int>(pos);
283 #assert(tokenIndex >= 0 && tokenIndex < tokens.Count());
284 return tokens[tokenIndex];
285 }
286 public ustring GetMatch(long pos)
287 {
288 Token token = GetToken(pos);
289 return token.match.ToString();
290 }
291 public char GetChar(long pos)
292 {
293 Token token = GetToken(pos);
294 return cast<char>(*token.match.begin);
295 }
296 public wchar GetWChar(long pos)
297 {
298 Token token = GetToken(pos);
299 return cast<wchar>(*token.match.begin);
300 }
301 public uchar GetUChar(long pos)
302 {
303 Token token = GetToken(pos);
304 return *token.match.begin;
305 }
306 [nodiscard]
307 public Result<int> GetInt(long pos)
308 {
309 Token token = GetToken(pos);
310 auto utf8 = ToUtf8(token.match.ToString());
311 if (utf8.Error())
312 {
313 SetErrorId(utf8.GetErrorId());
314 return Result<int>(ErrorId(utf8.GetErrorId()));
315 }
316 return ParseInt(utf8.Value());
317 }
318 [nodiscard]
319 public Result<double> GetDouble(long pos)
320 {
321 Token token = GetToken(pos);
322 auto utf8 = ToUtf8(token.match.ToString());
323 if (utf8.Error())
324 {
325 SetErrorId(utf8.GetErrorId());
326 return Result<double>(ErrorId(utf8.GetErrorId()));
327 }
328 return ParseDouble(utf8.Value());
329 }
330 public void SetTokens(const List<Token>& tokens_)
331 {
332 if (!tokens_.IsEmpty())
333 {
334 tokens.Add(tokens_.Front());
335 }
336 else
337 {
338 tokens.Add(Token(END_TOKEN, Lexeme(end, end), 1));
339 }
340 for (const Token& token : tokens_)
341 {
342 tokens.Add(token);
343 }
344 tokens.Add(Token(END_TOKEN, Lexeme(end, end), 1));
345 current = tokens.Begin();
346 }
347 public ustring ErrorLines(long pos)
348 {
349 Token token = GetToken(pos);
350 ustring lines;
351 const uchar* lineStart = LineStart(start, token.match.begin);
352 const uchar* lineEnd = LineEnd(end, token.match.end);
353 lines.Append(ustring(lineStart, token.match.begin));
354 lines.Append(token.match.ToString());
355 lines.Append(ustring(token.match.end, lineEnd));
356 lines.Append('\n', 1);
357 lines.Append(' ', token.match.begin - lineStart);
358 lines.Append('^', Max(cast<long>(1), token.match.end - token.match.begin));
359 lines.Append(' ', lineEnd - token.match.end);
360 lines.Append('\n', 1);
361 return lines;
362 }
363 public ErrorId GetFarthestError()
364 {
365 ustring errorLines = ErrorLines(farthestPos);
366 auto utf8 = ToUtf8(errorLines);
367 if (utf8.Error())
368 {
369 SetErrorId(utf8.GetErrorId());
370 return ErrorId(utf8.GetErrorId());
371 }
372 Console.Out() << utf8.Value() << endl();
373 string parserStateStr = GetParserStateStr();
374 string errorMessage = "parsing error at \'" + fileName + ":" + ToString(token.line) + "\':\n" + utf8.Value() + parserStateStr;
375 int errorId = AllocateError(errorMessage);
376 return ErrorId(errorId);
377 }
378 public ustring RestOfLine(int maxLineLength)
379 {
380 ustring restOfLine(current->match.ToString() + ustring(current->match.end, pos) + ustring(pos, LineEnd(end, pos)));
381 if (maxLineLength != 0)
382 {
383 restOfLine = restOfLine.Substring(0, maxLineLength);
384 }
385 return restOfLine;
386 }
387 public virtual int NextState(int state, uchar c)
388 {
389 return -1;
390 }
391 public TokenLine TokenizeLine(const ustring& line, int lineNumber, int startState)
392 {
393 pos = line.Chars();
394 end = line.Chars() + line.Length();
395 TokenLine tokenLine;
396 tokenLine.startState = startState;
397 lexeme.begin = pos;
398 lexeme.end = end;
399 token.match = lexeme;
400 token.id = INVALID_TOKEN;
401 token.line = lineNumber;
402 int state = startState;
403 while (pos != end)
404 {
405 uchar c = *pos;
406 if (state == 0)
407 {
408 lexeme.begin = pos;
409 token.id = INVALID_TOKEN;
410 token.line = lineNumber;
411 }
412 lexeme.end = pos + 1;
413 int prevState = state;
414 state = NextState(state, c);
415 if (state == -1)
416 {
417 if (prevState == 0)
418 {
419 break;
420 }
421 state = 0;
422 pos = token.match.end;
423 tokenLine.tokens.Add(token);
424 lexeme.begin = lexeme.end;
425 }
426 else
427 {
428 ++pos;
429 }
430 }
431 if (state != 0 && state != -1)
432 {
433 state = NextState(state, '\r');
434 }
435 if (state != 0 && state != -1)
436 {
437 state = NextState(state, '\n');
438 }
439 if (state != 0 && state != -1)
440 {
441 if (blockCommentStates.CFind(state) != blockCommentStates.CEnd())
442 {
443 token.id = commentTokenId;
444 token.match.end = end;
445 tokenLine.tokens.Add(token);
446 tokenLine.endState = state;
447 return tokenLine;
448 }
449 }
450 if (lexeme.begin != lexeme.end)
451 {
452 token.match = lexeme;
453 tokenLine.tokens.Add(token);
454 }
455 if (state == -1)
456 {
457 state = 0;
458 }
459 tokenLine.endState = state;
460 return tokenLine;
461 }
462 public const List<long>& RuleContext() const
463 {
464 return ruleContext;
465 }
466 public const List<long>& FarthestRuleContext() const
467 {
468 return farthestRuleContext;
469 }
470 public void SetRuleNameMapPtr(Map<long, string>* ruleNameMapPtr_)
471 {
472 ruleNameMapPtr = ruleNameMapPtr_;
473 }
474 public string GetParserStateStr() const
475 {
476 string parserStateStr;
477 long n = farthestRuleContext.Count();
478 if (ruleNameMapPtr != null && n > 0)
479 {
480 parserStateStr.Append("\nParser state:\n");
481 for (long i = 0; i < n; ++i;)
482 {
483 long ruleId = farthestRuleContext[i];
484 auto it = ruleNameMapPtr->CFind(ruleId);
485 if (it != ruleNameMapPtr->CEnd())
486 {
487 string ruleName = it->second;
488 parserStateStr.Append(ruleName).Append("\n");
489 }
490 }
491 }
492 return parserStateStr;
493 }
494 public void PushRule(long ruleId)
495 {
496 ruleContext.Add(ruleId);
497 }
498 public void PopRule()
499 {
500 ruleContext.RemoveLast();
501 }
502 public List<int> GetLineStartIndeces() const
503 {
504 List<int> lineStartIndeces;
505 for (long i = 0; i < lineStarts.Count(); ++i;)
506 {
507 lineStartIndeces.Add(cast<int>(lineStarts[i] - start));
508 }
509 return lineStartIndeces;
510 }
511 public void SetClassMap(int* classMap_)
512 {
513 classMap = classMap_;
514 }
515 public int GetClass(uchar c) const
516 {
517 if (classMap != null)
518 {
519 int i = cast<int>(c);
520 if (i < 1114112)
521 {
522 return classMap[i];
523 }
524 }
525 return -1;
526 }
527 private void ComputeLineStarts()
528 {
529 lineStarts.Add(pos);
530 const uchar* p = pos;
531 bool startOfLine = true;
532 while (p != end)
533 {
534 if (startOfLine)
535 {
536 lineStarts.Add(p);
537 }
538 startOfLine = *p == '\n';
539 ++p;
540 }
541 lineStarts.Add(end);
542 }
543 protected Lexeme lexeme;
544 protected int line;
545 private ustring content;
546 private string fileName;
547 private KeywordMap* keywordMap;
548 private const uchar* start;
549 private const uchar* end;
550 private const uchar* pos;
551 private List<Token> tokens;
552 private List<Token>.Iterator current;
553 private ParsingLog* log;
554 private bool countLines;
555 private uchar separatorChar;
556 private Set<int> blockCommentStates;
557 private int commentTokenId;
558 private long farthestPos;
559 private List<long> ruleContext;
560 private List<long> farthestRuleContext;
561 private Map<long, string>* ruleNameMapPtr;
562 private List<const uchar*> lineStarts;
563 private string classMapName;
564 private int* classMap;
565 private int fileIndex;
566 }
567
568 public const uchar* LineStart(const uchar* start, const uchar* p)
569 {
570 while (p != start && *p != '\n' && *p != '\r')
571 {
572 --p;
573 }
574 if (p != start)
575 {
576 ++p;
577 }
578 return p;
579 }
580
581 public const uchar* LineEnd(const uchar* end, const uchar* p)
582 {
583 while (p != end && *p != '\n' && *p != '\r')
584 {
585 ++p;
586 }
587 return p;
588 }
589
590 [nodiscard]
591 public Result<bool> WriteBeginRuleToLog(Lexer& lexer, const ustring& ruleName)
592 {
593 auto result0 = lexer.Log()->WriteBeginRule(ruleName);
594 if (result0.Error())
595 {
596 return result0;
597 }
598 lexer.Log()->IncIndent();
599 auto result1 = lexer.Log()->WriteTry(lexer.RestOfLine(lexer.Log()->MaxLineLength()));
600 if (result1.Error())
601 {
602 return result1;
603 }
604 lexer.Log()->IncIndent();
605 return Result<bool>(true);
606 }
607
608 [nodiscard]
609 public Result<bool> WriteSuccessToLog(Lexer& lexer, long pos, const ustring& ruleName)
610 {
611 lexer.Log()->DecIndent();
612 ustring match = lexer.GetMatch(pos);
613 auto result0 = lexer.Log()->WriteSuccess(match);
614 if (result0.Error())
615 {
616 return Result<bool>(ErrorId(result0.GetErrorId()));
617 }
618 lexer.Log()->DecIndent();
619 return lexer.Log()->WriteEndRule(ruleName);
620 }
621
622 [nodiscard]
623 public Result<bool> WriteFailureToLog(Lexer& lexer, const ustring& ruleName)
624 {
625 lexer.Log()->DecIndent();
626 auto result0 = lexer.Log()->WriteFail();
627 if (result0.Error())
628 {
629 return Result<bool>(ErrorId(result0.GetErrorId()));
630 }
631 lexer.Log()->DecIndent();
632 return lexer.Log()->WriteEndRule(ruleName);
633 }
634
635 public class RuleGuard<LexerT>
636 {
637 public RuleGuard(LexerT& lexer_, long ruleId_) : lexer(lexer_)
638 {
639 lexer.PushRule(ruleId_);
640 }
641 public ~RuleGuard()
642 {
643 lexer.PopRule();
644 }
645 private LexerT& lexer;
646 }
647