1 using System;
2 using System.Collections;
3
4
5
6
7
8 namespace System.Lex
9 {
10 public enum LexerFlags : sbyte
11 {
12 none = 0, synchronize = 1 << 0, synchronized = 1 << 1, synchronizedAtLeastOnce = 1 << 2, cursorSeen = 1 << 3, farthestError = 1 << 4
13 }
14
15 public class Lexer
16 {
17 public Lexer(const ustring& content_, const string& fileName_, int fileIndex_) :
18 content(content_), fileName(fileName_), fileIndex(fileIndex_), line(1), keywordMap(null), start(content.Chars()), end(content.Chars() + content.Length()), pos(start), current(tokens.End()), log(null), countLines(true), separatorChar('\0'),
19 commentTokenId(-1), farthestPos(GetPos())
20 {
21 }
22 public Lexer(const uchar* start_, const uchar* end_, const string& fileName_, int fileIndex_) :
23 content(), fileName(fileName_), fileIndex(fileIndex_), line(1), keywordMap(null), start(start_), end(end_), pos(start), current(tokens.End()), log(null), countLines(true), separatorChar('\0'),
24 commentTokenId(-1), farthestPos(GetPos())
25 {
26 }
27 suppress Lexer(const Lexer&);
28 suppress void operator=(const Lexer&);
29 public void SetBlockCommentStates(const Set<int>& blockCommentStates_) const
30 {
31 blockCommentStates = blockCommentStates_;
32 }
33 public nothrow const Set<int>& BlockCommentStates() const
34 {
35 return blockCommentStates;
36 }
37 public nothrow void SetCommentTokenId(int commentTokenId_)
38 {
39 commentTokenId = commentTokenId_;
40 }
41 protected virtual nothrow int GetCommentTokenId() const
42 {
43 return -1;
44 }
45 public virtual ~Lexer()
46 {
47 }
48 public int operator*() const
49 {
50 return current->id;
51 }
52 public void SetKeywordMap(KeywordMap* keywordMap_)
53 {
54 keywordMap = keywordMap_;
55 }
56 public KeywordMap* GetKeywordMap()
57 {
58 return keywordMap;
59 }
60 public void Retract()
61 {
62 token.match.end = pos;
63 }
64 public const string& FileName() const
65 {
66 return fileName;
67 }
68 public Span GetSpan() const
69 {
70 return Span(fileIndex, line, cast<int>(GetPos()));
71 }
72 public void PushSpan()
73 {
74 spanStack.Push(currentSpan);
75 currentSpan = Span(fileIndex, line, -1, -1);
76 }
77 public Span PopSpan()
78 {
79 Span s = currentSpan;
80 currentSpan = spanStack.Pop();
81 return s;
82 }
83 public void SetSpan(long pos)
84 {
85 if (currentSpan.start == -1)
86 {
87 currentSpan.line = line;
88 currentSpan.start = cast<int>(pos);
89 }
90 else
91 {
92 currentSpan.end = cast<int>(pos);
93 }
94 }
95 public inline nothrow Span GetCurrentSpan() const
96 {
97 return currentSpan;
98 }
99 public void SetLine(int line_)
100 {
101 line = line_;
102 }
103 public void SetCountLines(bool countLines_)
104 {
105 countLines = countLines_;
106 }
107 public Token token;
108 public const uchar* Start() const
109 {
110 return start;
111 }
112 public const uchar* End() const
113 {
114 return end;
115 }
116 public const uchar* Pos() const
117 {
118 return pos;
119 }
120 public void SetLog(ParsingLog* log_)
121 {
122 log = log_;
123 }
124 public ParsingLog* Log() const
125 {
126 return log;
127 }
128 public void SetSeparatorChar(uchar separatorChar_)
129 {
130 separatorChar = separatorChar_;
131 }
132 public void operator++()
133 {
134 if (current != tokens.End())
135 {
136 ++current;
137 }
138 if (current == tokens.End())
139 {
140 NextToken();
141 }
142 else
143 {
144 line = current->line;
145 }
146 if (GetFlag(LexerFlags.farthestError))
147 {
148 long p = GetPos();
149 if (p > farthestPos)
150 {
151 farthestPos = p;
152 farthestRuleContext = ruleContext;
153 }
154 }
155 }
156 public long GetPos() const
157 {
158 int p = cast<int>(current - tokens.Begin());
159 return (cast<long>(line) << 32) | cast<long>(p);
160 }
161 public void SetPos(long pos)
162 {
163 current = tokens.Begin() + cast<int>(pos);
164 line = cast<int>(pos >> 32);
165 }
166 public void NextToken()
167 {
168 int state = 0;
169 while (true)
170 {
171 uchar c = separatorChar;
172 if (pos != end)
173 {
174 c = *pos;
175 }
176 else if (c == '\0')
177 {
178 break;
179 }
180 if (state == 0)
181 {
182 lexeme.begin = pos;
183 token.id = INVALID_TOKEN;
184 token.line = line;
185 }
186 if (pos == end)
187 {
188 lexeme.end = end;
189 }
190 else
191 {
192 lexeme.end = pos + 1;
193 }
194 state = NextState(state, c);
195 if (state == -1)
196 {
197 if (token.id == CONTINUE_TOKEN)
198 {
199 if (pos == end)
200 {
201 break;
202 }
203 else
204 {
205 pos = token.match.end;
206 }
207 state = 0;
208 continue;
209 }
210 else if (token.id == INVALID_TOKEN)
211 {
212 if (pos == end)
213 {
214 break;
215 }
216 else
217 {
218 throw Exception("System.Lex.Lexer.NextToken(): error: invalid character \'" + ToUtf8(ustring(c, 1)) + "\' in file \'" + fileName + "\' at line " + ToString(line));
219 }
220 }
221 else
222 {
223 tokens.Add(token);
224 current = tokens.End() - 1;
225 pos = token.match.end;
226 return;
227 }
228 }
229 if (c == '\n' && countLines)
230 {
231 ++line;
232 }
233 ++pos;
234 }
235 token.id = INVALID_TOKEN;
236 state = NextState(state, '\0');
237 long p = -1;
238 if (token.id != INVALID_TOKEN && token.id != CONTINUE_TOKEN)
239 {
240 tokens.Add(token);
241 current = tokens.End() - 1;
242 p = GetPos();
243 }
244 Token endToken(END_TOKEN);
245 endToken.match.begin = end;
246 endToken.match.end = end;
247 tokens.Add(endToken);
248 if (p == -1)
249 {
250 current = tokens.End() - 1;
251 p = GetPos();
252 }
253 SetPos(p);
254 }
255 public int GetKeywordToken(const Lexeme& lexeme) const
256 {
257 if ((keywordMap != null))
258 {
259 return keywordMap->GetKeywordToken(lexeme);
260 }
261 else
262 {
263 return INVALID_TOKEN;
264 }
265 }
266 public void ConvertExternal(Span& span)
267 {
268 Token startToken = GetToken(span.start);
269 span.start = cast<int>(startToken.match.begin - start);
270 Token endToken = startToken;
271 if (span.end != -1)
272 {
273 endToken = GetToken(span.end);
274 }
275 span.end = cast<int>(endToken.match.end - start);
276 }
277 public Token GetToken(long pos) const
278 {
279 int tokenIndex = cast<int>(pos);
280 if (tokenIndex >= 0 && tokenIndex < tokens.Count())
281 {
282 return tokens[tokenIndex];
283 }
284 else
285 {
286 throw Exception("invalid token index");
287 }
288 }
289 public char GetChar(long pos) const
290 {
291 Token t = GetToken(pos);
292 return cast<char>(*t.match.begin);
293 }
294 public wchar GetWChar(long pos) const
295 {
296 Token t = GetToken(pos);
297 return cast<wchar>(*t.match.begin);
298 }
299 public uchar GetUChar(long pos) const
300 {
301 Token t = GetToken(pos);
302 return *t.match.begin;
303 }
304 public int GetInt(long pos) const
305 {
306 Token t = GetToken(pos);
307 return ParseInt(ToUtf8(t.match.ToString()));
308 }
309 public double GetDouble(long pos) const
310 {
311 Token t = GetToken(pos);
312 return ParseDouble(ToUtf8(t.match.ToString()));
313 }
314 public void SetTokens(const List<Token>& tokens_)
315 {
316 if (!tokens_.IsEmpty())
317 {
318 tokens.Add(tokens_.Front());
319 }
320 else
321 {
322 tokens.Add(Token(END_TOKEN, Lexeme(end, end), 1));
323 }
324 for (const Token& token : tokens_)
325 {
326 tokens.Add(token);
327 }
328 tokens.Add(Token(END_TOKEN, Lexeme(end, end), 1));
329 current = tokens.Begin();
330 }
331 public ustring GetMatch(const Span& span) const
332 {
333 ustring match;
334 Token startToken = GetToken(span.start);
335 match.Append(startToken.match.ToString());
336 const uchar* e = startToken.match.end;
337 for (int i = span.start + 1; i <= span.end; ++i;)
338 {
339 Token token = GetToken(i);
340 match.Append(ustring(' ', token.match.begin - e));
341 match.Append(token.match.ToString());
342 e = token.match.end;
343 }
344 return match;
345 }
346 public ustring ErrorLines(const Token& token) const
347 {
348 ustring lines;
349 const uchar* lineStart = LineStart(start, token.match.begin);
350 const uchar* lineEnd = LineEnd(end, token.match.end);
351 lines.Append(ustring(lineStart, token.match.begin));
352 lines.Append(token.match.ToString());
353 lines.Append(ustring(token.match.end, lineEnd));
354 lines.Append('\n', 1);
355 lines.Append(' ', token.match.begin - lineStart);
356 lines.Append('^', Max(cast<long>(1), token.match.end - token.match.begin));
357 lines.Append(' ', lineEnd - token.match.end);
358 lines.Append('\n', 1);
359 return lines;
360 }
361 public ustring ErrorLines(const Span& span) const
362 {
363 ustring lines;
364 Token startToken = GetToken(span.start);
365 Token endToken = startToken;
366 const uchar* lineStart = LineStart(start, startToken.match.begin);
367 if (span.end != -1 && span.end != span.start)
368 {
369 endToken = GetToken(span.end);
370 }
371 const uchar* lineEnd = LineEnd(end, endToken.match.end);
372 lines.Append(ustring(lineStart, startToken.match.begin));
373 lines.Append(startToken.match.ToString());
374 const uchar* s = startToken.match.begin;
375 const uchar* e = startToken.match.end;
376 for (int i = span.start + 1; i <= span.end; ++i;)
377 {
378 Token token = GetToken(i);
379 lines.Append(ustring(' ', token.match.begin - e));
380 lines.Append(token.match.ToString());
381 e = token.match.end;
382 }
383 lines.Append(ustring(e, lineEnd));
384 lines.Append('\n', 1);
385 lines.Append(' ', s - lineStart);
386 lines.Append('^', Max(cast<long>(1), e - s));
387 lines.Append(' ', lineEnd - e);
388 lines.Append('\n', 1);
389 return lines;
390 }
391 public void GetColumns(const Span& span, int& startCol, int& endCol) const
392 {
393 Token startToken = GetToken(span.start);
394 Token endToken = startToken;
395 const uchar* lineStart = LineStart(start, startToken.match.begin);
396 if (span.end != -1 && span.end != span.start)
397 {
398 endToken = GetToken(span.end);
399 }
400 int cols = cast<int>(startToken.match.begin - lineStart);
401 if (cols < 0)
402 {
403 cols = 0;
404 }
405 startCol = cols + 1;
406 const uchar* lineEnd = LineEnd(end, endToken.match.end);
407 if (lineEnd < lineStart)
408 {
409 lineEnd = lineStart;
410 }
411 int lineLength = cast<int>(lineEnd - lineStart);
412 int spanCols = Max(cast<int>(1), Min(span.end - span.start, lineLength - cols));
413 endCol = startCol + spanCols;
414 }
415 public void ThrowExpectationFailure(const Span& span, const ustring& name)
416 {
417 Token token = GetToken(span.start);
418 throw ParsingException("parsing error in \'" + fileName + ":" + ToString(token.line) + "\': " + ToUtf8(name) + " expected:\n" + ToUtf8(ErrorLines(span)), fileName, span);
419 }
420 public string GetFarthestError() const
421 {
422 Token token = GetToken(farthestPos);
423 string parserStateStr = GetParserStateStr();
424 return "parsing error at '" + fileName + ":" + ToString(token.line) + "':\n" + ToUtf8(ErrorLines(token)) + parserStateStr;
425 }
426 public void ThrowFarthestError()
427 {
428 throw ParsingException(GetFarthestError(), fileName);
429 }
430 public void AddError(const Span& span, const ustring& name)
431 {
432 if (GetFlag(LexerFlags.synchronize) && GetFlag(LexerFlags.synchronized))
433 {
434 SetFlag(LexerFlags.synchronizedAtLeastOnce);
435 }
436 else
437 {
438 Token token = GetToken(span.start);
439 ParsingException* error(new ParsingException("parsing error in '" + fileName + ":" + ToString(token.line) + "': " + ToUtf8(name) + " expected:\n" + ToUtf8(ErrorLines(span)), fileName, span));
440 errors.Add(UniquePtr<Exception>(error));
441 }
442 }
443 public nothrow List<UniquePtr<Exception>> Errors()
444 {
445 return Rvalue(errors);
446 }
447 public ustring RestOfLine(int maxLineLength)
448 {
449 ustring restOfLine(current->match.ToString() + ustring(current->match.end, pos) + ustring(pos, LineEnd(end, pos)));
450 if (maxLineLength != 0)
451 {
452 restOfLine = restOfLine.Substring(0, maxLineLength);
453 }
454 return restOfLine;
455 }
456 public virtual int NextState(int state, uchar c)
457 {
458 return -1;
459 }
460 public TokenLine TokenizeLine(const ustring& line, int lineNumber, int startState)
461 {
462 pos = line.Chars();
463 end = line.Chars() + line.Length();
464 TokenLine tokenLine;
465 tokenLine.startState = startState;
466 lexeme.begin = pos;
467 lexeme.end = end;
468 token.match = lexeme;
469 token.id = INVALID_TOKEN;
470 token.line = lineNumber;
471 int state = startState;
472 while (pos != end)
473 {
474 uchar c = *pos;
475 if (state == 0)
476 {
477 lexeme.begin = pos;
478 token.id = INVALID_TOKEN;
479 token.line = lineNumber;
480 }
481 lexeme.end = pos + 1;
482 int prevState = state;
483 state = NextState(state, c);
484 if (state == -1)
485 {
486 if (prevState == 0)
487 {
488 break;
489 }
490 state = 0;
491 pos = token.match.end;
492 tokenLine.tokens.Add(token);
493 lexeme.begin = lexeme.end;
494 }
495 else
496 {
497 ++pos;
498 }
499 }
500 if (state != 0 && state != -1)
501 {
502 state = NextState(state, '\r');
503 }
504 if (state != 0 && state != -1)
505 {
506 state = NextState(state, '\n');
507 }
508 if (state != 0 && state != -1)
509 {
510 if (blockCommentStates.CFind(state) != blockCommentStates.CEnd())
511 {
512 token.id = commentTokenId;
513 token.match.end = end;
514 tokenLine.tokens.Add(token);
515 tokenLine.endState = state;
516 return tokenLine;
517 }
518 }
519 if (lexeme.begin != lexeme.end)
520 {
521 token.match = lexeme;
522 tokenLine.tokens.Add(token);
523 }
524 if (state == -1)
525 {
526 state = 0;
527 }
528 tokenLine.endState = state;
529 return tokenLine;
530 }
531 public nothrow void SetSyncTokens(const List<int>& syncTokens_)
532 {
533 syncTokens = syncTokens_;
534 }
535 public nothrow bool Synchronize()
536 {
537 if (GetFlag(LexerFlags.synchronize))
538 {
539 if (GetFlag(LexerFlags.synchronized)) return false;
540 SetFlag(LexerFlags.synchronized);
541 while (pos != end)
542 {
543 int curToken = token.id;
544 for (int syncToken : syncTokens)
545 {
546 if (curToken == syncToken)
547 {
548 return true;
549 }
550 }
551 Lexer& lexer = *this;
552 ++lexer;
553 }
554 }
555 return false;
556 }
557 public inline nothrow LexerFlags Flags() const
558 {
559 return flags;
560 }
561 public inline nothrow bool GetFlag(LexerFlags flag) const
562 {
563 return (flags & flag) != LexerFlags.none;
564 }
565 public inline nothrow void SetFlag(LexerFlags flag)
566 {
567 flags = cast<LexerFlags>(flags | flag);
568 }
569 public inline nothrow void ResetFlag(LexerFlags flag)
570 {
571 flags = cast<LexerFlags>(flags & ~flag);
572 }
573 public nothrow const List<int>& RuleContext() const
574 {
575 return ruleContext;
576 }
577 public nothrow const List<int>& FarthestRuleContext() const
578 {
579 return farthestRuleContext;
580 }
581 public nothrow void SetRuleNameListPtr(List<string>* ruleNameListPtr_)
582 {
583 ruleNameListPtr = ruleNameListPtr_;
584 }
585 public nothrow string GetParserStateStr() const
586 {
587 string parserStateStr;
588 long n = farthestRuleContext.Count();
589 if (ruleNameListPtr != null && n > 0)
590 {
591 parserStateStr.Append("\nParser state:\n");
592 for (long i = 0; i < n; ++i;)
593 {
594 int ruleId = farthestRuleContext[i];
595 if (ruleId >= 0 && ruleId < ruleNameListPtr->Count())
596 {
597 string ruleName = (*ruleNameListPtr)[ruleId];
598 parserStateStr.Append(ruleName).Append("\n");
599 }
600 }
601 }
602 return parserStateStr;
603 }
604 public void PushRule(int ruleId)
605 {
606 ruleContext.Add(ruleId);
607 }
608 public void PopRule()
609 {
610 ruleContext.RemoveLast();
611 }
612 protected Lexeme lexeme;
613 protected int line;
614 private ustring content;
615 private string fileName;
616 private int fileIndex;
617 private KeywordMap* keywordMap;
618 private const uchar* start;
619 private const uchar* end;
620 private const uchar* pos;
621 private List<Token> tokens;
622 private List<Token>.Iterator current;
623 private List<UniquePtr<Exception>> errors;
624 private List<int> syncTokens;
625 private ParsingLog* log;
626 private bool countLines;
627 private uchar separatorChar;
628 private Stack<Span> spanStack;
629 private Span currentSpan;
630 private Set<int> blockCommentStates;
631 private int commentTokenId;
632 private LexerFlags flags;
633 private long farthestPos;
634 private List<int> ruleContext;
635 private List<int> farthestRuleContext;
636 private List<string>* ruleNameListPtr;
637 }
638 public const uchar* LineStart(const uchar* start, const uchar* p)
639 {
640 while (p != start && *p != '\n' && *p != '\r')
641 {
642 --p;
643 }
644 if (p != start)
645 {
646 ++p;
647 }
648 return p;
649 }
650 public const uchar* LineEnd(const uchar* end, const uchar* p)
651 {
652 while (p != end && *p != '\n' && *p != '\r')
653 {
654 ++p;
655 }
656 return p;
657 }
658 public ustring GetErrorLines(const uchar* start, const uchar* end, const Span& externalSpan)
659 {
660 const uchar* startPos = start + externalSpan.start;
661 if (startPos < start || startPos >= end)
662 {
663 return ustring();
664 }
665 const uchar* lineStart = LineStart(start, startPos);
666 int cols = cast<int>(startPos - lineStart);
667 if (cols < 0)
668 {
669 cols = 0;
670 }
671 const uchar* lineEnd = LineEnd(end, startPos);
672 if (lineEnd < lineStart)
673 {
674 lineEnd = lineStart;
675 }
676 int lineLength = cast<int>(lineEnd - lineStart);
677 ustring lines(lineStart, lineEnd);
678 int spanCols = Max(cast<int>(1), Min(externalSpan.end - externalSpan.start, lineLength - cols));
679 lines.Append('\n', 1).Append(ustring(' ', cols)).Append('^', spanCols);
680 return lines;
681 }
682 public void GetColumns(const uchar* start, const uchar* end, const Span& externalSpan, int& startCol, int& endCol)
683 {
684 startCol = 0;
685 endCol = 0;
686 const uchar* startPos = start + externalSpan.start;
687 if (startPos < start || startPos >= end)
688 {
689 return;
690 }
691 const uchar* lineStart = LineStart(start, startPos);
692 int cols = cast<int>(startPos - lineStart);
693 if (cols < 0)
694 {
695 cols = 0;
696 }
697 startCol = cols + 1;
698 const uchar* lineEnd = LineEnd(end, startPos);
699 if (lineEnd < lineStart)
700 {
701 lineEnd = lineStart;
702 }
703 int lineLength = cast<int>(lineEnd - lineStart);
704 int spanCols = Max(cast<int>(1), Min(externalSpan.end - externalSpan.start, lineLength - cols));
705 endCol = startCol + spanCols;
706 }
707 public void WriteBeginRuleToLog(Lexer& lexer, const ustring& ruleName)
708 {
709 lexer.Log()->WriteBeginRule(ruleName);
710 lexer.Log()->IncIndent();
711 lexer.Log()->WriteTry(lexer.RestOfLine(lexer.Log()->MaxLineLength()));
712 lexer.Log()->IncIndent();
713 }
714 public void WriteSuccessToLog(Lexer& lexer, const Span& matchSpan, const ustring& ruleName)
715 {
716 lexer.Log()->DecIndent();
717 lexer.Log()->WriteSuccess(lexer.GetMatch(matchSpan));
718 lexer.Log()->DecIndent();
719 lexer.Log()->WriteEndRule(ruleName);
720 }
721 public void WriteFailureToLog(Lexer& lexer, const ustring& ruleName)
722 {
723 lexer.Log()->DecIndent();
724 lexer.Log()->WriteFail();
725 lexer.Log()->DecIndent();
726 lexer.Log()->WriteEndRule(ruleName);
727 }
728
729 public class RuleGuard
730 {
731 public nothrow RuleGuard(Lexer& lexer_, int ruleId_) : lexer(lexer_)
732 {
733 lexer.PushRule(ruleId_);
734 }
735 public ~RuleGuard()
736 {
737 lexer.PopRule();
738 }
739 private Lexer& lexer;
740 }
741
742 }