1 using System;
2 using System.Collections;
3
4
5
6
7
8 namespace System.Lex
9 {
10 public enum LexerFlags : sbyte
11 {
12 none = 0, synchronize = 1 << 0, synchronized = 1 << 1, synchronizedAtLeastOnce = 1 << 2, cursorSeen = 1 << 3, farthestError = 1 << 4
13 }
14
15 public class Lexer
16 {
17 public Lexer(const ustring& content_, const string& fileName_, int fileIndex_) :
18 content(content_), fileName(fileName_), fileIndex(fileIndex_), line(1), keywordMap(null), start(content.Chars()), end(content.Chars() + content.Length()),
19 pos(start), current(tokens.End()), log(null), countLines(true), separatorChar('\0'),
20 commentTokenId(-1), farthestPos(GetPos())
21 {
22 }
23 public Lexer(const uchar* start_, const uchar* end_, const string& fileName_, int fileIndex_) :
24 content(), fileName(fileName_), fileIndex(fileIndex_), line(1), keywordMap(null), start(start_), end(end_), pos(start), current(tokens.End()), log(null), countLines(true), separatorChar('\0'),
25 commentTokenId(-1), farthestPos(GetPos())
26 {
27 }
28 suppress Lexer(const Lexer&);
29 suppress void operator=(const Lexer&);
30 public void SetBlockCommentStates(const Set<int>& blockCommentStates_) const
31 {
32 blockCommentStates = blockCommentStates_;
33 }
34 public nothrow const Set<int>& BlockCommentStates() const
35 {
36 return blockCommentStates;
37 }
38 public nothrow void SetCommentTokenId(int commentTokenId_)
39 {
40 commentTokenId = commentTokenId_;
41 }
42 protected virtual nothrow int GetCommentTokenId() const
43 {
44 return -1;
45 }
46 public virtual ~Lexer()
47 {
48 }
49 public int operator*() const
50 {
51 return current->id;
52 }
53 public void SetKeywordMap(KeywordMap* keywordMap_)
54 {
55 keywordMap = keywordMap_;
56 }
57 public KeywordMap* GetKeywordMap()
58 {
59 return keywordMap;
60 }
61 public void Retract()
62 {
63 token.match.end = pos;
64 }
65 public const string& FileName() const
66 {
67 return fileName;
68 }
69 public Span GetSpan() const
70 {
71 return Span(fileIndex, line, cast<int>(GetPos()));
72 }
73 public void PushSpan()
74 {
75 spanStack.Push(currentSpan);
76 currentSpan = Span(fileIndex, line, -1, -1);
77 }
78 public Span PopSpan()
79 {
80 Span s = currentSpan;
81 currentSpan = spanStack.Pop();
82 return s;
83 }
84 public void SetSpan(long pos)
85 {
86 if (currentSpan.start == -1)
87 {
88 currentSpan.line = line;
89 currentSpan.start = cast<int>(pos);
90 }
91 else
92 {
93 currentSpan.end = cast<int>(pos);
94 }
95 }
96 public inline nothrow Span GetCurrentSpan() const
97 {
98 return currentSpan;
99 }
100 public void SetLine(int line_)
101 {
102 line = line_;
103 }
104 public void SetCountLines(bool countLines_)
105 {
106 countLines = countLines_;
107 }
108 public Token token;
109 public const uchar* Start() const
110 {
111 return start;
112 }
113 public const uchar* End() const
114 {
115 return end;
116 }
117 public const uchar* Pos() const
118 {
119 return pos;
120 }
121 public void SetLog(ParsingLog* log_)
122 {
123 log = log_;
124 }
125 public ParsingLog* Log() const
126 {
127 return log;
128 }
129 public void SetSeparatorChar(uchar separatorChar_)
130 {
131 separatorChar = separatorChar_;
132 }
133 public void operator++()
134 {
135 if (current != tokens.End())
136 {
137 ++current;
138 }
139 if (current == tokens.End())
140 {
141 NextToken();
142 }
143 else
144 {
145 line = current->line;
146 }
147 if (GetFlag(LexerFlags.farthestError))
148 {
149 long p = GetPos();
150 if (p > farthestPos)
151 {
152 farthestPos = p;
153 farthestRuleContext = ruleContext;
154 }
155 }
156 }
157 public long GetPos() const
158 {
159 int p = cast<int>(current - tokens.Begin());
160 return (cast<long>(line) << 32) | cast<long>(p);
161 }
162 public void SetPos(long pos)
163 {
164 current = tokens.Begin() + cast<int>(pos);
165 line = cast<int>(pos >> 32);
166 }
167 public void NextToken()
168 {
169 int state = 0;
170 while (true)
171 {
172 uchar c = separatorChar;
173 if (pos != end)
174 {
175 c = *pos;
176 }
177 else if (c == '\0')
178 {
179 break;
180 }
181 if (state == 0)
182 {
183 lexeme.begin = pos;
184 token.id = INVALID_TOKEN;
185 token.line = line;
186 }
187 if (pos == end)
188 {
189 lexeme.end = end;
190 }
191 else
192 {
193 lexeme.end = pos + 1;
194 }
195 state = NextState(state, c);
196 if (state == -1)
197 {
198 if (token.id == CONTINUE_TOKEN)
199 {
200 if (pos == end)
201 {
202 break;
203 }
204 else
205 {
206 pos = token.match.end;
207 }
208 state = 0;
209 continue;
210 }
211 else if (token.id == INVALID_TOKEN)
212 {
213 if (pos == end)
214 {
215 break;
216 }
217 else
218 {
219 throw Exception("System.Lex.Lexer.NextToken(): error: invalid character \'" + ToUtf8(ustring(c, 1)) + "\' in file \'" + fileName + "\' at line " + ToString(line));
220 }
221 }
222 else
223 {
224 tokens.Add(token);
225 current = tokens.End() - 1;
226 pos = token.match.end;
227 return;
228 }
229 }
230 if (c == '\n' && countLines)
231 {
232 ++line;
233 }
234 ++pos;
235 }
236 token.id = INVALID_TOKEN;
237 state = NextState(state, '\0');
238 long p = -1;
239 if (token.id != INVALID_TOKEN && token.id != CONTINUE_TOKEN)
240 {
241 tokens.Add(token);
242 current = tokens.End() - 1;
243 p = GetPos();
244 }
245 Token endToken(END_TOKEN);
246 endToken.match.begin = end;
247 endToken.match.end = end;
248 tokens.Add(endToken);
249 if (p == -1)
250 {
251 current = tokens.End() - 1;
252 p = GetPos();
253 }
254 SetPos(p);
255 }
256 public int GetKeywordToken(const Lexeme& lexeme) const
257 {
258 if ((keywordMap != null))
259 {
260 return keywordMap->GetKeywordToken(lexeme);
261 }
262 else
263 {
264 return INVALID_TOKEN;
265 }
266 }
267 public void ConvertExternal(Span& span)
268 {
269 Token startToken = GetToken(span.start);
270 span.start = cast<int>(startToken.match.begin - start);
271 Token endToken = startToken;
272 if (span.end != -1)
273 {
274 endToken = GetToken(span.end);
275 }
276 span.end = cast<int>(endToken.match.end - start);
277 }
278 public Token GetToken(long pos) const
279 {
280 int tokenIndex = cast<int>(pos);
281 if (tokenIndex >= 0 && tokenIndex < tokens.Count())
282 {
283 return tokens[tokenIndex];
284 }
285 else
286 {
287 throw Exception("invalid token index");
288 }
289 }
290 public char GetChar(long pos) const
291 {
292 Token t = GetToken(pos);
293 return cast<char>(*t.match.begin);
294 }
295 public wchar GetWChar(long pos) const
296 {
297 Token t = GetToken(pos);
298 return cast<wchar>(*t.match.begin);
299 }
300 public uchar GetUChar(long pos) const
301 {
302 Token t = GetToken(pos);
303 return *t.match.begin;
304 }
305 public int GetInt(long pos) const
306 {
307 Token t = GetToken(pos);
308 return ParseInt(ToUtf8(t.match.ToString()));
309 }
310 public double GetDouble(long pos) const
311 {
312 Token t = GetToken(pos);
313 return ParseDouble(ToUtf8(t.match.ToString()));
314 }
315 public void SetTokens(const List<Token>& tokens_)
316 {
317 if (!tokens_.IsEmpty())
318 {
319 tokens.Add(tokens_.Front());
320 }
321 else
322 {
323 tokens.Add(Token(END_TOKEN, Lexeme(end, end), 1));
324 }
325 for (const Token& token : tokens_)
326 {
327 tokens.Add(token);
328 }
329 tokens.Add(Token(END_TOKEN, Lexeme(end, end), 1));
330 current = tokens.Begin();
331 }
332 public ustring GetMatch(const Span& span) const
333 {
334 ustring match;
335 Token startToken = GetToken(span.start);
336 match.Append(startToken.match.ToString());
337 const uchar* e = startToken.match.end;
338 for (int i = span.start + 1; i <= span.end; ++i;)
339 {
340 Token token = GetToken(i);
341 match.Append(ustring(' ', token.match.begin - e));
342 match.Append(token.match.ToString());
343 e = token.match.end;
344 }
345 return match;
346 }
347 public ustring ErrorLines(const Token& token) const
348 {
349 ustring lines;
350 const uchar* lineStart = LineStart(start, token.match.begin);
351 const uchar* lineEnd = LineEnd(end, token.match.end);
352 lines.Append(ustring(lineStart, token.match.begin));
353 lines.Append(token.match.ToString());
354 lines.Append(ustring(token.match.end, lineEnd));
355 lines.Append('\n', 1);
356 lines.Append(' ', token.match.begin - lineStart);
357 lines.Append('^', Max(cast<long>(1), token.match.end - token.match.begin));
358 lines.Append(' ', lineEnd - token.match.end);
359 lines.Append('\n', 1);
360 return lines;
361 }
362 public ustring ErrorLines(const Span& span) const
363 {
364 ustring lines;
365 Token startToken = GetToken(span.start);
366 Token endToken = startToken;
367 const uchar* lineStart = LineStart(start, startToken.match.begin);
368 if (span.end != -1 && span.end != span.start)
369 {
370 endToken = GetToken(span.end);
371 }
372 const uchar* lineEnd = LineEnd(end, endToken.match.end);
373 lines.Append(ustring(lineStart, startToken.match.begin));
374 lines.Append(startToken.match.ToString());
375 const uchar* s = startToken.match.begin;
376 const uchar* e = startToken.match.end;
377 for (int i = span.start + 1; i <= span.end; ++i;)
378 {
379 Token token = GetToken(i);
380 lines.Append(ustring(' ', token.match.begin - e));
381 lines.Append(token.match.ToString());
382 e = token.match.end;
383 }
384 lines.Append(ustring(e, lineEnd));
385 lines.Append('\n', 1);
386 lines.Append(' ', s - lineStart);
387 lines.Append('^', Max(cast<long>(1), e - s));
388 lines.Append(' ', lineEnd - e);
389 lines.Append('\n', 1);
390 return lines;
391 }
392 public void GetColumns(const Span& span, int& startCol, int& endCol) const
393 {
394 Token startToken = GetToken(span.start);
395 Token endToken = startToken;
396 const uchar* lineStart = LineStart(start, startToken.match.begin);
397 if (span.end != -1 && span.end != span.start)
398 {
399 endToken = GetToken(span.end);
400 }
401 int cols = cast<int>(startToken.match.begin - lineStart);
402 if (cols < 0)
403 {
404 cols = 0;
405 }
406 startCol = cols + 1;
407 const uchar* lineEnd = LineEnd(end, endToken.match.end);
408 if (lineEnd < lineStart)
409 {
410 lineEnd = lineStart;
411 }
412 int lineLength = cast<int>(lineEnd - lineStart);
413 int spanCols = Max(cast<int>(1), Min(span.end - span.start, lineLength - cols));
414 endCol = startCol + spanCols;
415 }
416 public void ThrowExpectationFailure(const Span& span, const ustring& name)
417 {
418 Token token = GetToken(span.start);
419 throw ParsingException("parsing error in \'" + fileName + ":" + ToString(token.line) + "\': " + ToUtf8(name) + " expected:\n" + ToUtf8(ErrorLines(span)), fileName, span);
420 }
421 public string GetFarthestError() const
422 {
423 Token token = GetToken(farthestPos);
424 string parserStateStr = GetParserStateStr();
425 return "parsing error at '" + fileName + ":" + ToString(token.line) + "':\n" + ToUtf8(ErrorLines(token)) + parserStateStr;
426 }
427 public void ThrowFarthestError()
428 {
429 throw ParsingException(GetFarthestError(), fileName);
430 }
431 public void AddError(const Span& span, const ustring& name)
432 {
433 if (GetFlag(LexerFlags.synchronize) && GetFlag(LexerFlags.synchronized))
434 {
435 SetFlag(LexerFlags.synchronizedAtLeastOnce);
436 }
437 else
438 {
439 Token token = GetToken(span.start);
440 ParsingException* error(new ParsingException("parsing error in '" + fileName + ":" + ToString(token.line) + "': " + ToUtf8(name) + " expected:\n" + ToUtf8(ErrorLines(span)), fileName, span));
441 errors.Add(UniquePtr<Exception>(error));
442 }
443 }
444 public nothrow List<UniquePtr<Exception>> Errors()
445 {
446 return Rvalue(errors);
447 }
448 public ustring RestOfLine(int maxLineLength)
449 {
450 ustring restOfLine(current->match.ToString() + ustring(current->match.end, pos) + ustring(pos, LineEnd(end, pos)));
451 if (maxLineLength != 0)
452 {
453 restOfLine = restOfLine.Substring(0, maxLineLength);
454 }
455 return restOfLine;
456 }
457 public virtual int NextState(int state, uchar c)
458 {
459 return -1;
460 }
461 public TokenLine TokenizeLine(const ustring& line, int lineNumber, int startState)
462 {
463 pos = line.Chars();
464 end = line.Chars() + line.Length();
465 TokenLine tokenLine;
466 tokenLine.startState = startState;
467 lexeme.begin = pos;
468 lexeme.end = end;
469 token.match = lexeme;
470 token.id = INVALID_TOKEN;
471 token.line = lineNumber;
472 int state = startState;
473 while (pos != end)
474 {
475 uchar c = *pos;
476 if (state == 0)
477 {
478 lexeme.begin = pos;
479 token.id = INVALID_TOKEN;
480 token.line = lineNumber;
481 }
482 lexeme.end = pos + 1;
483 int prevState = state;
484 state = NextState(state, c);
485 if (state == -1)
486 {
487 if (prevState == 0)
488 {
489 break;
490 }
491 state = 0;
492 pos = token.match.end;
493 tokenLine.tokens.Add(token);
494 lexeme.begin = lexeme.end;
495 }
496 else
497 {
498 ++pos;
499 }
500 }
501 if (state != 0 && state != -1)
502 {
503 state = NextState(state, '\r');
504 }
505 if (state != 0 && state != -1)
506 {
507 state = NextState(state, '\n');
508 }
509 if (state != 0 && state != -1)
510 {
511 if (blockCommentStates.CFind(state) != blockCommentStates.CEnd())
512 {
513 token.id = commentTokenId;
514 token.match.end = end;
515 tokenLine.tokens.Add(token);
516 tokenLine.endState = state;
517 return tokenLine;
518 }
519 }
520 if (lexeme.begin != lexeme.end)
521 {
522 token.match = lexeme;
523 tokenLine.tokens.Add(token);
524 }
525 if (state == -1)
526 {
527 state = 0;
528 }
529 tokenLine.endState = state;
530 return tokenLine;
531 }
532 public nothrow void SetSyncTokens(const List<int>& syncTokens_)
533 {
534 syncTokens = syncTokens_;
535 }
536 public nothrow bool Synchronize()
537 {
538 if (GetFlag(LexerFlags.synchronize))
539 {
540 if (GetFlag(LexerFlags.synchronized)) return false;
541 SetFlag(LexerFlags.synchronized);
542 while (pos != end)
543 {
544 int curToken = token.id;
545 for (int syncToken : syncTokens)
546 {
547 if (curToken == syncToken)
548 {
549 return true;
550 }
551 }
552 Lexer& lexer = *this;
553 ++lexer;
554 }
555 }
556 return false;
557 }
558 public inline nothrow LexerFlags Flags() const
559 {
560 return flags;
561 }
562 public inline nothrow bool GetFlag(LexerFlags flag) const
563 {
564 return (flags & flag) != LexerFlags.none;
565 }
566 public inline nothrow void SetFlag(LexerFlags flag)
567 {
568 flags = cast<LexerFlags>(flags | flag);
569 }
570 public inline nothrow void ResetFlag(LexerFlags flag)
571 {
572 flags = cast<LexerFlags>(flags & ~flag);
573 }
574 public nothrow const List<int>& RuleContext() const
575 {
576 return ruleContext;
577 }
578 public nothrow const List<int>& FarthestRuleContext() const
579 {
580 return farthestRuleContext;
581 }
582 public nothrow void SetRuleNameListPtr(List<string>* ruleNameListPtr_)
583 {
584 ruleNameListPtr = ruleNameListPtr_;
585 }
586 public nothrow string GetParserStateStr() const
587 {
588 string parserStateStr;
589 long n = farthestRuleContext.Count();
590 if (ruleNameListPtr != null && n > 0)
591 {
592 parserStateStr.Append("\nParser state:\n");
593 for (long i = 0; i < n; ++i;)
594 {
595 int ruleId = farthestRuleContext[i];
596 if (ruleId >= 0 && ruleId < ruleNameListPtr->Count())
597 {
598 string ruleName = (*ruleNameListPtr)[ruleId];
599 parserStateStr.Append(ruleName).Append("\n");
600 }
601 }
602 }
603 return parserStateStr;
604 }
605 public void PushRule(int ruleId)
606 {
607 ruleContext.Add(ruleId);
608 }
609 public void PopRule()
610 {
611 ruleContext.RemoveLast();
612 }
613 protected Lexeme lexeme;
614 protected int line;
615 private ustring content;
616 private string fileName;
617 private int fileIndex;
618 private KeywordMap* keywordMap;
619 private const uchar* start;
620 private const uchar* end;
621 private const uchar* pos;
622 private List<Token> tokens;
623 private List<Token>.Iterator current;
624 private List<UniquePtr<Exception>> errors;
625 private List<int> syncTokens;
626 private ParsingLog* log;
627 private bool countLines;
628 private uchar separatorChar;
629 private Stack<Span> spanStack;
630 private Span currentSpan;
631 private Set<int> blockCommentStates;
632 private int commentTokenId;
633 private LexerFlags flags;
634 private long farthestPos;
635 private List<int> ruleContext;
636 private List<int> farthestRuleContext;
637 private List<string>* ruleNameListPtr;
638 }
639 public const uchar* LineStart(const uchar* start, const uchar* p)
640 {
641 while (p != start && *p != '\n' && *p != '\r')
642 {
643 --p;
644 }
645 if (p != start)
646 {
647 ++p;
648 }
649 return p;
650 }
651 public const uchar* LineEnd(const uchar* end, const uchar* p)
652 {
653 while (p != end && *p != '\n' && *p != '\r')
654 {
655 ++p;
656 }
657 return p;
658 }
659 public ustring GetErrorLines(const uchar* start, const uchar* end, const Span& externalSpan)
660 {
661 const uchar* startPos = start + externalSpan.start;
662 if (startPos < start || startPos >= end)
663 {
664 return ustring();
665 }
666 const uchar* lineStart = LineStart(start, startPos);
667 int cols = cast<int>(startPos - lineStart);
668 if (cols < 0)
669 {
670 cols = 0;
671 }
672 const uchar* lineEnd = LineEnd(end, startPos);
673 if (lineEnd < lineStart)
674 {
675 lineEnd = lineStart;
676 }
677 int lineLength = cast<int>(lineEnd - lineStart);
678 ustring lines(lineStart, lineEnd);
679 int spanCols = Max(cast<int>(1), Min(externalSpan.end - externalSpan.start, lineLength - cols));
680 lines.Append('\n', 1).Append(ustring(' ', cols)).Append('^', spanCols);
681 return lines;
682 }
683 public void GetColumns(const uchar* start, const uchar* end, const Span& externalSpan, int& startCol, int& endCol)
684 {
685 startCol = 0;
686 endCol = 0;
687 const uchar* startPos = start + externalSpan.start;
688 if (startPos < start || startPos >= end)
689 {
690 return;
691 }
692 const uchar* lineStart = LineStart(start, startPos);
693 int cols = cast<int>(startPos - lineStart);
694 if (cols < 0)
695 {
696 cols = 0;
697 }
698 startCol = cols + 1;
699 const uchar* lineEnd = LineEnd(end, startPos);
700 if (lineEnd < lineStart)
701 {
702 lineEnd = lineStart;
703 }
704 int lineLength = cast<int>(lineEnd - lineStart);
705 int spanCols = Max(cast<int>(1), Min(externalSpan.end - externalSpan.start, lineLength - cols));
706 endCol = startCol + spanCols;
707 }
708 public void WriteBeginRuleToLog(Lexer& lexer, const ustring& ruleName)
709 {
710 lexer.Log()->WriteBeginRule(ruleName);
711 lexer.Log()->IncIndent();
712 lexer.Log()->WriteTry(lexer.RestOfLine(lexer.Log()->MaxLineLength()));
713 lexer.Log()->IncIndent();
714 }
715 public void WriteSuccessToLog(Lexer& lexer, const Span& matchSpan, const ustring& ruleName)
716 {
717 lexer.Log()->DecIndent();
718 lexer.Log()->WriteSuccess(lexer.GetMatch(matchSpan));
719 lexer.Log()->DecIndent();
720 lexer.Log()->WriteEndRule(ruleName);
721 }
722 public void WriteFailureToLog(Lexer& lexer, const ustring& ruleName)
723 {
724 lexer.Log()->DecIndent();
725 lexer.Log()->WriteFail();
726 lexer.Log()->DecIndent();
727 lexer.Log()->WriteEndRule(ruleName);
728 }
729
730 public class RuleGuard
731 {
732 public nothrow RuleGuard(Lexer& lexer_, int ruleId_) : lexer(lexer_)
733 {
734 lexer.PushRule(ruleId_);
735 }
736 public ~RuleGuard()
737 {
738 lexer.PopRule();
739 }
740 private Lexer& lexer;
741 }
742
743 }