1 // =================================
  2 // Copyright (c) 2024 Seppo Laakko
  3 // Distributed under the MIT license
  4 // =================================
  5 
  6 using System;
  7 using System.Collections;
  8 using System.IO;
  9 using System.Security;
 10 
 11 namespace System.Lex
 12 {
 13     [nodiscard]
 14     public Result<int*> GetClassMap(const string& classMapName)
 15     {
 16         int errorId = 0;
 17         int* classMap = RtmGetClassMap(classMapName.Chars()errorId);
 18         if (errorId > 0)
 19         {
 20             return Result<int*>(ErrorId(errorId));
 21         }
 22         return Result<int*>(classMap);
 23     }
 24 
 25     public class Lexer : IOBase
 26     {
 27         public Lexer(const uchar* start_const uchar* end_const string& fileName_const string& classMapName_) : 
 28             content()fileName(fileName_)line(1)keywordMap(null)start(start_)end(end_)pos(start)
 29             current(tokens.End())log(null)countLines(true)separatorChar('\0')
 30             commentTokenId(-1)farthestPos(GetPos())classMapName(classMapName_)classMap(null)fileIndex(-1)
 31         {
 32             ComputeLineStarts();
 33         }
 34         suppress Lexer(const Lexer&);
 35         suppress void operator=(const Lexer&);
 36         public inline const string& ClassMapName()
 37         {
 38             return classMapName;
 39         }
 40         public void SetBlockCommentStates(const Set<int>& blockCommentStates_) const
 41         {
 42             blockCommentStates = blockCommentStates_;
 43         }
 44         public const Set<int>& BlockCommentStates() const
 45         {
 46             return blockCommentStates;
 47         }
 48         public void SetCommentTokenId(int commentTokenId_)
 49         {
 50             commentTokenId = commentTokenId_;
 51         }
 52         protected virtual int GetCommentTokenId() const
 53         {
 54             return -1;
 55         }
 56         public inline long operator*() const
 57         {
 58             return current->id;
 59         }
 60         public inline void SetKeywordMap(KeywordMap* keywordMap_)
 61         {
 62             keywordMap = keywordMap_;
 63         }
 64         public inline KeywordMap* GetKeywordMap()
 65         {
 66             return keywordMap;
 67         }
 68         public void Retract()
 69         {
 70             token.match.end = pos;
 71         }
 72         public const string& FileName() const
 73         {
 74             return fileName;
 75         }
 76         public Span GetSpan(long pos) const
 77         {
 78             Token token = GetToken(pos);
 79             return Span(cast<int>(token.match.begin - start)cast<int>(token.match.Length()));
 80         }
 81         public inline int FileIndex() const
 82         {
 83             return fileIndex;
 84         }
 85         public void SetFileIndex(int fileIndex_)
 86         {
 87             fileIndex = fileIndex_;
 88         }
 89         public void SetLine(int line_)
 90         {
 91             line = line_;
 92         }
 93         public void SetCountLines(bool countLines_)
 94         {
 95             countLines = countLines_;
 96         }
 97         public Token token;
 98         public inline const uchar* Start() const
 99         {
100             return start;
101         }
102         public inline const uchar* End() const
103         {
104             return end;
105         }
106         public inline const uchar* Pos() const
107         {
108             return pos;
109         }
110         public void SetLog(ParsingLog* log_)
111         {
112             log = log_;
113         }
114         public ParsingLog* Log() const
115         {
116             return log;
117         }
118         public void SetSeparatorChar(uchar separatorChar_)
119         {
120             separatorChar = separatorChar_;
121         }
122         [nodiscard]
123         public Result<bool> operator++()
124         {
125             if (Error())
126             {
127                 return Result<bool>(ErrorId(GetErrorId()));
128             }
129             if (current != tokens.End())
130             {
131                 ++current;
132             }
133             if (current == tokens.End())
134             {
135                 auto nextTokenResult = NextToken();
136                 if (nextTokenResult.Error())
137                 {
138                     return nextTokenResult;
139                 }
140             }
141             else
142             {
143                 line = current->line;
144             }
145             long p = GetPos();
146             if (p > farthestPos)
147             {
148                 farthestPos = p;
149                 farthestRuleContext = ruleContext;
150             }
151             return Result<bool>(true);
152         }
153         public inline long GetPos() const
154         {
155             int p = cast<int>(current - tokens.Begin());
156             return (cast<long>(line) << 32) | cast<long>(p);
157         }
158         public inline void SetPos(long pos)
159         {
160             current = tokens.Begin() + cast<int>(pos);
161             line = cast<int>(pos >> 32);
162         }
163         [nodiscard]
164         public Result<bool> NextToken()
165         {
166             if (Error())
167             {
168                 return Result<bool>(ErrorId(GetErrorId()));
169             }
170             int state = 0;
171             while (true)
172             {
173                 uchar c = separatorChar;
174                 if (pos != end)
175                 {
176                     c = *pos;
177                 }
178                 else if (c == '\0')
179                 {
180                     break;
181                 }
182                 if (state == 0)
183                 {
184                     lexeme.begin = pos;
185                     token.id = INVALID_TOKEN;
186                     token.line = line;
187                 }
188                 if (pos == end)
189                 {
190                     lexeme.end = end;
191                 }
192                 else
193                 {
194                     lexeme.end = pos + 1;
195                 }
196                 int prevState = state;
197                 state = NextState(statec);
198                 if (state == -1)
199                 {
200                     if (token.id == CONTINUE_TOKEN)
201                     {
202                         if (pos == end)
203                         {
204                             break;
205                         }
206                         else
207                         {
208                             pos = token.match.end;
209                         }
210                         state = 0;
211                         continue;
212                     }
213                     else if (token.id == INVALID_TOKEN)
214                     {
215                         if (pos == end)
216                         {
217                             break;
218                         }
219                         else
220                         {
221                             auto utf8 = ToUtf8(ustring(c1));
222                             if (utf8.Error())
223                             {
224                                 SetErrorId(utf8.GetErrorId());
225                                 return Result<bool>(ErrorId(utf8.GetErrorId()));
226                             }
227                             string errorMessage = "System.Lex.Lexer.NextToken(): error: invalid character \'" + utf8.Value() + "\' in file \'" + fileName + "\' at line " + ToString(line) + 
228                                 ", state=" + ToString(prevState) + ", class=" + ToString(GetClass(c));
229                             int errorId = AllocateError(errorMessage);
230                             SetErrorId(errorId);
231                             return Result<bool>(ErrorId(errorId));
232                         }
233                     }
234                     else
235                     {
236                         tokens.Add(token);
237                         current = tokens.End() - 1;
238                         pos = token.match.end;
239                         return Result<bool>(true);
240                     }
241                 }
242                 if (c == '\n' && countLines)
243                 {
244                     ++line;
245                 }
246                 ++pos;
247             }
248             token.id = INVALID_TOKEN;
249             state = NextState(state'\0');
250             long p = -1;
251             if (token.id != INVALID_TOKEN && token.id != CONTINUE_TOKEN)
252             {
253                 tokens.Add(token);
254                 current = tokens.End() - 1;
255                 p = GetPos();
256             }
257             Token endToken(END_TOKEN);
258             endToken.match.begin = end;
259             endToken.match.end = end;
260             tokens.Add(endToken);
261             if (p == -1)
262             {
263                 current = tokens.End() - 1;
264                 p = GetPos();
265             }
266             SetPos(p);
267             return Result<bool>(true);
268         }
269         public long GetKeywordToken(const Lexeme& lexeme) const
270         {
271             if (keywordMap != null)
272             {
273                 return keywordMap->GetKeywordToken(lexeme);
274             }
275             else
276             {
277                 return INVALID_TOKEN;
278             }
279         }
280         public Token GetToken(long pos)
281         {
282             int tokenIndex = cast<int>(pos);
283             #assert(tokenIndex >= 0 && tokenIndex < tokens.Count());
284             return tokens[tokenIndex];
285         }
286         public ustring GetMatch(long pos)
287         {
288             Token token = GetToken(pos);
289             return token.match.ToString();
290         }
291         public char GetChar(long pos)
292         {
293             Token token = GetToken(pos);
294             return cast<char>(*token.match.begin);
295         }
296         public wchar GetWChar(long pos)
297         {
298             Token token = GetToken(pos);
299             return cast<wchar>(*token.match.begin);
300         }
301         public uchar GetUChar(long pos)
302         {
303             Token token = GetToken(pos);
304             return *token.match.begin;
305         }
306         [nodiscard]
307         public Result<int> GetInt(long pos)
308         {
309             Token token = GetToken(pos);
310             auto utf8 = ToUtf8(token.match.ToString());
311             if (utf8.Error())
312             {
313                 SetErrorId(utf8.GetErrorId());
314                 return Result<int>(ErrorId(utf8.GetErrorId()));
315             }
316             return ParseInt(utf8.Value());
317         }
318         [nodiscard]
319         public Result<double> GetDouble(long pos)
320         {
321             Token token = GetToken(pos);
322             auto utf8 = ToUtf8(token.match.ToString());
323             if (utf8.Error())
324             {
325                 SetErrorId(utf8.GetErrorId());
326                 return Result<double>(ErrorId(utf8.GetErrorId()));
327             }
328             return ParseDouble(utf8.Value());
329         }
330         public void SetTokens(const List<Token>& tokens_)
331         {
332             if (!tokens_.IsEmpty())
333             {
334                 tokens.Add(tokens_.Front());
335             }
336             else
337             {
338                 tokens.Add(Token(END_TOKENLexeme(endend)1));
339             }
340             for (const Token& token : tokens_)
341             {
342                 tokens.Add(token);
343             }
344             tokens.Add(Token(END_TOKENLexeme(endend)1));
345             current = tokens.Begin();
346         }
347         public ustring ErrorLines(long pos)
348         {
349             Token token = GetToken(pos);
350             ustring lines;
351             const uchar* lineStart = LineStart(starttoken.match.begin);
352             const uchar* lineEnd = LineEnd(endtoken.match.end);
353             lines.Append(ustring(lineStarttoken.match.begin));
354             lines.Append(token.match.ToString());
355             lines.Append(ustring(token.match.endlineEnd));
356             lines.Append('\n'1);
357             lines.Append(' 'token.match.begin - lineStart);
358             lines.Append('^'Max(cast<long>(1)token.match.end - token.match.begin));
359             lines.Append(' 'lineEnd - token.match.end);
360             lines.Append('\n'1);
361             return lines;
362         }
363         public ErrorId GetFarthestError()
364         {
365             ustring errorLines = ErrorLines(farthestPos);
366             auto utf8 = ToUtf8(errorLines);
367             if (utf8.Error())
368             {
369                 SetErrorId(utf8.GetErrorId());
370                 return ErrorId(utf8.GetErrorId());
371             }
372             Console.Out() << utf8.Value() << endl();
373             string parserStateStr = GetParserStateStr();
374             string errorMessage = "parsing error at \'" + fileName + ":" + ToString(token.line) + "\':\n" + utf8.Value() + parserStateStr;
375             int errorId = AllocateError(errorMessage);
376             return ErrorId(errorId);
377         }
378         public ustring RestOfLine(int maxLineLength)
379         {
380             ustring restOfLine(current->match.ToString() + ustring(current->match.endpos) + ustring(posLineEnd(endpos)));
381             if (maxLineLength != 0)
382             {
383                 restOfLine = restOfLine.Substring(0maxLineLength);
384             }
385             return restOfLine;
386         }
387         public virtual int NextState(int stateuchar c)
388         {
389             return -1;
390         }
391         public TokenLine TokenizeLine(const ustring& lineint lineNumberint startState)
392         {
393             pos = line.Chars();
394             end = line.Chars() + line.Length();
395             TokenLine tokenLine;
396             tokenLine.startState = startState;
397             lexeme.begin = pos;
398             lexeme.end = end;
399             token.match = lexeme;
400             token.id = INVALID_TOKEN;
401             token.line = lineNumber;
402             int state = startState;
403             while (pos != end)
404             {
405                 uchar c = *pos;
406                 if (state == 0)
407                 {
408                     lexeme.begin = pos;
409                     token.id = INVALID_TOKEN;
410                     token.line = lineNumber;
411                 }
412                 lexeme.end = pos + 1;
413                 int prevState = state;
414                 state = NextState(statec);
415                 if (state == -1)
416                 {
417                     if (prevState == 0)
418                     {
419                         break;
420                     }
421                     state = 0;
422                     pos = token.match.end;
423                     tokenLine.tokens.Add(token);
424                     lexeme.begin = lexeme.end;
425                 }
426                 else
427                 {
428                     ++pos;
429                 }
430             }
431             if (state != 0 && state != -1)
432             {
433                 state = NextState(state'\r');
434             }
435             if (state != 0 && state != -1)
436             {
437                 state = NextState(state'\n');
438             }
439             if (state != 0 && state != -1)
440             {
441                 if (blockCommentStates.CFind(state) != blockCommentStates.CEnd())
442                 {
443                     token.id = commentTokenId;
444                     token.match.end = end;
445                     tokenLine.tokens.Add(token);
446                     tokenLine.endState = state;
447                     return tokenLine;
448                 }
449             }
450             if (lexeme.begin != lexeme.end)
451             {
452                 token.match = lexeme;
453                 tokenLine.tokens.Add(token);
454             }
455             if (state == -1)
456             {
457                 state = 0;
458             }
459             tokenLine.endState = state;
460             return tokenLine;
461         }
462         public const List<long>& RuleContext() const
463         {
464             return ruleContext;
465         }
466         public const List<long>& FarthestRuleContext() const
467         {
468             return farthestRuleContext;
469         }
470         public void SetRuleNameMapPtr(Map<longstring>* ruleNameMapPtr_)
471         {
472             ruleNameMapPtr = ruleNameMapPtr_;
473         }
474         public string GetParserStateStr() const
475         {
476             string parserStateStr;
477             long n = farthestRuleContext.Count();
478             if (ruleNameMapPtr != null && n > 0)
479             {
480                 parserStateStr.Append("\nParser state:\n");
481                 for (long i = 0; i < n; ++i;)
482                 {
483                     long ruleId = farthestRuleContext[i];
484                     auto it = ruleNameMapPtr->CFind(ruleId);
485                     if (it != ruleNameMapPtr->CEnd())
486                     {
487                         string ruleName = it->second;
488                         parserStateStr.Append(ruleName).Append("\n");
489                     }
490                 }
491             }
492             return parserStateStr;
493         }
494         public void PushRule(long ruleId)
495         {
496             ruleContext.Add(ruleId);
497         }
498         public void PopRule()
499         {
500             ruleContext.RemoveLast();
501         }
502         public List<int> GetLineStartIndeces() const
503         {
504             List<int> lineStartIndeces;
505             for (long i = 0; i < lineStarts.Count(); ++i;)
506             {
507                 lineStartIndeces.Add(cast<int>(lineStarts[i] - start));
508             }
509             return lineStartIndeces;
510         }
511         public void SetClassMap(int* classMap_)
512         {
513             classMap = classMap_;
514         }
515         public int GetClass(uchar c) const
516         {
517             if (classMap != null)
518             {
519                 int i = cast<int>(c);
520                 if (i < 1114112)
521                 {
522                     return classMap[i];
523                 }
524             }
525             return -1;
526         }
527         private void ComputeLineStarts()
528         {
529             lineStarts.Add(pos);
530             const uchar* p = pos;
531             bool startOfLine = true;
532             while (p != end)
533             {
534                 if (startOfLine)
535                 {
536                     lineStarts.Add(p);
537                 }
538                 startOfLine = *p == '\n';
539                 ++p;
540             }
541             lineStarts.Add(end);
542         }
543         protected Lexeme lexeme;
544         protected int line;
545         private ustring content;
546         private string fileName;
547         private KeywordMap* keywordMap;
548         private const uchar* start;
549         private const uchar* end;
550         private const uchar* pos;
551         private List<Token> tokens;
552         private List<Token>.Iterator current;
553         private ParsingLog* log;
554         private bool countLines;
555         private uchar separatorChar;
556         private Set<int> blockCommentStates;
557         private int commentTokenId;
558         private long farthestPos;
559         private List<long> ruleContext;
560         private List<long> farthestRuleContext;
561         private Map<longstring>* ruleNameMapPtr;
562         private List<const uchar*> lineStarts;
563         private string classMapName;
564         private int* classMap;
565         private int fileIndex;
566     }
567 
568     public const uchar* LineStart(const uchar* startconst uchar* p)
569     {
570         while (p != start && *p != '\n' && *p != '\r')
571         {
572             --p;
573         }
574         if (p != start)
575         {
576             ++p;
577         }
578         return p;
579     }
580 
581     public const uchar* LineEnd(const uchar* endconst uchar* p)
582     {
583         while (p != end && *p != '\n' && *p != '\r')
584         {
585             ++p;
586         }
587         return p;
588     }
589 
590     [nodiscard]
591     public Result<bool> WriteBeginRuleToLog(Lexer& lexerconst ustring& ruleName)
592     {
593         auto result0 = lexer.Log()->WriteBeginRule(ruleName);
594         if (result0.Error())
595         {
596             return result0;
597         }
598         lexer.Log()->IncIndent();
599         auto result1 = lexer.Log()->WriteTry(lexer.RestOfLine(lexer.Log()->MaxLineLength()));
600         if (result1.Error())
601         {
602             return result1;
603         }
604         lexer.Log()->IncIndent();
605         return Result<bool>(true);
606     }
607 
608     [nodiscard]
609     public Result<bool> WriteSuccessToLog(Lexer& lexerlong posconst ustring& ruleName)
610     {
611         lexer.Log()->DecIndent();
612         ustring match = lexer.GetMatch(pos);
613         auto result0 = lexer.Log()->WriteSuccess(match);
614         if (result0.Error())
615         {
616             return Result<bool>(ErrorId(result0.GetErrorId()));
617         }
618         lexer.Log()->DecIndent();
619         return lexer.Log()->WriteEndRule(ruleName);
620     }
621 
622     [nodiscard]
623     public Result<bool> WriteFailureToLog(Lexer& lexerconst ustring& ruleName)
624     {
625         lexer.Log()->DecIndent();
626         auto result0 = lexer.Log()->WriteFail();
627         if (result0.Error())
628         {
629             return Result<bool>(ErrorId(result0.GetErrorId()));
630         }
631         lexer.Log()->DecIndent();
632         return lexer.Log()->WriteEndRule(ruleName);
633     }
634 
635     public class RuleGuard<LexerT>
636     {
637         public RuleGuard(LexerT& lexer_long ruleId_) : lexer(lexer_)
638         {
639             lexer.PushRule(ruleId_);
640         }
641         public ~RuleGuard()
642         {
643             lexer.PopRule();
644         }
645         private LexerT& lexer;
646     }
647