1 // =================================
  2 // Copyright (c) 2024 Seppo Laakko
  3 // Distributed under the MIT license
  4 // =================================
  5 
  6 using System.IO;
  7 using System.Collections;
  8 
  9 namespace System.Unicode
 10 {
 11     public class UnicodeEngine : IOBase
 12     {
 13         public UnicodeEngine() : resultReady(false)result('\0')state(0)
 14         {
 15         }
 16         [nodiscard]
 17         public Result<bool> Put(byte x)
 18         {
 19             switch (state)
 20             {
 21                 case 0:
 22                 {
 23                     resultReady = false;
 24                     if ((x & 128u) ==  0u)
 25                     {
 26                         result = cast<uchar>(x);
 27                         resultReady = true;
 28                     }
 29                     else if ((x & 224u) ==  192u )
 30                     {
 31                         bytes[0] = x;
 32                         state = 1;
 33                     }
 34                     else if ((x & 240u) ==  224u )
 35                     {
 36                         bytes[0] = x;
 37                         state = 2;
 38                     }
 39                     else if ((x & 248u) ==  240u )
 40                     {
 41                         bytes[0] = x;
 42                         state = 4;
 43                     }
 44                     else
 45                     {
 46                         int errorId = AllocateError("invalid UTF-8 sequence");
 47                         SetErrorId(errorId);
 48                         return Result<bool>(ErrorId(errorId));
 49                     }
 50                     break;
 51                 }
 52                 case 1:
 53                 {
 54                     result = cast<uchar>(0);
 55                     bytes[1] = x;
 56                     byte b1 = bytes[1];
 57                     if ((b1 & 192u) !=  128u )
 58                     {
 59                         int errorId = AllocateError("invalid UTF-8 sequence");
 60                         SetErrorId(errorId);
 61                         return Result<bool>(ErrorId(errorId));
 62                     }
 63                     byte shift = 0u;
 64                     for (byte i = 0u; i < 6u; ++i;)
 65                     {
 66                         byte bit = b1 & 1u;
 67                         b1 = b1 >> 1u;
 68                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
 69                         ++shift;
 70                     }
 71                     byte b0 = bytes[0];
 72                     for (byte i = 0u; i < 5u; ++i;)
 73                     {
 74                         byte bit = b0 & 1u;
 75                         b0 = b0 >> 1u;
 76                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
 77                         ++shift;
 78                     }
 79                     resultReady = true;
 80                     state = 0;
 81                     break;
 82                 }
 83                 case 2:
 84                 {
 85                     bytes[1] = x;
 86                     state = 3;
 87                     break;
 88                 }
 89                 case 3:
 90                 {
 91                     bytes[2] = x;
 92                     result = cast<uchar>(0);
 93                     byte b2 = bytes[2];
 94                     if ((b2 & 192u) !=  128u )
 95                     {
 96                         int errorId = AllocateError("invalid UTF-8 sequence");
 97                         SetErrorId(errorId);
 98                         return Result<bool>(ErrorId(errorId));
 99                     }
100                     byte shift = 0u;
101                     for (byte i = 0u; i < 6u; ++i;)
102                     {
103                         byte bit = b2 & 1u;
104                         b2 = b2 >> 1u;
105                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
106                         ++shift;
107                     }
108                     byte b1 = bytes[1];
109                     if ((b1 & 192u) !=  128u )
110                     {
111                         int errorId = AllocateError("invalid UTF-8 sequence");
112                         SetErrorId(errorId);
113                         return Result<bool>(ErrorId(errorId));
114                     }
115                     for (byte i = 0u; i < 6u; ++i;)
116                     {
117                         byte bit = b1 & 1u;
118                         b1 = b1 >> 1u;
119                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
120                         ++shift;
121                     }
122                     byte b0 = bytes[0];
123                     for (byte i = 0u; i < 4u; ++i;)
124                     {
125                         byte bit = b0 & 1u;
126                         b0 = b0 >> 1u;
127                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
128                         ++shift;
129                     }
130                     resultReady = true;
131                     state = 0;
132                     break;
133                 }
134                 case 4:
135                 {
136                     bytes[1] = x;
137                     state = 5;
138                     break;
139                 }
140                 case 5:
141                 {
142                     bytes[2] = x;
143                     state = 6;
144                     break;
145                 }
146                 case 6:
147                 {
148                     bytes[3] = x;
149                     result = cast<uchar>(0);
150                     byte b3 = bytes[3];
151                     if ((b3 & 192u) !=  128u )
152                     {
153                         int errorId = AllocateError("invalid UTF-8 sequence");
154                         SetErrorId(errorId);
155                         return Result<bool>(ErrorId(errorId));
156                     }
157                     byte shift = 0u;
158                     for (byte i = 0u; i < 6u; ++i;)
159                     {
160                         byte bit = b3 & 1u;
161                         b3 = b3 >> 1u;
162                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
163                         ++shift;
164                     }
165                     byte b2 = bytes[2];
166                     if ((b2 & 192u) !=  128u )
167                     {
168                         int errorId = AllocateError("invalid UTF-8 sequence");
169                         SetErrorId(errorId);
170                         return Result<bool>(ErrorId(errorId));
171                     }
172                     for (byte i = 0u; i < 6u; ++i;)
173                     {
174                         byte bit = b2 & 1u;
175                         b2 = b2 >> 1u;
176                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
177                         ++shift;
178                     }
179                     byte b1 = bytes[1];
180                     if ((b1 & 192u) !=  128u )
181                     {
182                         int errorId = AllocateError("invalid UTF-8 sequence");
183                         SetErrorId(errorId);
184                         return Result<bool>(ErrorId(errorId));
185                     }
186                     for (byte i = 0u; i < 6u; ++i;)
187                     {
188                         byte bit = b1 & 1u;
189                         b1 = b1 >> 1u;
190                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
191                         ++shift;
192                     }
193                     byte b0 = bytes[0];
194                     for (byte i = 0u; i < 3u; ++i;)
195                     {
196                         byte bit = b0 & 1u;
197                         b0 = b0 >> 1u;
198                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
199                         ++shift;
200                     }
201                     resultReady = true;
202                     state = 0;
203                     break;
204                 }
205             }
206             return Result<bool>(true);
207         }
208         public inline bool ResultReady() const
209         {
210             return resultReady;
211         }
212         public uchar Get()
213         {
214             return result;
215         }
216         private bool resultReady;
217         private uchar result;
218         private int state;
219         private byte[4] bytes;
220     }