1 // =================================
  2 // Copyright (c) 2021 Seppo Laakko
  3 // Distributed under the MIT license
  4 // =================================
  5 
  6 using System.Collections;
  7 
  8 namespace System.Unicode
  9 {
 10     public class UnicodeEngine
 11     {
 12         public nothrow UnicodeEngine() : resultReady(false)result('\0')state(0)
 13         {
 14         }
 15         public void Put(byte x)
 16         {
 17             switch (state)
 18             {
 19                 case 0:
 20                 {
 21                     resultReady = false;
 22                     if ((x & 0x80u) == 0u)
 23                     {
 24                         result = cast<uchar>(x);
 25                         resultReady = true;
 26                     }
 27                     else if ((x & 0xE0u) == 0xC0u)
 28                     {
 29                         bytes[0] = x;
 30                         state = 1;
 31                     }
 32                     else if ((x & 0xF0u) == 0xE0u)
 33                     {
 34                         bytes[0] = x;
 35                         state = 2;
 36                     }
 37                     else if ((x & 0xF8u) == 0xF0u)
 38                     {
 39                         bytes[0] = x;
 40                         state = 4;
 41                     }
 42                     else
 43                     {
 44                         throw UnicodeException("invalid UTF-8 sequence");
 45                     }
 46                     break;
 47                 }
 48                 case 1:
 49                 {
 50                     result = cast<uchar>(0);
 51                     bytes[1] = x;
 52                     byte b1 = bytes[1];
 53                     if ((b1 & 0xC0u) != 0x80u)
 54                     {
 55                         throw UnicodeException("invalid UTF-8 sequence");
 56                     }
 57                     byte shift = 0u;
 58                     for (byte i = 0u; i < 6u; ++i;)
 59                     {
 60                         byte bit = b1 & 1u;
 61                         b1 = b1 >> 1u;
 62                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
 63                         ++shift;
 64                     }
 65                     byte b0 = bytes[0];
 66                     for (byte i = 0u; i < 5u; ++i;)
 67                     {
 68                         byte bit = b0 & 1u;
 69                         b0 = b0 >> 1u;
 70                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
 71                         ++shift;
 72                     }
 73                     resultReady = true;
 74                     state = 0;
 75                     break;
 76                 }
 77                 case 2:
 78                 {
 79                     bytes[1] = x;
 80                     state = 3;
 81                     break;
 82                 }
 83                 case 3:
 84                 {
 85                     bytes[2] = x;
 86                     result = cast<uchar>(0);
 87                     byte b2 = bytes[2];
 88                     if ((b2 & 0xC0u) != 0x80u)
 89                     {
 90                         throw UnicodeException("invalid UTF-8 sequence");
 91                     }
 92                     byte shift = 0u;
 93                     for (byte i = 0u; i < 6u; ++i;)
 94                     {
 95                         byte bit = b2 & 1u;
 96                         b2 = b2 >> 1u;
 97                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
 98                         ++shift;
 99                     }
100                     byte b1 = bytes[1];
101                     if ((b1 & 0xC0u) != 0x80u)
102                     {
103                         throw UnicodeException("invalid UTF-8 sequence");
104                     }
105                     for (byte i = 0u; i < 6u; ++i;)
106                     {
107                         byte bit = b1 & 1u;
108                         b1 = b1 >> 1u;
109                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
110                         ++shift;
111                     }
112                     byte b0 = bytes[0];
113                     for (byte i = 0u; i < 4u; ++i;)
114                     {
115                         byte bit = b0 & 1u;
116                         b0 = b0 >> 1u;
117                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
118                         ++shift;
119                     }
120                     resultReady = true;
121                     state = 0;
122                     break;
123                 }
124                 case 4:
125                 {
126                     bytes[1] = x;
127                     state = 5;
128                     break;
129                 }
130                 case 5:
131                 {
132                     bytes[2] = x;
133                     state = 6;
134                     break;
135                 }
136                 case 6:
137                 {
138                     bytes[3] = x;
139                     result = cast<uchar>(0);
140                     byte b3 = bytes[3];
141                     if ((b3 & 0xC0u) != 0x80u)
142                     {
143                         throw UnicodeException("invalid UTF-8 sequence");
144                     }
145                     byte shift = 0u;
146                     for (byte i = 0u; i < 6u; ++i;)
147                     {
148                         byte bit = b3 & 1u;
149                         b3 = b3 >> 1u;
150                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
151                         ++shift;
152                     }
153                     byte b2 = bytes[2];
154                     if ((b2 & 0xC0u) != 0x80u)
155                     {
156                         throw UnicodeException("invalid UTF-8 sequence");
157                     }
158                     for (byte i = 0u; i < 6u; ++i;)
159                     {
160                         byte bit = b2 & 1u;
161                         b2 = b2 >> 1u;
162                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
163                         ++shift;
164                     }
165                     byte b1 = bytes[1];
166                     if ((b1 & 0xC0u) != 0x80u)
167                     {
168                         throw UnicodeException("invalid UTF-8 sequence");
169                     }
170                     for (byte i = 0u; i < 6u; ++i;)
171                     {
172                         byte bit = b1 & 1u;
173                         b1 = b1 >> 1u;
174                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
175                         ++shift;
176                     }
177                     byte b0 = bytes[0];
178                     for (byte i = 0u; i < 3u; ++i;)
179                     {
180                         byte bit = b0 & 1u;
181                         b0 = b0 >> 1u;
182                         result = cast<uchar>(cast<uint>(result) | (cast<uint>(bit) << shift));
183                         ++shift;
184                     }
185                     resultReady = true;
186                     state = 0;
187                     break;
188                 }
189             }
190         }
191         public inline nothrow bool ResultReady() const
192         {
193             return resultReady;
194         }
195         public nothrow uchar Get()
196         {
197             return result;
198         }
199         private bool resultReady;
200         private uchar result;
201         private int state;
202         private byte[4] bytes;
203     }
204 }