1 // =================================
  2 // Copyright (c) 2021 Seppo Laakko
  3 // Distributed under the MIT license
  4 // =================================
  5 
  6 using System.Collections;
  7 using System.Unicode;
  8 
  9 namespace System
 10 {
 11     public ustring ToUtf32(const string& utf8Str)
 12     {
 13         ustring result;
 14         const char* p = utf8Str.Chars();
 15         long bytesRemaining = utf8Str.Length();
 16         while (bytesRemaining > 0)
 17         {
 18             char c = *p;
 19             byte x = cast<byte>(c);
 20             if ((x & 0x80u) == 0u)
 21             {
 22                 result.Append(cast<uchar>(cast<uint>(x)));
 23                 --bytesRemaining;
 24                 ++p;
 25             }
 26             else if ((x & 0xE0u) == 0xC0u)
 27             {
 28                 if (bytesRemaining < 2)
 29                 {
 30                     ThrowConversionException("invalid UTF-8 sequence");
 31                 }
 32                 uchar u = cast<uchar>(cast<uint>(0u));
 33                 byte b1 = cast<byte>(p[1]);
 34                 if ((b1 & 0xC0u) != 0x80u)
 35                 {
 36                     ThrowConversionException("invalid UTF-8 sequence");
 37                 }
 38                 byte shift = 0u;
 39                 for (byte i = 0u; i < 6u; ++i;)
 40                 {
 41                     byte bit = b1 & 1u;
 42                     b1 = b1 >> 1u;
 43                     u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
 44                     ++shift;
 45                 }
 46                 byte b0 = x;
 47                 for (byte i = 0u; i < 5u; ++i;)
 48                 {
 49                     byte bit = b0 & 1u;
 50                     b0 = b0 >> 1u;
 51                     u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
 52                     ++shift;
 53                 }
 54                 result.Append(u);
 55                 bytesRemaining = bytesRemaining - 2;
 56                 p = p + 2;
 57             }
 58             else if ((x & 0xF0u) == 0xE0u)
 59             {
 60                 if (bytesRemaining < 3)
 61                 {
 62                     ThrowConversionException("invalid UTF-8 sequence");
 63                 }
 64                 uchar u = cast<uchar>(cast<uint>(0u));
 65                 byte b2 = cast<byte>(p[2]);
 66                 if ((b2 & 0xC0u) != 0x80u)
 67                 {
 68                     ThrowConversionException("invalid UTF-8 sequence");
 69                 }
 70                 byte shift = 0u;
 71                 for (byte i = 0u; i < 6u; ++i;)
 72                 {
 73                     byte bit = b2 & 1u;
 74                     b2 = b2 >> 1u;
 75                     u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
 76                     ++shift;
 77                 }
 78                 byte b1 = cast<byte>(p[1]);
 79                 if ((b1 & 0xC0u) != 0x80u)
 80                 {
 81                     ThrowConversionException("invalid UTF-8 sequence");
 82                 }
 83                 for (byte i = 0u; i < 6u; ++i;)
 84                 {
 85                     byte bit = b1 & 1u;
 86                     b1 = b1 >> 1u;
 87                     u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
 88                     ++shift;
 89                 }
 90                 byte b0 = x;
 91                 for (byte i = 0u; i < 4u; ++i;)
 92                 {
 93                     byte bit = b0 & 1u;
 94                     b0 = b0 >> 1u;
 95                     u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
 96                     ++shift;
 97                 }
 98                 result.Append(u);
 99                 bytesRemaining = bytesRemaining - 3;
100                 p = p + 3;
101             }
102             else if ((x & 0xF8u) == 0xF0u)
103             {
104                 if (bytesRemaining < 4)
105                 {
106                     ThrowConversionException("invalid UTF-8 sequence");
107                 }
108                 uchar u = cast<uchar>(cast<uint>(0u));
109                 byte b3 = cast<byte>(p[3]);
110                 if ((b3 & 0xC0u) != 0x80u)
111                 {
112                     ThrowConversionException("invalid UTF-8 sequence");
113                 }
114                 byte shift = 0u;
115                 for (byte i = 0u; i < 6u; ++i;)
116                 {
117                     byte bit = b3 & 1u;
118                     b3 = b3 >> 1u;
119                     u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
120                     ++shift;
121                 }
122                 byte b2 = cast<byte>(p[2]);
123                 if ((b2 & 0xC0u) != 0x80u)
124                 {
125                     ThrowConversionException("invalid UTF-8 sequence");
126                 }
127                 for (byte i = 0u; i < 6u; ++i;)
128                 {
129                     byte bit = b2 & 1u;
130                     b2 = b2 >> 1u;
131                     u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
132                     ++shift;
133                 }
134                 byte b1 = cast<byte>(p[1]);
135                 if ((b1 & 0xC0u) != 0x80u)
136                 {
137                     ThrowConversionException("invalid UTF-8 sequence");
138                 }
139                 for (byte i = 0u; i < 6u; ++i;)
140                 {
141                     byte bit = b1 & 1u;
142                     b1 = b1 >> 1u;
143                     u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
144                     ++shift;
145                 }
146                 byte b0 = x;
147                 for (byte i = 0u; i < 3u; ++i;)
148                 {
149                     byte bit = b0 & 1u;
150                     b0 = b0 >> 1u;
151                     u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
152                     ++shift;
153                 }
154                 result.Append(u);
155                 bytesRemaining = bytesRemaining - 4;
156                 p = p + 4;
157             }
158             else
159             {
160                 ThrowConversionException("invalid UTF-8 sequence");
161             }
162         }
163         return result;
164     }
165     
166     public ustring ToUtf32(const wstring& utf16Str)
167     {
168         ustring result;
169         const wchar* w = utf16Str.Chars();
170         long remaining = utf16Str.Length();
171         while (remaining > 0)
172         {
173             wchar w1 = *w++;
174             --remaining;
175             if (cast<ushort>(w1) < 0xD800u || cast<ushort>(w1) > 0xDFFFu)
176             {
177                 result.Append(w1);
178             }
179             else
180             {
181                 if (cast<ushort>(w1) < 0xD800u || cast<ushort>(w1) > 0xDBFFu)
182                 {
183                     ThrowConversionException("invalid UTF-16 sequence");
184                 }
185                 if (remaining > 0)
186                 {
187                     wchar w2 = *w++;
188                     --remaining;
189                     if (cast<ushort>(w2) < 0xDC00u || cast<ushort>(w2) > 0xDFFFu)
190                     {
191                         ThrowConversionException("invalid UTF-16 sequence");
192                     }
193                     else
194                     {
195                         uchar uprime = cast<uchar>(((0x03FFu & cast<uint>(w1)) << 10u) | (0x03FFu & cast<uint>(w2)));
196                         uchar u = cast<uchar>(cast<uint>(uprime) + 0x10000u);
197                         result.Append(u);
198                     }
199                 }
200                 else
201                 {
202                     ThrowConversionException("invalid UTF-16 sequence");
203                 }
204             }
205         }
206         return result;
207     }
208 
209     public wstring ToUtf16(const ustring& utf32Str)
210     {
211         wstring result;
212         for (uchar u : utf32Str)
213         {
214             if (cast<uint>(u) > 0x10FFFFu)
215             {
216                 ThrowConversionException("invalid UTF-32 code point");
217             }
218             if (cast<uint>(u) < 0x10000u)
219             {
220                 if (cast<uint>(u) >= 0xD800 && cast<uint>(u) <= 0xDFFF)
221                 {
222                     ThrowConversionException("invalid UTF-32 code point (reserved for UTF-16)");
223                 }
224                 wchar x = cast<wchar>(u);
225                 result.Append(x);
226             }
227             else
228             {
229                 uchar uprime = cast<uchar>(cast<uint>(u) - 0x10000u);
230                 wchar w1 = cast<wchar>(0xD800u);
231                 wchar w2 = cast<wchar>(0xDC00u);
232                 for (ushort i = 0u; i < 10u; ++i;)
233                 {
234                     ushort bit = cast<ushort>(cast<uint>(uprime) & (cast<uint>(0x1u) << i));
235                     w2 = cast<wchar>(cast<ushort>(w2) | bit);
236                 }
237                 for (ushort i = 10u; i < 20u; ++i;)
238                 {
239                     ushort bit = cast<ushort>((cast<uint>(uprime) & (cast<uint>(0x1u) << i)) >> 10u);
240                     w1 = cast<wchar>(cast<ushort>(w1) | bit);
241                 }
242                 result.Append(w1);
243                 result.Append(w2);
244             }
245         }
246         return result;
247     }
248 
249     public wstring ToUtf16(const string& utf8Str)
250     {
251         return ToUtf16(ToUtf32(utf8Str));
252     }
253 
254     public string ToUtf8(const ustring& utf32Str)
255     {
256         string result;
257         for (uchar c : utf32Str)
258         {
259             uint x = cast<uint>(c);
260             if (x < 0x80u)
261             {
262                 result.Append(cast<char>(x & 0x7Fu));
263             }
264             else if (x < 0x800u)
265             {
266                 byte b1 = 0x80u;
267                 for (byte i = 0u; i < 6u; ++i;)
268                 {
269                     b1 = b1 | (cast<byte>(x & 1u) << i);
270                     x = x >> 1u;
271                 }
272                 byte b0 = 0xC0u;
273                 for (byte i = 0u; i < 5u; ++i;)
274                 {
275                     b0 = b0 | (cast<byte>(x & 1u) << i);
276                     x = x >> 1u;
277                 }
278                 result.Append(cast<char>(b0));
279                 result.Append(cast<char>(b1));
280             }
281             else if (x < 0x10000u)
282             {
283                 byte b2 = 0x80u;
284                 for (byte i = 0u; i < 6u; ++i;)
285                 {
286                     b2 = b2 | (cast<byte>(x & 1u) << i);
287                     x = x >> 1u;
288                 }
289                 byte b1 = 0x80u;
290                 for (byte i = 0u; i < 6u; ++i;)
291                 {
292                     b1 = b1 | (cast<byte>(x & 1u) << i);
293                     x = x >> 1u;
294                 }
295                 byte b0 = 0xE0u;
296                 for (byte i = 0u; i < 4u; ++i;)
297                 {
298                     b0 = b0 | (cast<byte>(x & 1u) << i);
299                     x = x >> 1u;
300                 }
301                 result.Append(cast<char>(b0));
302                 result.Append(cast<char>(b1));
303                 result.Append(cast<char>(b2));
304             }
305             else if (x < 0x110000u)
306             {
307                 byte b3 = 0x80u;
308                 for (byte i = 0u; i < 6u; ++i;)
309                 {
310                     b3 = b3 | (cast<byte>(x & 1u) << i);
311                     x = x >> 1u;
312                 }
313                 byte b2 = 0x80u;
314                 for (byte i = 0u; i < 6u; ++i;)
315                 {
316                     b2 = b2 | (cast<byte>(x & 1u) << i);
317                     x = x >> 1u;
318                 }
319                 byte b1 = 0x80u;
320                 for (byte i = 0u; i < 6u; ++i;)
321                 {
322                     b1 = b1 | (cast<byte>(x & 1u) << i);
323                     x = x >> 1u;
324                 }
325                 byte b0 = 0xF0u;
326                 for (byte i = 0u; i < 3u; ++i;)
327                 {
328                     b0 = b0 | (cast<byte>(x & 1u) << i);
329                     x = x >> 1u;
330                 }
331                 result.Append(cast<char>(b0));
332                 result.Append(cast<char>(b1));
333                 result.Append(cast<char>(b2));
334                 result.Append(cast<char>(b3));
335             }
336             else
337             {
338                 ThrowConversionException("invalid UTF-32 code point");
339             }
340         }
341         return result;
342     }
343 
344     public string ToUtf8(const wstring& utf16Str)
345     {
346         return ToUtf8(ToUtf32(utf16Str));
347     }
348     
349     public bool IsUpperLetter(uchar c)
350     {
351         return GetGeneralCategory(c) == GeneralCategoryId.Lu;
352     }
353 
354     public bool IsLowerLetter(uchar c)
355     {
356         return GetGeneralCategory(c) == GeneralCategoryId.Ll;
357     }
358     
359     public bool IsTitleLetter(uchar c)
360     {
361         return GetGeneralCategory(c) == GeneralCategoryId.Lt;
362     }
363     
364     public bool IsModifierLetter(uchar c)
365     {
366         return GetGeneralCategory(c) == GeneralCategoryId.Lm;
367     }
368 
369     public bool IsOtherLetter(uchar c)
370     {
371         return GetGeneralCategory(c) == GeneralCategoryId.Lo;
372     }
373     
374     public bool IsCasedLetter(uchar c)
375     {
376         return HasGeneralCategory(cGeneralCategoryId.LC);
377     }
378 
379     public bool IsLetter(uchar c)
380     {
381         return HasGeneralCategory(cGeneralCategoryId.L);
382     }
383                                                 
384     public bool IsNonspacingMark(uchar c)
385     {
386         return GetGeneralCategory(c) == GeneralCategoryId.Mn;
387     }
388     
389     public bool IsSpacingMark(uchar c)
390     {
391         return GetGeneralCategory(c) == GeneralCategoryId.Mc;
392     }
393 
394     public bool IsEnclosingMark(uchar c)
395     {
396         return GetGeneralCategory(c) == GeneralCategoryId.Me;
397     }
398         
399     public bool IsMark(uchar c)
400     {
401         return HasGeneralCategory(cGeneralCategoryId.M);
402     }
403 
404     public bool IsDecimalNumber(uchar c)
405     {
406         return GetGeneralCategory(c) == GeneralCategoryId.Nd;
407     }
408     
409     public bool IsLetterNumber(uchar c)
410     {
411         return GetGeneralCategory(c) == GeneralCategoryId.Nl;
412     }
413 
414     public bool IsOtherNumber(uchar c)
415     {
416         return GetGeneralCategory(c) == GeneralCategoryId.No;
417     }
418 
419     public bool IsNumber(uchar c)
420     {
421         return HasGeneralCategory(cGeneralCategoryId.N);
422     }
423 
424     public bool IsConnectorPunctuation(uchar c)
425     {
426         return GetGeneralCategory(c) == GeneralCategoryId.Pc;
427     }
428 
429     public bool IsDashPunctuation(uchar c)
430     {
431         return GetGeneralCategory(c) == GeneralCategoryId.Pd;
432     }
433 
434     public bool IsOpenPunctuation(uchar c)
435     {
436         return GetGeneralCategory(c) == GeneralCategoryId.Ps;
437     }
438 
439     public bool IsClosePunctuation(uchar c)
440     {
441         return GetGeneralCategory(c) == GeneralCategoryId.Pe;
442     }
443 
444     public bool IsInitialPunctuation(uchar c)
445     {
446         return GetGeneralCategory(c) == GeneralCategoryId.Pi;
447     }
448 
449     public bool IsFinalPunctuation(uchar c)
450     {
451         return GetGeneralCategory(c) == GeneralCategoryId.Pf;
452     }
453 
454     public bool IsOtherPunctuation(uchar c)
455     {
456         return GetGeneralCategory(c) == GeneralCategoryId.Po;
457     }
458 
459     public bool IsPunctuation(uchar c)
460     {
461         return HasGeneralCategory(cGeneralCategoryId.P);
462     }
463 
464     public bool IsMathSymbol(uchar c)
465     {
466         return GetGeneralCategory(c) == GeneralCategoryId.Sm;
467     }
468 
469     public bool IsCurrencySymbol(uchar c)
470     {
471         return GetGeneralCategory(c) == GeneralCategoryId.Sc;
472     }
473 
474     public bool IsModifierSymbol(uchar c)
475     {
476         return GetGeneralCategory(c) == GeneralCategoryId.Sk;
477     }
478 
479     public bool IsOtherSymbol(uchar c)
480     {
481         return GetGeneralCategory(c) == GeneralCategoryId.So;
482     }
483 
484     public bool IsSymbol(uchar c)
485     {
486         return HasGeneralCategory(cGeneralCategoryId.S);
487     }
488 
489     public bool IsSpaceSeparator(uchar c)
490     {
491         return GetGeneralCategory(c) == GeneralCategoryId.Zs;
492     }
493 
494     public bool IsLineSeparator(uchar c)
495     {
496         return GetGeneralCategory(c) == GeneralCategoryId.Zl;
497     }
498 
499     public bool IsParagraphSeparator(uchar c)
500     {
501         return GetGeneralCategory(c) == GeneralCategoryId.Zp;
502     }
503     
504     public bool IsSeparator(uchar c)
505     {
506         return HasGeneralCategory(cGeneralCategoryId.Z);
507     }
508     
509     public bool IsControl(uchar c)
510     {
511         return GetGeneralCategory(c) == GeneralCategoryId.Cc;
512     }
513     
514     public bool IsFormat(uchar c)
515     {
516         return GetGeneralCategory(c) == GeneralCategoryId.Cf;
517     }
518 
519     public bool IsSurrogate(uchar c)
520     {
521         return GetGeneralCategory(c) == GeneralCategoryId.Cs;
522     }
523 
524     public bool IsPrivateUse(uchar c)
525     {
526         return GetGeneralCategory(c) == GeneralCategoryId.Co;
527     }
528 
529     public bool IsUnassigned(uchar c)
530     {
531         return GetGeneralCategory(c) == GeneralCategoryId.Cn;
532     }
533 
534     public bool IsOther(uchar c)
535     {
536         return HasGeneralCategory(cGeneralCategoryId.C);
537     }
538 
539     public bool IsGraphic(uchar c)
540     {
541         return HasGeneralCategory(cGeneralCategoryId.G);
542     }
543 
544     public bool IsBaseChar(uchar c)
545     {
546         return HasGeneralCategory(cGeneralCategoryId.B);
547     }
548 
549     public bool IsCombining(uchar c)
550     {
551         return IsMark(c);
552     }
553 
554     public uchar ToUpper(uchar c)
555     {
556         return GetCharacterInfo(c).Upper();
557     }
558 
559     public uchar ToLower(uchar c)
560     {
561         return GetCharacterInfo(c).Lower();
562     }
563 
564     public uchar ToTitle(uchar c)
565     {
566         return GetCharacterInfo(c).Title();
567     }
568     
569     public uchar ToFolding(uchar c)
570     {
571         return GetCharacterInfo(c).Folding();
572     }
573     
574     public ustring FullUpper(uchar c)
575     {
576         return GetExtendedCharacterInfo(c).FullUpper();
577     }
578 
579     public ustring FullLower(uchar c)
580     {
581         return GetExtendedCharacterInfo(c).FullLower();
582     }
583 
584     public ustring FullTitle(uchar c)
585     {
586         return GetExtendedCharacterInfo(c).FullTitle();
587     }
588 
589     public ustring FullFolding(uchar c)
590     {
591         return GetExtendedCharacterInfo(c).FullFolding();
592     }
593 
594     public bool IsWhiteSpace(uchar c)
595     {
596         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.whiteSpace);
597     }
598 
599     public bool IsAlphabetic(uchar c)
600     {
601         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.alphabetic);
602     }
603 
604     public bool IsAsciiHexDigit(uchar c)
605     {
606         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.asciiHexDigit);
607     }
608 
609     public bool IsAsciiDigit(uchar c)
610     {
611         return cast<uint>(c) < 256u && IsDigit(cast<char>(c));
612     }
613 
614     public bool IsUppercase(uchar c)
615     {
616         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.uppercase);
617     }
618 
619     public bool IsLowercase(uchar c)
620     {
621         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.lowercase);
622     }
623 
624     public bool IsIdStart(uchar c)
625     {
626         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.idStart);
627     }
628 
629     public bool IsIdCont(uchar c)
630     {
631         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.idContinue);
632     }
633 
634     public bool IsGraphemeBase(uchar c)
635     {
636         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.graphemeBase);
637     }
638 
639     public bool IsGraphemeExtender(uchar c)
640     {
641         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.graphemeExtend);
642     }
643 
644     public bool IsOtherLower(uchar c)
645     {
646         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.otherLowercase);
647     }
648 
649     public bool IsOtherUpper(uchar c)
650     {
651         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.otherUppercase);
652     }
653 
654     public string GetCharacterName(uchar c)
655     {
656         return GetExtendedCharacterInfo(c).CharacterName();
657     }
658 
659     public string GetUnicode1Name(uchar c)
660     {
661         return GetExtendedCharacterInfo(c).Unicode1Name();
662     }
663 
664     public NumericTypeId GetNumericType(uchar c)
665     {
666         return GetExtendedCharacterInfo(c).NumericTypeValue();
667     }
668 
669     public ustring GetNumericValue(uchar c)
670     {
671         return GetExtendedCharacterInfo(c).NumericValue();
672     }
673 
674     public bool IsBidiMirrored(uchar c)
675     {
676         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.bidiMirrored);
677     }
678 
679     public bool IsBidiControl(uchar c)
680     {
681         return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.bidiControl);
682     }
683 
684     public uchar GetBidiMirroringGlyph(uchar c)
685     {
686         return GetExtendedCharacterInfo(c).BidiMirroringGlyph();
687     }
688 
689     public BidiPairedBracketTypeId GetBidiPairedBracketType(uchar c)
690     {
691         return GetExtendedCharacterInfo(c).BidiPairedBracketTypeValue();
692     }
693 
694     public uchar GetBidiPairedBracket(uchar c)
695     {
696         return GetExtendedCharacterInfo(c).BidiPairedBracket();
697     }
698 
699     public const List<Alias>& Aliases(uchar c)
700     {
701         return GetExtendedCharacterInfo(c).Aliases();
702     }
703 }