1
2
3
4
5
6 using System.Collections;
7 using System.Unicode;
8
9 namespace System
10 {
11 public ustring ToUtf32(const string& utf8Str)
12 {
13 ustring result;
14 const char* p = utf8Str.Chars();
15 long bytesRemaining = utf8Str.Length();
16 while (bytesRemaining > 0)
17 {
18 char c = *p;
19 byte x = cast<byte>(c);
20 if ((x & 0x80u) == 0u)
21 {
22 result.Append(cast<uchar>(cast<uint>(x)));
23 --bytesRemaining;
24 ++p;
25 }
26 else if ((x & 0xE0u) == 0xC0u)
27 {
28 if (bytesRemaining < 2)
29 {
30 ThrowConversionException("invalid UTF-8 sequence");
31 }
32 uchar u = cast<uchar>(cast<uint>(0u));
33 byte b1 = cast<byte>(p[1]);
34 if ((b1 & 0xC0u) != 0x80u)
35 {
36 ThrowConversionException("invalid UTF-8 sequence");
37 }
38 byte shift = 0u;
39 for (byte i = 0u; i < 6u; ++i;)
40 {
41 byte bit = b1 & 1u;
42 b1 = b1 >> 1u;
43 u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
44 ++shift;
45 }
46 byte b0 = x;
47 for (byte i = 0u; i < 5u; ++i;)
48 {
49 byte bit = b0 & 1u;
50 b0 = b0 >> 1u;
51 u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
52 ++shift;
53 }
54 result.Append(u);
55 bytesRemaining = bytesRemaining - 2;
56 p = p + 2;
57 }
58 else if ((x & 0xF0u) == 0xE0u)
59 {
60 if (bytesRemaining < 3)
61 {
62 ThrowConversionException("invalid UTF-8 sequence");
63 }
64 uchar u = cast<uchar>(cast<uint>(0u));
65 byte b2 = cast<byte>(p[2]);
66 if ((b2 & 0xC0u) != 0x80u)
67 {
68 ThrowConversionException("invalid UTF-8 sequence");
69 }
70 byte shift = 0u;
71 for (byte i = 0u; i < 6u; ++i;)
72 {
73 byte bit = b2 & 1u;
74 b2 = b2 >> 1u;
75 u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
76 ++shift;
77 }
78 byte b1 = cast<byte>(p[1]);
79 if ((b1 & 0xC0u) != 0x80u)
80 {
81 ThrowConversionException("invalid UTF-8 sequence");
82 }
83 for (byte i = 0u; i < 6u; ++i;)
84 {
85 byte bit = b1 & 1u;
86 b1 = b1 >> 1u;
87 u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
88 ++shift;
89 }
90 byte b0 = x;
91 for (byte i = 0u; i < 4u; ++i;)
92 {
93 byte bit = b0 & 1u;
94 b0 = b0 >> 1u;
95 u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
96 ++shift;
97 }
98 result.Append(u);
99 bytesRemaining = bytesRemaining - 3;
100 p = p + 3;
101 }
102 else if ((x & 0xF8u) == 0xF0u)
103 {
104 if (bytesRemaining < 4)
105 {
106 ThrowConversionException("invalid UTF-8 sequence");
107 }
108 uchar u = cast<uchar>(cast<uint>(0u));
109 byte b3 = cast<byte>(p[3]);
110 if ((b3 & 0xC0u) != 0x80u)
111 {
112 ThrowConversionException("invalid UTF-8 sequence");
113 }
114 byte shift = 0u;
115 for (byte i = 0u; i < 6u; ++i;)
116 {
117 byte bit = b3 & 1u;
118 b3 = b3 >> 1u;
119 u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
120 ++shift;
121 }
122 byte b2 = cast<byte>(p[2]);
123 if ((b2 & 0xC0u) != 0x80u)
124 {
125 ThrowConversionException("invalid UTF-8 sequence");
126 }
127 for (byte i = 0u; i < 6u; ++i;)
128 {
129 byte bit = b2 & 1u;
130 b2 = b2 >> 1u;
131 u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
132 ++shift;
133 }
134 byte b1 = cast<byte>(p[1]);
135 if ((b1 & 0xC0u) != 0x80u)
136 {
137 ThrowConversionException("invalid UTF-8 sequence");
138 }
139 for (byte i = 0u; i < 6u; ++i;)
140 {
141 byte bit = b1 & 1u;
142 b1 = b1 >> 1u;
143 u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
144 ++shift;
145 }
146 byte b0 = x;
147 for (byte i = 0u; i < 3u; ++i;)
148 {
149 byte bit = b0 & 1u;
150 b0 = b0 >> 1u;
151 u = cast<uchar>(cast<uint>(u) | (cast<uint>(bit) << shift));
152 ++shift;
153 }
154 result.Append(u);
155 bytesRemaining = bytesRemaining - 4;
156 p = p + 4;
157 }
158 else
159 {
160 ThrowConversionException("invalid UTF-8 sequence");
161 }
162 }
163 return result;
164 }
165
166 public ustring ToUtf32(const wstring& utf16Str)
167 {
168 ustring result;
169 const wchar* w = utf16Str.Chars();
170 long remaining = utf16Str.Length();
171 while (remaining > 0)
172 {
173 wchar w1 = *w++;
174 --remaining;
175 if (cast<ushort>(w1) < 0xD800u || cast<ushort>(w1) > 0xDFFFu)
176 {
177 result.Append(w1);
178 }
179 else
180 {
181 if (cast<ushort>(w1) < 0xD800u || cast<ushort>(w1) > 0xDBFFu)
182 {
183 ThrowConversionException("invalid UTF-16 sequence");
184 }
185 if (remaining > 0)
186 {
187 wchar w2 = *w++;
188 --remaining;
189 if (cast<ushort>(w2) < 0xDC00u || cast<ushort>(w2) > 0xDFFFu)
190 {
191 ThrowConversionException("invalid UTF-16 sequence");
192 }
193 else
194 {
195 uchar uprime = cast<uchar>(((0x03FFu & cast<uint>(w1)) << 10u) | (0x03FFu & cast<uint>(w2)));
196 uchar u = cast<uchar>(cast<uint>(uprime) + 0x10000u);
197 result.Append(u);
198 }
199 }
200 else
201 {
202 ThrowConversionException("invalid UTF-16 sequence");
203 }
204 }
205 }
206 return result;
207 }
208
209 public wstring ToUtf16(const ustring& utf32Str)
210 {
211 wstring result;
212 for (uchar u : utf32Str)
213 {
214 if (cast<uint>(u) > 0x10FFFFu)
215 {
216 ThrowConversionException("invalid UTF-32 code point");
217 }
218 if (cast<uint>(u) < 0x10000u)
219 {
220 if (cast<uint>(u) >= 0xD800 && cast<uint>(u) <= 0xDFFF)
221 {
222 ThrowConversionException("invalid UTF-32 code point (reserved for UTF-16)");
223 }
224 wchar x = cast<wchar>(u);
225 result.Append(x);
226 }
227 else
228 {
229 uchar uprime = cast<uchar>(cast<uint>(u) - 0x10000u);
230 wchar w1 = cast<wchar>(0xD800u);
231 wchar w2 = cast<wchar>(0xDC00u);
232 for (ushort i = 0u; i < 10u; ++i;)
233 {
234 ushort bit = cast<ushort>(cast<uint>(uprime) & (cast<uint>(0x1u) << i));
235 w2 = cast<wchar>(cast<ushort>(w2) | bit);
236 }
237 for (ushort i = 10u; i < 20u; ++i;)
238 {
239 ushort bit = cast<ushort>((cast<uint>(uprime) & (cast<uint>(0x1u) << i)) >> 10u);
240 w1 = cast<wchar>(cast<ushort>(w1) | bit);
241 }
242 result.Append(w1);
243 result.Append(w2);
244 }
245 }
246 return result;
247 }
248
249 public wstring ToUtf16(const string& utf8Str)
250 {
251 return ToUtf16(ToUtf32(utf8Str));
252 }
253
254 public string ToUtf8(const ustring& utf32Str)
255 {
256 string result;
257 for (uchar c : utf32Str)
258 {
259 uint x = cast<uint>(c);
260 if (x < 0x80u)
261 {
262 result.Append(cast<char>(x & 0x7Fu));
263 }
264 else if (x < 0x800u)
265 {
266 byte b1 = 0x80u;
267 for (byte i = 0u; i < 6u; ++i;)
268 {
269 b1 = b1 | (cast<byte>(x & 1u) << i);
270 x = x >> 1u;
271 }
272 byte b0 = 0xC0u;
273 for (byte i = 0u; i < 5u; ++i;)
274 {
275 b0 = b0 | (cast<byte>(x & 1u) << i);
276 x = x >> 1u;
277 }
278 result.Append(cast<char>(b0));
279 result.Append(cast<char>(b1));
280 }
281 else if (x < 0x10000u)
282 {
283 byte b2 = 0x80u;
284 for (byte i = 0u; i < 6u; ++i;)
285 {
286 b2 = b2 | (cast<byte>(x & 1u) << i);
287 x = x >> 1u;
288 }
289 byte b1 = 0x80u;
290 for (byte i = 0u; i < 6u; ++i;)
291 {
292 b1 = b1 | (cast<byte>(x & 1u) << i);
293 x = x >> 1u;
294 }
295 byte b0 = 0xE0u;
296 for (byte i = 0u; i < 4u; ++i;)
297 {
298 b0 = b0 | (cast<byte>(x & 1u) << i);
299 x = x >> 1u;
300 }
301 result.Append(cast<char>(b0));
302 result.Append(cast<char>(b1));
303 result.Append(cast<char>(b2));
304 }
305 else if (x < 0x110000u)
306 {
307 byte b3 = 0x80u;
308 for (byte i = 0u; i < 6u; ++i;)
309 {
310 b3 = b3 | (cast<byte>(x & 1u) << i);
311 x = x >> 1u;
312 }
313 byte b2 = 0x80u;
314 for (byte i = 0u; i < 6u; ++i;)
315 {
316 b2 = b2 | (cast<byte>(x & 1u) << i);
317 x = x >> 1u;
318 }
319 byte b1 = 0x80u;
320 for (byte i = 0u; i < 6u; ++i;)
321 {
322 b1 = b1 | (cast<byte>(x & 1u) << i);
323 x = x >> 1u;
324 }
325 byte b0 = 0xF0u;
326 for (byte i = 0u; i < 3u; ++i;)
327 {
328 b0 = b0 | (cast<byte>(x & 1u) << i);
329 x = x >> 1u;
330 }
331 result.Append(cast<char>(b0));
332 result.Append(cast<char>(b1));
333 result.Append(cast<char>(b2));
334 result.Append(cast<char>(b3));
335 }
336 else
337 {
338 ThrowConversionException("invalid UTF-32 code point");
339 }
340 }
341 return result;
342 }
343
344 public string ToUtf8(const wstring& utf16Str)
345 {
346 return ToUtf8(ToUtf32(utf16Str));
347 }
348
349 public bool IsUpperLetter(uchar c)
350 {
351 return GetGeneralCategory(c) == GeneralCategoryId.Lu;
352 }
353
354 public bool IsLowerLetter(uchar c)
355 {
356 return GetGeneralCategory(c) == GeneralCategoryId.Ll;
357 }
358
359 public bool IsTitleLetter(uchar c)
360 {
361 return GetGeneralCategory(c) == GeneralCategoryId.Lt;
362 }
363
364 public bool IsModifierLetter(uchar c)
365 {
366 return GetGeneralCategory(c) == GeneralCategoryId.Lm;
367 }
368
369 public bool IsOtherLetter(uchar c)
370 {
371 return GetGeneralCategory(c) == GeneralCategoryId.Lo;
372 }
373
374 public bool IsCasedLetter(uchar c)
375 {
376 return HasGeneralCategory(c, GeneralCategoryId.LC);
377 }
378
379 public bool IsLetter(uchar c)
380 {
381 return HasGeneralCategory(c, GeneralCategoryId.L);
382 }
383
384 public bool IsNonspacingMark(uchar c)
385 {
386 return GetGeneralCategory(c) == GeneralCategoryId.Mn;
387 }
388
389 public bool IsSpacingMark(uchar c)
390 {
391 return GetGeneralCategory(c) == GeneralCategoryId.Mc;
392 }
393
394 public bool IsEnclosingMark(uchar c)
395 {
396 return GetGeneralCategory(c) == GeneralCategoryId.Me;
397 }
398
399 public bool IsMark(uchar c)
400 {
401 return HasGeneralCategory(c, GeneralCategoryId.M);
402 }
403
404 public bool IsDecimalNumber(uchar c)
405 {
406 return GetGeneralCategory(c) == GeneralCategoryId.Nd;
407 }
408
409 public bool IsLetterNumber(uchar c)
410 {
411 return GetGeneralCategory(c) == GeneralCategoryId.Nl;
412 }
413
414 public bool IsOtherNumber(uchar c)
415 {
416 return GetGeneralCategory(c) == GeneralCategoryId.No;
417 }
418
419 public bool IsNumber(uchar c)
420 {
421 return HasGeneralCategory(c, GeneralCategoryId.N);
422 }
423
424 public bool IsConnectorPunctuation(uchar c)
425 {
426 return GetGeneralCategory(c) == GeneralCategoryId.Pc;
427 }
428
429 public bool IsDashPunctuation(uchar c)
430 {
431 return GetGeneralCategory(c) == GeneralCategoryId.Pd;
432 }
433
434 public bool IsOpenPunctuation(uchar c)
435 {
436 return GetGeneralCategory(c) == GeneralCategoryId.Ps;
437 }
438
439 public bool IsClosePunctuation(uchar c)
440 {
441 return GetGeneralCategory(c) == GeneralCategoryId.Pe;
442 }
443
444 public bool IsInitialPunctuation(uchar c)
445 {
446 return GetGeneralCategory(c) == GeneralCategoryId.Pi;
447 }
448
449 public bool IsFinalPunctuation(uchar c)
450 {
451 return GetGeneralCategory(c) == GeneralCategoryId.Pf;
452 }
453
454 public bool IsOtherPunctuation(uchar c)
455 {
456 return GetGeneralCategory(c) == GeneralCategoryId.Po;
457 }
458
459 public bool IsPunctuation(uchar c)
460 {
461 return HasGeneralCategory(c, GeneralCategoryId.P);
462 }
463
464 public bool IsMathSymbol(uchar c)
465 {
466 return GetGeneralCategory(c) == GeneralCategoryId.Sm;
467 }
468
469 public bool IsCurrencySymbol(uchar c)
470 {
471 return GetGeneralCategory(c) == GeneralCategoryId.Sc;
472 }
473
474 public bool IsModifierSymbol(uchar c)
475 {
476 return GetGeneralCategory(c) == GeneralCategoryId.Sk;
477 }
478
479 public bool IsOtherSymbol(uchar c)
480 {
481 return GetGeneralCategory(c) == GeneralCategoryId.So;
482 }
483
484 public bool IsSymbol(uchar c)
485 {
486 return HasGeneralCategory(c, GeneralCategoryId.S);
487 }
488
489 public bool IsSpaceSeparator(uchar c)
490 {
491 return GetGeneralCategory(c) == GeneralCategoryId.Zs;
492 }
493
494 public bool IsLineSeparator(uchar c)
495 {
496 return GetGeneralCategory(c) == GeneralCategoryId.Zl;
497 }
498
499 public bool IsParagraphSeparator(uchar c)
500 {
501 return GetGeneralCategory(c) == GeneralCategoryId.Zp;
502 }
503
504 public bool IsSeparator(uchar c)
505 {
506 return HasGeneralCategory(c, GeneralCategoryId.Z);
507 }
508
509 public bool IsControl(uchar c)
510 {
511 return GetGeneralCategory(c) == GeneralCategoryId.Cc;
512 }
513
514 public bool IsFormat(uchar c)
515 {
516 return GetGeneralCategory(c) == GeneralCategoryId.Cf;
517 }
518
519 public bool IsSurrogate(uchar c)
520 {
521 return GetGeneralCategory(c) == GeneralCategoryId.Cs;
522 }
523
524 public bool IsPrivateUse(uchar c)
525 {
526 return GetGeneralCategory(c) == GeneralCategoryId.Co;
527 }
528
529 public bool IsUnassigned(uchar c)
530 {
531 return GetGeneralCategory(c) == GeneralCategoryId.Cn;
532 }
533
534 public bool IsOther(uchar c)
535 {
536 return HasGeneralCategory(c, GeneralCategoryId.C);
537 }
538
539 public bool IsGraphic(uchar c)
540 {
541 return HasGeneralCategory(c, GeneralCategoryId.G);
542 }
543
544 public bool IsBaseChar(uchar c)
545 {
546 return HasGeneralCategory(c, GeneralCategoryId.B);
547 }
548
549 public bool IsCombining(uchar c)
550 {
551 return IsMark(c);
552 }
553
554 public uchar ToUpper(uchar c)
555 {
556 return GetCharacterInfo(c).Upper();
557 }
558
559 public uchar ToLower(uchar c)
560 {
561 return GetCharacterInfo(c).Lower();
562 }
563
564 public uchar ToTitle(uchar c)
565 {
566 return GetCharacterInfo(c).Title();
567 }
568
569 public uchar ToFolding(uchar c)
570 {
571 return GetCharacterInfo(c).Folding();
572 }
573
574 public ustring FullUpper(uchar c)
575 {
576 return GetExtendedCharacterInfo(c).FullUpper();
577 }
578
579 public ustring FullLower(uchar c)
580 {
581 return GetExtendedCharacterInfo(c).FullLower();
582 }
583
584 public ustring FullTitle(uchar c)
585 {
586 return GetExtendedCharacterInfo(c).FullTitle();
587 }
588
589 public ustring FullFolding(uchar c)
590 {
591 return GetExtendedCharacterInfo(c).FullFolding();
592 }
593
594 public bool IsWhiteSpace(uchar c)
595 {
596 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.whiteSpace);
597 }
598
599 public bool IsAlphabetic(uchar c)
600 {
601 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.alphabetic);
602 }
603
604 public bool IsAsciiHexDigit(uchar c)
605 {
606 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.asciiHexDigit);
607 }
608
609 public bool IsAsciiDigit(uchar c)
610 {
611 return cast<uint>(c) < 256u && IsDigit(cast<char>(c));
612 }
613
614 public bool IsUppercase(uchar c)
615 {
616 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.uppercase);
617 }
618
619 public bool IsLowercase(uchar c)
620 {
621 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.lowercase);
622 }
623
624 public bool IsIdStart(uchar c)
625 {
626 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.idStart);
627 }
628
629 public bool IsIdCont(uchar c)
630 {
631 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.idContinue);
632 }
633
634 public bool IsGraphemeBase(uchar c)
635 {
636 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.graphemeBase);
637 }
638
639 public bool IsGraphemeExtender(uchar c)
640 {
641 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.graphemeExtend);
642 }
643
644 public bool IsOtherLower(uchar c)
645 {
646 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.otherLowercase);
647 }
648
649 public bool IsOtherUpper(uchar c)
650 {
651 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.otherUppercase);
652 }
653
654 public string GetCharacterName(uchar c)
655 {
656 return GetExtendedCharacterInfo(c).CharacterName();
657 }
658
659 public string GetUnicode1Name(uchar c)
660 {
661 return GetExtendedCharacterInfo(c).Unicode1Name();
662 }
663
664 public NumericTypeId GetNumericType(uchar c)
665 {
666 return GetExtendedCharacterInfo(c).NumericTypeValue();
667 }
668
669 public ustring GetNumericValue(uchar c)
670 {
671 return GetExtendedCharacterInfo(c).NumericValue();
672 }
673
674 public bool IsBidiMirrored(uchar c)
675 {
676 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.bidiMirrored);
677 }
678
679 public bool IsBidiControl(uchar c)
680 {
681 return GetCharacterInfo(c).GetBinaryProperty(BinaryPropertyId.bidiControl);
682 }
683
684 public uchar GetBidiMirroringGlyph(uchar c)
685 {
686 return GetExtendedCharacterInfo(c).BidiMirroringGlyph();
687 }
688
689 public BidiPairedBracketTypeId GetBidiPairedBracketType(uchar c)
690 {
691 return GetExtendedCharacterInfo(c).BidiPairedBracketTypeValue();
692 }
693
694 public uchar GetBidiPairedBracket(uchar c)
695 {
696 return GetExtendedCharacterInfo(c).BidiPairedBracket();
697 }
698
699 public const List<Alias>& Aliases(uchar c)
700 {
701 return GetExtendedCharacterInfo(c).Aliases();
702 }
703 }