1
2
3
4
5
6 #include <soulng/util/Unicode.hpp>
7 #include <soulng/util/TextUtils.hpp>
8 #include <boost/filesystem.hpp>
9 #include <cctype>
10 #include <mutex>
11
12 namespace soulng { namespace unicode {
13
14 std::string CmajorVersionStr()
15 {
16 return "3.10.0";
17 }
18
19 UnicodeException::UnicodeException(const std::string& message_) : std::runtime_error(message_)
20 {
21 }
22
23 Utf8ToUtf32Engine::Utf8ToUtf32Engine() : state(0), resultReady(false), result(U'\0')
24 {
25 std::memset(bytes, 0, sizeof(bytes));
26 }
27
28 void ThrowInvalidUtf8Sequence()
29 {
30 throw UnicodeException("invalid UTF-8 sequence");
31 }
32
33 void Utf8ToUtf32Engine::Put(uint8_t x)
34 {
35 switch (state)
36 {
37 case 0:
38 {
39 resultReady = false;
40 if ((x & 0x80u) == 0u)
41 {
42 result = static_cast<char32_t>(x);
43 resultReady = true;
44 }
45 else if ((x & 0xE0u) == 0xC0u)
46 {
47 bytes[0] = x;
48 state = 1;
49 }
50 else if ((x & 0xF0u) == 0xE0u)
51 {
52 bytes[0] = x;
53 state = 2;
54 }
55 else if ((x & 0xF8u) == 0xF0u)
56 {
57 bytes[0] = x;
58 state = 4;
59 }
60 else
61 {
62 ThrowInvalidUtf8Sequence();
63 }
64 break;
65 }
66 case 1:
67 {
68 result = static_cast<char32_t>(0);
69 bytes[1] = x;
70 uint8_t b1 = bytes[1];
71 if ((b1 & 0xC0u) != 0x80u)
72 {
73 ThrowInvalidUtf8Sequence();
74 }
75 uint8_t shift = 0u;
76 for (uint8_t i = 0u; i < 6u; ++i)
77 {
78 uint8_t bit = b1 & 1u;
79 b1 = b1 >> 1u;
80 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
81 ++shift;
82 }
83 uint8_t b0 = bytes[0];
84 for (uint8_t i = 0u; i < 5u; ++i)
85 {
86 uint8_t bit = b0 & 1u;
87 b0 = b0 >> 1u;
88 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
89 ++shift;
90 }
91 resultReady = true;
92 state = 0;
93 break;
94 }
95 case 2:
96 {
97 bytes[1] = x;
98 state = 3;
99 break;
100 }
101 case 3:
102 {
103 bytes[2] = x;
104 result = static_cast<char32_t>(0);
105 uint8_t b2 = bytes[2];
106 if ((b2 & 0xC0u) != 0x80u)
107 {
108 ThrowInvalidUtf8Sequence();
109 }
110 uint8_t shift = 0u;
111 for (uint8_t i = 0u; i < 6u; ++i)
112 {
113 uint8_t bit = b2 & 1u;
114 b2 = b2 >> 1u;
115 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
116 ++shift;
117 }
118 uint8_t b1 = bytes[1];
119 if ((b1 & 0xC0u) != 0x80u)
120 {
121 ThrowInvalidUtf8Sequence();
122 }
123 for (uint8_t i = 0u; i < 6u; ++i)
124 {
125 uint8_t bit = b1 & 1u;
126 b1 = b1 >> 1u;
127 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
128 ++shift;
129 }
130 uint8_t b0 = bytes[0];
131 for (uint8_t i = 0u; i < 4u; ++i)
132 {
133 uint8_t bit = b0 & 1u;
134 b0 = b0 >> 1u;
135 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
136 ++shift;
137 }
138 resultReady = true;
139 state = 0;
140 break;
141 }
142 case 4:
143 {
144 bytes[1] = x;
145 state = 5;
146 break;
147 }
148 case 5:
149 {
150 bytes[2] = x;
151 state = 6;
152 break;
153 }
154 case 6:
155 {
156 bytes[3] = x;
157 result = static_cast<char32_t>(0);
158 uint8_t b3 = bytes[3];
159 if ((b3 & 0xC0u) != 0x80u)
160 {
161 ThrowInvalidUtf8Sequence();
162 }
163 uint8_t shift = 0u;
164 for (uint8_t i = 0u; i < 6u; ++i)
165 {
166 uint8_t bit = b3 & 1u;
167 b3 = b3 >> 1u;
168 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
169 ++shift;
170 }
171 uint8_t b2 = bytes[2];
172 if ((b2 & 0xC0u) != 0x80u)
173 {
174 ThrowInvalidUtf8Sequence();
175 }
176 for (uint8_t i = 0u; i < 6u; ++i)
177 {
178 uint8_t bit = b2 & 1u;
179 b2 = b2 >> 1u;
180 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
181 ++shift;
182 }
183 uint8_t b1 = bytes[1];
184 if ((b1 & 0xC0u) != 0x80u)
185 {
186 ThrowInvalidUtf8Sequence();
187 }
188 for (uint8_t i = 0u; i < 6u; ++i)
189 {
190 uint8_t bit = b1 & 1u;
191 b1 = b1 >> 1u;
192 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
193 ++shift;
194 }
195 uint8_t b0 = bytes[0];
196 for (uint8_t i = 0u; i < 3u; ++i)
197 {
198 uint8_t bit = b0 & 1u;
199 b0 = b0 >> 1u;
200 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
201 ++shift;
202 }
203 resultReady = true;
204 state = 0;
205 break;
206 }
207 }
208 }
209
210 std::u32string ToUtf32(const std::string& utf8Str)
211 {
212 std::u32string result;
213 const char* p = utf8Str.c_str();
214 int bytesRemaining = int(utf8Str.length());
215 while (bytesRemaining > 0)
216 {
217 char c = *p;
218 uint8_t x = static_cast<uint8_t>(c);
219 if ((x & 0x80u) == 0u)
220 {
221 result.append(1, static_cast<char32_t>(static_cast<uint32_t>(x)));
222 --bytesRemaining;
223 ++p;
224 }
225 else if ((x & 0xE0u) == 0xC0u)
226 {
227 if (bytesRemaining < 2)
228 {
229 ThrowInvalidUtf8Sequence();
230 }
231 char32_t u = static_cast<char32_t>(static_cast<uint32_t>(0u));
232 uint8_t b1 = static_cast<uint8_t>(p[1]);
233 if ((b1 & 0xC0u) != 0x80u)
234 {
235 ThrowInvalidUtf8Sequence();
236 }
237 uint8_t shift = 0u;
238 for (uint8_t i = 0u; i < 6u; ++i)
239 {
240 uint8_t bit = b1 & 1u;
241 b1 = b1 >> 1u;
242 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
243 ++shift;
244 }
245 uint8_t b0 = x;
246 for (uint8_t i = 0u; i < 5u; ++i)
247 {
248 uint8_t bit = b0 & 1u;
249 b0 = b0 >> 1u;
250 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
251 ++shift;
252 }
253 result.append(1, u);
254 bytesRemaining = bytesRemaining - 2;
255 p = p + 2;
256 }
257 else if ((x & 0xF0u) == 0xE0u)
258 {
259 if (bytesRemaining < 3)
260 {
261 ThrowInvalidUtf8Sequence();
262 }
263 char32_t u = static_cast<char32_t>(static_cast<uint32_t>(0u));
264 uint8_t b2 = static_cast<uint8_t>(p[2]);
265 if ((b2 & 0xC0u) != 0x80u)
266 {
267 ThrowInvalidUtf8Sequence();
268 }
269 uint8_t shift = 0u;
270 for (uint8_t i = 0u; i < 6u; ++i)
271 {
272 uint8_t bit = b2 & 1u;
273 b2 = b2 >> 1u;
274 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
275 ++shift;
276 }
277 uint8_t b1 = static_cast<uint8_t>(p[1]);
278 if ((b1 & 0xC0u) != 0x80u)
279 {
280 ThrowInvalidUtf8Sequence();
281 }
282 for (uint8_t i = 0u; i < 6u; ++i)
283 {
284 uint8_t bit = b1 & 1u;
285 b1 = b1 >> 1u;
286 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
287 ++shift;
288 }
289 uint8_t b0 = x;
290 for (uint8_t i = 0u; i < 4u; ++i)
291 {
292 uint8_t bit = b0 & 1u;
293 b0 = b0 >> 1u;
294 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
295 ++shift;
296 }
297 result.append(1, u);
298 bytesRemaining = bytesRemaining - 3;
299 p = p + 3;
300 }
301 else if ((x & 0xF8u) == 0xF0u)
302 {
303 if (bytesRemaining < 4)
304 {
305 ThrowInvalidUtf8Sequence();
306 }
307 char32_t u = static_cast<char32_t>(static_cast<uint32_t>(0u));
308 uint8_t b3 = static_cast<uint8_t>(p[3]);
309 if ((b3 & 0xC0u) != 0x80u)
310 {
311 ThrowInvalidUtf8Sequence();
312 }
313 uint8_t shift = 0u;
314 for (uint8_t i = 0u; i < 6u; ++i)
315 {
316 uint8_t bit = b3 & 1u;
317 b3 = b3 >> 1u;
318 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
319 ++shift;
320 }
321 uint8_t b2 = static_cast<uint8_t>(p[2]);
322 if ((b2 & 0xC0u) != 0x80u)
323 {
324 ThrowInvalidUtf8Sequence();
325 }
326 for (uint8_t i = 0u; i < 6u; ++i)
327 {
328 uint8_t bit = b2 & 1u;
329 b2 = b2 >> 1u;
330 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
331 ++shift;
332 }
333 uint8_t b1 = static_cast<uint8_t>(p[1]);
334 if ((b1 & 0xC0u) != 0x80u)
335 {
336 ThrowInvalidUtf8Sequence();
337 }
338 for (uint8_t i = 0u; i < 6u; ++i)
339 {
340 uint8_t bit = b1 & 1u;
341 b1 = b1 >> 1u;
342 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
343 ++shift;
344 }
345 uint8_t b0 = x;
346 for (uint8_t i = 0u; i < 3u; ++i)
347 {
348 uint8_t bit = b0 & 1u;
349 b0 = b0 >> 1u;
350 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
351 ++shift;
352 }
353 result.append(1, u);
354 bytesRemaining = bytesRemaining - 4;
355 p = p + 4;
356 }
357 else
358 {
359 ThrowInvalidUtf8Sequence();
360 }
361 }
362 return result;
363 }
364
365 std::u32string ToUtf32(const std::u16string& utf16Str)
366 {
367 std::u32string result;
368 const char16_t* w = utf16Str.c_str();
369 int remaining = int(utf16Str.length());
370 while (remaining > 0)
371 {
372 char16_t w1 = *w++;
373 --remaining;
374 if (static_cast<uint16_t>(w1) < 0xD800u || static_cast<uint16_t>(w1) > 0xDFFFu)
375 {
376 result.append(1, w1);
377 }
378 else
379 {
380 if (static_cast<uint16_t>(w1) < 0xD800u || static_cast<uint16_t>(w1) > 0xDBFFu)
381 {
382 throw UnicodeException("invalid UTF-16 sequence");
383 }
384 if (remaining > 0)
385 {
386 char16_t w2 = *w++;
387 --remaining;
388 if (static_cast<uint16_t>(w2) < 0xDC00u || static_cast<uint16_t>(w2) > 0xDFFFu)
389 {
390 throw UnicodeException("invalid UTF-16 sequence");
391 }
392 else
393 {
394 char32_t uprime = static_cast<char32_t>(((0x03FFu & static_cast<uint32_t>(w1)) << 10u) | (0x03FFu & static_cast<uint32_t>(w2)));
395 char32_t u = static_cast<char32_t>(static_cast<uint32_t>(uprime) + 0x10000u);
396 result.append(1, u);
397 }
398 }
399 else
400 {
401 throw UnicodeException("invalid UTF-16 sequence");
402 }
403 }
404 }
405 return result;
406 }
407
408 std::u16string ToUtf16(const std::u32string& utf32Str)
409 {
410 std::u16string result;
411 for (char32_t u : utf32Str)
412 {
413 if (static_cast<uint32_t>(u) > 0x10FFFFu)
414 {
415 throw UnicodeException("invalid UTF-32 code point");
416 }
417 if (static_cast<uint32_t>(u) < 0x10000u)
418 {
419 if (static_cast<uint32_t>(u) >= 0xD800 && static_cast<uint32_t>(u) <= 0xDFFF)
420 {
421 throw UnicodeException("invalid UTF-32 code point (reserved for UTF-16)");
422 }
423 char16_t x = static_cast<char16_t>(u);
424 result.append(1, x);
425 }
426 else
427 {
428 char32_t uprime = static_cast<char32_t>(static_cast<uint32_t>(u) - 0x10000u);
429 char16_t w1 = static_cast<char16_t>(0xD800u);
430 char16_t w2 = static_cast<char16_t>(0xDC00u);
431 for (uint16_t i = 0u; i < 10u; ++i)
432 {
433 uint16_t bit = static_cast<uint16_t>(static_cast<uint32_t>(uprime) & (static_cast<uint32_t>(0x1u) << i));
434 w2 = static_cast<char16_t>(static_cast<uint16_t>(w2) | bit);
435 }
436 for (uint16_t i = 10u; i < 20u; ++i)
437 {
438 uint16_t bit = static_cast<uint16_t>((static_cast<uint32_t>(uprime) & (static_cast<uint32_t>(0x1u) << i)) >> 10u);
439 w1 = static_cast<char16_t>(static_cast<uint16_t>(w1) | bit);
440 }
441 result.append(1, w1);
442 result.append(1, w2);
443 }
444 }
445 return result;
446
447 }
448
449 std::u16string ToUtf16(const std::string& utf8Str)
450 {
451 return ToUtf16(ToUtf32(utf8Str));
452 }
453
454 std::string ToUtf8(const std::u32string& utf32Str)
455 {
456 std::string result;
457 for (char32_t c : utf32Str)
458 {
459 uint32_t x = static_cast<uint32_t>(c);
460 if (x < 0x80u)
461 {
462 result.append(1, static_cast<char>(x & 0x7Fu));
463 }
464 else if (x < 0x800u)
465 {
466 uint8_t b1 = 0x80u;
467 for (uint8_t i = 0u; i < 6u; ++i)
468 {
469 b1 = b1 | (static_cast<uint8_t>(x & 1u) << i);
470 x = x >> 1u;
471 }
472 uint8_t b0 = 0xC0u;
473 for (uint8_t i = 0u; i < 5u; ++i)
474 {
475 b0 = b0 | (static_cast<uint8_t>(x & 1u) << i);
476 x = x >> 1u;
477 }
478 result.append(1, static_cast<char>(b0));
479 result.append(1, static_cast<char>(b1));
480 }
481 else if (x < 0x10000u)
482 {
483 uint8_t b2 = 0x80u;
484 for (uint8_t i = 0u; i < 6u; ++i)
485 {
486 b2 = b2 | (static_cast<uint8_t>(x & 1u) << i);
487 x = x >> 1u;
488 }
489 uint8_t b1 = 0x80u;
490 for (uint8_t i = 0u; i < 6u; ++i)
491 {
492 b1 = b1 | (static_cast<uint8_t>(x & 1u) << i);
493 x = x >> 1u;
494 }
495 uint8_t b0 = 0xE0u;
496 for (uint8_t i = 0u; i < 4u; ++i)
497 {
498 b0 = b0 | (static_cast<uint8_t>(x & 1u) << i);
499 x = x >> 1u;
500 }
501 result.append(1, static_cast<char>(b0));
502 result.append(1, static_cast<char>(b1));
503 result.append(1, static_cast<char>(b2));
504 }
505 else if (x < 0x110000u)
506 {
507 uint8_t b3 = 0x80u;
508 for (uint8_t i = 0u; i < 6u; ++i)
509 {
510 b3 = b3 | (static_cast<uint8_t>(x & 1u) << i);
511 x = x >> 1u;
512 }
513 uint8_t b2 = 0x80u;
514 for (uint8_t i = 0u; i < 6u; ++i)
515 {
516 b2 = b2 | (static_cast<uint8_t>(x & 1u) << i);
517 x = x >> 1u;
518 }
519 uint8_t b1 = 0x80u;
520 for (uint8_t i = 0u; i < 6u; ++i)
521 {
522 b1 = b1 | (static_cast<uint8_t>(x & 1u) << i);
523 x = x >> 1u;
524 }
525 uint8_t b0 = 0xF0u;
526 for (uint8_t i = 0u; i < 3u; ++i)
527 {
528 b0 = b0 | (static_cast<uint8_t>(x & 1u) << i);
529 x = x >> 1u;
530 }
531 result.append(1, static_cast<char>(b0));
532 result.append(1, static_cast<char>(b1));
533 result.append(1, static_cast<char>(b2));
534 result.append(1, static_cast<char>(b3));
535 }
536 else
537 {
538 throw UnicodeException("invalid UTF-32 code point");
539 }
540 }
541 return result;
542 }
543
544 std::string ToUtf8(const std::u16string& utf16Str)
545 {
546 return ToUtf8(ToUtf32(utf16Str));
547 }
548
549 std::u32string ToUpper(const std::u32string& s)
550 {
551 std::u32string upper;
552 for (char32_t c : s)
553 {
554 upper.append(1, ToUpper(c));
555 }
556 return upper;
557 }
558
559 std::u32string ToLower(const std::u32string& s)
560 {
561 std::u32string lower;
562 for (char32_t c : s)
563 {
564 lower.append(1, ToLower(c));
565 }
566 return lower;
567 }
568
569 std::string MakeCanonicalPropertyName(const std::string& s)
570 {
571 std::string propertyName;
572 for (char c : s)
573 {
574 if (c != '_' && c != ' ' && c != '-')
575 {
576 propertyName.append(1, c);
577 }
578 }
579 return soulng::util::ToLower(propertyName);
580 }
581
582 BinaryProperty::BinaryProperty(BinaryPropertyId id_, const std::string& shortName_, const std::string& longName_) : id(id_), shortName(shortName_), longName(longName_)
583 {
584 }
585
586 void BinaryPropertyTable::Init()
587 {
588 instance.reset(new BinaryPropertyTable());
589 }
590
591 void BinaryPropertyTable::Done()
592 {
593 instance.reset();
594 }
595
596 std::unique_ptr<BinaryPropertyTable> BinaryPropertyTable::instance;
597
598 BinaryPropertyTable::BinaryPropertyTable()
599 {
600 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::asciiHexDigit, "AHex", "Ascii Hex Digit"));
601 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::alphabetic, "Alpha", "Alphabetic"));
602 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::bidiControl, "Bidi C", "Bidi Control"));
603 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::bidiMirrored, "Bidi M", "Bidi Mirrored"));
604 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::cased, "Cased", "Cased"));
605 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::compositionExclusion, "CE", "Composition Exclusion"));
606 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::caseIgnorable, "CI", "Case Ignorable"));
607 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::fullCompositionExclusion, "Comp Ex", "Full Composition Exclusion"));
608 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenCasefolded, "CWCF", "Changes When Casefolded"));
609 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenCaseMapped, "CWCM", "Changes When Casemapped"));
610 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenNFKCCasefolded, "CWKCF", "Changes When NFKC Casefolded"));
611 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenLowercased, "CWL", "Changes When Lowercased"));
612 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenTitlecased, "CWT", "Changes When Titlecased"));
613 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenUppercased, "CWU", "Changes When Uppercased"));
614 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::dash, "Dash", "Dash"));
615 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::deprecated, "Dep", "Deprecated"));
616 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::defaultIgnorableCodePoint, "DI", "Default Ignorable Code Point"));
617 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::diacritic, "Dia", "Diacritic"));
618 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::extender, "Ext", "Extender"));
619 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::graphemeBase, "Gr Base", "Grapheme Base"));
620 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::graphemeExtend, "Gr Ext", "Grapheme Extend"));
621 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::graphemeLink, "Gr Link", "Grapheme Link"));
622 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::hexDigit, "Hex", "Hex Digit"));
623 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::hyphen, "Hyphen", "Hyphen"));
624 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::idContinue, "IDC", "ID Continue"));
625 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::ideographic, "Ideo", "Ideographic"));
626 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::idStart, "IDS", "ID Start"));
627 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::idsBinaryOperator, "IDSB", "IDS Binary Operator"));
628 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::idsTrinaryOperator, "IDST", "IDS Trinary Operator"));
629 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::joinControl, "Join C", "Join Control"));
630 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::logicalOrderException, "LOE", "Logical Order Exception"));
631 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::lowercase, "Lower", "Lowercase"));
632 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::math, "Math", "Math"));
633 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::noncharacterCodePoint, "NChar", "Noncharacter Code Point"));
634 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherAlphabetic, "OAlpha", "Other Alphabetic"));
635 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherDefaultIgnorableCodePoint, "ODI", "Other Default Ignorable Code Point"));
636 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherGraphemeExtend, "OGr Ext", "Other Grapheme Extend"));
637 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherIdContinue, "OIDC", "Other ID Continue"));
638 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherIdStart, "OIDS", "Other ID Start"));
639 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherLowercase, "OLower", "Other Lowercase"));
640 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherMath, "OMath", "Other Math"));
641 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherUppercase, "OUpper", "Other Uppercase"));
642 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::patternSyntax, "Pat Syn", "Pattern Syntax"));
643 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::patternWhiteSpace, "Pat WS", "Pattern White Space"));
644 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::prependedConcatenationMark, "PCM", "Prepended Concatenation Mark"));
645 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::quotationMark, "QMark", "Quotation Mark"));
646 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::radical, "Radical", "Radical"));
647 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::softDotted, "SD", "Soft Dotted"));
648 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::sentenceterminal, "STerm", "Sentence Terminal"));
649 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::terminalPunctuation, "Term", "Terminal Punctuation"));
650 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::unifiedIdeograph, "UIdeo", "Unified Ideograph"));
651 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::uppercase, "Upper", "Uppercase"));
652 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::variationSelector, "VS", "Variation Selector"));
653 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::whiteSpace, "WSpace", "White Space"));
654 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::xidContinue, "XIDC", "XID Continue"));
655 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::xidStart, "XIDS", "XID Start"));
656 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::expandsOnNFC, "XO NFC", "Expands On NFC"));
657 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::expandsOnNFD, "XO NFD", "Expands On NFD"));
658 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::expandsOnNFKC, "XO NFKC", "Expands On NFKC"));
659 binaryProperties.push_back(BinaryProperty(BinaryPropertyId::expandsOnNFKD, "XO NFKD", "Expands On NFKD"));
660
661 for (const BinaryProperty& binaryProperty : binaryProperties)
662 {
663 binaryPropertyIdMap[binaryProperty.Id()] = &binaryProperty;
664 shortNameMap[MakeCanonicalPropertyName(binaryProperty.ShortName())] = &binaryProperty;
665 longNameMap[MakeCanonicalPropertyName(binaryProperty.LongName())] = &binaryProperty;
666 }
667 }
668
669 const BinaryProperty& BinaryPropertyTable::GetBinaryProperty(BinaryPropertyId binaryPropertyId) const
670 {
671 auto it = binaryPropertyIdMap.find(binaryPropertyId);
672 if (it != binaryPropertyIdMap.cend())
673 {
674 return *it->second;
675 }
676 else
677 {
678 throw UnicodeException("binary property " + std::to_string(static_cast<int>(binaryPropertyId)) + " not found");
679 }
680 }
681
682 bool BinaryPropertyTable::IsBinaryProperty(const std::string& shortName) const
683 {
684 return shortNameMap.find(MakeCanonicalPropertyName(shortName)) != shortNameMap.cend();
685 }
686
687 const BinaryProperty& BinaryPropertyTable::GetBinaryPropertyByShortName(const std::string& shortName) const
688 {
689 auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
690 if (it != shortNameMap.cend())
691 {
692 return *it->second;
693 }
694 else
695 {
696 throw UnicodeException("binary property '" + shortName + "' not found");
697 }
698 }
699
700 const BinaryProperty& BinaryPropertyTable::GetBinaryPropertyByLongName(const std::string& longName) const
701 {
702 auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
703 if (it != longNameMap.cend())
704 {
705 return *it->second;
706 }
707 else
708 {
709 throw UnicodeException("binary property '" + longName + "' not found");
710 }
711 }
712
713 Block::Block(BlockId id_, const std::string& shortName_, const std::string& longName_, char32_t start_, char32_t end_) : id(id_), shortName(shortName_), longName(longName_), start(start_), end(end_)
714 {
715 }
716
717 std::unique_ptr<BlockTable> BlockTable::instance;
718
719 void BlockTable::Init()
720 {
721 instance.reset(new BlockTable());
722 }
723
724 void BlockTable::Done()
725 {
726 instance.reset();
727 }
728
729 BlockTable::BlockTable()
730 {
731 blocks.push_back(Block(BlockId::ascii, "ASCII", "Basic Latin", 0x0000, 0x007F));
732 blocks.push_back(Block(BlockId::latin1Sup, "Latin 1 Sup", "Latin-1 Supplement", 0x0080, 0x00FF));
733 blocks.push_back(Block(BlockId::latinExtA, "Latin Ext A", "Latin Extended-A", 0x0100, 0x017F));
734 blocks.push_back(Block(BlockId::latinExtB, "Latin Ext B", "Latin Extended-B", 0x0180, 0x0024F));
735 blocks.push_back(Block(BlockId::ipaExt, "IPA Ext", "IPA Extensions", 0x0250, 0x02AF));
736 blocks.push_back(Block(BlockId::modifierLetters, "Modifier Letters", "Spacing Modifier Letters", 0x02B0, 0x02FF));
737 blocks.push_back(Block(BlockId::diacriticals, "Diacriticals", "Combining Diacritical Marks", 0x0300, 0x036F));
738 blocks.push_back(Block(BlockId::greek, "Greek", "Greek and Coptic", 0x0370, 0x03FF));
739 blocks.push_back(Block(BlockId::cyrillic, "Cyrillic", "Cyrillic", 0x0400, 0x04FF));
740 blocks.push_back(Block(BlockId::cyrillicSup, "Cyrillic Sup", "Cyrillic Supplement", 0x0500, 0x052F));
741 blocks.push_back(Block(BlockId::armenian, "Armenian", "Armenian", 0x0530, 0x058F));
742 blocks.push_back(Block(BlockId::hebrew, "Hebrew", "Hebrew", 0x0590, 0x05FF));
743 blocks.push_back(Block(BlockId::arabic, "Arabic", "Arabic", 0x0600, 0x06FF));
744 blocks.push_back(Block(BlockId::syriac, "Syriac", "Syriac", 0x0700, 0x074F));
745 blocks.push_back(Block(BlockId::arabicSup, "Arabic Sup", "Arabic Supplement", 0x0750, 0x077F));
746 blocks.push_back(Block(BlockId::thaana, "Thaana", "Thaana", 0x0780, 0x07BF));
747 blocks.push_back(Block(BlockId::nko, "Nko", "Nko", 0x07C0, 0x07FF));
748 blocks.push_back(Block(BlockId::samaritan, "Samaritan", "Samaritan", 0x0800, 0x083F));
749 blocks.push_back(Block(BlockId::mandaic, "Mandaic", "Mandaic", 0x0840, 0x085F));
750 blocks.push_back(Block(BlockId::syriacSup, "Syriac Sup", "Syriac Supplement", 0x0860, 0x086F));
751 blocks.push_back(Block(BlockId::arabicExtA, "Arabic Ext A", "Arabic Extended-A", 0x08A0, 0x08FF));
752 blocks.push_back(Block(BlockId::devanagari, "Devanagari", "Devanagari", 0x0900, 0x097F));
753 blocks.push_back(Block(BlockId::bengali, "Bengali", "Bengali", 0x0980, 0x09FF));
754 blocks.push_back(Block(BlockId::gurmukhi, "Gurmukhi", "Gurmukhi", 0x0A00, 0x0A7F));
755 blocks.push_back(Block(BlockId::gujarati, "Gujarati", "Gujarati", 0x0A80, 0x0AFF));
756 blocks.push_back(Block(BlockId::oriya, "Oriya", "Oriya", 0x0B00, 0x0B7F));
757 blocks.push_back(Block(BlockId::tamil, "Tamil", "Tamil", 0x0B80, 0x0BFF));
758 blocks.push_back(Block(BlockId::telugu, "Telugu", "Telugu", 0x0C00, 0x0C7F));
759 blocks.push_back(Block(BlockId::kannada, "Kannada", "Kannada", 0x0C80, 0x0CFF));
760 blocks.push_back(Block(BlockId::malayalam, "Malayalam", "Malayalam", 0x0D00, 0x0D7F));
761 blocks.push_back(Block(BlockId::sinhala, "Sinhala", "Sinhala", 0x0D80, 0x0DFF));
762 blocks.push_back(Block(BlockId::thai, "Thai", "Thai", 0x0E00, 0x0E7F));
763 blocks.push_back(Block(BlockId::lao, "Lao", "Lao", 0x0E80, 0x0EFF));
764 blocks.push_back(Block(BlockId::tibetan, "Tibetan", "Tibetan", 0x0F00, 0x0FFF));
765 blocks.push_back(Block(BlockId::myanmar, "Myanmar", "Myanmar", 0x1000, 0x109F));
766 blocks.push_back(Block(BlockId::georgian, "Georgian", "Georgian", 0x10A0, 0x10FF));
767 blocks.push_back(Block(BlockId::jamo, "Jamo", "Hangul Jamo", 0x1100, 0x11FF));
768 blocks.push_back(Block(BlockId::ethiopic, "Ethiopic", "Ethiopic", 0x1200, 0x137F));
769 blocks.push_back(Block(BlockId::ethiopicSup, "Ethiopic Sup", "Ethiopic Supplement", 0x1380, 0x139F));
770 blocks.push_back(Block(BlockId::cherokee, "Cherokee", "Cherokee", 0x13A0, 0x13FF));
771 blocks.push_back(Block(BlockId::ucas, "UCAS", "Unified Canadian Aboriginal Syllabics", 0x1400, 0x167F));
772 blocks.push_back(Block(BlockId::ogham, "Ogham", "Ogham", 0x1680, 0x169F));
773 blocks.push_back(Block(BlockId::runic, "Runic", "Runic", 0x16A0, 0x16FF));
774 blocks.push_back(Block(BlockId::tagalog, "Tagalog", "Tagalog", 0x1700, 0x171F));
775 blocks.push_back(Block(BlockId::hanunoo, "Hanunoo", "Hanunoo", 0x1720, 0x173F));
776 blocks.push_back(Block(BlockId::buhid, "Buhid", "Buhid", 0x1740, 0x175F));
777 blocks.push_back(Block(BlockId::tagbanwa, "Tagbanwa", "Tagbanwa", 0x1760, 0x177F));
778 blocks.push_back(Block(BlockId::khmer, "Khmer", "Khmer", 0x1780, 0x17FF));
779 blocks.push_back(Block(BlockId::mongolian, "Mongolian", "Mongolian", 0x1800, 0x18AF));
780 blocks.push_back(Block(BlockId::ucasExt, "UCAS Ext", "Unified Canadian Aboriginal Syllabics Extended", 0x18B0, 0x18FF));
781 blocks.push_back(Block(BlockId::limbu, "Limbu", "Limbu", 0x1900, 0x194F));
782 blocks.push_back(Block(BlockId::taiLe, "Tai Le", "Tai Le", 0x1950, 0x197F));
783 blocks.push_back(Block(BlockId::newTaiLue, "New Tai Lue", "New Tai Lue", 0x1980, 0x19DF));
784 blocks.push_back(Block(BlockId::khmerSymbols, "Khmer Symbols", "Khmer Symbols", 0x19E0, 0x19FF));
785 blocks.push_back(Block(BlockId::buginese, "Buginese", "Buginese", 0x1A00, 0x1A1F));
786 blocks.push_back(Block(BlockId::taiTham, "Tai Tham", "Tai Tham", 0x1A20, 0x1AAF));
787 blocks.push_back(Block(BlockId::diacriticalsExt, "Diacriticals Ext", "Combining Diacritical Marks Extended", 0x1AB0, 0x1AFF));
788 blocks.push_back(Block(BlockId::balinese, "Balinese", "Balinese", 0x1B00, 0x1B7F));
789 blocks.push_back(Block(BlockId::sundanese, "Sundanese", "Sundanese", 0x1B80, 0x1BBF));
790 blocks.push_back(Block(BlockId::batak, "Batak", "Batak", 0x1BC0, 0x1BFF));
791 blocks.push_back(Block(BlockId::lepcha, "Lepcha", "Lepcha", 0x1C00, 0x1C4F));
792 blocks.push_back(Block(BlockId::olChiki, "Ol Chiki", "Ol Chiki", 0x1C50, 0x1C7F));
793 blocks.push_back(Block(BlockId::cyrillicExtC, "Cyrillic Ext C", "Cyrillic Extended-C", 0x1C80, 0x1C8F));
794 blocks.push_back(Block(BlockId::georgianExt, "Georgian Ext", "Georgian Extended", 0x1C90, 0x1CBF));
795 blocks.push_back(Block(BlockId::sundaneseSup, "Sundanese Sup", "Sundanese Supplement", 0x1CC0, 0x1CCF));
796 blocks.push_back(Block(BlockId::vedicExt, "Vedic Ext", "Vedic Extensions", 0x1CD0, 0x1CFF));
797 blocks.push_back(Block(BlockId::phoneticExt, "Phonetic Ext", "Phonetic Extensions", 0x1D00, 0x1D7F));
798 blocks.push_back(Block(BlockId::phoneticExtSup, "Phonetic Ext Sup", "Phonetic Extensions Supplement", 0x1D80, 0x1DBF));
799 blocks.push_back(Block(BlockId::diacriticalsSup, "Diacriticals Sup", "Combining Diacritical Marks Supplement", 0x1DC0, 0x1DFF));
800 blocks.push_back(Block(BlockId::latinExtAdditional, "Latin Ext Additional", "Latin Extended Additional", 0x1E00, 0x1EFF));
801 blocks.push_back(Block(BlockId::greekExt, "Greek Ext", "Greek Extended", 0x1F00, 0x1FFF));
802 blocks.push_back(Block(BlockId::punctuation, "Punctuation", "General Punctuation", 0x2000, 0x206F));
803 blocks.push_back(Block(BlockId::superAndSub, "Super And Sub", "Superscripts and Subscripts", 0x2070, 0x209F));
804 blocks.push_back(Block(BlockId::currencySymbols, "Currency Symbols", "Currency Symbols", 0x20A0, 0x20CF));
805 blocks.push_back(Block(BlockId::diariticalsForSymbols, "Diacriticals For Symbols", "Combining Diacritical Marks for Symbols", 0x20D0, 0x20FF));
806 blocks.push_back(Block(BlockId::letterlikeSymbols, "Letterlike Symbols", "Letterlike Symbols", 0x2100, 0x214F));
807 blocks.push_back(Block(BlockId::numberForms, "Number Forms", "Number Forms", 0x2150, 0x218F));
808 blocks.push_back(Block(BlockId::arrows, "Arrows", "Arrows", 0x2190, 0x21FF));
809 blocks.push_back(Block(BlockId::mathOperators, "Math Operators", "Mathematical Operators", 0x2200, 0x22FF));
810 blocks.push_back(Block(BlockId::miscTechnical, "Misc Technical", "Miscellaneous Technical", 0x2300, 0x23FF));
811 blocks.push_back(Block(BlockId::controlPictures, "Control Pictures", "Control Pictures", 0x2400, 0x243F));
812 blocks.push_back(Block(BlockId::ocr, "OCR", "Optical Character Regognition", 0x2440, 0x245F));
813 blocks.push_back(Block(BlockId::enclosedAlphanum, "Enclosed Alphanum", "Enclosed Alphanumerics", 0x2460, 0x24FF));
814 blocks.push_back(Block(BlockId::boxDrawing, "Box Drawing", "Box Drawing", 0x2500, 0x257F));
815 blocks.push_back(Block(BlockId::blockElements, "Block Elements", "Block Elements", 0x2580, 0x259F));
816 blocks.push_back(Block(BlockId::geometricShapes, "Geometric Shapes", "Geometric Shapes", 0x25A0, 0x25FF));
817 blocks.push_back(Block(BlockId::miscSymbols, "Misc Symbols", "Miscellaneous Symbols", 0x2600, 0x26FF));
818 blocks.push_back(Block(BlockId::dingbats, "Dingbats", "Dingbats", 0x2700, 0x27BF));
819 blocks.push_back(Block(BlockId::miscMathSymbolsA, "Misc Math Symbols A", "Miscellaneous Mathematical Symbols - A", 0x27C0, 0x27EF));
820 blocks.push_back(Block(BlockId::supArrowsA, "Sup Arrows A", "Supplemental Arrows-A", 0x27F0, 0x27FF));
821 blocks.push_back(Block(BlockId::braille, "Braille", "Braille Patterns", 0x2800, 0x28FF));
822 blocks.push_back(Block(BlockId::supArrowsB, "Sup Arrows B", "Supplemental Arrows-B", 0x2900, 0x297F));
823 blocks.push_back(Block(BlockId::miscMathSymbolsB, "Misc Math Symbols B", "Miscellaneous Mathematical Symbols-B", 0x2980, 0x29FF));
824 blocks.push_back(Block(BlockId::supMathOperators, "Sup Math Operators", "Supplemental Mathematical Operators", 0x2A00, 0x2AFF));
825 blocks.push_back(Block(BlockId::miscArrows, "Misc Arrows", "Miscellaneous Symbols and Arrows", 0x2B00, 0x2BFF));
826 blocks.push_back(Block(BlockId::glagolitic, "Glagolitic", "Glagolitic", 0x2C00, 0x2C5F));
827 blocks.push_back(Block(BlockId::latinExtC, "Latin Ext C", "Latin Extended-C", 0x2C60, 0x2C7F));
828 blocks.push_back(Block(BlockId::coptic, "Coptic", "Coptic", 0x2C80, 0x2CFF));
829 blocks.push_back(Block(BlockId::georgianSup, "Georgian Sup", "Georgian Supplement", 0x2D00, 0x2D2F));
830 blocks.push_back(Block(BlockId::tifinagh, "Tifinagh", "Tifinagh", 0x2D30, 0x2D7F));
831 blocks.push_back(Block(BlockId::ethiopicExt, "Ethiopic Ext", "Ethiopic Extended", 0x2D80, 0x2DDF));
832 blocks.push_back(Block(BlockId::cyrillicExtA, "Cyrillic Ext A", "Cyrillic Extended-A", 0x2DE0, 0x2DFF));
833 blocks.push_back(Block(BlockId::supPunctuation, "Sup Punctuation", "Supplemental Punctuation", 0x2E00, 0x2E7F));
834 blocks.push_back(Block(BlockId::cjkRadicalsSup, "CJK Radicals Sup", "CJK Radicals Supplement", 0x2E80, 0x2EFF));
835 blocks.push_back(Block(BlockId::kangxi, "Kangxi", "Kangxi Radicals", 0x2F00, 0x2FDF));
836 blocks.push_back(Block(BlockId::idc, "IDC", "Ideographic Description Characters", 0x2FF0, 0x2FFF));
837 blocks.push_back(Block(BlockId::cjkSymbols, "CJK Symbols", "CJK Symbols and Punctuation", 0x3000, 0x303F));
838 blocks.push_back(Block(BlockId::hiragana, "Hiragana", "Hiragana", 0x3040, 0x309F));
839 blocks.push_back(Block(BlockId::katakana, "Katakana", "Katakana", 0x30A0, 0x30FF));
840 blocks.push_back(Block(BlockId::bopomofo, "Bopomofo", "Bopomofo", 0x3100, 0x312F));
841 blocks.push_back(Block(BlockId::compatJamo, "Compat Jamo", "Hangul Compatibility Jamo", 0x3130, 0x318F));
842 blocks.push_back(Block(BlockId::kanbun, "Kanbun", "Kanbun", 0x3190, 0x319F));
843 blocks.push_back(Block(BlockId::bopomofoExt, "Bopomofo Ext", "Bopomofo Extended", 0x31A0, 0x31BF));
844 blocks.push_back(Block(BlockId::cjkStrokes, "CJK Strokes", "CJK Strokes", 0x31C0, 0x31EF));
845 blocks.push_back(Block(BlockId::katakanaExt, "Katakana Ext", "Katakana Phonetic Extensions", 0x31F0, 0x31FF));
846 blocks.push_back(Block(BlockId::enclosedCjk, "Enclosed CJK", "Enclosed CJK Letters and Months", 0x3200, 0x32FF));
847 blocks.push_back(Block(BlockId::cjkCompat, "CJK Compat", "CJK Compatibility", 0x3300, 0x33FF));
848 blocks.push_back(Block(BlockId::cjkExtA, "CJK Ext A", "CJK Unified Ideographic Extension A", 0x3400, 0x4DBF));
849 blocks.push_back(Block(BlockId::yijing, "Yijing", "Yijing Hexagram Symbols", 0x4DC0, 0x4DFF));
850 blocks.push_back(Block(BlockId::cjk, "CJK", "CJK Unified Ideographs", 0x4E00, 0x9FFF));
851 blocks.push_back(Block(BlockId::yiSyllables, "Yi Syllables", "Yi Syllables", 0xA000, 0xA48F));
852 blocks.push_back(Block(BlockId::yiRadicals, "Yi Radicals", "Yi Radicals", 0xA090, 0xA4CF));
853 blocks.push_back(Block(BlockId::lisu, "Lisu", "Lisu", 0xA0D0, 0xA4FF));
854 blocks.push_back(Block(BlockId::vai, "Vai", "Vai", 0xA500, 0xA63F));
855 blocks.push_back(Block(BlockId::cyrillicExtB, "Cyrillic Ext B", "Cyrillic Extended-B", 0xA640, 0xA69F));
856 blocks.push_back(Block(BlockId::bamum, "Bamum", "Bamum", 0xA6A0, 0xA6FF));
857 blocks.push_back(Block(BlockId::modifierToneLetters, "Modifier Tone Letters", "Modifier Tone Letters", 0xA700, 0xA71F));
858 blocks.push_back(Block(BlockId::latinExtD, "Latin Ext D", "Latin Extended-D", 0xA720, 0xA7FF));
859 blocks.push_back(Block(BlockId::sylotiNagri, "Syloti Nagri", "Syloti Nagri", 0xA800, 0xA82F));
860 blocks.push_back(Block(BlockId::indicNumberForms, "Indic Number Forms", "Common Indic Number Forms", 0xA830, 0xA83F));
861 blocks.push_back(Block(BlockId::phagsPa, "Phags Pa", "Phags-Pa", 0xA840, 0xA87F));
862 blocks.push_back(Block(BlockId::saurashtra, "Saurashtra", "Saurashtra", 0xA880, 0xA8DF));
863 blocks.push_back(Block(BlockId::devanagariExt, "Devanagari Ext", "Devanagari Extended", 0xA8E0, 0xA8FF));
864 blocks.push_back(Block(BlockId::kayahLi, "Kayah Li", "Kayah Li", 0xA900, 0xA92F));
865 blocks.push_back(Block(BlockId::rejang, "Rejang", "Rejang", 0xA930, 0xA95F));
866 blocks.push_back(Block(BlockId::jamoExtA, "Jamo Ext A", "Hangul Jamo Extended-A", 0xA960, 0xA97F));
867 blocks.push_back(Block(BlockId::javanese, "Javanese", "Javanese", 0xA980, 0xA9DF));
868 blocks.push_back(Block(BlockId::myanmarExtB, "Myanmar Ext B", "Myanmar Extended - B", 0xA9E0, 0xA9FF));
869 blocks.push_back(Block(BlockId::cham, "Cham", "Cham", 0xAA00, 0xAA5F));
870 blocks.push_back(Block(BlockId::myanmarExtA, "Myanmar Ext A", "Myanmar Extended-A", 0xAA60, 0xAA7F));
871 blocks.push_back(Block(BlockId::taiViet, "Tai Viet", "Tai Viet", 0xAA80, 0xAADF));
872 blocks.push_back(Block(BlockId::meeteiMayekExt, "Meetei Mayek Ext", "Meetei Mayek Extensions", 0xAAE0, 0xAAFF));
873 blocks.push_back(Block(BlockId::ethiopicExtA, "Ethiopic Ext A", "Ethiopic Extended-A", 0xAB00, 0xAB2F));
874 blocks.push_back(Block(BlockId::latinExtE, "Latin Ext E", "Latin Extended-E", 0xAB30, 0xAB6F));
875 blocks.push_back(Block(BlockId::cherokeeSup, "Cherokee Sup", "Cherokee Supplement", 0xAB70, 0xABBF));
876 blocks.push_back(Block(BlockId::meeteiMayek, "Meetei Mayek", "Meetei Mayek", 0xABC0, 0xABFF));
877 blocks.push_back(Block(BlockId::hangul, "Hangul", "Hangul Syllables", 0xAC00, 0xD7AF));
878 blocks.push_back(Block(BlockId::jamoExtB, "Jamo Ext B", "Hangul Jamo Extended-B", 0xD7B0, 0xD7FF));
879 blocks.push_back(Block(BlockId::highSurrogates, "High Surrogates", "High Surrogates", 0xD800, 0xDB7F));
880 blocks.push_back(Block(BlockId::highPuSurrogates, "High PU Surrogates", "High Private Use Surrogates", 0xDB80, 0xDBFF));
881 blocks.push_back(Block(BlockId::lowSurrogates, "Low Surrogates", "Low Surrogates", 0xDC00, 0xDFFF));
882 blocks.push_back(Block(BlockId::pua, "PUA", "Private Use Area", 0xE000, 0xF8FF));
883 blocks.push_back(Block(BlockId::cjkCompatIdeographs, "CJK Compat Ideographs", "CJK Compatibility Ideographs", 0xF900, 0xFAFF));
884 blocks.push_back(Block(BlockId::alphabeticPf, "Alphabetic PF", "Alphabetic Presentations Forms", 0xFB00, 0xFB4F));
885 blocks.push_back(Block(BlockId::arabicPfA, "Arabic PF A", "Arabic Presentation Forms-A", 0xFB50, 0xFDFF));
886 blocks.push_back(Block(BlockId::vs, "VS", "Variation Selectors", 0xFE00, 0xFE0F));
887 blocks.push_back(Block(BlockId::verticalForms, "Vertical Forms", "Vertical Forms", 0xFE10, 0xFE1F));
888 blocks.push_back(Block(BlockId::halfMarks, "Half Marks", "Combining Half Marks", 0xFE20, 0xFE2F));
889 blocks.push_back(Block(BlockId::cjkCompatForms, "CJK Compat Forms", "CJK Compatibility Forms", 0xFE30, 0xFE4F));
890 blocks.push_back(Block(BlockId::smallForms, "Small Forms", "Small Form Variants", 0xFE50, 0xFE6F));
891 blocks.push_back(Block(BlockId::arabicPfB, "Arabic PF B", "Arabic Presentation Forms-B", 0xFE70, 0xFEFF));
892 blocks.push_back(Block(BlockId::halfAndFullForms, "Half And Full Forms", "Halfwidth and Fullwidth Forms", 0xFF00, 0xFFEF));
893 blocks.push_back(Block(BlockId::specials, "Specials", "Specials", 0xFFF0, 0xFFFF));
894 blocks.push_back(Block(BlockId::linearBSyllabary, "Linear B Syllabary", "Linear B Syllabary", 0x10000, 0x1007F));
895 blocks.push_back(Block(BlockId::linearBIdeograms, "Linear B Ideograms", "Linear B Ideograms", 0x10080, 0x100FF));
896 blocks.push_back(Block(BlockId::aegeanNumbers, "Aegean Numbers", "Aegean Numbers", 0x10100, 0x1013F));
897 blocks.push_back(Block(BlockId::ancientGreekNumbers, "Ancient Greek Numbers", "Ancient Greek Numbers", 0x10140, 0x1018F));
898 blocks.push_back(Block(BlockId::ancientSymbols, "Ancient Symbols", "Ancient Symbols", 0x10190, 0x101CF));
899 blocks.push_back(Block(BlockId::phaistos, "Phaistos", "Phaistos Disc", 0x101D0, 0x101FF));
900 blocks.push_back(Block(BlockId::lycian, "Lycian", "Lycian", 0x10280, 0x1029F));
901 blocks.push_back(Block(BlockId::carian, "Carian", "Carian", 0x102A0, 0x102DF));
902 blocks.push_back(Block(BlockId::copticEpactNumbers, "Coptic Epact Numbers", "Coptic Epact Numbers", 0x102E0, 0x102FF));
903 blocks.push_back(Block(BlockId::oldItalic, "Old Italic", "Old Italic", 0x10300, 0x1032F));
904 blocks.push_back(Block(BlockId::gothic, "Gothic", "Gothic", 0x10330, 0x1034F));
905 blocks.push_back(Block(BlockId::oldPermic, "Old Permic", "Old Permic", 0x10350, 0x1037F));
906 blocks.push_back(Block(BlockId::ugaritic, "Ugaritic", "Ugaritic", 0x10380, 0x1039F));
907 blocks.push_back(Block(BlockId::oldPersian, "Old Persian", "Old Persian", 0x103A0, 0x103DF));
908 blocks.push_back(Block(BlockId::deseret, "Deseret", "Deseret", 0x10400, 0x1044F));
909 blocks.push_back(Block(BlockId::shavian, "Shavian", "Shavian", 0x10450, 0x1047F));
910 blocks.push_back(Block(BlockId::osmanya, "Osmanya", "Osmanya", 0x10480, 0x104AF));
911 blocks.push_back(Block(BlockId::osage, "Osage", "Osage", 0x104B0, 0x104FF));
912 blocks.push_back(Block(BlockId::elbasan, "Elbasan", "Elbasan", 0x10500, 0x1052F));
913 blocks.push_back(Block(BlockId::caucasianAlbanian, "Caucasian Albanian", "Caucasian Albanian", 0x10530, 0x1056F));
914 blocks.push_back(Block(BlockId::linearA, "Linear A", "Linear A", 0x10600, 0x1077F));
915 blocks.push_back(Block(BlockId::cypriotSyllabary, "Cypriot Syllabary", "Cypriot Syllabary", 0x10800, 0x1083F));
916 blocks.push_back(Block(BlockId::imperialAramaic, "Imperial Aramaic", "Imperial Aramaic", 0x10840, 0x1085F));
917 blocks.push_back(Block(BlockId::palmyrene, "Palmyrene", "Palmyrene", 0x10860, 0x1087F));
918 blocks.push_back(Block(BlockId::nabataean, "Nabataean", "Nabataean", 0x10880, 0x108AF));
919 blocks.push_back(Block(BlockId::hatran, "Hatran", "Hatran", 0x108E0, 0x108FF));
920 blocks.push_back(Block(BlockId::phoenician, "Phoenician", "Phoenician", 0x10900, 0x1091F));
921 blocks.push_back(Block(BlockId::lydian, "Lydian", "Lydian", 0x10920, 0x1093F));
922 blocks.push_back(Block(BlockId::meroiticHieroglyphs, "Meroitic Hieroglyphs", "Meroitic Hieroglyphs", 0x10980, 0x1099F));
923 blocks.push_back(Block(BlockId::meroiticCursive, "Meroitic Cursive", "Meroitic Cursive", 0x109A0, 0x109FF));
924 blocks.push_back(Block(BlockId::kharoshthi, "Kharoshthi", "Kharoshthi", 0x10A00, 0x10A5F));
925 blocks.push_back(Block(BlockId::oldSouthArabian, "Old South Arabian", "Old South Arabian", 0x10A60, 0x10A7F));
926 blocks.push_back(Block(BlockId::oldNorthArabian, "Old North Arabian", "Old North Arabian", 0x10A80, 0x10A9F));
927 blocks.push_back(Block(BlockId::manichean, "Manichaean", "Manichaean", 0x10AC0, 0x10AFF));
928 blocks.push_back(Block(BlockId::avestan, "Avestan", "Avestan", 0x10B00, 0x10B3F));
929 blocks.push_back(Block(BlockId::inscriptionalParthian, "Inscriptional Parthian", "Inscriptional Parthian", 0x10B40, 0x10B5F));
930 blocks.push_back(Block(BlockId::inscriptionalPahlavi, "Inscriptional Pahlavi", "Inscriptional Pahlavi", 0x10B60, 0x10B7F));
931 blocks.push_back(Block(BlockId::psalterPahlavi, "Psalter Pahlavi", "Psalter Pahlavi", 0x10B80, 0x10BAF));
932 blocks.push_back(Block(BlockId::oldTurkic, "Old Turkic", "Old Turkic", 0x10C00, 0x10C4F));
933 blocks.push_back(Block(BlockId::oldHungarian, "Old Hungarian", "Old Hungarian", 0x10C80, 0x10CFF));
934 blocks.push_back(Block(BlockId::hanifiRohingya, "Hanifi Rohingya", "Hanifi Rohingya", 0x10D00, 0x10D3F));
935 blocks.push_back(Block(BlockId::rumi, "Rumi", "Rumi Numeral Symbols", 0x10E60, 0x10E7F));
936 blocks.push_back(Block(BlockId::oldSogdian, "Old Sogdian", "Old Sogdian", 0x10F00, 0x10F2F));
937 blocks.push_back(Block(BlockId::sogdian, "Sogdian", "Sogdian", 0x10F30, 0x10F6F));
938 blocks.push_back(Block(BlockId::elymaic, "Elymaic", "Elymaic", 0x10FE0, 0x10FFF));
939 blocks.push_back(Block(BlockId::brahmi, "Brahmi", "Brahmi", 0x11000, 0x1107F));
940 blocks.push_back(Block(BlockId::kaithi, "Kaithi", "Kaithi", 0x11080, 0x110CF));
941 blocks.push_back(Block(BlockId::soraSompeng, "Sora Sompeng", "Sora Sompeng", 0x110D0, 0x110FF));
942 blocks.push_back(Block(BlockId::chakma, "Chakma", "Chakma", 0x11100, 0x1114F));
943 blocks.push_back(Block(BlockId::mahajani, "Mahajani", "Mahajani", 0x11150, 0x1117F));
944 blocks.push_back(Block(BlockId::sharada, "Sharada", "Sharada", 0x11180, 0x111DF));
945 blocks.push_back(Block(BlockId::sinhalaArchaicNumbers, "Sinhala Archaic Numbers", "Sinhala Archaic Numbers", 0x111E0, 0x111FF));
946 blocks.push_back(Block(BlockId::khojki, "Khojki", "Khojki", 0x11200, 0x1124F));
947 blocks.push_back(Block(BlockId::multani, "Multani", "Multani", 0x11280, 0x112AF));
948 blocks.push_back(Block(BlockId::khudawadi, "Khudawadi", "Khudawadi", 0x112B0, 0x112FF));
949 blocks.push_back(Block(BlockId::grantha, "Grantha", "Grantha", 0x11300, 0x1137F));
950 blocks.push_back(Block(BlockId::newa, "Newa", "Newa", 0x11400, 0x1147F));
951 blocks.push_back(Block(BlockId::tirhuta, "Tirhuta", "Tirhuta", 0x11480, 0x114DF));
952 blocks.push_back(Block(BlockId::siddham, "Siddham", "Siddham", 0x11580, 0x115FF));
953 blocks.push_back(Block(BlockId::modi, "Modi", "Modi", 0x11600, 0x1165F));
954 blocks.push_back(Block(BlockId::mongolianSup, "Mongolian Sup", "Mongolian Supplement", 0x11660, 0x1167F));
955 blocks.push_back(Block(BlockId::takri, "Takri", "Takri", 0x11680, 0x116CF));
956 blocks.push_back(Block(BlockId::ahom, "Ahom", "Ahom", 0x11700, 0x1173F));
957 blocks.push_back(Block(BlockId::dogra, "Dogra", "Dogra", 0x11800, 0x1184F));
958 blocks.push_back(Block(BlockId::warangCiti, "Warang Citi", "Warang Citi", 0x118A0, 0x118FF));
959 blocks.push_back(Block(BlockId::nandinagari, "Nandinagari", "Nandinagari", 0x119A0, 0x119FF));
960 blocks.push_back(Block(BlockId::zanabazarSquare, "Zanabazar Square", "Zanabazar Square", 0x11A00, 0x11A4FF));
961 blocks.push_back(Block(BlockId::soyombo, "Soyombo", "Soyombo", 0x11A50, 0x11AAF));
962 blocks.push_back(Block(BlockId::pauCinHau, "Pau Cin Hau", "Pau Cin Hau", 0x11AC0, 0x11AFF));
963 blocks.push_back(Block(BlockId::bhaisuki, "Bhaiksuki", "Bhaiksuki", 0x11C00, 0x11C6F));
964 blocks.push_back(Block(BlockId::marchen, "Marchen", "Marchen", 0x11C70, 0x11CBF));
965 blocks.push_back(Block(BlockId::masaramGondi, "Masaram Gondi", "Masaram Gondi", 0x11D00, 0x11D5F));
966 blocks.push_back(Block(BlockId::gunjalaGondi, "Gunjala Gondi", "Gunjala Gondi", 0x11D60, 0x11DAF));
967 blocks.push_back(Block(BlockId::makasar, "Makasar", "Makasar", 0x11EE0, 0x11EFF));
968 blocks.push_back(Block(BlockId::tamilSup, "Tamil Sup", "Tamil Sup", 0x11FC0, 0x11FFE));
969 blocks.push_back(Block(BlockId::cuneiform, "Cuneiform", "Cuneiform", 0x12000, 0x123FF));
970 blocks.push_back(Block(BlockId::cuneiformNumbers, "Cuneiform Numbers", "Cuneiform Numbers and Punctuation", 0x12400, 0x1247F));
971 blocks.push_back(Block(BlockId::earlyDynasticCuneiform, "Early Dynastic Cuneiform", "Early Dynastic Cuneiform", 0x12480, 0x1254F));
972 blocks.push_back(Block(BlockId::egyptianHieroglyphs, "Egyptian Hieroglyphs", "Egyptian Hieroglyphs", 0x13000, 0x1342F));
973 blocks.push_back(Block(BlockId::egyptianHieroglyphFormatControls, "Egyptian Hieroglyph Format Controls", "Egyptian Hieroglyph Format Controls", 0x13430, 0x1343F));
974 blocks.push_back(Block(BlockId::anatolianHieroglyphs, "Anatolian Hieroglyphs", "Anatolian Hieroglyphs", 0x14400, 0x1467F));
975 blocks.push_back(Block(BlockId::bamumSup, "Bamum Sup", "Bamum Supplement", 0x16800, 0x16A3F));
976 blocks.push_back(Block(BlockId::mro, "Mro", "Mro", 0x16A40, 0x16A6F));
977 blocks.push_back(Block(BlockId::bassaVah, "Bassa Vah", "Bassa Vah", 0x16AD0, 0x16AFF));
978 blocks.push_back(Block(BlockId::pahawhHmong, "Pahawh Hmong", "Pahawh Hmong", 0x16B00, 0x16B8F));
979 blocks.push_back(Block(BlockId::medefaidrin, "Medefaidrin", "Medefaidrin", 0x16E40, 0x16E9F));
980 blocks.push_back(Block(BlockId::miao, "Miao", "Miao", 0x16F00, 0x16F9F));
981 blocks.push_back(Block(BlockId::ideographicSymbols, "Ideographic Symbols", "Ideographic Symbols and Punctuation", 0x16FE0, 0x16FFF));
982 blocks.push_back(Block(BlockId::tangut, "Tangut", "Tangut", 0x17000, 0x187FF));
983 blocks.push_back(Block(BlockId::tangutComponents, "Tangut Components", "Tangut Components", 0x18800, 0x18AFF));
984 blocks.push_back(Block(BlockId::kanaSup, "Kana Sup", "Kana Supplement", 0x1B000, 0x1B0FF));
985 blocks.push_back(Block(BlockId::kanaExtA, "Kana Ext A", "Kana Extended-A", 0x1B100, 0x1B12F));
986 blocks.push_back(Block(BlockId::smallKanaExt, "Small Kana Ext", "Small Kana Extension", 0x1B130, 0x1B16F));
987 blocks.push_back(Block(BlockId::nushu, "Nushu", "Nushu", 0x1B170, 0x1B2FF));
988 blocks.push_back(Block(BlockId::duployan, "Duployan", "Duployan", 0x1BC00, 0x1BC9F));
989 blocks.push_back(Block(BlockId::shorthandFormatControls, "Shorthand Format Controls", "Shorthand Format Controls", 0x1BCA0, 0x1BCAF));
990 blocks.push_back(Block(BlockId::byzantineMusic, "Byzantine Music", "Byzantine Musical Symbols", 0x1D000, 0x1D0FF));
991 blocks.push_back(Block(BlockId::music, "Music", "Musical Symbols", 0x1D100, 0x1D1FF));
992 blocks.push_back(Block(BlockId::ancientGreekMusic, "Ancient Greek Music", "Ancient Greek Musical Notation", 0x1D200, 0x1D24F));
993 blocks.push_back(Block(BlockId::mayanNumerals, "Mayan Numerals", "Mayan Numerals", 0x1D2E0, 0x1D2FF));
994 blocks.push_back(Block(BlockId::taiXuanJing, "Tai Xuan Jing", "Tai Xuan Jing Symbols", 0x1D300, 0x1D35F));
995 blocks.push_back(Block(BlockId::countingRod, "Counting Rod", "Counting Rod Numerals", 0x1D360, 0x1D37F));
996 blocks.push_back(Block(BlockId::mathAlphanum, "Math Alphanum", "Mathematical Alphanumeric Symbols", 0x1D400, 0x1D7FF));
997 blocks.push_back(Block(BlockId::suttonSignWriting, "Sutton SignWriting", "Sutton SignWriting", 0x1D800, 0x1DAAF));
998 blocks.push_back(Block(BlockId::glagoliticSup, "Glagolitic Sup", "Glagolitic Supplement", 0x1E000, 0x1E02F));
999 blocks.push_back(Block(BlockId::nyiakengPuachueHmong, "Nyiakeng Puachue Hmong", "Nyiakeng Puachue Hmong", 0x1E100, 0x1E14F));
1000 blocks.push_back(Block(BlockId::wancho, "Wancho", "Wancho", 0x1E2C0, 0x1E2FF));
1001 blocks.push_back(Block(BlockId::mendeKikakui, "Mende Kikakui", "Mende Kikakui", 0x1E800, 0x1E8DF));
1002 blocks.push_back(Block(BlockId::adlam, "Adlam", "Adlam", 0x1E900, 0x1E95F));
1003 blocks.push_back(Block(BlockId::indicSiyaqNumbers, "Indic Siyaq Numbers", "Indic Siyaq Numbers", 0x1EC70, 0x1ECBF));
1004 blocks.push_back(Block(BlockId::ottomanSiyaqNumbers, "Ottoman Siyaq Numbers", "Ottoman Siyaq Numbers", 0x1ED00, 0x1ED4F));
1005 blocks.push_back(Block(BlockId::arabicMath, "Arabic Math", "Arabic Mathematical Alphabetic Symbols", 0x1EE00, 0x1EEFF));
1006 blocks.push_back(Block(BlockId::mahjong, "Mahjong", "Mahjong Tiles", 0x1F000, 0x1F02F));
1007 blocks.push_back(Block(BlockId::domino, "Domino", "Domino Tiles", 0x1F030, 0x1F09F));
1008 blocks.push_back(Block(BlockId::playingCards, "Playing Cards", "Playing Cards", 0x1F0A0, 0x1F0FF));
1009 blocks.push_back(Block(BlockId::enclosedAlphanumSup, "Enclosed Alphanum Sup", "Enclosed Alphanumeric Supplement", 0x1F100, 0x1F1FF));
1010 blocks.push_back(Block(BlockId::enclosedIdeographicSup, "Enclosed Ideographic Sup", "Enclosed Ideographic Supplement", 0x1F200, 0x1F2FF));
1011 blocks.push_back(Block(BlockId::miscPictographs, "Misc Pictographs", "Miscellaneous Symbols and Pictographs", 0x1F300, 0x1F5FF));
1012 blocks.push_back(Block(BlockId::emoticons, "Emoticons", "Emoticons", 0x1F600, 0x1F64F));
1013 blocks.push_back(Block(BlockId::ornamentalDingbats, "Ornamental Dingbats", "Ornamental Dingbats", 0x1F650, 0x1F67F));
1014 blocks.push_back(Block(BlockId::transportAndMap, "Transport And Map", "Transport and Map Symbols", 0x1F680, 0x1F6FF));
1015 blocks.push_back(Block(BlockId::alchemical, "Alchemical", "Alchemical Symbols", 0x1F700, 0x1F77F));
1016 blocks.push_back(Block(BlockId::geometricShapesExt, "Geometric Shapes Ext", "Geometric Shapes Extended", 0x1F780, 0x1F7FF));
1017 blocks.push_back(Block(BlockId::supArrowsC, "Sup Arrows C", "Supplemental Arrows-C", 0x1F800, 0x1F8FF));
1018 blocks.push_back(Block(BlockId::supSymbolsAndPictographs, "Sup Symbols And Pictographs", "Supplemental Symbols and Pictographs", 0x1F900, 0x1F9FF));
1019 blocks.push_back(Block(BlockId::chessSymbols, "Chess Symbols", "Chess Symbols", 0x1FA00, 0x1FA6F));
1020 blocks.push_back(Block(BlockId::symbolsAndPictographsExtA, "Symbols And Pictographs Ext A", "Symbols And Pictographs Extended A", 0x1FA70, 0x1FAFF));
1021 blocks.push_back(Block(BlockId::cjkExtB, "CJK Ext B", "CJK Unified Ideographs Extension B", 0x20000, 0x2A6DF));
1022 blocks.push_back(Block(BlockId::cjkExtC, "CJK Ext C", "CJK Unified Ideographs Extension C", 0x2A700, 0x2B73F));
1023 blocks.push_back(Block(BlockId::cjkExtD, "CJK Ext D", "CJK Unified Ideographs Extension D", 0x2B740, 0x2B81F));
1024 blocks.push_back(Block(BlockId::cjkExtE, "CJK Ext E", "CJK Unified Ideographs Extension E", 0x2B820, 0x2CEAF));
1025 blocks.push_back(Block(BlockId::cjkExtF, "CJK Ext F", "CJK Unified Ideographs Extension F", 0x2CEB0, 0x2EBEF));
1026 blocks.push_back(Block(BlockId::cjkCompatIdeographsSup, "CJK Compat Ideographs Sup", "CJK Compatibility Ideographs Supplement", 0x2F800, 0x2FA1F));
1027 blocks.push_back(Block(BlockId::tags, "Tags", "Tags", 0xE0000, 0xE007F));
1028 blocks.push_back(Block(BlockId::vsSup, "VS Sup", "Variation Selectors Supplement", 0xE0100, 0xE01EF));
1029 blocks.push_back(Block(BlockId::supPuaA, "Sup PUA A", "Supplementary Private Use Area-A", 0xF0000, 0xFFFFF));
1030 blocks.push_back(Block(BlockId::supPuaB, "Sup PUA B", "Supplementary Private Use Area-B", 0x100000, 0x10FFFF));
1031
1032 for (const Block& block : blocks)
1033 {
1034 blockIdMap[block.Id()] = █
1035 shortNameMap[MakeCanonicalPropertyName(block.ShortName())] = █
1036 longNameMap[MakeCanonicalPropertyName(block.LongName())] = █
1037 }
1038 }
1039
1040 const Block& BlockTable::GetBlock(BlockId blockId) const
1041 {
1042 auto it = blockIdMap.find(blockId);
1043 if (it != blockIdMap.cend())
1044 {
1045 return *it->second;
1046 }
1047 else
1048 {
1049 throw UnicodeException("block id " + std::to_string(static_cast<int>(blockId)) + " not found");
1050 }
1051 }
1052
1053 const Block& BlockTable::GetBlockByShortName(const std::string& shortName) const
1054 {
1055 auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1056 if (it != shortNameMap.cend())
1057 {
1058 return *it->second;
1059 }
1060 else
1061 {
1062 throw UnicodeException("block '" + shortName + "' not found");
1063 }
1064 }
1065
1066 const Block& BlockTable::GetBlockByLongName(const std::string& longName) const
1067 {
1068 auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1069 if (it != longNameMap.cend())
1070 {
1071 return *it->second;
1072 }
1073 else
1074 {
1075 throw UnicodeException("block '" + longName + "' not found");
1076 }
1077 }
1078
1079 GeneralCategory::GeneralCategory(GeneralCategoryId id_, const std::string& shortName_, const std::string& longName_) : id(id_), shortName(shortName_), longName(longName_)
1080 {
1081 }
1082
1083 void GeneralCategoryTable::Init()
1084 {
1085 instance.reset(new GeneralCategoryTable());
1086 }
1087
1088 void GeneralCategoryTable::Done()
1089 {
1090 instance.reset();
1091 }
1092
1093 std::unique_ptr<GeneralCategoryTable> GeneralCategoryTable::instance;
1094
1095 GeneralCategoryTable::GeneralCategoryTable()
1096 {
1097 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Lu, "Lu", "Uppercase Letter"));
1098 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Lu, "Ll", "Lowercase Letter"));
1099 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Lt, "Lt", "Titlecase Letter"));
1100 generalCategories.push_back(GeneralCategory(GeneralCategoryId::LC, "LC", "Cased Letter"));
1101 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Lm, "Lm", "Modifier Letter"));
1102 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Lo, "Lo", "Other Letter"));
1103 generalCategories.push_back(GeneralCategory(GeneralCategoryId::L, "L", "Letter"));
1104 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Mn, "Mn", "Nonspacing Mark"));
1105 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Mc, "Mc", "Spacing Mark"));
1106 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Me, "Me", "Enclosing Mark"));
1107 generalCategories.push_back(GeneralCategory(GeneralCategoryId::M, "M", "Mark"));
1108 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Nd, "Nd", "Decimal Number"));
1109 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Nl, "Nl", "Letter Number"));
1110 generalCategories.push_back(GeneralCategory(GeneralCategoryId::No, "No", "Other Number"));
1111 generalCategories.push_back(GeneralCategory(GeneralCategoryId::N, "N", "Number"));
1112 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Pc, "Pc", "Connector Punctuation"));
1113 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Pd, "Pd", "Dash Punctuation"));
1114 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Ps, "Ps", "Open Punctuation"));
1115 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Pe, "Pe", "Close Punctuation"));
1116 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Pi, "Pi", "Initial Punctuation"));
1117 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Pf, "Pf", "Final Punctuation"));
1118 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Po, "Po", "Other Punctuation"));
1119 generalCategories.push_back(GeneralCategory(GeneralCategoryId::P, "P", "Punctuation"));
1120 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Sm, "Sm", "Math Symbol"));
1121 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Sc, "Sc", "Currency Symbol"));
1122 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Sk, "Sk", "Modifier Symbol"));
1123 generalCategories.push_back(GeneralCategory(GeneralCategoryId::So, "So", "Other Symbol"));
1124 generalCategories.push_back(GeneralCategory(GeneralCategoryId::S, "S", "Symbol"));
1125 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Zs, "Zs", "Space Separator"));
1126 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Zl, "Zl", "Line Separator"));
1127 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Zp, "Zp", "Paragraph Separator"));
1128 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Z, "Z", "Separator"));
1129 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Cc, "Cc", "Control"));
1130 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Cf, "Cf", "Format"));
1131 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Cs, "Cs", "Surrogate"));
1132 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Co, "Co", "Private Use"));
1133 generalCategories.push_back(GeneralCategory(GeneralCategoryId::Cn, "Cn", "Unassigned"));
1134 generalCategories.push_back(GeneralCategory(GeneralCategoryId::C, "C", "Other"));
1135 generalCategories.push_back(GeneralCategory(GeneralCategoryId::G, "G", "Graphic"));
1136 generalCategories.push_back(GeneralCategory(GeneralCategoryId::B, "B", "Base"));
1137 for (const GeneralCategory& generalCategory : generalCategories)
1138 {
1139 generalCategoryIdMap[generalCategory.Id()] = &generalCategory;
1140 shortNameMap[MakeCanonicalPropertyName(generalCategory.ShortName())] = &generalCategory;
1141 longNameMap[MakeCanonicalPropertyName(generalCategory.LongName())] = &generalCategory;
1142 }
1143 }
1144
1145 const GeneralCategory& GeneralCategoryTable::GetGeneralCategory(GeneralCategoryId generalCategoryId) const
1146 {
1147 auto it = generalCategoryIdMap.find(generalCategoryId);
1148 if (it != generalCategoryIdMap.cend())
1149 {
1150 return *it->second;
1151 }
1152 else
1153 {
1154 throw UnicodeException("general category " + std::to_string(static_cast<int>(generalCategoryId)) + " not found");
1155 }
1156 }
1157
1158 const GeneralCategory& GeneralCategoryTable::GetGeneralCategoryByShortName(const std::string& shortName) const
1159 {
1160 auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1161 if (it != shortNameMap.cend())
1162 {
1163 return *it->second;
1164 }
1165 else
1166 {
1167 throw UnicodeException("general category '" + shortName + "' not found");
1168 }
1169 }
1170
1171 const GeneralCategory& GeneralCategoryTable::GetGeneralCategoryByLongName(const std::string& longName) const
1172 {
1173 auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1174 if (it != longNameMap.cend())
1175 {
1176 return *it->second;
1177 }
1178 else
1179 {
1180 throw UnicodeException("general category '" + longName + "' not found");
1181 }
1182 }
1183
1184 Age::Age(AgeId id_, const std::string& version_) : id(id_), version(version_)
1185 {
1186 }
1187
1188 void AgeTable::Init()
1189 {
1190 instance.reset(new AgeTable());
1191 }
1192
1193 void AgeTable::Done()
1194 {
1195 instance.reset();
1196 }
1197
1198 std::unique_ptr<AgeTable> AgeTable::instance;
1199
1200 AgeTable::AgeTable()
1201 {
1202 ages.push_back(Age(AgeId::age_1_1, "1.1"));
1203 ages.push_back(Age(AgeId::age_2_0, "2.0"));
1204 ages.push_back(Age(AgeId::age_2_1, "2.1"));
1205 ages.push_back(Age(AgeId::age_3_0, "3.0"));
1206 ages.push_back(Age(AgeId::age_3_1, "3.1"));
1207 ages.push_back(Age(AgeId::age_3_2, "3.2"));
1208 ages.push_back(Age(AgeId::age_4_0, "4.0"));
1209 ages.push_back(Age(AgeId::age_4_1, "4.1"));
1210 ages.push_back(Age(AgeId::age_5_0, "5.0"));
1211 ages.push_back(Age(AgeId::age_5_1, "5.1"));
1212 ages.push_back(Age(AgeId::age_5_2, "5.2"));
1213 ages.push_back(Age(AgeId::age_6_0, "6.0"));
1214 ages.push_back(Age(AgeId::age_6_1, "6.1"));
1215 ages.push_back(Age(AgeId::age_6_2, "6.2"));
1216 ages.push_back(Age(AgeId::age_6_3, "6.3"));
1217 ages.push_back(Age(AgeId::age_7_0, "7.0"));
1218 ages.push_back(Age(AgeId::age_8_0, "8.0"));
1219 ages.push_back(Age(AgeId::age_9_0, "9.0"));
1220 ages.push_back(Age(AgeId::age_10_0, "10.0"));
1221 ages.push_back(Age(AgeId::age_11_0, "11.0"));
1222 ages.push_back(Age(AgeId::age_12_0, "12.0"));
1223 ages.push_back(Age(AgeId::age_12_1, "12.1"));
1224 for (const Age& age : ages)
1225 {
1226 ageIdMap[age.Id()] = &age;
1227 versionMap[age.Version()] = &age;
1228 }
1229 }
1230
1231 const Age& AgeTable::GetAge(AgeId id) const
1232 {
1233 auto it = ageIdMap.find(id);
1234 if (it != ageIdMap.cend())
1235 {
1236 return *it->second;
1237 }
1238 else
1239 {
1240 throw UnicodeException("Unicode age " + std::to_string(static_cast<int>(id)) + " not found");
1241 }
1242 }
1243
1244 const Age& AgeTable::GetAge(const std::string& version) const
1245 {
1246 auto it = versionMap.find(version);
1247 if (it != versionMap.cend())
1248 {
1249 return *it->second;
1250 }
1251 else
1252 {
1253 throw UnicodeException("Unicode age '" + version + "' not found");
1254 }
1255 }
1256
1257 Script::Script(ScriptId id_, const std::string& shortName_, const std::string& longName_) : id(id_), shortName(shortName_), longName(longName_)
1258 {
1259 }
1260
1261 void ScriptTable::Init()
1262 {
1263 instance.reset(new ScriptTable());
1264 }
1265
1266 void ScriptTable::Done()
1267 {
1268 instance.reset();
1269 }
1270
1271 std::unique_ptr<ScriptTable> ScriptTable::instance;
1272
1273 ScriptTable::ScriptTable()
1274 {
1275 scripts.push_back(Script(ScriptId::adlm, "Adlm", "Adlam"));
1276 scripts.push_back(Script(ScriptId::aghb, "Aghb", "Caucasian Albanian"));
1277 scripts.push_back(Script(ScriptId::ahom, "Ahom", "Ahom"));
1278 scripts.push_back(Script(ScriptId::arab, "Arab", "Arabic"));
1279 scripts.push_back(Script(ScriptId::armi, "Armi", "Imperial Aramaic"));
1280 scripts.push_back(Script(ScriptId::armn, "Armn", "Armenian"));
1281 scripts.push_back(Script(ScriptId::avst, "Avst", "Avestan"));
1282 scripts.push_back(Script(ScriptId::bali, "Bali", "Balinese"));
1283 scripts.push_back(Script(ScriptId::bamu, "Bamu", "Bamum"));
1284 scripts.push_back(Script(ScriptId::bass, "Bass", "Bassa Vah"));
1285 scripts.push_back(Script(ScriptId::batk, "Batk", "Batak"));
1286 scripts.push_back(Script(ScriptId::beng, "Beng", "Bengali"));
1287 scripts.push_back(Script(ScriptId::bhks, "Bhks", "Bhaisuki"));
1288 scripts.push_back(Script(ScriptId::bopo, "Bopo", "Bopomofo"));
1289 scripts.push_back(Script(ScriptId::brah, "Brah", "Brahmi"));
1290 scripts.push_back(Script(ScriptId::brai, "Brai", "Braille"));
1291 scripts.push_back(Script(ScriptId::bugi, "Bugi", "Buginese"));
1292 scripts.push_back(Script(ScriptId::buhd, "Buhd", "Buhid"));
1293 scripts.push_back(Script(ScriptId::cakm, "Cakm", "Chakma"));
1294 scripts.push_back(Script(ScriptId::cans, "Cans", "Canadian Aboriginal"));
1295 scripts.push_back(Script(ScriptId::cari, "Cari", "Carian"));
1296 scripts.push_back(Script(ScriptId::cham, "Cham", "Cham"));
1297 scripts.push_back(Script(ScriptId::cher, "Cher", "Cherokee"));
1298 scripts.push_back(Script(ScriptId::copt, "Copt", "Coptic"));
1299 scripts.push_back(Script(ScriptId::cprt, "Cprt", "Cypriot"));
1300 scripts.push_back(Script(ScriptId::cyrl, "Cyrl", "Cyrillic"));
1301 scripts.push_back(Script(ScriptId::deva, "Deva", "Devanagari"));
1302 scripts.push_back(Script(ScriptId::dogr, "Dogr", "Dogra"));
1303 scripts.push_back(Script(ScriptId::dsrt, "Dsrt", "Deseret"));
1304 scripts.push_back(Script(ScriptId::dupl, "Dupl", "Duployan"));
1305 scripts.push_back(Script(ScriptId::egyp, "Egyp", "Egyptian Hieroglyphs"));
1306 scripts.push_back(Script(ScriptId::elba, "Elba", "Elbasan"));
1307 scripts.push_back(Script(ScriptId::elym, "Elym", "Elymaic"));
1308 scripts.push_back(Script(ScriptId::ethi, "Ethi", "Ethiopian"));
1309 scripts.push_back(Script(ScriptId::geor, "Geor", "Georgian"));
1310 scripts.push_back(Script(ScriptId::glag, "Glag", "Glagolitic"));
1311 scripts.push_back(Script(ScriptId::gong, "Gong", "Gunjala Gondi"));
1312 scripts.push_back(Script(ScriptId::gonm, "Gonm", "Masaram Gondi"));
1313 scripts.push_back(Script(ScriptId::goth, "Goth", "Gothic"));
1314 scripts.push_back(Script(ScriptId::gran, "Gran", "Grantha"));
1315 scripts.push_back(Script(ScriptId::grek, "Grek", "Greek"));
1316 scripts.push_back(Script(ScriptId::gujr, "Gujr", "Gujarati"));
1317 scripts.push_back(Script(ScriptId::guru, "Guru", "Gurmukhi"));
1318 scripts.push_back(Script(ScriptId::hang, "Hang", "Hangul"));
1319 scripts.push_back(Script(ScriptId::hani, "Hani", "Han"));
1320 scripts.push_back(Script(ScriptId::hano, "Hano", "Hanunoo"));
1321 scripts.push_back(Script(ScriptId::hatr, "Hatr", "Hatran"));
1322 scripts.push_back(Script(ScriptId::hebr, "Hebr", "Hebrew"));
1323 scripts.push_back(Script(ScriptId::hira, "Hira", "Hiragana"));
1324 scripts.push_back(Script(ScriptId::hluw, "Hluw", "Anatolian Hieroglyphs"));
1325 scripts.push_back(Script(ScriptId::hmng, "Hmng", "Pahawh Hmong"));
1326 scripts.push_back(Script(ScriptId::hmnp, "Hmnp", "Nyiakeng Puachue Hmong"));
1327 scripts.push_back(Script(ScriptId::hrkt, "Hrkt", "Katakana Or Hiragana"));
1328 scripts.push_back(Script(ScriptId::hung, "Hung", "Old Hungarian"));
1329 scripts.push_back(Script(ScriptId::ital, "Ital", "Old Italic"));
1330 scripts.push_back(Script(ScriptId::java, "Java", "Javanese"));
1331 scripts.push_back(Script(ScriptId::kali, "Kali", "Kayah Li"));
1332 scripts.push_back(Script(ScriptId::kana, "Kana", "Katakana"));
1333 scripts.push_back(Script(ScriptId::khar, "Khar", "Kharoshthi"));
1334 scripts.push_back(Script(ScriptId::khmr, "Khmr", "Khmer"));
1335 scripts.push_back(Script(ScriptId::khoj, "Khoj", "Khojki"));
1336 scripts.push_back(Script(ScriptId::knda, "Knda", "Kannada"));
1337 scripts.push_back(Script(ScriptId::kthi, "Kthi", "Kaithi"));
1338 scripts.push_back(Script(ScriptId::lana, "Lana", "Tai Tham"));
1339 scripts.push_back(Script(ScriptId::laoo, "Laoo", "Lao"));
1340 scripts.push_back(Script(ScriptId::latn, "Latn", "Latin"));
1341 scripts.push_back(Script(ScriptId::lepc, "Lepc", "Lepcha"));
1342 scripts.push_back(Script(ScriptId::limb, "Limb", "Limbu"));
1343 scripts.push_back(Script(ScriptId::lina, "Lina", "Linear A"));
1344 scripts.push_back(Script(ScriptId::linb, "Linb", "Linear B"));
1345 scripts.push_back(Script(ScriptId::lisu, "Lisu", "Lisu"));
1346 scripts.push_back(Script(ScriptId::lyci, "Lyci", "Lycian"));
1347 scripts.push_back(Script(ScriptId::lydi, "Lydi", "Lydian"));
1348 scripts.push_back(Script(ScriptId::mahj, "Mahj", "Mahajani"));
1349 scripts.push_back(Script(ScriptId::maka, "Maka", "Makasar"));
1350 scripts.push_back(Script(ScriptId::mand, "Mand", "Mandaic"));
1351 scripts.push_back(Script(ScriptId::mani, "Mani", "Manichaean"));
1352 scripts.push_back(Script(ScriptId::marc, "Marc", "Marchen"));
1353 scripts.push_back(Script(ScriptId::medf, "Medf", "Medefaidrin"));
1354 scripts.push_back(Script(ScriptId::mend, "Mend", "Mende Kikakui"));
1355 scripts.push_back(Script(ScriptId::merc, "Merc", "Meroitic Cursive"));
1356 scripts.push_back(Script(ScriptId::mero, "Mero", "Meroitic Hieroglyphs"));
1357 scripts.push_back(Script(ScriptId::mlym, "Mlym", "Malayalam"));
1358 scripts.push_back(Script(ScriptId::modi, "Modi", "Modi"));
1359 scripts.push_back(Script(ScriptId::mong, "Mong", "Mongolian"));
1360 scripts.push_back(Script(ScriptId::mroo, "Mroo", "Mro"));
1361 scripts.push_back(Script(ScriptId::mtei, "Mtei", "Meetei Mayak"));
1362 scripts.push_back(Script(ScriptId::mult, "Mult", "Multani"));
1363 scripts.push_back(Script(ScriptId::mymr, "Mymr", "Myanmar"));
1364 scripts.push_back(Script(ScriptId::nand, "Nand", "Nandinagari"));
1365 scripts.push_back(Script(ScriptId::narb, "Narb", "Old North Arabian"));
1366 scripts.push_back(Script(ScriptId::nbat, "Nbat", "Nabataean"));
1367 scripts.push_back(Script(ScriptId::newa, "Newa", "Newa"));
1368 scripts.push_back(Script(ScriptId::nkoo, "Nkoo", "Nko"));
1369 scripts.push_back(Script(ScriptId::nshu, "Nshu", "Nushu"));
1370 scripts.push_back(Script(ScriptId::ogam, "Ogam", "Ogham"));
1371 scripts.push_back(Script(ScriptId::olck, "Olck", "Ol Chiki"));
1372 scripts.push_back(Script(ScriptId::orkh, "Orkh", "Old Turkic"));
1373 scripts.push_back(Script(ScriptId::orya, "Orya", "Oriya"));
1374 scripts.push_back(Script(ScriptId::osge, "Osge", "Osage"));
1375 scripts.push_back(Script(ScriptId::osma, "Osma", "Osmanya"));
1376 scripts.push_back(Script(ScriptId::palm, "Palm", "Palmyrene"));
1377 scripts.push_back(Script(ScriptId::pauc, "Pauc", "Pau Cin Hau"));
1378 scripts.push_back(Script(ScriptId::perm, "Perm", "Old Permic"));
1379 scripts.push_back(Script(ScriptId::phag, "Phag", "Phags Pa"));
1380 scripts.push_back(Script(ScriptId::phli, "Phli", "Inscriptional Pahlavi"));
1381 scripts.push_back(Script(ScriptId::phlp, "Phlp", "Psalter Pahlavi"));
1382 scripts.push_back(Script(ScriptId::phnx, "Phnx", "Phoenician"));
1383 scripts.push_back(Script(ScriptId::plrd, "Plrd", "Miao"));
1384 scripts.push_back(Script(ScriptId::prti, "Prti", "Inscriptional Parthian"));
1385 scripts.push_back(Script(ScriptId::rjng, "Rjng", "Rejang"));
1386 scripts.push_back(Script(ScriptId::rohg, "Rohg", "Hanifi Rohingya"));
1387 scripts.push_back(Script(ScriptId::runr, "Runr", "Runic"));
1388 scripts.push_back(Script(ScriptId::samr, "Samr", "Samaritan"));
1389 scripts.push_back(Script(ScriptId::sarb, "Sarb", "Old South Arabian"));
1390 scripts.push_back(Script(ScriptId::saur, "Saur", "Saurashtra"));
1391 scripts.push_back(Script(ScriptId::sgnw, "Sgnw", "SignWriting"));
1392 scripts.push_back(Script(ScriptId::shaw, "Shaw", "Shawian"));
1393 scripts.push_back(Script(ScriptId::shrd, "Shrd", "Sharada"));
1394 scripts.push_back(Script(ScriptId::sidd, "Sidd", "Shiddham"));
1395 scripts.push_back(Script(ScriptId::sind, "Sind", "Khudawadi"));
1396 scripts.push_back(Script(ScriptId::sinh, "Sinh", "Sinhala"));
1397 scripts.push_back(Script(ScriptId::sogd, "Sogd", "Sogdian"));
1398 scripts.push_back(Script(ScriptId::sogo, "Sogo", "Old Sogdian"));
1399 scripts.push_back(Script(ScriptId::sora, "Sora", "Sora Sompeng"));
1400 scripts.push_back(Script(ScriptId::soyo, "Soyo", "Soyombo"));
1401 scripts.push_back(Script(ScriptId::sund, "Sund", "Sundanese"));
1402 scripts.push_back(Script(ScriptId::sylo, "Sylo", "Syloti Nagri"));
1403 scripts.push_back(Script(ScriptId::syrc, "Syrc", "Syriac"));
1404 scripts.push_back(Script(ScriptId::tagb, "Tagb", "Tagbanwa"));
1405 scripts.push_back(Script(ScriptId::takr, "Takr", "Takri"));
1406 scripts.push_back(Script(ScriptId::tale, "Tale", "Tai Le"));
1407 scripts.push_back(Script(ScriptId::talu, "Talu", "New Tai Lue"));
1408 scripts.push_back(Script(ScriptId::taml, "Taml", "Tamil"));
1409 scripts.push_back(Script(ScriptId::tang, "Tang", "Tangut"));
1410 scripts.push_back(Script(ScriptId::tavt, "Tavt", "Tai Viet"));
1411 scripts.push_back(Script(ScriptId::telu, "Telu", "Telugu"));
1412 scripts.push_back(Script(ScriptId::tfng, "Tfng", "Tifinag"));
1413 scripts.push_back(Script(ScriptId::tglg, "Tglg", "Tagalog"));
1414 scripts.push_back(Script(ScriptId::thaa, "Thaa", "Thaana"));
1415 scripts.push_back(Script(ScriptId::thai, "Thai", "Thai"));
1416 scripts.push_back(Script(ScriptId::tibt, "Tibt", "Tibetan"));
1417 scripts.push_back(Script(ScriptId::tirh, "Tirh", "Tirhuta"));
1418 scripts.push_back(Script(ScriptId::ugar, "Ugar", "Ugaritic"));
1419 scripts.push_back(Script(ScriptId::vaii, "Vaii", "Vai"));
1420 scripts.push_back(Script(ScriptId::wara, "Wara", "Warang Citi"));
1421 scripts.push_back(Script(ScriptId::wcho, "Wcho", "Wcho"));
1422 scripts.push_back(Script(ScriptId::xpeo, "Xpeo", "Old Persian"));
1423 scripts.push_back(Script(ScriptId::xsux, "Xsux", "Cuneiform"));
1424 scripts.push_back(Script(ScriptId::yiii, "Yiii", "Yi"));
1425 scripts.push_back(Script(ScriptId::zanb, "Zanb", "Zanabazar Square"));
1426 scripts.push_back(Script(ScriptId::zinh, "Zinh", "Inherited"));
1427 scripts.push_back(Script(ScriptId::zyyy, "Zyyy", "Common"));
1428 scripts.push_back(Script(ScriptId::zzzz, "Zzzz", "Unknown"));
1429
1430 for (const Script& script : scripts)
1431 {
1432 scriptIdMap[script.Id()] = &script;
1433 shortNameMap[MakeCanonicalPropertyName(script.ShortName())] = &script;
1434 longNameMap[MakeCanonicalPropertyName(script.LongName())] = &script;
1435 }
1436 }
1437
1438 const Script& ScriptTable::GetScript(ScriptId id) const
1439 {
1440 auto it = scriptIdMap.find(id);
1441 if (it != scriptIdMap.cend())
1442 {
1443 return *it->second;
1444 }
1445 else
1446 {
1447 throw UnicodeException("script id " + std::to_string(static_cast<int>(id)) + " not found");
1448 }
1449 }
1450
1451 const Script& ScriptTable::GetScriptByShortName(const std::string& shortName) const
1452 {
1453 auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1454 if (it != shortNameMap.cend())
1455 {
1456 return *it->second;
1457 }
1458 else
1459 {
1460 throw UnicodeException("script '" + shortName + "' not found");
1461 }
1462 }
1463
1464 const Script& ScriptTable::GetScriptByLongName(const std::string& longName) const
1465 {
1466 auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1467 if (it != longNameMap.cend())
1468 {
1469 return *it->second;
1470 }
1471 else
1472 {
1473 throw UnicodeException("script '" + longName + "' not found");
1474 }
1475 }
1476
1477 CharacterInfo::CharacterInfo() :
1478 binaryProperties(0), generalCategory(GeneralCategoryId::none), upper(0), lower(0), title(0), folding(0), block(BlockId::none), age(AgeId::age_unassigned), script(ScriptId::none)
1479 {
1480 }
1481
1482 void CharacterInfo::Write(BinaryWriter& writer)
1483 {
1484 writer.Write(binaryProperties);
1485 writer.Write(static_cast<uint32_t>(generalCategory));
1486 writer.Write(upper);
1487 writer.Write(lower);
1488 writer.Write(title);
1489 writer.Write(folding);
1490 writer.Write(static_cast<uint16_t>(block));
1491 writer.Write(static_cast<uint8_t>(age));
1492 writer.Write(static_cast<uint8_t>(script));
1493 }
1494
1495 void CharacterInfo::Read(BinaryReader& reader)
1496 {
1497 binaryProperties = reader.ReadULong();
1498 generalCategory = static_cast<GeneralCategoryId>(reader.ReadUInt());
1499 upper = reader.ReadUChar();
1500 lower = reader.ReadUChar();
1501 title = reader.ReadUChar();
1502 folding = reader.ReadUChar();
1503 block = static_cast<BlockId>(reader.ReadUShort());
1504 age = static_cast<AgeId>(reader.ReadByte());
1505 script = static_cast<ScriptId>(reader.ReadByte());
1506 }
1507
1508 NumericType::NumericType(NumericTypeId id_, const std::string& shortName_, const std::string& longName_) : id(id_), shortName(shortName_), longName(longName_)
1509 {
1510 }
1511
1512 void NumericTypeTable::Init()
1513 {
1514 instance.reset(new NumericTypeTable());
1515 }
1516
1517 void NumericTypeTable::Done()
1518 {
1519 instance.reset();
1520 }
1521
1522 std::unique_ptr<NumericTypeTable> NumericTypeTable::instance;
1523
1524 NumericTypeTable::NumericTypeTable()
1525 {
1526 numericTypes.push_back(NumericType(NumericTypeId::none, "None", "None"));
1527 numericTypes.push_back(NumericType(NumericTypeId::de, "De", "Decimal"));
1528 numericTypes.push_back(NumericType(NumericTypeId::di, "Di", "Digit"));
1529 numericTypes.push_back(NumericType(NumericTypeId::nu, "Nu", "Numeric"));
1530 for (const NumericType& numericType : numericTypes)
1531 {
1532 numericTypeMap[numericType.Id()] = &numericType;
1533 shortNameMap[MakeCanonicalPropertyName(numericType.ShortName())] = &numericType;
1534 longNameMap[MakeCanonicalPropertyName(numericType.LongName())] = &numericType;;
1535 }
1536 }
1537
1538 const NumericType& NumericTypeTable::GetNumericType(NumericTypeId id) const
1539 {
1540 auto it = numericTypeMap.find(id);
1541 if (it != numericTypeMap.cend())
1542 {
1543 return *it->second;
1544 }
1545 else
1546 {
1547 throw UnicodeException("numeric type " + std::to_string(static_cast<int>(id)) + " not found");
1548 }
1549 }
1550
1551 const NumericType& NumericTypeTable::GetNumericTypeByShortName(const std::string& shortName) const
1552 {
1553 auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1554 if (it != shortNameMap.cend())
1555 {
1556 return *it->second;
1557 }
1558 else
1559 {
1560 throw UnicodeException("numeric type '" + shortName + "' not found");
1561 }
1562 }
1563
1564 const NumericType& NumericTypeTable::GetNumericTypeByLongName(const std::string& longName) const
1565 {
1566 auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1567 if (it != longNameMap.cend())
1568 {
1569 return *it->second;
1570 }
1571 else
1572 {
1573 throw UnicodeException("numeric type '" + longName + "' not found");
1574 }
1575 }
1576
1577 BidiClass::BidiClass(BidiClassId id_, const std::string& shortName_, const std::string& longName_) : id(id_), shortName(shortName_), longName(longName_)
1578 {
1579 }
1580
1581 void BidiClassTable::Init()
1582 {
1583 instance.reset(new BidiClassTable());
1584 }
1585
1586 void BidiClassTable::Done()
1587 {
1588 instance.reset();
1589 }
1590
1591 std::unique_ptr<BidiClassTable> BidiClassTable::instance;
1592
1593 BidiClassTable::BidiClassTable()
1594 {
1595 bidiClasses.push_back(BidiClass(BidiClassId::al, "AL", "Arabic Letter"));
1596 bidiClasses.push_back(BidiClass(BidiClassId::an, "AN", "Arabic Number"));
1597 bidiClasses.push_back(BidiClass(BidiClassId::b, "B", "Paragraph Separator"));
1598 bidiClasses.push_back(BidiClass(BidiClassId::bn, "BN", "Boundary Neutral"));
1599 bidiClasses.push_back(BidiClass(BidiClassId::cs, "CS", "Common Separator"));
1600 bidiClasses.push_back(BidiClass(BidiClassId::en, "EN", "European Number"));
1601 bidiClasses.push_back(BidiClass(BidiClassId::es, "ES", "European Separator"));
1602 bidiClasses.push_back(BidiClass(BidiClassId::et, "ET", "European Terminator"));
1603 bidiClasses.push_back(BidiClass(BidiClassId::fsi, "FSI", "First Strong Isolate"));
1604 bidiClasses.push_back(BidiClass(BidiClassId::l, "L", "Left To Right"));
1605 bidiClasses.push_back(BidiClass(BidiClassId::lre, "LRE", "Left To Right Embedding"));
1606 bidiClasses.push_back(BidiClass(BidiClassId::lri, "LRI", "Left To Right Isolate"));
1607 bidiClasses.push_back(BidiClass(BidiClassId::lro, "LRO", "Left To Right Override"));
1608 bidiClasses.push_back(BidiClass(BidiClassId::nsm, "NSM", "Nonspacing Mark"));
1609 bidiClasses.push_back(BidiClass(BidiClassId::on, "ON", "Other Neutral"));
1610 bidiClasses.push_back(BidiClass(BidiClassId::pdf, "PDF", "Pop Directional Format"));
1611 bidiClasses.push_back(BidiClass(BidiClassId::pdi, "PDI", "Pop Directional Isolate"));
1612 bidiClasses.push_back(BidiClass(BidiClassId::r, "R", "Right To Left"));
1613 bidiClasses.push_back(BidiClass(BidiClassId::rle, "RLE", "Right To Left Embedding"));
1614 bidiClasses.push_back(BidiClass(BidiClassId::rli, "RLI", "Right To Left Isolate"));
1615 bidiClasses.push_back(BidiClass(BidiClassId::rlo, "RLO", "Right To Left Override"));
1616 bidiClasses.push_back(BidiClass(BidiClassId::s, "S", "Segment Separator"));
1617 bidiClasses.push_back(BidiClass(BidiClassId::ws, "WS", "White Space"));
1618
1619 for (const BidiClass& bidiClass : bidiClasses)
1620 {
1621 bidiClassMap[bidiClass.Id()] = &bidiClass;
1622 shortNameMap[MakeCanonicalPropertyName(bidiClass.ShortName())] = &bidiClass;
1623 longNameMap[MakeCanonicalPropertyName(bidiClass.LongName())] = &bidiClass;
1624 }
1625 }
1626
1627 const BidiClass& BidiClassTable::GetBidiClass(BidiClassId id) const
1628 {
1629 auto it = bidiClassMap.find(id);
1630 if (it != bidiClassMap.cend())
1631 {
1632 return *it->second;
1633 }
1634 else
1635 {
1636 throw UnicodeException("bidi class " + std::to_string(static_cast<int>(id)) + " not found");
1637 }
1638 }
1639
1640 const BidiClass& BidiClassTable::GetBidiClassByShortName(const std::string& shortName) const
1641 {
1642 auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1643 if (it != shortNameMap.cend())
1644 {
1645 return *it->second;
1646 }
1647 else
1648 {
1649 throw UnicodeException("bidi class '" + shortName + "' not found");
1650 }
1651 }
1652
1653 const BidiClass& BidiClassTable::GetBidiClassByLongName(const std::string& longName) const
1654 {
1655 auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1656 if (it != longNameMap.cend())
1657 {
1658 return *it->second;
1659 }
1660 else
1661 {
1662 throw UnicodeException("bidi class '" + longName + "' not found");
1663 }
1664 }
1665
1666 BidiPairedBracketType::BidiPairedBracketType(BidiPairedBracketTypeId id_, const std::string& shortName_, const std::string& longName_) : id(id_), shortName(shortName_), longName(longName_)
1667 {
1668 }
1669
1670 void BidiPairedBracketTypeTable::Init()
1671 {
1672 instance.reset(new BidiPairedBracketTypeTable());
1673 }
1674
1675 void BidiPairedBracketTypeTable::Done()
1676 {
1677 instance.reset();
1678 }
1679
1680 std::unique_ptr<BidiPairedBracketTypeTable> BidiPairedBracketTypeTable::instance;
1681
1682 BidiPairedBracketTypeTable::BidiPairedBracketTypeTable()
1683 {
1684 bidiPairedBracketTypes.push_back(BidiPairedBracketType(BidiPairedBracketTypeId::o, "O", "Open"));
1685 bidiPairedBracketTypes.push_back(BidiPairedBracketType(BidiPairedBracketTypeId::c, "C", "Close"));
1686 bidiPairedBracketTypes.push_back(BidiPairedBracketType(BidiPairedBracketTypeId::none, "N", "None"));
1687 for (const BidiPairedBracketType& type : bidiPairedBracketTypes)
1688 {
1689 typeMap[type.Id()] = &type;
1690 shortNameMap[MakeCanonicalPropertyName(type.ShortName())] = &type;
1691 longNameMap[MakeCanonicalPropertyName(type.LongName())] = &type;
1692 }
1693 }
1694
1695 const BidiPairedBracketType& BidiPairedBracketTypeTable::GetBidiPairedBracketType(BidiPairedBracketTypeId id) const
1696 {
1697 auto it = typeMap.find(id);
1698 if (it != typeMap.cend())
1699 {
1700 return *it->second;
1701 }
1702 else
1703 {
1704 throw UnicodeException("Bidi paired bracket type " + std::to_string(static_cast<int>(id)) + " not found");
1705 }
1706 }
1707
1708 const BidiPairedBracketType& BidiPairedBracketTypeTable::GetBidiPairedBracketTypeByShortName(const std::string& shortName) const
1709 {
1710 auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1711 if (it != shortNameMap.cend())
1712 {
1713 return *it->second;
1714 }
1715 else
1716 {
1717 throw UnicodeException("Bidi paired bracket type '" + shortName + "' not found");
1718 }
1719 }
1720
1721 const BidiPairedBracketType& BidiPairedBracketTypeTable::GetBidiPairedBracketTypeByLongName(const std::string& longName) const
1722 {
1723 auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1724 if (it != longNameMap.cend())
1725 {
1726 return *it->second;
1727 }
1728 else
1729 {
1730 throw UnicodeException("Bidi paired bracket type '" + longName + "' not found");
1731 }
1732 }
1733
1734 AliasType::AliasType(AliasTypeId id_, const std::string& name_) : id(id_), name(name_)
1735 {
1736 }
1737
1738 void AliasTypeTable::Init()
1739 {
1740 instance.reset(new AliasTypeTable());
1741 }
1742
1743 void AliasTypeTable::Done()
1744 {
1745 instance.reset();
1746 }
1747
1748 std::unique_ptr<AliasTypeTable> AliasTypeTable::instance;
1749
1750 AliasTypeTable::AliasTypeTable()
1751 {
1752 aliasTypes.push_back(AliasType(AliasTypeId::abbreviation, "abbreviation"));
1753 aliasTypes.push_back(AliasType(AliasTypeId::alternate, "alternate"));
1754 aliasTypes.push_back(AliasType(AliasTypeId::control, "control"));
1755 aliasTypes.push_back(AliasType(AliasTypeId::correction, "correction"));
1756 aliasTypes.push_back(AliasType(AliasTypeId::figment, "figment"));
1757 for (const AliasType& aliasType : aliasTypes)
1758 {
1759 aliasTypeMap[aliasType.Id()] = &aliasType;
1760 typeNameMap[aliasType.Name()] = &aliasType;
1761 }
1762 }
1763
1764 const AliasType& AliasTypeTable::GetAliasType(AliasTypeId id) const
1765 {
1766 auto it = aliasTypeMap.find(id);
1767 if (it != aliasTypeMap.cend())
1768 {
1769 return *it->second;
1770 }
1771 else
1772 {
1773 throw UnicodeException("alias type " + std::to_string(static_cast<int>(id)) + " not found");
1774 }
1775 }
1776
1777 const AliasType& AliasTypeTable::GetAliasType(const std::string& typeName) const
1778 {
1779 auto it = typeNameMap.find(MakeCanonicalPropertyName(typeName));
1780 if (it != typeNameMap.cend())
1781 {
1782 return *it->second;
1783 }
1784 else
1785 {
1786 throw UnicodeException("alias type '" + typeName + "' not found");
1787 }
1788 }
1789
1790 Alias::Alias() : typeId(AliasTypeId::none), name()
1791 {
1792 }
1793
1794 Alias::Alias(AliasTypeId typeId_, const std::string& name_) : typeId(typeId_), name(name_)
1795 {
1796 }
1797
1798 void Alias::Write(BinaryWriter& writer)
1799 {
1800 writer.Write(static_cast<uint8_t>(typeId));
1801 writer.Write(name);
1802 }
1803
1804 void Alias::Read(BinaryReader& reader)
1805 {
1806 typeId = static_cast<AliasTypeId>(reader.ReadByte());
1807 name = reader.ReadUtf8String();
1808 }
1809
1810 ExtendedCharacterInfo::ExtendedCharacterInfo() : characterName(), unicode1Name(), canonicalCombiningClass(0), fullUpper(), fullLower(), fullTitle(), fullFolding(), bidiClass(BidiClassId::none),
1811 numericType(NumericTypeId::none), numericValue(), bidiPairedBracketType(BidiPairedBracketTypeId::none), bidiMirroringGlyph(0), bidiPairedBracket(0)
1812 {
1813 }
1814
1815 void ExtendedCharacterInfo::SetCharacterName(const std::string& characterName_)
1816 {
1817 characterName = characterName_;
1818 }
1819
1820 void ExtendedCharacterInfo::SetUnicode1Name(const std::string& unicode1Name_)
1821 {
1822 unicode1Name = unicode1Name_;
1823 }
1824
1825 void ExtendedCharacterInfo::Write(BinaryWriter& writer)
1826 {
1827 writer.Write(characterName);
1828 writer.Write(unicode1Name);
1829 writer.Write(static_cast<uint8_t>(canonicalCombiningClass));
1830 uint8_t nu = static_cast<uint8_t>(fullUpper.length());
1831 writer.Write(nu);
1832 for (uint8_t i = 0; i < nu; ++i)
1833 {
1834 writer.Write(fullUpper[i]);
1835 }
1836 uint8_t nl = static_cast<uint8_t>(fullLower.length());
1837 writer.Write(nl);
1838 for (uint8_t i = 0; i < nl; ++i)
1839 {
1840 writer.Write(fullLower[i]);
1841 }
1842 uint8_t nt = static_cast<uint8_t>(fullTitle.length());
1843 writer.Write(nt);
1844 for (uint8_t i = 0; i < nt; ++i)
1845 {
1846 writer.Write(fullTitle[i]);
1847 }
1848 uint8_t nf = static_cast<uint8_t>(fullFolding.length());
1849 writer.Write(nf);
1850 for (uint8_t i = 0; i < nf; ++i)
1851 {
1852 writer.Write(fullFolding[i]);
1853 }
1854 writer.Write(static_cast<uint8_t>(bidiClass));
1855 writer.Write(static_cast<uint8_t>(numericType));
1856 writer.Write(numericValue);
1857 uint8_t na = static_cast<uint8_t>(aliases.size());
1858 writer.Write(na);
1859 for (uint8_t i = 0; i < na; ++i)
1860 {
1861 aliases[i].Write(writer);
1862 }
1863 writer.Write(bidiMirroringGlyph);
1864 writer.Write(static_cast<uint8_t>(bidiPairedBracketType));
1865 writer.Write(bidiPairedBracket);
1866 }
1867
1868 void ExtendedCharacterInfo::Read(BinaryReader& reader)
1869 {
1870 characterName = reader.ReadUtf8String();
1871 unicode1Name = reader.ReadUtf8String();
1872 canonicalCombiningClass = reader.ReadByte();
1873 uint8_t nu = reader.ReadByte();
1874 for (uint8_t i = 0; i < nu; ++i)
1875 {
1876 fullUpper.append(1, reader.ReadUChar());
1877 }
1878 uint8_t nl = reader.ReadByte();
1879 for (uint8_t i = 0; i < nl; ++i)
1880 {
1881 fullLower.append(1, reader.ReadUChar());
1882 }
1883 uint8_t nt = reader.ReadByte();
1884 for (uint8_t i = 0; i < nt; ++i)
1885 {
1886 fullTitle.append(1, reader.ReadUChar());
1887 }
1888 uint8_t nf = reader.ReadByte();
1889 for (uint8_t i = 0; i < nf; ++i)
1890 {
1891 fullFolding.append(1, reader.ReadUChar());
1892 }
1893 bidiClass = static_cast<BidiClassId>(reader.ReadByte());
1894 numericType = static_cast<NumericTypeId>(reader.ReadByte());
1895 numericValue = reader.ReadUtf8String();
1896 uint8_t na = reader.ReadByte();
1897 for (uint8_t i = 0; i < na; ++i)
1898 {
1899 Alias alias;
1900 alias.Read(reader);
1901 aliases.push_back(alias);
1902 }
1903 bidiMirroringGlyph = reader.ReadUChar();
1904 bidiPairedBracketType = static_cast<BidiPairedBracketTypeId>(reader.ReadByte());
1905 bidiPairedBracket = reader.ReadUChar();
1906 }
1907
1908 CharacterInfoPage::CharacterInfoPage()
1909 {
1910 characterInfos.resize(numInfosInPage);
1911 }
1912
1913 CharacterInfo& CharacterInfoPage::GetCharacterInfo(int index)
1914 {
1915 if (index < 0 || index > characterInfos.size())
1916 {
1917 throw UnicodeException("invalid character info index");
1918 }
1919 return characterInfos[index];
1920 }
1921
1922 const CharacterInfo& CharacterInfoPage::GetCharacterInfo(int index) const
1923 {
1924 if (index < 0 || index > characterInfos.size())
1925 {
1926 throw UnicodeException("invalid character info index");
1927 }
1928 return characterInfos[index];
1929 }
1930
1931 void CharacterInfoPage::Write(BinaryWriter& writer)
1932 {
1933 for (int i = 0; i < characterInfos.size(); ++i)
1934 {
1935 CharacterInfo& info = characterInfos[i];
1936 info.Write(writer);
1937 }
1938 }
1939
1940 void CharacterInfoPage::Read(BinaryReader& reader)
1941 {
1942 for (int i = 0; i < characterInfos.size(); ++i)
1943 {
1944 CharacterInfo& info = characterInfos[i];
1945 info.Read(reader);
1946 }
1947 }
1948
1949 ExtendedCharacterInfoPage::ExtendedCharacterInfoPage()
1950 {
1951 extendedCharacterInfos.resize(numInfosInPage);
1952 }
1953
1954 const ExtendedCharacterInfo& ExtendedCharacterInfoPage::GetExtendedCharacterInfo(int index) const
1955 {
1956 if (index < 0 || index > extendedCharacterInfos.size())
1957 {
1958 throw UnicodeException("invalid extended character info index");
1959 }
1960 return extendedCharacterInfos[index];
1961 }
1962
1963 ExtendedCharacterInfo& ExtendedCharacterInfoPage::GetExtendedCharacterInfo(int index)
1964 {
1965 if (index < 0 || index > extendedCharacterInfos.size())
1966 {
1967 throw UnicodeException("invalid extended character info index");
1968 }
1969 return extendedCharacterInfos[index];
1970 }
1971
1972 void ExtendedCharacterInfoPage::Write(BinaryWriter& writer)
1973 {
1974 int n = extendedCharacterInfos.size();
1975 for (int i = 0; i < n; ++i)
1976 {
1977 extendedCharacterInfos[i].Write(writer);
1978 }
1979 }
1980
1981 void ExtendedCharacterInfoPage::Read(BinaryReader& reader)
1982 {
1983 int n = extendedCharacterInfos.size();
1984 for (int i = 0; i < n; ++i)
1985 {
1986 extendedCharacterInfos[i].Read(reader);
1987 }
1988 }
1989
1990 void ExtendedCharacterInfoHeader::AllocatePages(int numExtendedPages)
1991 {
1992 extendedPageStarts.resize(numExtendedPages);
1993 }
1994
1995 void ExtendedCharacterInfoHeader::Write(BinaryWriter& writer)
1996 {
1997 uint32_t n = extendedPageStarts.size();
1998 writer.Write(n);
1999 for (uint32_t i = 0; i < n; ++i)
2000 {
2001 writer.Write(extendedPageStarts[i]);
2002 }
2003 }
2004
2005 void ExtendedCharacterInfoHeader::Read(BinaryReader& reader)
2006 {
2007 uint32_t n = reader.ReadUInt();
2008 for (uint32_t i = 0; i < n; ++i)
2009 {
2010 uint32_t start = reader.ReadUInt();
2011 extendedPageStarts.push_back(start);
2012 }
2013 }
2014
2015 uint32_t ExtendedCharacterInfoHeader::GetPageStart(int pageIndex) const
2016 {
2017 if (pageIndex < 0 || pageIndex >= extendedPageStarts.size())
2018 {
2019 throw UnicodeException("invalid extended page index" + std::to_string(pageIndex));
2020 }
2021 return extendedPageStarts[pageIndex];
2022 }
2023
2024 void ExtendedCharacterInfoHeader::SetPageStart(int pageIndex, uint32_t extendedPageStart)
2025 {
2026 if (pageIndex < 0 || pageIndex >= extendedPageStarts.size())
2027 {
2028 throw UnicodeException("invalid extended page index" + std::to_string(pageIndex));
2029 }
2030 extendedPageStarts[pageIndex] = extendedPageStart;
2031 }
2032
2033 void CharacterTable::Init()
2034 {
2035 instance.reset(new CharacterTable());
2036 }
2037
2038 void CharacterTable::Done()
2039 {
2040 instance.reset();
2041 }
2042
2043 std::unique_ptr<CharacterTable> CharacterTable::instance;
2044
2045 const uint8_t headerMagic[8] =
2046 {
2047 static_cast<uint8_t>('C'), static_cast<uint8_t>('M'), static_cast<uint8_t>('A'), static_cast<uint8_t>('J'),
2048 static_cast<uint8_t>('U'), static_cast<uint8_t>('C'), static_cast<uint8_t>('D'), current_cmajor_ucd_version
2049 };
2050
2051 std::string CmajorRoot()
2052 {
2053 std::string cmajorRoot;
2054 const char* cmajorRootEnv = getenv("CMAJOR_ROOT");
2055 if (cmajorRootEnv)
2056 {
2057 cmajorRoot = cmajorRootEnv;
2058 }
2059 if (cmajorRoot.empty())
2060 {
2061 throw UnicodeException("please set 'CMAJOR_ROOT' environment variable to contain /path/to/cmajor-" + CmajorVersionStr() + " directory.");
2062 }
2063 return cmajorRoot;
2064 }
2065
2066 std::string CmajorUcdFilePath()
2067 {
2068 return (boost::filesystem::path(CmajorRoot()) / boost::filesystem::path("unicode") / boost::filesystem::path("cmajor_ucd.bin")).generic_string();
2069 }
2070
2071 CharacterTable::CharacterTable() : headerRead(false), extendedHeaderStart(0), extendedHeaderEnd(0), extendedHeaderRead(false)
2072 {
2073 }
2074
2075 void CharacterTable::Write()
2076 {
2077 std::string ucdFilePath = CmajorUcdFilePath();
2078 BinaryWriter writer(ucdFilePath);
2079 WriteHeader(writer);
2080 writer.Seek(headerSize);
2081 int n = pages.size();
2082 for (int i = 0; i < n; ++i)
2083 {
2084 CharacterInfoPage* page = pages[i].get();
2085 page->Write(writer);
2086 }
2087 extendedHeaderStart = writer.Pos();
2088 int nx = extendedPages.size();
2089 extendedHeader.AllocatePages(nx);
2090 extendedHeader.Write(writer);
2091 extendedHeaderEnd = writer.Pos();
2092 for (int i = 0; i < nx; ++i)
2093 {
2094 extendedHeader.SetPageStart(i, writer.Pos());
2095 ExtendedCharacterInfoPage* extendedPage = extendedPages[i].get();
2096 extendedPage->Write(writer);
2097 }
2098 writer.Seek(extendedHeaderStart);
2099 extendedHeader.Write(writer);
2100 writer.Seek(0);
2101 WriteHeader(writer);
2102 }
2103
2104 void CharacterTable::WriteHeader(BinaryWriter& writer)
2105 {
2106 for (int i = 0; i < 8; ++i)
2107 {
2108 writer.Write(headerMagic[i]);
2109 }
2110 writer.Write(uint32_t(extendedHeaderStart));
2111 writer.Write(uint32_t(extendedHeaderEnd));
2112 }
2113
2114 void CharacterTable::ReadHeader(BinaryReader& reader)
2115 {
2116 headerRead = true;
2117 uint8_t magic[8];
2118 for (int i = 0; i < 8; ++i)
2119 {
2120 magic[i] = reader.ReadByte();
2121 }
2122 for (int i = 0; i < 7; ++i)
2123 {
2124 if (magic[i] != headerMagic[i])
2125 {
2126 throw UnicodeException("invalid cmajor_ucd.bin header magic: 'CMAJUCD' expected");
2127 }
2128 }
2129 if (magic[7] != headerMagic[7])
2130 {
2131 throw UnicodeException("invalid cmajor_ucd.bin version: version " + std::string(1, headerMagic[7]) + " expected, version " + std::string(1, magic[7]) + " read");
2132 }
2133 extendedHeaderStart = reader.ReadUInt();
2134 extendedHeaderEnd = reader.ReadUInt();
2135 reader.Skip(headerSize - 16);
2136 }
2137
2138 void CharacterTable::ReadExtendedHeader(BinaryReader& reader)
2139 {
2140 extendedHeaderRead = true;
2141 extendedHeader.Read(reader);
2142 }
2143
2144 std::mutex mtx;
2145
2146 const CharacterInfo& CharacterTable::GetCharacterInfo(char32_t codePoint)
2147 {
2148 if (codePoint > 0x10FFFF)
2149 {
2150 throw UnicodeException("invalid Unicode code point " + std::to_string(codePoint));
2151 }
2152 int pageIndex = codePoint / numInfosInPage;
2153 if (pages.size() <= pageIndex)
2154 {
2155 std::lock_guard<std::mutex> lock(mtx);
2156 while (pages.size() <= pageIndex)
2157 {
2158 pages.push_back(std::unique_ptr<CharacterInfoPage>());
2159 }
2160 }
2161 CharacterInfoPage* page = pages[pageIndex].get();
2162 if (!page)
2163 {
2164 std::lock_guard<std::mutex> lock(mtx);
2165 if (!page)
2166 {
2167 std::string ucdFilePath = CmajorUcdFilePath();
2168 BinaryReader reader(ucdFilePath);
2169 uint32_t pageStart = 0;
2170 if (!headerRead)
2171 {
2172 ReadHeader(reader);
2173 pageStart = characterInfoPageSize * pageIndex;
2174 }
2175 else
2176 {
2177 pageStart = headerSize + characterInfoPageSize * pageIndex;
2178 }
2179 reader.Skip(pageStart);
2180 page = new CharacterInfoPage();
2181 page->Read(reader);
2182 pages[pageIndex] = std::move(std::unique_ptr<CharacterInfoPage>(page));
2183 }
2184 }
2185 int infoIndex = codePoint % numInfosInPage;
2186 return page->GetCharacterInfo(infoIndex);
2187 }
2188
2189 CharacterInfo& CharacterTable::CreateCharacterInfo(char32_t codePoint)
2190 {
2191 if (codePoint > 0x10FFFF)
2192 {
2193 throw UnicodeException("invalid Unicode code point " + std::to_string(codePoint));
2194 }
2195 int pageIndex = codePoint / numInfosInPage;
2196 while (pages.size() <= pageIndex)
2197 {
2198 pages.push_back(std::unique_ptr<CharacterInfoPage>(new CharacterInfoPage()));
2199 }
2200 int infoIndex = codePoint % numInfosInPage;
2201 CharacterInfoPage* page = pages[pageIndex].get();
2202 return page->GetCharacterInfo(infoIndex);
2203 }
2204
2205 const ExtendedCharacterInfo& CharacterTable::GetExtendedCharacterInfo(char32_t codePoint)
2206 {
2207 if (codePoint > 0x10FFFF)
2208 {
2209 throw UnicodeException("invalid Unicode code point " + std::to_string(codePoint));
2210 }
2211 int pageIndex = codePoint / numInfosInPage;
2212 if (extendedPages.size() <= pageIndex)
2213 {
2214 std::lock_guard<std::mutex> lock(mtx);
2215 while (extendedPages.size() <= pageIndex)
2216 {
2217 extendedPages.push_back(std::unique_ptr<ExtendedCharacterInfoPage>());
2218 }
2219 }
2220 ExtendedCharacterInfoPage* extendedPage = extendedPages[pageIndex].get();
2221 if (!extendedPage)
2222 {
2223 std::lock_guard<std::mutex> lock(mtx);
2224 if (!extendedPage)
2225 {
2226 std::string ucdFilePath = CmajorUcdFilePath();
2227 BinaryReader reader(ucdFilePath);
2228 uint32_t start = 0;
2229 uint32_t pageStart = 0;
2230 if (!headerRead)
2231 {
2232 ReadHeader(reader);
2233 start = extendedHeaderStart - headerSize;
2234 }
2235 else
2236 {
2237 start = extendedHeaderStart;
2238 }
2239 if (!extendedHeaderRead)
2240 {
2241 reader.Skip(start);
2242 ReadExtendedHeader(reader);
2243 pageStart = extendedHeader.GetPageStart(pageIndex) - extendedHeaderEnd;
2244 }
2245 else
2246 {
2247 pageStart = extendedHeader.GetPageStart(pageIndex);
2248 }
2249 reader.Skip(pageStart);
2250 extendedPage = new ExtendedCharacterInfoPage();
2251 extendedPage->Read(reader);
2252 extendedPages[pageIndex] = std::move(std::unique_ptr<ExtendedCharacterInfoPage>(extendedPage));
2253 }
2254 }
2255 int infoIndex = codePoint % numInfosInPage;
2256 return extendedPage->GetExtendedCharacterInfo(infoIndex);
2257 }
2258
2259 ExtendedCharacterInfo& CharacterTable::CreateExtendedCharacterInfo(char32_t codePoint)
2260 {
2261 if (codePoint > 0x10FFFF)
2262 {
2263 throw UnicodeException("invalid Unicode code point " + std::to_string(codePoint));
2264 }
2265 int pageIndex = codePoint / numInfosInPage;
2266 while (extendedPages.size() <= pageIndex)
2267 {
2268 extendedPages.push_back(std::unique_ptr<ExtendedCharacterInfoPage>(new ExtendedCharacterInfoPage()));
2269 }
2270 int infoIndex = codePoint % numInfosInPage;
2271 ExtendedCharacterInfoPage* extendedPage = extendedPages[pageIndex].get();
2272 return extendedPage->GetExtendedCharacterInfo(infoIndex);
2273 }
2274
2275 bool IsAsciiDigit(char32_t c)
2276 {
2277 if (c < 256)
2278 {
2279 return std::isdigit((unsigned char)c);
2280 }
2281 return false;
2282 }
2283
2284 void UnicodeInit()
2285 {
2286 CharacterTable::Init();
2287 BinaryPropertyTable::Init();
2288 GeneralCategoryTable::Init();
2289 AgeTable::Init();
2290 ScriptTable::Init();
2291 BlockTable::Init();
2292 BidiClassTable::Init();
2293 BidiPairedBracketTypeTable::Init();
2294 NumericTypeTable::Init();
2295 AliasTypeTable::Init();
2296 }
2297
2298 void UnicodeDone()
2299 {
2300 AliasTypeTable::Done();
2301 NumericTypeTable::Done();
2302 BidiPairedBracketTypeTable::Done();
2303 BidiClassTable::Done();
2304 BlockTable::Done();
2305 ScriptTable::Done();
2306 AgeTable::Done();
2307 GeneralCategoryTable::Done();
2308 BinaryPropertyTable::Done();
2309 CharacterTable::Done();
2310 }
2311
2312 } }