1 // =================================
   2 // Copyright (c) 2021 Seppo Laakko
   3 // Distributed under the MIT license
   4 // =================================
   5 
   6 #include <soulng/util/Unicode.hpp>
   7 #include <soulng/util/TextUtils.hpp>
   8 #include <boost/filesystem.hpp>
   9 #include <cctype>
  10 #include <mutex>
  11 
  12 namespace soulng { namespace unicode {
  13 
  14 std::string CmajorVersionStr()
  15 {
  16     return "3.10.0";
  17 }
  18 
  19 UnicodeException::UnicodeException(const std::string& message_) : std::runtime_error(message_)
  20 {
  21 }
  22 
  23 Utf8ToUtf32Engine::Utf8ToUtf32Engine() : state(0)resultReady(false)result(U'\0')
  24 {
  25     std::memset(bytes0sizeof(bytes));
  26 }
  27 
  28 void ThrowInvalidUtf8Sequence()
  29 {
  30     throw UnicodeException("invalid UTF-8 sequence");
  31 }
  32 
  33 void Utf8ToUtf32Engine::Put(uint8_t x)
  34 {
  35     switch (state)
  36     {
  37         case 0:
  38         {
  39             resultReady = false;
  40             if ((x & 0x80u) == 0u)
  41             {
  42                 result = static_cast<char32_t>(x);
  43                 resultReady = true;
  44             }
  45             else if ((x & 0xE0u) == 0xC0u)
  46             {
  47                 bytes[0] = x;
  48                 state = 1;
  49             }
  50             else if ((x & 0xF0u) == 0xE0u)
  51             {
  52                 bytes[0] = x;
  53                 state = 2;
  54             }
  55             else if ((x & 0xF8u) == 0xF0u)
  56             {
  57                 bytes[0] = x;
  58                 state = 4;
  59             }
  60             else
  61             {
  62                 ThrowInvalidUtf8Sequence();
  63             }
  64             break;
  65         }
  66         case 1:
  67         {
  68             result = static_cast<char32_t>(0);
  69             bytes[1] = x;
  70             uint8_t b1 = bytes[1];
  71             if ((b1 & 0xC0u) != 0x80u)
  72             {
  73                 ThrowInvalidUtf8Sequence();
  74             }
  75             uint8_t shift = 0u;
  76             for (uint8_t i = 0u; i < 6u; ++i)
  77             {
  78                 uint8_t bit = b1 & 1u;
  79                 b1 = b1 >> 1u;
  80                 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
  81                 ++shift;
  82             }
  83             uint8_t b0 = bytes[0];
  84             for (uint8_t i = 0u; i < 5u; ++i)
  85             {
  86                 uint8_t bit = b0 & 1u;
  87                 b0 = b0 >> 1u;
  88                 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
  89                 ++shift;
  90             }
  91             resultReady = true;
  92             state = 0;
  93             break;
  94         }
  95         case 2:
  96         {
  97             bytes[1] = x;
  98             state = 3;
  99             break;
 100         }
 101         case 3:
 102         {
 103             bytes[2] = x;
 104             result = static_cast<char32_t>(0);
 105             uint8_t b2 = bytes[2];
 106             if ((b2 & 0xC0u) != 0x80u)
 107             {
 108                 ThrowInvalidUtf8Sequence();
 109             }
 110             uint8_t shift = 0u;
 111             for (uint8_t i = 0u; i < 6u; ++i)
 112             {
 113                 uint8_t bit = b2 & 1u;
 114                 b2 = b2 >> 1u;
 115                 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
 116                 ++shift;
 117             }
 118             uint8_t b1 = bytes[1];
 119             if ((b1 & 0xC0u) != 0x80u)
 120             {
 121                 ThrowInvalidUtf8Sequence();
 122             }
 123             for (uint8_t i = 0u; i < 6u; ++i)
 124             {
 125                 uint8_t bit = b1 & 1u;
 126                 b1 = b1 >> 1u;
 127                 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
 128                 ++shift;
 129             }
 130             uint8_t b0 = bytes[0];
 131             for (uint8_t i = 0u; i < 4u; ++i)
 132             {
 133                 uint8_t bit = b0 & 1u;
 134                 b0 = b0 >> 1u;
 135                 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
 136                 ++shift;
 137             }
 138             resultReady = true;
 139             state = 0;
 140             break;
 141         }
 142         case 4:
 143         {
 144             bytes[1] = x;
 145             state = 5;
 146             break;
 147         }
 148         case 5:
 149         {
 150             bytes[2] = x;
 151             state = 6;
 152             break;
 153         }
 154         case 6:
 155         {
 156             bytes[3] = x;
 157             result = static_cast<char32_t>(0);
 158             uint8_t b3 = bytes[3];
 159             if ((b3 & 0xC0u) != 0x80u)
 160             {
 161                 ThrowInvalidUtf8Sequence();
 162             }
 163             uint8_t shift = 0u;
 164             for (uint8_t i = 0u; i < 6u; ++i)
 165             {
 166                 uint8_t bit = b3 & 1u;
 167                 b3 = b3 >> 1u;
 168                 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
 169                 ++shift;
 170             }
 171             uint8_t b2 = bytes[2];
 172             if ((b2 & 0xC0u) != 0x80u)
 173             {
 174                 ThrowInvalidUtf8Sequence();
 175             }
 176             for (uint8_t i = 0u; i < 6u; ++i)
 177             {
 178                 uint8_t bit = b2 & 1u;
 179                 b2 = b2 >> 1u;
 180                 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
 181                 ++shift;
 182             }
 183             uint8_t b1 = bytes[1];
 184             if ((b1 & 0xC0u) != 0x80u)
 185             {
 186                 ThrowInvalidUtf8Sequence();
 187             }
 188             for (uint8_t i = 0u; i < 6u; ++i)
 189             {
 190                 uint8_t bit = b1 & 1u;
 191                 b1 = b1 >> 1u;
 192                 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
 193                 ++shift;
 194             }
 195             uint8_t b0 = bytes[0];
 196             for (uint8_t i = 0u; i < 3u; ++i)
 197             {
 198                 uint8_t bit = b0 & 1u;
 199                 b0 = b0 >> 1u;
 200                 result = static_cast<char32_t>(static_cast<uint32_t>(result) | (static_cast<uint32_t>(bit) << shift));
 201                 ++shift;
 202             }
 203             resultReady = true;
 204             state = 0;
 205             break;
 206         }
 207     }
 208 }
 209 
 210 std::u32string ToUtf32(const std::string& utf8Str)
 211 {
 212     std::u32string result;
 213     const char* p = utf8Str.c_str();
 214     int bytesRemaining = int(utf8Str.length());
 215     while (bytesRemaining > 0)
 216     {
 217         char c = *p;
 218         uint8_t x = static_cast<uint8_t>(c);
 219         if ((x & 0x80u) == 0u)
 220         {
 221             result.append(1static_cast<char32_t>(static_cast<uint32_t>(x)));
 222             --bytesRemaining;
 223             ++p;
 224         }
 225         else if ((x & 0xE0u) == 0xC0u)
 226         {
 227             if (bytesRemaining < 2)
 228             {
 229                 ThrowInvalidUtf8Sequence();
 230             }
 231             char32_t u = static_cast<char32_t>(static_cast<uint32_t>(0u));
 232             uint8_t b1 = static_cast<uint8_t>(p[1]);
 233             if ((b1 & 0xC0u) != 0x80u)
 234             {
 235                 ThrowInvalidUtf8Sequence();
 236             }
 237             uint8_t shift = 0u;
 238             for (uint8_t i = 0u; i < 6u; ++i)
 239             {
 240                 uint8_t bit = b1 & 1u;
 241                 b1 = b1 >> 1u;
 242                 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
 243                 ++shift;
 244             }
 245             uint8_t b0 = x;
 246             for (uint8_t i = 0u; i < 5u; ++i)
 247             {
 248                 uint8_t bit = b0 & 1u;
 249                 b0 = b0 >> 1u;
 250                 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
 251                 ++shift;
 252             }
 253             result.append(1u);
 254             bytesRemaining = bytesRemaining - 2;
 255             p = p + 2;
 256         }
 257         else if ((x & 0xF0u) == 0xE0u)
 258         {
 259             if (bytesRemaining < 3)
 260             {
 261                 ThrowInvalidUtf8Sequence();
 262             }
 263             char32_t u = static_cast<char32_t>(static_cast<uint32_t>(0u));
 264             uint8_t b2 = static_cast<uint8_t>(p[2]);
 265             if ((b2 & 0xC0u) != 0x80u)
 266             {
 267                 ThrowInvalidUtf8Sequence();
 268             }
 269             uint8_t shift = 0u;
 270             for (uint8_t i = 0u; i < 6u; ++i)
 271             {
 272                 uint8_t bit = b2 & 1u;
 273                 b2 = b2 >> 1u;
 274                 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
 275                 ++shift;
 276             }
 277             uint8_t b1 = static_cast<uint8_t>(p[1]);
 278             if ((b1 & 0xC0u) != 0x80u)
 279             {
 280                 ThrowInvalidUtf8Sequence();
 281             }
 282             for (uint8_t i = 0u; i < 6u; ++i)
 283             {
 284                 uint8_t bit = b1 & 1u;
 285                 b1 = b1 >> 1u;
 286                 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
 287                 ++shift;
 288             }
 289             uint8_t b0 = x;
 290             for (uint8_t i = 0u; i < 4u; ++i)
 291             {
 292                 uint8_t bit = b0 & 1u;
 293                 b0 = b0 >> 1u;
 294                 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
 295                 ++shift;
 296             }
 297             result.append(1u);
 298             bytesRemaining = bytesRemaining - 3;
 299             p = p + 3;
 300         }
 301         else if ((x & 0xF8u) == 0xF0u)
 302         {
 303             if (bytesRemaining < 4)
 304             {
 305                 ThrowInvalidUtf8Sequence();
 306             }
 307             char32_t u = static_cast<char32_t>(static_cast<uint32_t>(0u));
 308             uint8_t b3 = static_cast<uint8_t>(p[3]);
 309             if ((b3 & 0xC0u) != 0x80u)
 310             {
 311                 ThrowInvalidUtf8Sequence();
 312             }
 313             uint8_t shift = 0u;
 314             for (uint8_t i = 0u; i < 6u; ++i)
 315             {
 316                 uint8_t bit = b3 & 1u;
 317                 b3 = b3 >> 1u;
 318                 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
 319                 ++shift;
 320             }
 321             uint8_t b2 = static_cast<uint8_t>(p[2]);
 322             if ((b2 & 0xC0u) != 0x80u)
 323             {
 324                 ThrowInvalidUtf8Sequence();
 325             }
 326             for (uint8_t i = 0u; i < 6u; ++i)
 327             {
 328                 uint8_t bit = b2 & 1u;
 329                 b2 = b2 >> 1u;
 330                 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
 331                 ++shift;
 332             }
 333             uint8_t b1 = static_cast<uint8_t>(p[1]);
 334             if ((b1 & 0xC0u) != 0x80u)
 335             {
 336                 ThrowInvalidUtf8Sequence();
 337             }
 338             for (uint8_t i = 0u; i < 6u; ++i)
 339             {
 340                 uint8_t bit = b1 & 1u;
 341                 b1 = b1 >> 1u;
 342                 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
 343                 ++shift;
 344             }
 345             uint8_t b0 = x;
 346             for (uint8_t i = 0u; i < 3u; ++i)
 347             {
 348                 uint8_t bit = b0 & 1u;
 349                 b0 = b0 >> 1u;
 350                 u = static_cast<char32_t>(static_cast<uint32_t>(u) | (static_cast<uint32_t>(bit) << shift));
 351                 ++shift;
 352             }
 353             result.append(1u);
 354             bytesRemaining = bytesRemaining - 4;
 355             p = p + 4;
 356         }
 357         else
 358         {
 359             ThrowInvalidUtf8Sequence();
 360         }
 361     }
 362     return result;
 363 }
 364 
 365 std::u32string ToUtf32(const std::u16string& utf16Str)
 366 {
 367     std::u32string result;
 368     const char16_t* w = utf16Str.c_str();
 369     int remaining = int(utf16Str.length());
 370     while (remaining > 0)
 371     {
 372         char16_t w1 = *w++;
 373         --remaining;
 374         if (static_cast<uint16_t>(w1) < 0xD800u || static_cast<uint16_t>(w1) > 0xDFFFu)
 375         {
 376             result.append(1w1);
 377         }
 378         else
 379         {
 380             if (static_cast<uint16_t>(w1) < 0xD800u || static_cast<uint16_t>(w1) > 0xDBFFu)
 381             {
 382                 throw UnicodeException("invalid UTF-16 sequence");
 383             }
 384             if (remaining > 0)
 385             {
 386                 char16_t w2 = *w++;
 387                 --remaining;
 388                 if (static_cast<uint16_t>(w2) < 0xDC00u || static_cast<uint16_t>(w2) > 0xDFFFu)
 389                 {
 390                     throw UnicodeException("invalid UTF-16 sequence");
 391                 }
 392                 else
 393                 {
 394                     char32_t uprime = static_cast<char32_t>(((0x03FFu & static_cast<uint32_t>(w1)) << 10u) | (0x03FFu & static_cast<uint32_t>(w2)));
 395                     char32_t u = static_cast<char32_t>(static_cast<uint32_t>(uprime) + 0x10000u);
 396                     result.append(1u);
 397                 }
 398             }
 399             else
 400             {
 401                 throw UnicodeException("invalid UTF-16 sequence");
 402             }
 403         }
 404     }
 405     return result;
 406 }
 407 
 408 std::u16string ToUtf16(const std::u32string& utf32Str)
 409 {
 410     std::u16string result;
 411     for (char32_t u : utf32Str)
 412     {
 413         if (static_cast<uint32_t>(u) > 0x10FFFFu)
 414         {
 415             throw UnicodeException("invalid UTF-32 code point");
 416         }
 417         if (static_cast<uint32_t>(u) < 0x10000u)
 418         {
 419             if (static_cast<uint32_t>(u) >= 0xD800 && static_cast<uint32_t>(u) <= 0xDFFF)
 420             {
 421                 throw UnicodeException("invalid UTF-32 code point (reserved for UTF-16)");
 422             }
 423             char16_t x = static_cast<char16_t>(u);
 424             result.append(1x);
 425         }
 426         else
 427         {
 428             char32_t uprime = static_cast<char32_t>(static_cast<uint32_t>(u) - 0x10000u);
 429             char16_t w1 = static_cast<char16_t>(0xD800u);
 430             char16_t w2 = static_cast<char16_t>(0xDC00u);
 431             for (uint16_t i = 0u; i < 10u; ++i)
 432             {
 433                 uint16_t bit = static_cast<uint16_t>(static_cast<uint32_t>(uprime) & (static_cast<uint32_t>(0x1u) << i));
 434                 w2 = static_cast<char16_t>(static_cast<uint16_t>(w2) | bit);
 435             }
 436             for (uint16_t i = 10u; i < 20u; ++i)
 437             {
 438                 uint16_t bit = static_cast<uint16_t>((static_cast<uint32_t>(uprime) & (static_cast<uint32_t>(0x1u) << i)) >> 10u);
 439                 w1 = static_cast<char16_t>(static_cast<uint16_t>(w1) | bit);
 440             }
 441             result.append(1w1);
 442             result.append(1w2);
 443         }
 444     }
 445     return result;
 446 
 447 }
 448 
 449 std::u16string ToUtf16(const std::string& utf8Str)
 450 {
 451     return ToUtf16(ToUtf32(utf8Str));
 452 }
 453 
 454 std::string ToUtf8(const std::u32string& utf32Str)
 455 {
 456     std::string result;
 457     for (char32_t c : utf32Str)
 458     {
 459         uint32_t x = static_cast<uint32_t>(c);
 460         if (x < 0x80u)
 461         {
 462             result.append(1static_cast<char>(x & 0x7Fu));
 463         }
 464         else if (x < 0x800u)
 465         {
 466             uint8_t b1 = 0x80u;
 467             for (uint8_t i = 0u; i < 6u; ++i)
 468             {
 469                 b1 = b1 | (static_cast<uint8_t>(x & 1u) << i);
 470                 x = x >> 1u;
 471             }
 472             uint8_t b0 = 0xC0u;
 473             for (uint8_t i = 0u; i < 5u; ++i)
 474             {
 475                 b0 = b0 | (static_cast<uint8_t>(x & 1u) << i);
 476                 x = x >> 1u;
 477             }
 478             result.append(1static_cast<char>(b0));
 479             result.append(1static_cast<char>(b1));
 480         }
 481         else if (x < 0x10000u)
 482         {
 483             uint8_t b2 = 0x80u;
 484             for (uint8_t i = 0u; i < 6u; ++i)
 485             {
 486                 b2 = b2 | (static_cast<uint8_t>(x & 1u) << i);
 487                 x = x >> 1u;
 488             }
 489             uint8_t b1 = 0x80u;
 490             for (uint8_t i = 0u; i < 6u; ++i)
 491             {
 492                 b1 = b1 | (static_cast<uint8_t>(x & 1u) << i);
 493                 x = x >> 1u;
 494             }
 495             uint8_t b0 = 0xE0u;
 496             for (uint8_t i = 0u; i < 4u; ++i)
 497             {
 498                 b0 = b0 | (static_cast<uint8_t>(x & 1u) << i);
 499                 x = x >> 1u;
 500             }
 501             result.append(1static_cast<char>(b0));
 502             result.append(1static_cast<char>(b1));
 503             result.append(1static_cast<char>(b2));
 504         }
 505         else if (x < 0x110000u)
 506         {
 507             uint8_t b3 = 0x80u;
 508             for (uint8_t i = 0u; i < 6u; ++i)
 509             {
 510                 b3 = b3 | (static_cast<uint8_t>(x & 1u) << i);
 511                 x = x >> 1u;
 512             }
 513             uint8_t b2 = 0x80u;
 514             for (uint8_t i = 0u; i < 6u; ++i)
 515             {
 516                 b2 = b2 | (static_cast<uint8_t>(x & 1u) << i);
 517                 x = x >> 1u;
 518             }
 519             uint8_t b1 = 0x80u;
 520             for (uint8_t i = 0u; i < 6u; ++i)
 521             {
 522                 b1 = b1 | (static_cast<uint8_t>(x & 1u) << i);
 523                 x = x >> 1u;
 524             }
 525             uint8_t b0 = 0xF0u;
 526             for (uint8_t i = 0u; i < 3u; ++i)
 527             {
 528                 b0 = b0 | (static_cast<uint8_t>(x & 1u) << i);
 529                 x = x >> 1u;
 530             }
 531             result.append(1static_cast<char>(b0));
 532             result.append(1static_cast<char>(b1));
 533             result.append(1static_cast<char>(b2));
 534             result.append(1static_cast<char>(b3));
 535         }
 536         else
 537         {
 538             throw UnicodeException("invalid UTF-32 code point");
 539         }
 540     }
 541     return result;
 542 }
 543 
 544 std::string ToUtf8(const std::u16string& utf16Str)
 545 {
 546     return ToUtf8(ToUtf32(utf16Str));
 547 }
 548 
 549 std::u32string ToUpper(const std::u32string& s)
 550 {
 551     std::u32string upper;
 552     for (char32_t c : s)
 553     {
 554         upper.append(1ToUpper(c));
 555     }
 556     return upper;
 557 }
 558 
 559 std::u32string ToLower(const std::u32string& s)
 560 {
 561     std::u32string lower;
 562     for (char32_t c : s)
 563     {
 564         lower.append(1ToLower(c));
 565     }
 566     return lower;
 567 }
 568 
 569 std::string MakeCanonicalPropertyName(const std::string& s)
 570 {
 571     std::string propertyName;
 572     for (char c : s)
 573     {
 574         if (c != '_' && c != ' ' && c != '-')
 575         {
 576             propertyName.append(1c);
 577         }
 578     }
 579     return soulng::util::ToLower(propertyName);
 580 }
 581 
 582 BinaryProperty::BinaryProperty(BinaryPropertyId id_const std::string& shortName_const std::string& longName_) : id(id_)shortName(shortName_)longName(longName_)
 583 {
 584 }
 585 
 586 void BinaryPropertyTable::Init()
 587 {
 588     instance.reset(new BinaryPropertyTable());
 589 }
 590 
 591 void BinaryPropertyTable::Done()
 592 {
 593     instance.reset();
 594 }
 595 
 596 std::unique_ptr<BinaryPropertyTable> BinaryPropertyTable::instance;
 597 
 598 BinaryPropertyTable::BinaryPropertyTable()
 599 {
 600     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::asciiHexDigit"AHex""Ascii Hex Digit"));
 601     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::alphabetic"Alpha""Alphabetic"));
 602     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::bidiControl"Bidi C""Bidi Control"));
 603     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::bidiMirrored"Bidi M""Bidi Mirrored"));
 604     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::cased"Cased""Cased"));
 605     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::compositionExclusion"CE""Composition Exclusion"));
 606     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::caseIgnorable"CI""Case Ignorable"));
 607     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::fullCompositionExclusion"Comp Ex""Full Composition Exclusion"));
 608     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenCasefolded"CWCF""Changes When Casefolded"));
 609     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenCaseMapped"CWCM""Changes When Casemapped"));
 610     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenNFKCCasefolded"CWKCF""Changes When NFKC Casefolded"));
 611     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenLowercased"CWL""Changes When Lowercased"));
 612     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenTitlecased"CWT""Changes When Titlecased"));
 613     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::changesWhenUppercased"CWU""Changes When Uppercased"));
 614     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::dash"Dash""Dash"));
 615     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::deprecated"Dep""Deprecated"));
 616     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::defaultIgnorableCodePoint"DI""Default Ignorable Code Point"));
 617     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::diacritic"Dia""Diacritic"));
 618     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::extender"Ext""Extender"));
 619     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::graphemeBase"Gr Base""Grapheme Base"));
 620     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::graphemeExtend"Gr Ext""Grapheme Extend"));
 621     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::graphemeLink"Gr Link""Grapheme Link"));
 622     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::hexDigit"Hex""Hex Digit"));
 623     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::hyphen"Hyphen""Hyphen"));
 624     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::idContinue"IDC""ID Continue"));
 625     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::ideographic"Ideo""Ideographic"));
 626     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::idStart"IDS""ID Start"));
 627     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::idsBinaryOperator"IDSB""IDS Binary Operator"));
 628     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::idsTrinaryOperator"IDST""IDS Trinary Operator"));
 629     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::joinControl"Join C""Join Control"));
 630     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::logicalOrderException"LOE""Logical Order Exception"));
 631     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::lowercase"Lower""Lowercase"));
 632     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::math"Math""Math"));
 633     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::noncharacterCodePoint"NChar""Noncharacter Code Point"));
 634     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherAlphabetic"OAlpha""Other Alphabetic"));
 635     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherDefaultIgnorableCodePoint"ODI""Other Default Ignorable Code Point"));
 636     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherGraphemeExtend"OGr Ext""Other Grapheme Extend"));
 637     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherIdContinue"OIDC""Other ID Continue"));
 638     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherIdStart"OIDS""Other ID Start"));
 639     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherLowercase"OLower""Other Lowercase"));
 640     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherMath"OMath""Other Math"));
 641     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::otherUppercase"OUpper""Other Uppercase"));
 642     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::patternSyntax"Pat Syn""Pattern Syntax"));
 643     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::patternWhiteSpace"Pat WS""Pattern White Space"));
 644     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::prependedConcatenationMark"PCM""Prepended Concatenation Mark"));
 645     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::quotationMark"QMark""Quotation Mark"));
 646     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::radical"Radical""Radical"));
 647     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::softDotted"SD""Soft Dotted"));
 648     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::sentenceterminal"STerm""Sentence Terminal"));
 649     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::terminalPunctuation"Term""Terminal Punctuation"));
 650     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::unifiedIdeograph"UIdeo""Unified Ideograph"));
 651     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::uppercase"Upper""Uppercase"));
 652     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::variationSelector"VS""Variation Selector"));
 653     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::whiteSpace"WSpace""White Space"));
 654     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::xidContinue"XIDC""XID Continue"));
 655     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::xidStart"XIDS""XID Start"));
 656     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::expandsOnNFC"XO NFC""Expands On NFC"));
 657     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::expandsOnNFD"XO NFD""Expands On NFD"));
 658     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::expandsOnNFKC"XO NFKC""Expands On NFKC"));
 659     binaryProperties.push_back(BinaryProperty(BinaryPropertyId::expandsOnNFKD"XO NFKD""Expands On NFKD"));
 660 
 661     for (const BinaryProperty& binaryProperty : binaryProperties)
 662     {
 663         binaryPropertyIdMap[binaryProperty.Id()] = &binaryProperty;
 664         shortNameMap[MakeCanonicalPropertyName(binaryProperty.ShortName())] = &binaryProperty;
 665         longNameMap[MakeCanonicalPropertyName(binaryProperty.LongName())] = &binaryProperty;
 666     }
 667 }
 668 
 669 const BinaryProperty& BinaryPropertyTable::GetBinaryProperty(BinaryPropertyId binaryPropertyId) const
 670 {
 671     auto it = binaryPropertyIdMap.find(binaryPropertyId);
 672     if (it != binaryPropertyIdMap.cend())
 673     {
 674         return *it->second;
 675     }
 676     else
 677     {
 678         throw UnicodeException("binary property " + std::to_string(static_cast<int>(binaryPropertyId)) + " not found");
 679     }
 680 }
 681 
 682 bool BinaryPropertyTable::IsBinaryProperty(const std::string& shortName) const
 683 {
 684     return shortNameMap.find(MakeCanonicalPropertyName(shortName)) != shortNameMap.cend();
 685 }
 686 
 687 const BinaryProperty& BinaryPropertyTable::GetBinaryPropertyByShortName(const std::string& shortName) const
 688 {
 689     auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
 690     if (it != shortNameMap.cend())
 691     {
 692         return *it->second;
 693     }
 694     else
 695     {
 696         throw UnicodeException("binary property '" + shortName + "' not found");
 697     }
 698 }
 699 
 700 const BinaryProperty& BinaryPropertyTable::GetBinaryPropertyByLongName(const std::string& longName) const
 701 {
 702     auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
 703     if (it != longNameMap.cend())
 704     {
 705         return *it->second;
 706     }
 707     else
 708     {
 709         throw UnicodeException("binary property '" + longName + "' not found");
 710     }
 711 }
 712 
 713 Block::Block(BlockId id_const std::string& shortName_const std::string& longName_char32_t start_char32_t end_) : id(id_)shortName(shortName_)longName(longName_)start(start_)end(end_)
 714 {
 715 }
 716 
 717 std::unique_ptr<BlockTable> BlockTable::instance;
 718 
 719 void BlockTable::Init()
 720 {
 721     instance.reset(new BlockTable());
 722 }
 723 
 724 void BlockTable::Done()
 725 {
 726     instance.reset();
 727 }
 728 
 729 BlockTable::BlockTable()
 730 {
 731     blocks.push_back(Block(BlockId::ascii"ASCII""Basic Latin"0x00000x007F));
 732     blocks.push_back(Block(BlockId::latin1Sup"Latin 1 Sup""Latin-1 Supplement"0x00800x00FF));
 733     blocks.push_back(Block(BlockId::latinExtA"Latin Ext A""Latin Extended-A"0x01000x017F));
 734     blocks.push_back(Block(BlockId::latinExtB"Latin Ext B""Latin Extended-B"0x01800x0024F));
 735     blocks.push_back(Block(BlockId::ipaExt"IPA Ext""IPA Extensions"0x02500x02AF));
 736     blocks.push_back(Block(BlockId::modifierLetters"Modifier Letters""Spacing Modifier Letters"0x02B00x02FF));
 737     blocks.push_back(Block(BlockId::diacriticals"Diacriticals""Combining Diacritical Marks"0x03000x036F));
 738     blocks.push_back(Block(BlockId::greek"Greek""Greek and Coptic"0x03700x03FF));
 739     blocks.push_back(Block(BlockId::cyrillic"Cyrillic""Cyrillic"0x04000x04FF));
 740     blocks.push_back(Block(BlockId::cyrillicSup"Cyrillic Sup""Cyrillic Supplement"0x05000x052F));
 741     blocks.push_back(Block(BlockId::armenian"Armenian""Armenian"0x05300x058F));
 742     blocks.push_back(Block(BlockId::hebrew"Hebrew""Hebrew"0x05900x05FF));
 743     blocks.push_back(Block(BlockId::arabic"Arabic""Arabic"0x06000x06FF));
 744     blocks.push_back(Block(BlockId::syriac"Syriac""Syriac"0x07000x074F));
 745     blocks.push_back(Block(BlockId::arabicSup"Arabic Sup""Arabic Supplement"0x07500x077F));
 746     blocks.push_back(Block(BlockId::thaana"Thaana""Thaana"0x07800x07BF));
 747     blocks.push_back(Block(BlockId::nko"Nko""Nko"0x07C00x07FF));
 748     blocks.push_back(Block(BlockId::samaritan"Samaritan""Samaritan"0x08000x083F));
 749     blocks.push_back(Block(BlockId::mandaic"Mandaic""Mandaic"0x08400x085F));
 750     blocks.push_back(Block(BlockId::syriacSup"Syriac Sup""Syriac Supplement"0x08600x086F));
 751     blocks.push_back(Block(BlockId::arabicExtA"Arabic Ext A""Arabic Extended-A"0x08A00x08FF));
 752     blocks.push_back(Block(BlockId::devanagari"Devanagari""Devanagari"0x09000x097F));
 753     blocks.push_back(Block(BlockId::bengali"Bengali""Bengali"0x09800x09FF));
 754     blocks.push_back(Block(BlockId::gurmukhi"Gurmukhi""Gurmukhi"0x0A000x0A7F));
 755     blocks.push_back(Block(BlockId::gujarati"Gujarati""Gujarati"0x0A800x0AFF));
 756     blocks.push_back(Block(BlockId::oriya"Oriya""Oriya"0x0B000x0B7F));
 757     blocks.push_back(Block(BlockId::tamil"Tamil""Tamil"0x0B800x0BFF));
 758     blocks.push_back(Block(BlockId::telugu"Telugu""Telugu"0x0C000x0C7F));
 759     blocks.push_back(Block(BlockId::kannada"Kannada""Kannada"0x0C800x0CFF));
 760     blocks.push_back(Block(BlockId::malayalam"Malayalam""Malayalam"0x0D000x0D7F));
 761     blocks.push_back(Block(BlockId::sinhala"Sinhala""Sinhala"0x0D800x0DFF));
 762     blocks.push_back(Block(BlockId::thai"Thai""Thai"0x0E000x0E7F));
 763     blocks.push_back(Block(BlockId::lao"Lao""Lao"0x0E800x0EFF));
 764     blocks.push_back(Block(BlockId::tibetan"Tibetan""Tibetan"0x0F000x0FFF));
 765     blocks.push_back(Block(BlockId::myanmar"Myanmar""Myanmar"0x10000x109F));
 766     blocks.push_back(Block(BlockId::georgian"Georgian""Georgian"0x10A00x10FF));
 767     blocks.push_back(Block(BlockId::jamo"Jamo""Hangul Jamo"0x11000x11FF));
 768     blocks.push_back(Block(BlockId::ethiopic"Ethiopic""Ethiopic"0x12000x137F));
 769     blocks.push_back(Block(BlockId::ethiopicSup"Ethiopic Sup""Ethiopic Supplement"0x13800x139F));
 770     blocks.push_back(Block(BlockId::cherokee"Cherokee""Cherokee"0x13A00x13FF));
 771     blocks.push_back(Block(BlockId::ucas"UCAS""Unified Canadian Aboriginal Syllabics"0x14000x167F));
 772     blocks.push_back(Block(BlockId::ogham"Ogham""Ogham"0x16800x169F));
 773     blocks.push_back(Block(BlockId::runic"Runic""Runic"0x16A00x16FF));
 774     blocks.push_back(Block(BlockId::tagalog"Tagalog""Tagalog"0x17000x171F));
 775     blocks.push_back(Block(BlockId::hanunoo"Hanunoo""Hanunoo"0x17200x173F));
 776     blocks.push_back(Block(BlockId::buhid"Buhid""Buhid"0x17400x175F));
 777     blocks.push_back(Block(BlockId::tagbanwa"Tagbanwa""Tagbanwa"0x17600x177F));
 778     blocks.push_back(Block(BlockId::khmer"Khmer""Khmer"0x17800x17FF));
 779     blocks.push_back(Block(BlockId::mongolian"Mongolian""Mongolian"0x18000x18AF));
 780     blocks.push_back(Block(BlockId::ucasExt"UCAS Ext""Unified Canadian Aboriginal Syllabics Extended"0x18B00x18FF));
 781     blocks.push_back(Block(BlockId::limbu"Limbu""Limbu"0x19000x194F));
 782     blocks.push_back(Block(BlockId::taiLe"Tai Le""Tai Le"0x19500x197F));
 783     blocks.push_back(Block(BlockId::newTaiLue"New Tai Lue""New Tai Lue"0x19800x19DF));
 784     blocks.push_back(Block(BlockId::khmerSymbols"Khmer Symbols""Khmer Symbols"0x19E00x19FF));
 785     blocks.push_back(Block(BlockId::buginese"Buginese""Buginese"0x1A000x1A1F));
 786     blocks.push_back(Block(BlockId::taiTham"Tai Tham""Tai Tham"0x1A200x1AAF));
 787     blocks.push_back(Block(BlockId::diacriticalsExt"Diacriticals Ext""Combining Diacritical Marks Extended"0x1AB00x1AFF));
 788     blocks.push_back(Block(BlockId::balinese"Balinese""Balinese"0x1B000x1B7F));
 789     blocks.push_back(Block(BlockId::sundanese"Sundanese""Sundanese"0x1B800x1BBF));
 790     blocks.push_back(Block(BlockId::batak"Batak""Batak"0x1BC00x1BFF));
 791     blocks.push_back(Block(BlockId::lepcha"Lepcha""Lepcha"0x1C000x1C4F));
 792     blocks.push_back(Block(BlockId::olChiki"Ol Chiki""Ol Chiki"0x1C500x1C7F));
 793     blocks.push_back(Block(BlockId::cyrillicExtC"Cyrillic Ext C""Cyrillic Extended-C"0x1C800x1C8F));
 794     blocks.push_back(Block(BlockId::georgianExt"Georgian Ext""Georgian Extended"0x1C900x1CBF));
 795     blocks.push_back(Block(BlockId::sundaneseSup"Sundanese Sup""Sundanese Supplement"0x1CC00x1CCF));
 796     blocks.push_back(Block(BlockId::vedicExt"Vedic Ext""Vedic Extensions"0x1CD00x1CFF));
 797     blocks.push_back(Block(BlockId::phoneticExt"Phonetic Ext""Phonetic Extensions"0x1D000x1D7F));
 798     blocks.push_back(Block(BlockId::phoneticExtSup"Phonetic Ext Sup""Phonetic Extensions Supplement"0x1D800x1DBF));
 799     blocks.push_back(Block(BlockId::diacriticalsSup"Diacriticals Sup""Combining Diacritical Marks Supplement"0x1DC00x1DFF));
 800     blocks.push_back(Block(BlockId::latinExtAdditional"Latin Ext Additional""Latin Extended Additional"0x1E000x1EFF));
 801     blocks.push_back(Block(BlockId::greekExt"Greek Ext""Greek Extended"0x1F000x1FFF));
 802     blocks.push_back(Block(BlockId::punctuation"Punctuation""General Punctuation"0x20000x206F));
 803     blocks.push_back(Block(BlockId::superAndSub"Super And Sub""Superscripts and Subscripts"0x20700x209F));
 804     blocks.push_back(Block(BlockId::currencySymbols"Currency Symbols""Currency Symbols"0x20A00x20CF));
 805     blocks.push_back(Block(BlockId::diariticalsForSymbols"Diacriticals For Symbols""Combining Diacritical Marks for Symbols"0x20D00x20FF));
 806     blocks.push_back(Block(BlockId::letterlikeSymbols"Letterlike Symbols""Letterlike Symbols"0x21000x214F));
 807     blocks.push_back(Block(BlockId::numberForms"Number Forms""Number Forms"0x21500x218F));
 808     blocks.push_back(Block(BlockId::arrows"Arrows""Arrows"0x21900x21FF));
 809     blocks.push_back(Block(BlockId::mathOperators"Math Operators""Mathematical Operators"0x22000x22FF));
 810     blocks.push_back(Block(BlockId::miscTechnical"Misc Technical""Miscellaneous Technical"0x23000x23FF));
 811     blocks.push_back(Block(BlockId::controlPictures"Control Pictures""Control Pictures"0x24000x243F));
 812     blocks.push_back(Block(BlockId::ocr"OCR""Optical Character Regognition"0x24400x245F));
 813     blocks.push_back(Block(BlockId::enclosedAlphanum"Enclosed Alphanum""Enclosed Alphanumerics"0x24600x24FF));
 814     blocks.push_back(Block(BlockId::boxDrawing"Box Drawing""Box Drawing"0x25000x257F));
 815     blocks.push_back(Block(BlockId::blockElements"Block Elements""Block Elements"0x25800x259F));
 816     blocks.push_back(Block(BlockId::geometricShapes"Geometric Shapes""Geometric Shapes"0x25A00x25FF));
 817     blocks.push_back(Block(BlockId::miscSymbols"Misc Symbols""Miscellaneous Symbols"0x26000x26FF));
 818     blocks.push_back(Block(BlockId::dingbats"Dingbats""Dingbats"0x27000x27BF));
 819     blocks.push_back(Block(BlockId::miscMathSymbolsA"Misc Math Symbols A""Miscellaneous Mathematical Symbols - A"0x27C00x27EF));
 820     blocks.push_back(Block(BlockId::supArrowsA"Sup Arrows A""Supplemental Arrows-A"0x27F00x27FF));
 821     blocks.push_back(Block(BlockId::braille"Braille""Braille Patterns"0x28000x28FF));
 822     blocks.push_back(Block(BlockId::supArrowsB"Sup Arrows B""Supplemental Arrows-B"0x29000x297F));
 823     blocks.push_back(Block(BlockId::miscMathSymbolsB"Misc Math Symbols B""Miscellaneous Mathematical Symbols-B"0x29800x29FF));
 824     blocks.push_back(Block(BlockId::supMathOperators"Sup Math Operators""Supplemental Mathematical Operators"0x2A000x2AFF));
 825     blocks.push_back(Block(BlockId::miscArrows"Misc Arrows""Miscellaneous Symbols and Arrows"0x2B000x2BFF));
 826     blocks.push_back(Block(BlockId::glagolitic"Glagolitic""Glagolitic"0x2C000x2C5F));
 827     blocks.push_back(Block(BlockId::latinExtC"Latin Ext C""Latin Extended-C"0x2C600x2C7F));
 828     blocks.push_back(Block(BlockId::coptic"Coptic""Coptic"0x2C800x2CFF));
 829     blocks.push_back(Block(BlockId::georgianSup"Georgian Sup""Georgian Supplement"0x2D000x2D2F));
 830     blocks.push_back(Block(BlockId::tifinagh"Tifinagh""Tifinagh"0x2D300x2D7F));
 831     blocks.push_back(Block(BlockId::ethiopicExt"Ethiopic Ext""Ethiopic Extended"0x2D800x2DDF));
 832     blocks.push_back(Block(BlockId::cyrillicExtA"Cyrillic Ext A""Cyrillic Extended-A"0x2DE00x2DFF));
 833     blocks.push_back(Block(BlockId::supPunctuation"Sup Punctuation""Supplemental Punctuation"0x2E000x2E7F));
 834     blocks.push_back(Block(BlockId::cjkRadicalsSup"CJK Radicals Sup""CJK Radicals Supplement"0x2E800x2EFF));
 835     blocks.push_back(Block(BlockId::kangxi"Kangxi""Kangxi Radicals"0x2F000x2FDF));
 836     blocks.push_back(Block(BlockId::idc"IDC""Ideographic Description Characters"0x2FF00x2FFF));
 837     blocks.push_back(Block(BlockId::cjkSymbols"CJK Symbols""CJK Symbols and Punctuation"0x30000x303F));
 838     blocks.push_back(Block(BlockId::hiragana"Hiragana""Hiragana"0x30400x309F));
 839     blocks.push_back(Block(BlockId::katakana"Katakana""Katakana"0x30A00x30FF));
 840     blocks.push_back(Block(BlockId::bopomofo"Bopomofo""Bopomofo"0x31000x312F));
 841     blocks.push_back(Block(BlockId::compatJamo"Compat Jamo""Hangul Compatibility Jamo"0x31300x318F));
 842     blocks.push_back(Block(BlockId::kanbun"Kanbun""Kanbun"0x31900x319F));
 843     blocks.push_back(Block(BlockId::bopomofoExt"Bopomofo Ext""Bopomofo Extended"0x31A00x31BF));
 844     blocks.push_back(Block(BlockId::cjkStrokes"CJK Strokes""CJK Strokes"0x31C00x31EF));
 845     blocks.push_back(Block(BlockId::katakanaExt"Katakana Ext""Katakana Phonetic Extensions"0x31F00x31FF));
 846     blocks.push_back(Block(BlockId::enclosedCjk"Enclosed CJK""Enclosed CJK Letters and Months"0x32000x32FF));
 847     blocks.push_back(Block(BlockId::cjkCompat"CJK Compat""CJK Compatibility"0x33000x33FF));
 848     blocks.push_back(Block(BlockId::cjkExtA"CJK Ext A""CJK Unified Ideographic Extension A"0x34000x4DBF));
 849     blocks.push_back(Block(BlockId::yijing"Yijing""Yijing Hexagram Symbols"0x4DC00x4DFF));
 850     blocks.push_back(Block(BlockId::cjk"CJK""CJK Unified Ideographs"0x4E000x9FFF));
 851     blocks.push_back(Block(BlockId::yiSyllables"Yi Syllables""Yi Syllables"0xA0000xA48F));
 852     blocks.push_back(Block(BlockId::yiRadicals"Yi Radicals""Yi Radicals"0xA0900xA4CF));
 853     blocks.push_back(Block(BlockId::lisu"Lisu""Lisu"0xA0D00xA4FF));
 854     blocks.push_back(Block(BlockId::vai"Vai""Vai"0xA5000xA63F));
 855     blocks.push_back(Block(BlockId::cyrillicExtB"Cyrillic Ext B""Cyrillic Extended-B"0xA6400xA69F));
 856     blocks.push_back(Block(BlockId::bamum"Bamum""Bamum"0xA6A00xA6FF));
 857     blocks.push_back(Block(BlockId::modifierToneLetters"Modifier Tone Letters""Modifier Tone Letters"0xA7000xA71F));
 858     blocks.push_back(Block(BlockId::latinExtD"Latin Ext D""Latin Extended-D"0xA7200xA7FF));
 859     blocks.push_back(Block(BlockId::sylotiNagri"Syloti Nagri""Syloti Nagri"0xA8000xA82F));
 860     blocks.push_back(Block(BlockId::indicNumberForms"Indic Number Forms""Common Indic Number Forms"0xA8300xA83F));
 861     blocks.push_back(Block(BlockId::phagsPa"Phags Pa""Phags-Pa"0xA8400xA87F));
 862     blocks.push_back(Block(BlockId::saurashtra"Saurashtra""Saurashtra"0xA8800xA8DF));
 863     blocks.push_back(Block(BlockId::devanagariExt"Devanagari Ext""Devanagari Extended"0xA8E00xA8FF));
 864     blocks.push_back(Block(BlockId::kayahLi"Kayah Li""Kayah Li"0xA9000xA92F));
 865     blocks.push_back(Block(BlockId::rejang"Rejang""Rejang"0xA9300xA95F));
 866     blocks.push_back(Block(BlockId::jamoExtA"Jamo Ext A""Hangul Jamo Extended-A"0xA9600xA97F));
 867     blocks.push_back(Block(BlockId::javanese"Javanese""Javanese"0xA9800xA9DF));
 868     blocks.push_back(Block(BlockId::myanmarExtB"Myanmar Ext B""Myanmar Extended - B"0xA9E00xA9FF));
 869     blocks.push_back(Block(BlockId::cham"Cham""Cham"0xAA000xAA5F));
 870     blocks.push_back(Block(BlockId::myanmarExtA"Myanmar Ext A""Myanmar Extended-A"0xAA600xAA7F));
 871     blocks.push_back(Block(BlockId::taiViet"Tai Viet""Tai Viet"0xAA800xAADF));
 872     blocks.push_back(Block(BlockId::meeteiMayekExt"Meetei Mayek Ext""Meetei Mayek Extensions"0xAAE00xAAFF));
 873     blocks.push_back(Block(BlockId::ethiopicExtA"Ethiopic Ext A""Ethiopic Extended-A"0xAB000xAB2F));
 874     blocks.push_back(Block(BlockId::latinExtE"Latin Ext E""Latin Extended-E"0xAB300xAB6F));
 875     blocks.push_back(Block(BlockId::cherokeeSup"Cherokee Sup""Cherokee Supplement"0xAB700xABBF));
 876     blocks.push_back(Block(BlockId::meeteiMayek"Meetei Mayek""Meetei Mayek"0xABC00xABFF));
 877     blocks.push_back(Block(BlockId::hangul"Hangul""Hangul Syllables"0xAC000xD7AF));
 878     blocks.push_back(Block(BlockId::jamoExtB"Jamo Ext B""Hangul Jamo Extended-B"0xD7B00xD7FF));
 879     blocks.push_back(Block(BlockId::highSurrogates"High Surrogates""High Surrogates"0xD8000xDB7F));
 880     blocks.push_back(Block(BlockId::highPuSurrogates "High PU Surrogates""High Private Use Surrogates"0xDB800xDBFF));
 881     blocks.push_back(Block(BlockId::lowSurrogates"Low Surrogates""Low Surrogates"0xDC000xDFFF));
 882     blocks.push_back(Block(BlockId::pua"PUA""Private Use Area"0xE0000xF8FF));
 883     blocks.push_back(Block(BlockId::cjkCompatIdeographs"CJK Compat Ideographs""CJK Compatibility Ideographs"0xF9000xFAFF));
 884     blocks.push_back(Block(BlockId::alphabeticPf"Alphabetic PF""Alphabetic Presentations Forms"0xFB000xFB4F));
 885     blocks.push_back(Block(BlockId::arabicPfA"Arabic PF A""Arabic Presentation Forms-A"0xFB500xFDFF));
 886     blocks.push_back(Block(BlockId::vs"VS""Variation Selectors"0xFE000xFE0F));
 887     blocks.push_back(Block(BlockId::verticalForms"Vertical Forms""Vertical Forms"0xFE100xFE1F));
 888     blocks.push_back(Block(BlockId::halfMarks"Half Marks""Combining Half Marks"0xFE200xFE2F));
 889     blocks.push_back(Block(BlockId::cjkCompatForms"CJK Compat Forms""CJK Compatibility Forms"0xFE300xFE4F));
 890     blocks.push_back(Block(BlockId::smallForms"Small Forms""Small Form Variants"0xFE500xFE6F));
 891     blocks.push_back(Block(BlockId::arabicPfB"Arabic PF B""Arabic Presentation Forms-B"0xFE700xFEFF));
 892     blocks.push_back(Block(BlockId::halfAndFullForms"Half And Full Forms""Halfwidth and Fullwidth Forms"0xFF000xFFEF));
 893     blocks.push_back(Block(BlockId::specials"Specials""Specials"0xFFF00xFFFF));
 894     blocks.push_back(Block(BlockId::linearBSyllabary"Linear B Syllabary""Linear B Syllabary"0x100000x1007F));
 895     blocks.push_back(Block(BlockId::linearBIdeograms"Linear B Ideograms""Linear B Ideograms"0x100800x100FF));
 896     blocks.push_back(Block(BlockId::aegeanNumbers"Aegean Numbers""Aegean Numbers"0x101000x1013F));
 897     blocks.push_back(Block(BlockId::ancientGreekNumbers"Ancient Greek Numbers""Ancient Greek Numbers"0x101400x1018F));
 898     blocks.push_back(Block(BlockId::ancientSymbols"Ancient Symbols""Ancient Symbols"0x101900x101CF));
 899     blocks.push_back(Block(BlockId::phaistos"Phaistos""Phaistos Disc"0x101D00x101FF));
 900     blocks.push_back(Block(BlockId::lycian"Lycian""Lycian"0x102800x1029F));
 901     blocks.push_back(Block(BlockId::carian"Carian""Carian"0x102A00x102DF));
 902     blocks.push_back(Block(BlockId::copticEpactNumbers"Coptic Epact Numbers""Coptic Epact Numbers"0x102E00x102FF));
 903     blocks.push_back(Block(BlockId::oldItalic"Old Italic""Old Italic"0x103000x1032F));
 904     blocks.push_back(Block(BlockId::gothic"Gothic""Gothic"0x103300x1034F));
 905     blocks.push_back(Block(BlockId::oldPermic"Old Permic""Old Permic"0x103500x1037F));
 906     blocks.push_back(Block(BlockId::ugaritic"Ugaritic""Ugaritic"0x103800x1039F));
 907     blocks.push_back(Block(BlockId::oldPersian"Old Persian""Old Persian"0x103A00x103DF));
 908     blocks.push_back(Block(BlockId::deseret"Deseret""Deseret"0x104000x1044F));
 909     blocks.push_back(Block(BlockId::shavian"Shavian""Shavian"0x104500x1047F));
 910     blocks.push_back(Block(BlockId::osmanya"Osmanya""Osmanya"0x104800x104AF));
 911     blocks.push_back(Block(BlockId::osage"Osage""Osage"0x104B00x104FF));
 912     blocks.push_back(Block(BlockId::elbasan"Elbasan""Elbasan"0x105000x1052F));
 913     blocks.push_back(Block(BlockId::caucasianAlbanian"Caucasian Albanian""Caucasian Albanian"0x105300x1056F));
 914     blocks.push_back(Block(BlockId::linearA"Linear A""Linear A"0x106000x1077F));
 915     blocks.push_back(Block(BlockId::cypriotSyllabary"Cypriot Syllabary""Cypriot Syllabary"0x108000x1083F));
 916     blocks.push_back(Block(BlockId::imperialAramaic"Imperial Aramaic""Imperial Aramaic"0x108400x1085F));
 917     blocks.push_back(Block(BlockId::palmyrene"Palmyrene""Palmyrene"0x108600x1087F));
 918     blocks.push_back(Block(BlockId::nabataean"Nabataean""Nabataean"0x108800x108AF));
 919     blocks.push_back(Block(BlockId::hatran"Hatran""Hatran"0x108E00x108FF));
 920     blocks.push_back(Block(BlockId::phoenician"Phoenician""Phoenician"0x109000x1091F));
 921     blocks.push_back(Block(BlockId::lydian"Lydian""Lydian"0x109200x1093F));
 922     blocks.push_back(Block(BlockId::meroiticHieroglyphs"Meroitic Hieroglyphs""Meroitic Hieroglyphs"0x109800x1099F));
 923     blocks.push_back(Block(BlockId::meroiticCursive"Meroitic Cursive""Meroitic Cursive"0x109A00x109FF));
 924     blocks.push_back(Block(BlockId::kharoshthi"Kharoshthi""Kharoshthi"0x10A000x10A5F));
 925     blocks.push_back(Block(BlockId::oldSouthArabian"Old South Arabian""Old South Arabian"0x10A600x10A7F));
 926     blocks.push_back(Block(BlockId::oldNorthArabian"Old North Arabian""Old North Arabian"0x10A800x10A9F));
 927     blocks.push_back(Block(BlockId::manichean"Manichaean""Manichaean"0x10AC00x10AFF));
 928     blocks.push_back(Block(BlockId::avestan"Avestan""Avestan"0x10B000x10B3F));
 929     blocks.push_back(Block(BlockId::inscriptionalParthian"Inscriptional Parthian""Inscriptional Parthian"0x10B400x10B5F));
 930     blocks.push_back(Block(BlockId::inscriptionalPahlavi"Inscriptional Pahlavi""Inscriptional Pahlavi"0x10B600x10B7F));
 931     blocks.push_back(Block(BlockId::psalterPahlavi"Psalter Pahlavi""Psalter Pahlavi"0x10B800x10BAF));
 932     blocks.push_back(Block(BlockId::oldTurkic"Old Turkic""Old Turkic"0x10C000x10C4F));
 933     blocks.push_back(Block(BlockId::oldHungarian"Old Hungarian""Old Hungarian"0x10C800x10CFF));
 934     blocks.push_back(Block(BlockId::hanifiRohingya"Hanifi Rohingya""Hanifi Rohingya"0x10D000x10D3F));
 935     blocks.push_back(Block(BlockId::rumi"Rumi""Rumi Numeral Symbols"0x10E600x10E7F));
 936     blocks.push_back(Block(BlockId::oldSogdian"Old Sogdian""Old Sogdian"0x10F000x10F2F));
 937     blocks.push_back(Block(BlockId::sogdian"Sogdian""Sogdian"0x10F300x10F6F));
 938     blocks.push_back(Block(BlockId::elymaic"Elymaic""Elymaic"0x10FE00x10FFF));
 939     blocks.push_back(Block(BlockId::brahmi"Brahmi""Brahmi"0x110000x1107F));
 940     blocks.push_back(Block(BlockId::kaithi"Kaithi""Kaithi"0x110800x110CF));
 941     blocks.push_back(Block(BlockId::soraSompeng"Sora Sompeng""Sora Sompeng"0x110D00x110FF));
 942     blocks.push_back(Block(BlockId::chakma"Chakma""Chakma"0x111000x1114F));
 943     blocks.push_back(Block(BlockId::mahajani"Mahajani""Mahajani"0x111500x1117F));
 944     blocks.push_back(Block(BlockId::sharada"Sharada""Sharada"0x111800x111DF));
 945     blocks.push_back(Block(BlockId::sinhalaArchaicNumbers"Sinhala Archaic Numbers""Sinhala Archaic Numbers"0x111E00x111FF));
 946     blocks.push_back(Block(BlockId::khojki"Khojki""Khojki"0x112000x1124F));
 947     blocks.push_back(Block(BlockId::multani"Multani""Multani"0x112800x112AF));
 948     blocks.push_back(Block(BlockId::khudawadi"Khudawadi""Khudawadi"0x112B00x112FF));
 949     blocks.push_back(Block(BlockId::grantha"Grantha""Grantha"0x113000x1137F));
 950     blocks.push_back(Block(BlockId::newa"Newa""Newa"0x114000x1147F));
 951     blocks.push_back(Block(BlockId::tirhuta"Tirhuta""Tirhuta"0x114800x114DF));
 952     blocks.push_back(Block(BlockId::siddham"Siddham""Siddham"0x115800x115FF));
 953     blocks.push_back(Block(BlockId::modi"Modi""Modi"0x116000x1165F));
 954     blocks.push_back(Block(BlockId::mongolianSup"Mongolian Sup""Mongolian Supplement"0x116600x1167F));
 955     blocks.push_back(Block(BlockId::takri"Takri""Takri"0x116800x116CF));
 956     blocks.push_back(Block(BlockId::ahom"Ahom""Ahom"0x117000x1173F));
 957     blocks.push_back(Block(BlockId::dogra"Dogra""Dogra"0x118000x1184F));
 958     blocks.push_back(Block(BlockId::warangCiti"Warang Citi""Warang Citi"0x118A00x118FF));
 959     blocks.push_back(Block(BlockId::nandinagari"Nandinagari""Nandinagari"0x119A00x119FF));
 960     blocks.push_back(Block(BlockId::zanabazarSquare"Zanabazar Square""Zanabazar Square"0x11A000x11A4FF));
 961     blocks.push_back(Block(BlockId::soyombo"Soyombo""Soyombo"0x11A500x11AAF));
 962     blocks.push_back(Block(BlockId::pauCinHau"Pau Cin Hau""Pau Cin Hau"0x11AC00x11AFF));
 963     blocks.push_back(Block(BlockId::bhaisuki"Bhaiksuki""Bhaiksuki"0x11C000x11C6F));
 964     blocks.push_back(Block(BlockId::marchen"Marchen""Marchen"0x11C700x11CBF));
 965     blocks.push_back(Block(BlockId::masaramGondi"Masaram Gondi""Masaram Gondi"0x11D000x11D5F));
 966     blocks.push_back(Block(BlockId::gunjalaGondi"Gunjala Gondi""Gunjala Gondi"0x11D600x11DAF));
 967     blocks.push_back(Block(BlockId::makasar"Makasar""Makasar"0x11EE00x11EFF));
 968     blocks.push_back(Block(BlockId::tamilSup"Tamil Sup""Tamil Sup"0x11FC00x11FFE));
 969     blocks.push_back(Block(BlockId::cuneiform"Cuneiform""Cuneiform"0x120000x123FF));
 970     blocks.push_back(Block(BlockId::cuneiformNumbers"Cuneiform Numbers""Cuneiform Numbers and Punctuation"0x124000x1247F));
 971     blocks.push_back(Block(BlockId::earlyDynasticCuneiform"Early Dynastic Cuneiform""Early Dynastic Cuneiform"0x124800x1254F));
 972     blocks.push_back(Block(BlockId::egyptianHieroglyphs"Egyptian Hieroglyphs""Egyptian Hieroglyphs"0x130000x1342F));
 973     blocks.push_back(Block(BlockId::egyptianHieroglyphFormatControls"Egyptian Hieroglyph Format Controls""Egyptian Hieroglyph Format Controls"0x134300x1343F));
 974     blocks.push_back(Block(BlockId::anatolianHieroglyphs"Anatolian Hieroglyphs""Anatolian Hieroglyphs"0x144000x1467F));
 975     blocks.push_back(Block(BlockId::bamumSup"Bamum Sup""Bamum Supplement"0x168000x16A3F));
 976     blocks.push_back(Block(BlockId::mro"Mro""Mro"0x16A400x16A6F));
 977     blocks.push_back(Block(BlockId::bassaVah"Bassa Vah""Bassa Vah"0x16AD00x16AFF));
 978     blocks.push_back(Block(BlockId::pahawhHmong"Pahawh Hmong""Pahawh Hmong"0x16B000x16B8F));
 979     blocks.push_back(Block(BlockId::medefaidrin"Medefaidrin""Medefaidrin"0x16E400x16E9F));
 980     blocks.push_back(Block(BlockId::miao"Miao""Miao"0x16F000x16F9F));
 981     blocks.push_back(Block(BlockId::ideographicSymbols"Ideographic Symbols""Ideographic Symbols and Punctuation"0x16FE00x16FFF));
 982     blocks.push_back(Block(BlockId::tangut"Tangut""Tangut"0x170000x187FF));
 983     blocks.push_back(Block(BlockId::tangutComponents"Tangut Components""Tangut Components"0x188000x18AFF));
 984     blocks.push_back(Block(BlockId::kanaSup"Kana Sup""Kana Supplement"0x1B0000x1B0FF));
 985     blocks.push_back(Block(BlockId::kanaExtA"Kana Ext A""Kana Extended-A"0x1B1000x1B12F));
 986     blocks.push_back(Block(BlockId::smallKanaExt"Small Kana Ext""Small Kana Extension"0x1B1300x1B16F));
 987     blocks.push_back(Block(BlockId::nushu"Nushu""Nushu"0x1B1700x1B2FF));
 988     blocks.push_back(Block(BlockId::duployan"Duployan""Duployan"0x1BC000x1BC9F));
 989     blocks.push_back(Block(BlockId::shorthandFormatControls"Shorthand Format Controls""Shorthand Format Controls"0x1BCA00x1BCAF));
 990     blocks.push_back(Block(BlockId::byzantineMusic"Byzantine Music""Byzantine Musical Symbols"0x1D0000x1D0FF));
 991     blocks.push_back(Block(BlockId::music"Music""Musical Symbols"0x1D1000x1D1FF));
 992     blocks.push_back(Block(BlockId::ancientGreekMusic"Ancient Greek Music""Ancient Greek Musical Notation"0x1D2000x1D24F));
 993     blocks.push_back(Block(BlockId::mayanNumerals"Mayan Numerals""Mayan Numerals"0x1D2E00x1D2FF));
 994     blocks.push_back(Block(BlockId::taiXuanJing"Tai Xuan Jing""Tai Xuan Jing Symbols"0x1D3000x1D35F));
 995     blocks.push_back(Block(BlockId::countingRod"Counting Rod""Counting Rod Numerals"0x1D3600x1D37F));
 996     blocks.push_back(Block(BlockId::mathAlphanum"Math Alphanum""Mathematical Alphanumeric Symbols"0x1D4000x1D7FF));
 997     blocks.push_back(Block(BlockId::suttonSignWriting"Sutton SignWriting""Sutton SignWriting"0x1D8000x1DAAF));
 998     blocks.push_back(Block(BlockId::glagoliticSup"Glagolitic Sup""Glagolitic Supplement"0x1E0000x1E02F));
 999     blocks.push_back(Block(BlockId::nyiakengPuachueHmong"Nyiakeng Puachue Hmong""Nyiakeng Puachue Hmong"0x1E1000x1E14F));
1000     blocks.push_back(Block(BlockId::wancho"Wancho""Wancho"0x1E2C00x1E2FF));
1001     blocks.push_back(Block(BlockId::mendeKikakui"Mende Kikakui""Mende Kikakui"0x1E8000x1E8DF));
1002     blocks.push_back(Block(BlockId::adlam"Adlam""Adlam"0x1E9000x1E95F));
1003     blocks.push_back(Block(BlockId::indicSiyaqNumbers"Indic Siyaq Numbers""Indic Siyaq Numbers"0x1EC700x1ECBF));
1004     blocks.push_back(Block(BlockId::ottomanSiyaqNumbers"Ottoman Siyaq Numbers""Ottoman Siyaq Numbers"0x1ED000x1ED4F));
1005     blocks.push_back(Block(BlockId::arabicMath"Arabic Math""Arabic Mathematical Alphabetic Symbols"0x1EE000x1EEFF));
1006     blocks.push_back(Block(BlockId::mahjong"Mahjong""Mahjong Tiles"0x1F0000x1F02F));
1007     blocks.push_back(Block(BlockId::domino"Domino""Domino Tiles"0x1F0300x1F09F));
1008     blocks.push_back(Block(BlockId::playingCards"Playing Cards""Playing Cards"0x1F0A00x1F0FF));
1009     blocks.push_back(Block(BlockId::enclosedAlphanumSup"Enclosed Alphanum Sup""Enclosed Alphanumeric Supplement"0x1F1000x1F1FF));
1010     blocks.push_back(Block(BlockId::enclosedIdeographicSup"Enclosed Ideographic Sup""Enclosed Ideographic Supplement"0x1F2000x1F2FF));
1011     blocks.push_back(Block(BlockId::miscPictographs"Misc Pictographs""Miscellaneous Symbols and Pictographs"0x1F3000x1F5FF));
1012     blocks.push_back(Block(BlockId::emoticons"Emoticons""Emoticons"0x1F6000x1F64F));
1013     blocks.push_back(Block(BlockId::ornamentalDingbats"Ornamental Dingbats""Ornamental Dingbats"0x1F6500x1F67F));
1014     blocks.push_back(Block(BlockId::transportAndMap"Transport And Map""Transport and Map Symbols"0x1F6800x1F6FF));
1015     blocks.push_back(Block(BlockId::alchemical"Alchemical""Alchemical Symbols"0x1F7000x1F77F));
1016     blocks.push_back(Block(BlockId::geometricShapesExt"Geometric Shapes Ext""Geometric Shapes Extended"0x1F7800x1F7FF));
1017     blocks.push_back(Block(BlockId::supArrowsC"Sup Arrows C""Supplemental Arrows-C"0x1F8000x1F8FF));
1018     blocks.push_back(Block(BlockId::supSymbolsAndPictographs"Sup Symbols And Pictographs""Supplemental Symbols and Pictographs"0x1F9000x1F9FF));
1019     blocks.push_back(Block(BlockId::chessSymbols"Chess Symbols""Chess Symbols"0x1FA000x1FA6F));
1020     blocks.push_back(Block(BlockId::symbolsAndPictographsExtA"Symbols And Pictographs Ext A""Symbols And Pictographs Extended A"0x1FA700x1FAFF));
1021     blocks.push_back(Block(BlockId::cjkExtB"CJK Ext B""CJK Unified Ideographs Extension B"0x200000x2A6DF));
1022     blocks.push_back(Block(BlockId::cjkExtC"CJK Ext C""CJK Unified Ideographs Extension C"0x2A7000x2B73F));
1023     blocks.push_back(Block(BlockId::cjkExtD"CJK Ext D""CJK Unified Ideographs Extension D"0x2B7400x2B81F));
1024     blocks.push_back(Block(BlockId::cjkExtE"CJK Ext E""CJK Unified Ideographs Extension E"0x2B8200x2CEAF));
1025     blocks.push_back(Block(BlockId::cjkExtF"CJK Ext F""CJK Unified Ideographs Extension F"0x2CEB00x2EBEF));
1026     blocks.push_back(Block(BlockId::cjkCompatIdeographsSup"CJK Compat Ideographs Sup""CJK Compatibility Ideographs Supplement"0x2F8000x2FA1F));
1027     blocks.push_back(Block(BlockId::tags"Tags""Tags"0xE00000xE007F));
1028     blocks.push_back(Block(BlockId::vsSup"VS Sup""Variation Selectors Supplement"0xE01000xE01EF));
1029     blocks.push_back(Block(BlockId::supPuaA"Sup PUA A""Supplementary Private Use Area-A"0xF00000xFFFFF));
1030     blocks.push_back(Block(BlockId::supPuaB"Sup PUA B""Supplementary Private Use Area-B"0x1000000x10FFFF));
1031 
1032     for (const Block& block : blocks)
1033     {
1034         blockIdMap[block.Id()] = &block;
1035         shortNameMap[MakeCanonicalPropertyName(block.ShortName())] = &block;
1036         longNameMap[MakeCanonicalPropertyName(block.LongName())] = &block;
1037     }
1038 }
1039 
1040 const Block& BlockTable::GetBlock(BlockId blockId) const
1041 {
1042     auto it = blockIdMap.find(blockId);
1043     if (it != blockIdMap.cend())
1044     {
1045         return *it->second;
1046     }
1047     else
1048     {
1049         throw UnicodeException("block id " + std::to_string(static_cast<int>(blockId)) + " not found");
1050     }
1051 }
1052 
1053 const Block& BlockTable::GetBlockByShortName(const std::string& shortName) const
1054 {
1055     auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1056     if (it != shortNameMap.cend())
1057     {
1058         return *it->second;
1059     }
1060     else
1061     {
1062         throw UnicodeException("block '" + shortName + "' not found");
1063     }
1064 }
1065 
1066 const Block& BlockTable::GetBlockByLongName(const std::string& longName) const
1067 {
1068     auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1069     if (it != longNameMap.cend())
1070     {
1071         return *it->second;
1072     }
1073     else
1074     {
1075         throw UnicodeException("block '" + longName + "' not found");
1076     }
1077 }
1078 
1079 GeneralCategory::GeneralCategory(GeneralCategoryId id_const std::string& shortName_const std::string& longName_) : id(id_)shortName(shortName_)longName(longName_)
1080 {
1081 }
1082 
1083 void GeneralCategoryTable::Init()
1084 {
1085     instance.reset(new GeneralCategoryTable());
1086 }
1087 
1088 void GeneralCategoryTable::Done()
1089 {
1090     instance.reset();
1091 }
1092 
1093 std::unique_ptr<GeneralCategoryTable> GeneralCategoryTable::instance;
1094 
1095 GeneralCategoryTable::GeneralCategoryTable()
1096 {
1097     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Lu"Lu""Uppercase Letter"));
1098     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Lu"Ll""Lowercase Letter"));
1099     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Lt"Lt""Titlecase Letter"));
1100     generalCategories.push_back(GeneralCategory(GeneralCategoryId::LC"LC""Cased Letter"));
1101     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Lm"Lm""Modifier Letter"));
1102     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Lo"Lo""Other Letter"));
1103     generalCategories.push_back(GeneralCategory(GeneralCategoryId::L"L""Letter"));
1104     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Mn"Mn""Nonspacing Mark"));
1105     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Mc"Mc""Spacing Mark"));
1106     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Me"Me""Enclosing Mark"));
1107     generalCategories.push_back(GeneralCategory(GeneralCategoryId::M"M""Mark"));
1108     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Nd"Nd""Decimal Number"));
1109     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Nl"Nl""Letter Number"));
1110     generalCategories.push_back(GeneralCategory(GeneralCategoryId::No"No""Other Number"));
1111     generalCategories.push_back(GeneralCategory(GeneralCategoryId::N"N""Number"));
1112     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Pc"Pc""Connector Punctuation"));
1113     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Pd"Pd""Dash Punctuation"));
1114     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Ps"Ps""Open Punctuation"));
1115     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Pe"Pe""Close Punctuation"));
1116     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Pi"Pi""Initial Punctuation"));
1117     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Pf"Pf""Final Punctuation"));
1118     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Po"Po""Other Punctuation"));
1119     generalCategories.push_back(GeneralCategory(GeneralCategoryId::P"P""Punctuation"));
1120     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Sm"Sm""Math Symbol"));
1121     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Sc"Sc""Currency Symbol"));
1122     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Sk"Sk""Modifier Symbol"));
1123     generalCategories.push_back(GeneralCategory(GeneralCategoryId::So"So""Other Symbol"));
1124     generalCategories.push_back(GeneralCategory(GeneralCategoryId::S"S""Symbol"));
1125     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Zs"Zs""Space Separator"));
1126     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Zl"Zl""Line Separator"));
1127     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Zp"Zp""Paragraph Separator"));
1128     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Z"Z""Separator"));
1129     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Cc"Cc""Control"));
1130     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Cf"Cf""Format"));
1131     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Cs"Cs""Surrogate"));
1132     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Co"Co""Private Use"));
1133     generalCategories.push_back(GeneralCategory(GeneralCategoryId::Cn"Cn""Unassigned"));
1134     generalCategories.push_back(GeneralCategory(GeneralCategoryId::C"C""Other"));
1135     generalCategories.push_back(GeneralCategory(GeneralCategoryId::G"G""Graphic"));
1136     generalCategories.push_back(GeneralCategory(GeneralCategoryId::B"B""Base"));
1137     for (const GeneralCategory& generalCategory : generalCategories)
1138     {
1139         generalCategoryIdMap[generalCategory.Id()] = &generalCategory;
1140         shortNameMap[MakeCanonicalPropertyName(generalCategory.ShortName())] = &generalCategory;
1141         longNameMap[MakeCanonicalPropertyName(generalCategory.LongName())] = &generalCategory;
1142     }
1143 }
1144 
1145 const GeneralCategory& GeneralCategoryTable::GetGeneralCategory(GeneralCategoryId generalCategoryId) const
1146 {
1147     auto it = generalCategoryIdMap.find(generalCategoryId);
1148     if (it != generalCategoryIdMap.cend())
1149     {
1150         return *it->second;
1151     }
1152     else
1153     {
1154         throw UnicodeException("general category " + std::to_string(static_cast<int>(generalCategoryId)) +  " not found");
1155     }
1156 }
1157 
1158 const GeneralCategory& GeneralCategoryTable::GetGeneralCategoryByShortName(const std::string& shortName) const
1159 {
1160     auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1161     if (it != shortNameMap.cend())
1162     {
1163         return *it->second;
1164     }
1165     else
1166     {
1167         throw UnicodeException("general category '" + shortName + "' not found");
1168     }
1169 }
1170 
1171 const GeneralCategory& GeneralCategoryTable::GetGeneralCategoryByLongName(const std::string& longName) const
1172 {
1173     auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1174     if (it != longNameMap.cend())
1175     {
1176         return *it->second;
1177     }
1178     else
1179     {
1180         throw UnicodeException("general category '" + longName + "' not found");
1181     }
1182 }
1183 
1184 Age::Age(AgeId id_const std::string& version_) : id(id_)version(version_)
1185 {
1186 }
1187 
1188 void AgeTable::Init()
1189 {
1190     instance.reset(new AgeTable());
1191 }
1192 
1193 void AgeTable::Done()
1194 {
1195     instance.reset();
1196 }
1197 
1198 std::unique_ptr<AgeTable> AgeTable::instance;
1199 
1200 AgeTable::AgeTable()
1201 {
1202     ages.push_back(Age(AgeId::age_1_1"1.1"));
1203     ages.push_back(Age(AgeId::age_2_0"2.0"));
1204     ages.push_back(Age(AgeId::age_2_1"2.1"));
1205     ages.push_back(Age(AgeId::age_3_0"3.0"));
1206     ages.push_back(Age(AgeId::age_3_1"3.1"));
1207     ages.push_back(Age(AgeId::age_3_2"3.2"));
1208     ages.push_back(Age(AgeId::age_4_0"4.0"));
1209     ages.push_back(Age(AgeId::age_4_1"4.1"));
1210     ages.push_back(Age(AgeId::age_5_0"5.0"));
1211     ages.push_back(Age(AgeId::age_5_1"5.1"));
1212     ages.push_back(Age(AgeId::age_5_2"5.2"));
1213     ages.push_back(Age(AgeId::age_6_0"6.0"));
1214     ages.push_back(Age(AgeId::age_6_1"6.1"));
1215     ages.push_back(Age(AgeId::age_6_2"6.2"));
1216     ages.push_back(Age(AgeId::age_6_3"6.3"));
1217     ages.push_back(Age(AgeId::age_7_0"7.0"));
1218     ages.push_back(Age(AgeId::age_8_0"8.0"));
1219     ages.push_back(Age(AgeId::age_9_0"9.0"));
1220     ages.push_back(Age(AgeId::age_10_0"10.0"));
1221     ages.push_back(Age(AgeId::age_11_0"11.0"));
1222     ages.push_back(Age(AgeId::age_12_0"12.0"));
1223     ages.push_back(Age(AgeId::age_12_1"12.1"));
1224     for (const Age& age : ages)
1225     {
1226         ageIdMap[age.Id()] = &age;
1227         versionMap[age.Version()] = &age;
1228     }
1229 }
1230 
1231 const Age& AgeTable::GetAge(AgeId id) const
1232 {
1233     auto it = ageIdMap.find(id);
1234     if (it != ageIdMap.cend())
1235     {
1236         return *it->second;
1237     }
1238     else
1239     {
1240         throw UnicodeException("Unicode age " + std::to_string(static_cast<int>(id)) + " not found");
1241     }
1242 }
1243 
1244 const Age& AgeTable::GetAge(const std::string& version) const
1245 {
1246     auto it = versionMap.find(version);
1247     if (it != versionMap.cend())
1248     {
1249         return *it->second;
1250     }
1251     else
1252     {
1253         throw UnicodeException("Unicode age '" + version + "' not found");
1254     }
1255 }
1256 
1257 Script::Script(ScriptId id_const std::string& shortName_const std::string& longName_) : id(id_)shortName(shortName_)longName(longName_)
1258 {
1259 }
1260 
1261 void ScriptTable::Init()
1262 {
1263     instance.reset(new ScriptTable());
1264 }
1265 
1266 void ScriptTable::Done()
1267 {
1268     instance.reset();
1269 }
1270 
1271 std::unique_ptr<ScriptTable> ScriptTable::instance;
1272 
1273 ScriptTable::ScriptTable()
1274 {
1275     scripts.push_back(Script(ScriptId::adlm"Adlm""Adlam"));
1276     scripts.push_back(Script(ScriptId::aghb"Aghb""Caucasian Albanian"));
1277     scripts.push_back(Script(ScriptId::ahom"Ahom""Ahom"));
1278     scripts.push_back(Script(ScriptId::arab"Arab""Arabic"));
1279     scripts.push_back(Script(ScriptId::armi"Armi""Imperial Aramaic"));
1280     scripts.push_back(Script(ScriptId::armn"Armn""Armenian"));
1281     scripts.push_back(Script(ScriptId::avst"Avst""Avestan"));
1282     scripts.push_back(Script(ScriptId::bali"Bali""Balinese"));
1283     scripts.push_back(Script(ScriptId::bamu"Bamu""Bamum"));
1284     scripts.push_back(Script(ScriptId::bass"Bass""Bassa Vah"));
1285     scripts.push_back(Script(ScriptId::batk"Batk""Batak"));
1286     scripts.push_back(Script(ScriptId::beng"Beng""Bengali"));
1287     scripts.push_back(Script(ScriptId::bhks"Bhks""Bhaisuki"));
1288     scripts.push_back(Script(ScriptId::bopo"Bopo""Bopomofo"));
1289     scripts.push_back(Script(ScriptId::brah"Brah""Brahmi"));
1290     scripts.push_back(Script(ScriptId::brai"Brai""Braille"));
1291     scripts.push_back(Script(ScriptId::bugi"Bugi""Buginese"));
1292     scripts.push_back(Script(ScriptId::buhd"Buhd""Buhid"));
1293     scripts.push_back(Script(ScriptId::cakm"Cakm""Chakma"));
1294     scripts.push_back(Script(ScriptId::cans"Cans""Canadian Aboriginal"));
1295     scripts.push_back(Script(ScriptId::cari"Cari""Carian"));
1296     scripts.push_back(Script(ScriptId::cham"Cham""Cham"));
1297     scripts.push_back(Script(ScriptId::cher"Cher""Cherokee"));
1298     scripts.push_back(Script(ScriptId::copt"Copt""Coptic"));
1299     scripts.push_back(Script(ScriptId::cprt"Cprt""Cypriot"));
1300     scripts.push_back(Script(ScriptId::cyrl"Cyrl""Cyrillic"));
1301     scripts.push_back(Script(ScriptId::deva"Deva""Devanagari"));
1302     scripts.push_back(Script(ScriptId::dogr"Dogr""Dogra"));
1303     scripts.push_back(Script(ScriptId::dsrt"Dsrt""Deseret"));
1304     scripts.push_back(Script(ScriptId::dupl"Dupl""Duployan"));
1305     scripts.push_back(Script(ScriptId::egyp"Egyp""Egyptian Hieroglyphs"));
1306     scripts.push_back(Script(ScriptId::elba"Elba""Elbasan"));
1307     scripts.push_back(Script(ScriptId::elym"Elym""Elymaic"));
1308     scripts.push_back(Script(ScriptId::ethi"Ethi""Ethiopian"));
1309     scripts.push_back(Script(ScriptId::geor"Geor""Georgian"));
1310     scripts.push_back(Script(ScriptId::glag"Glag""Glagolitic"));
1311     scripts.push_back(Script(ScriptId::gong"Gong""Gunjala Gondi"));
1312     scripts.push_back(Script(ScriptId::gonm"Gonm""Masaram Gondi"));
1313     scripts.push_back(Script(ScriptId::goth"Goth""Gothic"));
1314     scripts.push_back(Script(ScriptId::gran"Gran""Grantha"));
1315     scripts.push_back(Script(ScriptId::grek"Grek""Greek"));
1316     scripts.push_back(Script(ScriptId::gujr"Gujr""Gujarati"));
1317     scripts.push_back(Script(ScriptId::guru"Guru""Gurmukhi"));
1318     scripts.push_back(Script(ScriptId::hang"Hang""Hangul"));
1319     scripts.push_back(Script(ScriptId::hani"Hani""Han"));
1320     scripts.push_back(Script(ScriptId::hano"Hano""Hanunoo"));
1321     scripts.push_back(Script(ScriptId::hatr"Hatr""Hatran"));
1322     scripts.push_back(Script(ScriptId::hebr"Hebr""Hebrew"));
1323     scripts.push_back(Script(ScriptId::hira"Hira""Hiragana"));
1324     scripts.push_back(Script(ScriptId::hluw"Hluw""Anatolian Hieroglyphs"));
1325     scripts.push_back(Script(ScriptId::hmng"Hmng""Pahawh Hmong"));
1326     scripts.push_back(Script(ScriptId::hmnp"Hmnp""Nyiakeng Puachue Hmong"));
1327     scripts.push_back(Script(ScriptId::hrkt"Hrkt""Katakana Or Hiragana"));
1328     scripts.push_back(Script(ScriptId::hung"Hung""Old Hungarian"));
1329     scripts.push_back(Script(ScriptId::ital"Ital""Old Italic"));
1330     scripts.push_back(Script(ScriptId::java"Java""Javanese"));
1331     scripts.push_back(Script(ScriptId::kali"Kali""Kayah Li"));
1332     scripts.push_back(Script(ScriptId::kana"Kana""Katakana"));
1333     scripts.push_back(Script(ScriptId::khar"Khar""Kharoshthi"));
1334     scripts.push_back(Script(ScriptId::khmr"Khmr""Khmer"));
1335     scripts.push_back(Script(ScriptId::khoj"Khoj""Khojki"));
1336     scripts.push_back(Script(ScriptId::knda"Knda""Kannada"));
1337     scripts.push_back(Script(ScriptId::kthi"Kthi""Kaithi"));
1338     scripts.push_back(Script(ScriptId::lana"Lana""Tai Tham"));
1339     scripts.push_back(Script(ScriptId::laoo"Laoo""Lao"));
1340     scripts.push_back(Script(ScriptId::latn"Latn""Latin"));
1341     scripts.push_back(Script(ScriptId::lepc"Lepc""Lepcha"));
1342     scripts.push_back(Script(ScriptId::limb"Limb""Limbu"));
1343     scripts.push_back(Script(ScriptId::lina"Lina""Linear A"));
1344     scripts.push_back(Script(ScriptId::linb"Linb""Linear B"));
1345     scripts.push_back(Script(ScriptId::lisu"Lisu""Lisu"));
1346     scripts.push_back(Script(ScriptId::lyci"Lyci""Lycian"));
1347     scripts.push_back(Script(ScriptId::lydi"Lydi""Lydian"));
1348     scripts.push_back(Script(ScriptId::mahj"Mahj""Mahajani"));
1349     scripts.push_back(Script(ScriptId::maka"Maka""Makasar"));
1350     scripts.push_back(Script(ScriptId::mand"Mand""Mandaic"));
1351     scripts.push_back(Script(ScriptId::mani"Mani""Manichaean"));
1352     scripts.push_back(Script(ScriptId::marc"Marc""Marchen"));
1353     scripts.push_back(Script(ScriptId::medf"Medf""Medefaidrin"));
1354     scripts.push_back(Script(ScriptId::mend"Mend""Mende Kikakui"));
1355     scripts.push_back(Script(ScriptId::merc"Merc""Meroitic Cursive"));
1356     scripts.push_back(Script(ScriptId::mero"Mero""Meroitic Hieroglyphs"));
1357     scripts.push_back(Script(ScriptId::mlym"Mlym""Malayalam"));
1358     scripts.push_back(Script(ScriptId::modi"Modi""Modi"));
1359     scripts.push_back(Script(ScriptId::mong"Mong""Mongolian"));
1360     scripts.push_back(Script(ScriptId::mroo"Mroo""Mro"));
1361     scripts.push_back(Script(ScriptId::mtei"Mtei""Meetei Mayak"));
1362     scripts.push_back(Script(ScriptId::mult"Mult""Multani"));
1363     scripts.push_back(Script(ScriptId::mymr"Mymr""Myanmar"));
1364     scripts.push_back(Script(ScriptId::nand"Nand""Nandinagari"));
1365     scripts.push_back(Script(ScriptId::narb"Narb""Old North Arabian"));
1366     scripts.push_back(Script(ScriptId::nbat"Nbat""Nabataean"));
1367     scripts.push_back(Script(ScriptId::newa"Newa""Newa"));
1368     scripts.push_back(Script(ScriptId::nkoo"Nkoo""Nko"));
1369     scripts.push_back(Script(ScriptId::nshu"Nshu""Nushu"));
1370     scripts.push_back(Script(ScriptId::ogam"Ogam""Ogham"));
1371     scripts.push_back(Script(ScriptId::olck"Olck""Ol Chiki"));
1372     scripts.push_back(Script(ScriptId::orkh"Orkh""Old Turkic"));
1373     scripts.push_back(Script(ScriptId::orya"Orya""Oriya"));
1374     scripts.push_back(Script(ScriptId::osge"Osge""Osage"));
1375     scripts.push_back(Script(ScriptId::osma"Osma""Osmanya"));
1376     scripts.push_back(Script(ScriptId::palm"Palm""Palmyrene"));
1377     scripts.push_back(Script(ScriptId::pauc"Pauc""Pau Cin Hau"));
1378     scripts.push_back(Script(ScriptId::perm"Perm""Old Permic"));
1379     scripts.push_back(Script(ScriptId::phag"Phag""Phags Pa"));
1380     scripts.push_back(Script(ScriptId::phli"Phli""Inscriptional Pahlavi"));
1381     scripts.push_back(Script(ScriptId::phlp"Phlp""Psalter Pahlavi"));
1382     scripts.push_back(Script(ScriptId::phnx"Phnx""Phoenician"));
1383     scripts.push_back(Script(ScriptId::plrd"Plrd""Miao"));
1384     scripts.push_back(Script(ScriptId::prti"Prti""Inscriptional Parthian"));
1385     scripts.push_back(Script(ScriptId::rjng"Rjng""Rejang"));
1386     scripts.push_back(Script(ScriptId::rohg"Rohg""Hanifi Rohingya"));
1387     scripts.push_back(Script(ScriptId::runr"Runr""Runic"));
1388     scripts.push_back(Script(ScriptId::samr"Samr""Samaritan"));
1389     scripts.push_back(Script(ScriptId::sarb"Sarb""Old South Arabian"));
1390     scripts.push_back(Script(ScriptId::saur"Saur""Saurashtra"));
1391     scripts.push_back(Script(ScriptId::sgnw"Sgnw""SignWriting"));
1392     scripts.push_back(Script(ScriptId::shaw"Shaw""Shawian"));
1393     scripts.push_back(Script(ScriptId::shrd"Shrd""Sharada"));
1394     scripts.push_back(Script(ScriptId::sidd"Sidd""Shiddham"));
1395     scripts.push_back(Script(ScriptId::sind"Sind""Khudawadi"));
1396     scripts.push_back(Script(ScriptId::sinh"Sinh""Sinhala"));
1397     scripts.push_back(Script(ScriptId::sogd"Sogd""Sogdian"));
1398     scripts.push_back(Script(ScriptId::sogo"Sogo""Old Sogdian"));
1399     scripts.push_back(Script(ScriptId::sora"Sora""Sora Sompeng"));
1400     scripts.push_back(Script(ScriptId::soyo"Soyo""Soyombo"));
1401     scripts.push_back(Script(ScriptId::sund"Sund""Sundanese"));
1402     scripts.push_back(Script(ScriptId::sylo"Sylo""Syloti Nagri"));
1403     scripts.push_back(Script(ScriptId::syrc"Syrc""Syriac"));
1404     scripts.push_back(Script(ScriptId::tagb"Tagb""Tagbanwa"));
1405     scripts.push_back(Script(ScriptId::takr"Takr""Takri"));
1406     scripts.push_back(Script(ScriptId::tale"Tale""Tai Le"));
1407     scripts.push_back(Script(ScriptId::talu"Talu""New Tai Lue"));
1408     scripts.push_back(Script(ScriptId::taml"Taml""Tamil"));
1409     scripts.push_back(Script(ScriptId::tang"Tang""Tangut"));
1410     scripts.push_back(Script(ScriptId::tavt"Tavt""Tai Viet"));
1411     scripts.push_back(Script(ScriptId::telu"Telu""Telugu"));
1412     scripts.push_back(Script(ScriptId::tfng"Tfng""Tifinag"));
1413     scripts.push_back(Script(ScriptId::tglg"Tglg""Tagalog"));
1414     scripts.push_back(Script(ScriptId::thaa"Thaa""Thaana"));
1415     scripts.push_back(Script(ScriptId::thai"Thai""Thai"));
1416     scripts.push_back(Script(ScriptId::tibt"Tibt""Tibetan"));
1417     scripts.push_back(Script(ScriptId::tirh"Tirh""Tirhuta"));
1418     scripts.push_back(Script(ScriptId::ugar"Ugar""Ugaritic"));
1419     scripts.push_back(Script(ScriptId::vaii"Vaii""Vai"));
1420     scripts.push_back(Script(ScriptId::wara"Wara""Warang Citi"));
1421     scripts.push_back(Script(ScriptId::wcho"Wcho""Wcho"));
1422     scripts.push_back(Script(ScriptId::xpeo"Xpeo""Old Persian"));
1423     scripts.push_back(Script(ScriptId::xsux"Xsux""Cuneiform"));
1424     scripts.push_back(Script(ScriptId::yiii"Yiii""Yi"));
1425     scripts.push_back(Script(ScriptId::zanb"Zanb""Zanabazar Square"));
1426     scripts.push_back(Script(ScriptId::zinh"Zinh""Inherited"));
1427     scripts.push_back(Script(ScriptId::zyyy"Zyyy""Common"));
1428     scripts.push_back(Script(ScriptId::zzzz"Zzzz""Unknown"));
1429 
1430     for (const Script& script : scripts)
1431     {
1432         scriptIdMap[script.Id()] = &script;
1433         shortNameMap[MakeCanonicalPropertyName(script.ShortName())] = &script;
1434         longNameMap[MakeCanonicalPropertyName(script.LongName())] = &script;
1435     }
1436 }
1437 
1438 const Script& ScriptTable::GetScript(ScriptId id) const
1439 {
1440     auto it = scriptIdMap.find(id);
1441     if (it != scriptIdMap.cend())
1442     {
1443         return *it->second;
1444     }
1445     else
1446     {
1447         throw UnicodeException("script id " + std::to_string(static_cast<int>(id)) + " not found");
1448     }
1449 }
1450 
1451 const Script& ScriptTable::GetScriptByShortName(const std::string& shortName) const
1452 {
1453     auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1454     if (it != shortNameMap.cend())
1455     {
1456         return *it->second;
1457     }
1458     else
1459     {
1460         throw UnicodeException("script '" + shortName + "' not found");
1461     }
1462 }
1463 
1464 const Script& ScriptTable::GetScriptByLongName(const std::string& longName) const
1465 {
1466     auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1467     if (it != longNameMap.cend())
1468     {
1469         return *it->second;
1470     }
1471     else
1472     {
1473         throw UnicodeException("script '" + longName + "' not found");
1474     }
1475 }
1476 
1477 CharacterInfo::CharacterInfo() :
1478     binaryProperties(0)generalCategory(GeneralCategoryId::none)upper(0)lower(0)title(0)folding(0)block(BlockId::none)age(AgeId::age_unassigned)script(ScriptId::none)
1479 {
1480 }
1481 
1482 void CharacterInfo::Write(BinaryWriter& writer)
1483 {
1484     writer.Write(binaryProperties);
1485     writer.Write(static_cast<uint32_t>(generalCategory));
1486     writer.Write(upper);
1487     writer.Write(lower);
1488     writer.Write(title);
1489     writer.Write(folding);
1490     writer.Write(static_cast<uint16_t>(block));
1491     writer.Write(static_cast<uint8_t>(age));
1492     writer.Write(static_cast<uint8_t>(script));
1493 }
1494 
1495 void CharacterInfo::Read(BinaryReader& reader)
1496 {
1497     binaryProperties = reader.ReadULong();
1498     generalCategory = static_cast<GeneralCategoryId>(reader.ReadUInt());
1499     upper = reader.ReadUChar();
1500     lower = reader.ReadUChar();
1501     title = reader.ReadUChar();
1502     folding = reader.ReadUChar();
1503     block = static_cast<BlockId>(reader.ReadUShort());
1504     age = static_cast<AgeId>(reader.ReadByte());
1505     script = static_cast<ScriptId>(reader.ReadByte());
1506 }
1507 
1508 NumericType::NumericType(NumericTypeId id_const std::string& shortName_const std::string& longName_) : id(id_)shortName(shortName_)longName(longName_)
1509 {
1510 }
1511 
1512 void NumericTypeTable::Init()
1513 {
1514     instance.reset(new NumericTypeTable());
1515 }
1516 
1517 void NumericTypeTable::Done()
1518 {
1519     instance.reset();
1520 }
1521 
1522 std::unique_ptr<NumericTypeTable> NumericTypeTable::instance;
1523 
1524 NumericTypeTable::NumericTypeTable()
1525 {
1526     numericTypes.push_back(NumericType(NumericTypeId::none"None""None"));
1527     numericTypes.push_back(NumericType(NumericTypeId::de"De""Decimal"));
1528     numericTypes.push_back(NumericType(NumericTypeId::di"Di""Digit"));
1529     numericTypes.push_back(NumericType(NumericTypeId::nu"Nu""Numeric"));
1530     for (const NumericType& numericType : numericTypes)
1531     {
1532         numericTypeMap[numericType.Id()] = &numericType;
1533         shortNameMap[MakeCanonicalPropertyName(numericType.ShortName())] = &numericType;
1534         longNameMap[MakeCanonicalPropertyName(numericType.LongName())] = &numericType;;
1535     }
1536 }
1537 
1538 const NumericType& NumericTypeTable::GetNumericType(NumericTypeId id) const
1539 {
1540     auto it = numericTypeMap.find(id);
1541     if (it != numericTypeMap.cend())
1542     {
1543         return *it->second;
1544     }
1545     else
1546     {
1547         throw UnicodeException("numeric type " + std::to_string(static_cast<int>(id)) + " not found");
1548     }
1549 }
1550 
1551 const NumericType& NumericTypeTable::GetNumericTypeByShortName(const std::string& shortName) const
1552 {
1553     auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1554     if (it != shortNameMap.cend())
1555     {
1556         return *it->second;
1557     }
1558     else
1559     {
1560         throw UnicodeException("numeric type '" + shortName + "' not found");
1561     }
1562 }
1563 
1564 const NumericType& NumericTypeTable::GetNumericTypeByLongName(const std::string& longName) const
1565 {
1566     auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1567     if (it != longNameMap.cend())
1568     {
1569         return *it->second;
1570     }
1571     else
1572     {
1573         throw UnicodeException("numeric type '" + longName + "' not found");
1574     }
1575 }
1576 
1577 BidiClass::BidiClass(BidiClassId id_const std::string& shortName_const std::string& longName_) : id(id_)shortName(shortName_)longName(longName_)
1578 {
1579 }
1580 
1581 void BidiClassTable::Init()
1582 {
1583     instance.reset(new BidiClassTable());
1584 }
1585 
1586 void BidiClassTable::Done()
1587 {
1588     instance.reset();
1589 }
1590 
1591 std::unique_ptr<BidiClassTable> BidiClassTable::instance;
1592 
1593 BidiClassTable::BidiClassTable()
1594 {
1595     bidiClasses.push_back(BidiClass(BidiClassId::al"AL""Arabic Letter"));
1596     bidiClasses.push_back(BidiClass(BidiClassId::an"AN""Arabic Number"));
1597     bidiClasses.push_back(BidiClass(BidiClassId::b"B""Paragraph Separator"));
1598     bidiClasses.push_back(BidiClass(BidiClassId::bn"BN""Boundary Neutral"));
1599     bidiClasses.push_back(BidiClass(BidiClassId::cs"CS""Common Separator"));
1600     bidiClasses.push_back(BidiClass(BidiClassId::en"EN""European Number"));
1601     bidiClasses.push_back(BidiClass(BidiClassId::es"ES""European Separator"));
1602     bidiClasses.push_back(BidiClass(BidiClassId::et"ET""European Terminator"));
1603     bidiClasses.push_back(BidiClass(BidiClassId::fsi"FSI""First Strong Isolate"));
1604     bidiClasses.push_back(BidiClass(BidiClassId::l"L""Left To Right"));
1605     bidiClasses.push_back(BidiClass(BidiClassId::lre"LRE""Left To Right Embedding"));
1606     bidiClasses.push_back(BidiClass(BidiClassId::lri"LRI""Left To Right Isolate"));
1607     bidiClasses.push_back(BidiClass(BidiClassId::lro"LRO""Left To Right Override"));
1608     bidiClasses.push_back(BidiClass(BidiClassId::nsm"NSM""Nonspacing Mark"));
1609     bidiClasses.push_back(BidiClass(BidiClassId::on"ON""Other Neutral"));
1610     bidiClasses.push_back(BidiClass(BidiClassId::pdf"PDF""Pop Directional Format"));
1611     bidiClasses.push_back(BidiClass(BidiClassId::pdi"PDI""Pop Directional Isolate"));
1612     bidiClasses.push_back(BidiClass(BidiClassId::r"R""Right To Left"));
1613     bidiClasses.push_back(BidiClass(BidiClassId::rle"RLE""Right To Left Embedding"));
1614     bidiClasses.push_back(BidiClass(BidiClassId::rli"RLI""Right To Left Isolate"));
1615     bidiClasses.push_back(BidiClass(BidiClassId::rlo"RLO""Right To Left Override"));
1616     bidiClasses.push_back(BidiClass(BidiClassId::s"S""Segment Separator"));
1617     bidiClasses.push_back(BidiClass(BidiClassId::ws"WS""White Space"));
1618 
1619     for (const BidiClass& bidiClass : bidiClasses)
1620     {
1621         bidiClassMap[bidiClass.Id()] = &bidiClass;
1622         shortNameMap[MakeCanonicalPropertyName(bidiClass.ShortName())] = &bidiClass;
1623         longNameMap[MakeCanonicalPropertyName(bidiClass.LongName())] = &bidiClass;
1624     }
1625 }
1626 
1627 const BidiClass& BidiClassTable::GetBidiClass(BidiClassId id) const
1628 {
1629     auto it = bidiClassMap.find(id);
1630     if (it != bidiClassMap.cend())
1631     {
1632         return *it->second;
1633     }
1634     else
1635     {
1636         throw UnicodeException("bidi class " + std::to_string(static_cast<int>(id)) + " not found");
1637     }
1638 }
1639 
1640 const BidiClass& BidiClassTable::GetBidiClassByShortName(const std::string& shortName) const
1641 {
1642     auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1643     if (it != shortNameMap.cend())
1644     {
1645         return *it->second;
1646     }
1647     else
1648     {
1649         throw UnicodeException("bidi class '" + shortName + "' not found");
1650     }
1651 }
1652 
1653 const BidiClass& BidiClassTable::GetBidiClassByLongName(const std::string& longName) const
1654 {
1655     auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1656     if (it != longNameMap.cend())
1657     {
1658         return *it->second;
1659     }
1660     else
1661     {
1662         throw UnicodeException("bidi class '" + longName + "' not found");
1663     }
1664 }
1665 
1666 BidiPairedBracketType::BidiPairedBracketType(BidiPairedBracketTypeId id_const std::string& shortName_const std::string& longName_) : id(id_)shortName(shortName_)longName(longName_)
1667 {
1668 }
1669 
1670 void BidiPairedBracketTypeTable::Init()
1671 {
1672     instance.reset(new BidiPairedBracketTypeTable());
1673 }
1674 
1675 void BidiPairedBracketTypeTable::Done()
1676 {
1677     instance.reset();
1678 }
1679 
1680 std::unique_ptr<BidiPairedBracketTypeTable> BidiPairedBracketTypeTable::instance;
1681 
1682 BidiPairedBracketTypeTable::BidiPairedBracketTypeTable()
1683 {
1684     bidiPairedBracketTypes.push_back(BidiPairedBracketType(BidiPairedBracketTypeId::o"O""Open"));
1685     bidiPairedBracketTypes.push_back(BidiPairedBracketType(BidiPairedBracketTypeId::c"C""Close"));
1686     bidiPairedBracketTypes.push_back(BidiPairedBracketType(BidiPairedBracketTypeId::none"N""None"));
1687     for (const BidiPairedBracketType& type : bidiPairedBracketTypes)
1688     {
1689         typeMap[type.Id()] = &type;
1690         shortNameMap[MakeCanonicalPropertyName(type.ShortName())] = &type;
1691         longNameMap[MakeCanonicalPropertyName(type.LongName())] = &type;
1692     }
1693 }
1694 
1695 const BidiPairedBracketType& BidiPairedBracketTypeTable::GetBidiPairedBracketType(BidiPairedBracketTypeId id) const
1696 {
1697     auto it = typeMap.find(id);
1698     if (it != typeMap.cend())
1699     {
1700         return *it->second;
1701     }
1702     else
1703     {
1704         throw UnicodeException("Bidi paired bracket type " + std::to_string(static_cast<int>(id)) + " not found");
1705     }
1706 }
1707 
1708 const BidiPairedBracketType& BidiPairedBracketTypeTable::GetBidiPairedBracketTypeByShortName(const std::string& shortName) const
1709 {
1710     auto it = shortNameMap.find(MakeCanonicalPropertyName(shortName));
1711     if (it != shortNameMap.cend())
1712     {
1713         return *it->second;
1714     }
1715     else
1716     {
1717         throw UnicodeException("Bidi paired bracket type '" + shortName + "' not found");
1718     }
1719 }
1720 
1721 const BidiPairedBracketType& BidiPairedBracketTypeTable::GetBidiPairedBracketTypeByLongName(const std::string& longName) const
1722 {
1723     auto it = longNameMap.find(MakeCanonicalPropertyName(longName));
1724     if (it != longNameMap.cend())
1725     {
1726         return *it->second;
1727     }
1728     else
1729     {
1730         throw UnicodeException("Bidi paired bracket type '" + longName + "' not found");
1731     }
1732 }
1733 
1734 AliasType::AliasType(AliasTypeId id_const std::string& name_) : id(id_)name(name_)
1735 {
1736 }
1737 
1738 void AliasTypeTable::Init()
1739 {
1740     instance.reset(new AliasTypeTable());
1741 }
1742 
1743 void AliasTypeTable::Done()
1744 {
1745     instance.reset();
1746 }
1747 
1748 std::unique_ptr<AliasTypeTable> AliasTypeTable::instance;
1749 
1750 AliasTypeTable::AliasTypeTable()
1751 {
1752     aliasTypes.push_back(AliasType(AliasTypeId::abbreviation"abbreviation"));
1753     aliasTypes.push_back(AliasType(AliasTypeId::alternate"alternate"));
1754     aliasTypes.push_back(AliasType(AliasTypeId::control"control"));
1755     aliasTypes.push_back(AliasType(AliasTypeId::correction"correction"));
1756     aliasTypes.push_back(AliasType(AliasTypeId::figment"figment"));
1757     for (const AliasType& aliasType : aliasTypes)
1758     {
1759         aliasTypeMap[aliasType.Id()] = &aliasType;
1760         typeNameMap[aliasType.Name()] = &aliasType;
1761     }
1762 }
1763 
1764 const AliasType& AliasTypeTable::GetAliasType(AliasTypeId id) const
1765 {
1766     auto it = aliasTypeMap.find(id);
1767     if (it != aliasTypeMap.cend())
1768     {
1769         return *it->second;
1770     }
1771     else
1772     {
1773         throw UnicodeException("alias type " + std::to_string(static_cast<int>(id)) + " not found");
1774     }
1775 }
1776 
1777 const AliasType& AliasTypeTable::GetAliasType(const std::string& typeName) const
1778 {
1779     auto it = typeNameMap.find(MakeCanonicalPropertyName(typeName));
1780     if (it != typeNameMap.cend())
1781     {
1782         return *it->second;
1783     }
1784     else
1785     {
1786         throw UnicodeException("alias type '" + typeName + "' not found");
1787     }
1788 }
1789 
1790 Alias::Alias() : typeId(AliasTypeId::none)name()
1791 {
1792 }
1793 
1794 Alias::Alias(AliasTypeId typeId_const std::string& name_) : typeId(typeId_)name(name_)
1795 {
1796 }
1797 
1798 void Alias::Write(BinaryWriter& writer)
1799 {
1800     writer.Write(static_cast<uint8_t>(typeId));
1801     writer.Write(name);
1802 }
1803 
1804 void Alias::Read(BinaryReader& reader)
1805 {
1806     typeId = static_cast<AliasTypeId>(reader.ReadByte());
1807     name = reader.ReadUtf8String();
1808 }
1809 
1810 ExtendedCharacterInfo::ExtendedCharacterInfo() : characterName()unicode1Name()canonicalCombiningClass(0)fullUpper()fullLower()fullTitle()fullFolding()bidiClass(BidiClassId::none)
1811     numericType(NumericTypeId::none)numericValue()bidiPairedBracketType(BidiPairedBracketTypeId::none)bidiMirroringGlyph(0)bidiPairedBracket(0)
1812 {
1813 }
1814 
1815 void ExtendedCharacterInfo::SetCharacterName(const std::string& characterName_)
1816 {
1817     characterName = characterName_;
1818 }
1819 
1820 void ExtendedCharacterInfo::SetUnicode1Name(const std::string& unicode1Name_)
1821 {
1822     unicode1Name = unicode1Name_;
1823 }
1824 
1825 void ExtendedCharacterInfo::Write(BinaryWriter& writer)
1826 {
1827     writer.Write(characterName);
1828     writer.Write(unicode1Name);
1829     writer.Write(static_cast<uint8_t>(canonicalCombiningClass));
1830     uint8_t nu = static_cast<uint8_t>(fullUpper.length());
1831     writer.Write(nu);
1832     for (uint8_t i = 0; i < nu; ++i)
1833     {
1834         writer.Write(fullUpper[i]);
1835     }
1836     uint8_t nl = static_cast<uint8_t>(fullLower.length());
1837     writer.Write(nl);
1838     for (uint8_t i = 0; i < nl; ++i)
1839     {
1840         writer.Write(fullLower[i]);
1841     }
1842     uint8_t nt = static_cast<uint8_t>(fullTitle.length());
1843     writer.Write(nt);
1844     for (uint8_t i = 0; i < nt; ++i)
1845     {
1846         writer.Write(fullTitle[i]);
1847     }
1848     uint8_t nf = static_cast<uint8_t>(fullFolding.length());
1849     writer.Write(nf);
1850     for (uint8_t i = 0; i < nf; ++i)
1851     {
1852         writer.Write(fullFolding[i]);
1853     }
1854     writer.Write(static_cast<uint8_t>(bidiClass));
1855     writer.Write(static_cast<uint8_t>(numericType));
1856     writer.Write(numericValue);
1857     uint8_t na = static_cast<uint8_t>(aliases.size());
1858     writer.Write(na);
1859     for (uint8_t i = 0; i < na; ++i)
1860     {
1861         aliases[i].Write(writer);
1862     }
1863     writer.Write(bidiMirroringGlyph);
1864     writer.Write(static_cast<uint8_t>(bidiPairedBracketType));
1865     writer.Write(bidiPairedBracket);
1866 }
1867 
1868 void ExtendedCharacterInfo::Read(BinaryReader& reader)
1869 {
1870     characterName = reader.ReadUtf8String();
1871     unicode1Name = reader.ReadUtf8String();
1872     canonicalCombiningClass = reader.ReadByte();
1873     uint8_t nu = reader.ReadByte();
1874     for (uint8_t i = 0; i < nu; ++i)
1875     {
1876         fullUpper.append(1reader.ReadUChar());
1877     }
1878     uint8_t nl = reader.ReadByte();
1879     for (uint8_t i = 0; i < nl; ++i)
1880     {
1881         fullLower.append(1reader.ReadUChar());
1882     }
1883     uint8_t nt = reader.ReadByte();
1884     for (uint8_t i = 0; i < nt; ++i)
1885     {
1886         fullTitle.append(1reader.ReadUChar());
1887     }
1888     uint8_t nf = reader.ReadByte();
1889     for (uint8_t i = 0; i < nf; ++i)
1890     {
1891         fullFolding.append(1reader.ReadUChar());
1892     }
1893     bidiClass = static_cast<BidiClassId>(reader.ReadByte());
1894     numericType = static_cast<NumericTypeId>(reader.ReadByte());
1895     numericValue = reader.ReadUtf8String();
1896     uint8_t na = reader.ReadByte();
1897     for (uint8_t i = 0; i < na; ++i)
1898     {
1899         Alias alias;
1900         alias.Read(reader);
1901         aliases.push_back(alias);
1902     }
1903     bidiMirroringGlyph = reader.ReadUChar();
1904     bidiPairedBracketType = static_cast<BidiPairedBracketTypeId>(reader.ReadByte());
1905     bidiPairedBracket = reader.ReadUChar();
1906 }
1907 
1908 CharacterInfoPage::CharacterInfoPage()
1909 {
1910     characterInfos.resize(numInfosInPage);
1911 }
1912 
1913 CharacterInfo& CharacterInfoPage::GetCharacterInfo(int index)
1914 {
1915     if (index < 0 || index > characterInfos.size())
1916     {
1917         throw UnicodeException("invalid character info index");
1918     }
1919     return characterInfos[index];
1920 }
1921 
1922 const CharacterInfo& CharacterInfoPage::GetCharacterInfo(int index) const
1923 {
1924     if (index < 0 || index > characterInfos.size())
1925     {
1926         throw UnicodeException("invalid character info index");
1927     }
1928     return characterInfos[index];
1929 }
1930 
1931 void CharacterInfoPage::Write(BinaryWriter& writer)
1932 {
1933     for (int i = 0; i < characterInfos.size(); ++i)
1934     {
1935         CharacterInfo& info = characterInfos[i];
1936         info.Write(writer);
1937     }
1938 }
1939 
1940 void CharacterInfoPage::Read(BinaryReader& reader)
1941 {
1942     for (int i = 0; i < characterInfos.size(); ++i)
1943     {
1944         CharacterInfo& info = characterInfos[i];
1945         info.Read(reader);
1946     }
1947 }
1948 
1949 ExtendedCharacterInfoPage::ExtendedCharacterInfoPage()
1950 {
1951     extendedCharacterInfos.resize(numInfosInPage);
1952 }
1953 
1954 const ExtendedCharacterInfo& ExtendedCharacterInfoPage::GetExtendedCharacterInfo(int index) const
1955 {
1956     if (index < 0 || index > extendedCharacterInfos.size())
1957     {
1958         throw UnicodeException("invalid extended character info index");
1959     }
1960     return extendedCharacterInfos[index];
1961 }
1962 
1963 ExtendedCharacterInfo& ExtendedCharacterInfoPage::GetExtendedCharacterInfo(int index)
1964 {
1965     if (index < 0 || index > extendedCharacterInfos.size())
1966     {
1967         throw UnicodeException("invalid extended character info index");
1968     }
1969     return extendedCharacterInfos[index];
1970 }
1971 
1972 void ExtendedCharacterInfoPage::Write(BinaryWriter& writer)
1973 {
1974     int n = extendedCharacterInfos.size();
1975     for (int i = 0; i < n; ++i)
1976     {
1977         extendedCharacterInfos[i].Write(writer);
1978     }
1979 }
1980 
1981 void ExtendedCharacterInfoPage::Read(BinaryReader& reader)
1982 {
1983     int n = extendedCharacterInfos.size();
1984     for (int i = 0; i < n; ++i)
1985     {
1986         extendedCharacterInfos[i].Read(reader);
1987     }
1988 }
1989 
1990 void ExtendedCharacterInfoHeader::AllocatePages(int numExtendedPages)
1991 {
1992     extendedPageStarts.resize(numExtendedPages);
1993 }
1994 
1995 void ExtendedCharacterInfoHeader::Write(BinaryWriter& writer)
1996 {
1997     uint32_t n = extendedPageStarts.size();
1998     writer.Write(n);
1999     for (uint32_t i = 0; i < n; ++i)
2000     {
2001         writer.Write(extendedPageStarts[i]);
2002     }
2003 }
2004 
2005 void ExtendedCharacterInfoHeader::Read(BinaryReader& reader)
2006 {
2007     uint32_t n = reader.ReadUInt();
2008     for (uint32_t i = 0; i < n; ++i)
2009     {
2010         uint32_t start = reader.ReadUInt();
2011         extendedPageStarts.push_back(start);
2012     }
2013 }
2014 
2015 uint32_t ExtendedCharacterInfoHeader::GetPageStart(int pageIndex) const
2016 {
2017     if (pageIndex < 0 || pageIndex >= extendedPageStarts.size())
2018     {
2019         throw UnicodeException("invalid extended page index" + std::to_string(pageIndex));
2020     }
2021     return extendedPageStarts[pageIndex];
2022 }
2023 
2024 void ExtendedCharacterInfoHeader::SetPageStart(int pageIndexuint32_t extendedPageStart)
2025 {
2026     if (pageIndex < 0 || pageIndex >= extendedPageStarts.size())
2027     {
2028         throw UnicodeException("invalid extended page index" + std::to_string(pageIndex));
2029     }
2030     extendedPageStarts[pageIndex] = extendedPageStart;
2031 }
2032 
2033 void CharacterTable::Init()
2034 {
2035     instance.reset(new CharacterTable());
2036 }
2037 
2038 void CharacterTable::Done()
2039 {
2040     instance.reset();
2041 }
2042 
2043 std::unique_ptr<CharacterTable> CharacterTable::instance;
2044 
2045 const uint8_t headerMagic[8] =
2046 {
2047     static_cast<uint8_t>('C')static_cast<uint8_t>('M')static_cast<uint8_t>('A')static_cast<uint8_t>('J')
2048     static_cast<uint8_t>('U')static_cast<uint8_t>('C')static_cast<uint8_t>('D')current_cmajor_ucd_version
2049 };
2050 
2051 std::string CmajorRoot()
2052 {
2053     std::string cmajorRoot;
2054     const char* cmajorRootEnv = getenv("CMAJOR_ROOT");
2055     if (cmajorRootEnv)
2056     {
2057         cmajorRoot = cmajorRootEnv;
2058     }
2059     if (cmajorRoot.empty())
2060     {
2061         throw UnicodeException("please set 'CMAJOR_ROOT' environment variable to contain /path/to/cmajor-" + CmajorVersionStr() + " directory.");
2062     }
2063     return cmajorRoot;
2064 }
2065 
2066 std::string CmajorUcdFilePath()
2067 {
2068     return (boost::filesystem::path(CmajorRoot()) / boost::filesystem::path("unicode") / boost::filesystem::path("cmajor_ucd.bin")).generic_string();
2069 }
2070 
2071 CharacterTable::CharacterTable() : headerRead(false)extendedHeaderStart(0)extendedHeaderEnd(0)extendedHeaderRead(false)
2072 {
2073 }
2074 
2075 void CharacterTable::Write()
2076 {
2077     std::string ucdFilePath = CmajorUcdFilePath();
2078     BinaryWriter writer(ucdFilePath);
2079     WriteHeader(writer);
2080     writer.Seek(headerSize);
2081     int n = pages.size();
2082     for (int i = 0; i < n; ++i)
2083     {
2084         CharacterInfoPage* page = pages[i].get();
2085         page->Write(writer);
2086     }
2087     extendedHeaderStart = writer.Pos();
2088     int nx = extendedPages.size();
2089     extendedHeader.AllocatePages(nx);
2090     extendedHeader.Write(writer);
2091     extendedHeaderEnd = writer.Pos();
2092     for (int i = 0; i < nx; ++i)
2093     {
2094         extendedHeader.SetPageStart(iwriter.Pos());
2095         ExtendedCharacterInfoPage* extendedPage = extendedPages[i].get();
2096         extendedPage->Write(writer);
2097     }
2098     writer.Seek(extendedHeaderStart);
2099     extendedHeader.Write(writer);
2100     writer.Seek(0);
2101     WriteHeader(writer);
2102 }
2103 
2104 void CharacterTable::WriteHeader(BinaryWriter& writer)
2105 {
2106     for (int i = 0; i < 8; ++i)
2107     {
2108         writer.Write(headerMagic[i]);
2109     }
2110     writer.Write(uint32_t(extendedHeaderStart));
2111     writer.Write(uint32_t(extendedHeaderEnd));
2112 }
2113 
2114 void CharacterTable::ReadHeader(BinaryReader& reader)
2115 {
2116     headerRead = true;
2117     uint8_t magic[8];
2118     for (int i = 0; i < 8; ++i)
2119     {
2120         magic[i] = reader.ReadByte();
2121     }
2122     for (int i = 0; i < 7; ++i)
2123     {
2124         if (magic[i] != headerMagic[i])
2125         {
2126             throw UnicodeException("invalid cmajor_ucd.bin header magic: 'CMAJUCD' expected");
2127         }
2128     }
2129     if (magic[7] != headerMagic[7])
2130     {
2131         throw UnicodeException("invalid cmajor_ucd.bin version: version " + std::string(1headerMagic[7]) + " expected, version " + std::string(1magic[7]) + " read");
2132     }
2133     extendedHeaderStart = reader.ReadUInt();
2134     extendedHeaderEnd = reader.ReadUInt();
2135     reader.Skip(headerSize - 16);
2136 }
2137 
2138 void CharacterTable::ReadExtendedHeader(BinaryReader& reader)
2139 {
2140     extendedHeaderRead = true;
2141     extendedHeader.Read(reader);
2142 }
2143 
2144 std::mutex mtx;
2145 
2146 const CharacterInfo& CharacterTable::GetCharacterInfo(char32_t codePoint)
2147 {
2148     if (codePoint > 0x10FFFF)
2149     {
2150         throw UnicodeException("invalid Unicode code point " + std::to_string(codePoint));
2151     }
2152     int pageIndex = codePoint / numInfosInPage;
2153     if (pages.size() <= pageIndex)
2154     {
2155         std::lock_guard<std::mutex> lock(mtx);
2156         while (pages.size() <= pageIndex)
2157         {
2158             pages.push_back(std::unique_ptr<CharacterInfoPage>());
2159         }
2160     }
2161     CharacterInfoPage* page = pages[pageIndex].get();
2162     if (!page)
2163     {
2164         std::lock_guard<std::mutex> lock(mtx);
2165         if (!page)
2166         {
2167             std::string ucdFilePath = CmajorUcdFilePath();
2168             BinaryReader reader(ucdFilePath);
2169             uint32_t pageStart = 0;
2170             if (!headerRead)
2171             {
2172                 ReadHeader(reader);
2173                 pageStart = characterInfoPageSize * pageIndex;
2174             }
2175             else
2176             {
2177                 pageStart = headerSize + characterInfoPageSize * pageIndex;
2178             }
2179             reader.Skip(pageStart);
2180             page = new CharacterInfoPage();
2181             page->Read(reader);
2182             pages[pageIndex] = std::move(std::unique_ptr<CharacterInfoPage>(page));
2183         }
2184     }
2185     int infoIndex = codePoint % numInfosInPage;
2186     return page->GetCharacterInfo(infoIndex);
2187 }
2188 
2189 CharacterInfo& CharacterTable::CreateCharacterInfo(char32_t codePoint)
2190 {
2191     if (codePoint > 0x10FFFF)
2192     {
2193         throw UnicodeException("invalid Unicode code point " + std::to_string(codePoint));
2194     }
2195     int pageIndex = codePoint / numInfosInPage;
2196     while (pages.size() <= pageIndex)
2197     {
2198         pages.push_back(std::unique_ptr<CharacterInfoPage>(new CharacterInfoPage()));
2199     }
2200     int infoIndex = codePoint % numInfosInPage;
2201     CharacterInfoPage* page = pages[pageIndex].get();
2202     return page->GetCharacterInfo(infoIndex);
2203 }
2204 
2205 const ExtendedCharacterInfo& CharacterTable::GetExtendedCharacterInfo(char32_t codePoint)
2206 {
2207     if (codePoint > 0x10FFFF)
2208     {
2209         throw UnicodeException("invalid Unicode code point " + std::to_string(codePoint));
2210     }
2211     int pageIndex = codePoint / numInfosInPage;
2212     if (extendedPages.size() <= pageIndex)
2213     {
2214         std::lock_guard<std::mutex> lock(mtx);
2215         while (extendedPages.size() <= pageIndex)
2216         {
2217             extendedPages.push_back(std::unique_ptr<ExtendedCharacterInfoPage>());
2218         }
2219     }
2220     ExtendedCharacterInfoPage* extendedPage = extendedPages[pageIndex].get();
2221     if (!extendedPage)
2222     {
2223         std::lock_guard<std::mutex> lock(mtx);
2224         if (!extendedPage)
2225         {
2226             std::string ucdFilePath = CmajorUcdFilePath();
2227             BinaryReader reader(ucdFilePath);
2228             uint32_t start = 0;
2229             uint32_t pageStart = 0;
2230             if (!headerRead)
2231             {
2232                 ReadHeader(reader);
2233                 start = extendedHeaderStart - headerSize;
2234             }
2235             else
2236             {
2237                 start = extendedHeaderStart;
2238             }
2239             if (!extendedHeaderRead)
2240             {
2241                 reader.Skip(start);
2242                 ReadExtendedHeader(reader);
2243                 pageStart = extendedHeader.GetPageStart(pageIndex) - extendedHeaderEnd;
2244             }
2245             else
2246             {
2247                 pageStart = extendedHeader.GetPageStart(pageIndex);
2248             }
2249             reader.Skip(pageStart);
2250             extendedPage = new ExtendedCharacterInfoPage();
2251             extendedPage->Read(reader);
2252             extendedPages[pageIndex] = std::move(std::unique_ptr<ExtendedCharacterInfoPage>(extendedPage));
2253         }
2254     }
2255     int infoIndex = codePoint % numInfosInPage;
2256     return extendedPage->GetExtendedCharacterInfo(infoIndex);
2257 }
2258 
2259 ExtendedCharacterInfo& CharacterTable::CreateExtendedCharacterInfo(char32_t codePoint)
2260 {
2261     if (codePoint > 0x10FFFF)
2262     {
2263         throw UnicodeException("invalid Unicode code point " + std::to_string(codePoint));
2264     }
2265     int pageIndex = codePoint / numInfosInPage;
2266     while (extendedPages.size() <= pageIndex)
2267     {
2268         extendedPages.push_back(std::unique_ptr<ExtendedCharacterInfoPage>(new ExtendedCharacterInfoPage()));
2269     }
2270     int infoIndex = codePoint % numInfosInPage;
2271     ExtendedCharacterInfoPage* extendedPage = extendedPages[pageIndex].get();
2272     return extendedPage->GetExtendedCharacterInfo(infoIndex);
2273 }
2274 
2275 bool IsAsciiDigit(char32_t c)
2276 {
2277     if (c < 256)
2278     {
2279         return std::isdigit((unsigned char)c);
2280     }
2281     return false;
2282 }
2283 
2284 void UnicodeInit()
2285 {
2286     CharacterTable::Init();
2287     BinaryPropertyTable::Init();
2288     GeneralCategoryTable::Init();
2289     AgeTable::Init();
2290     ScriptTable::Init();
2291     BlockTable::Init();
2292     BidiClassTable::Init();
2293     BidiPairedBracketTypeTable::Init();
2294     NumericTypeTable::Init();
2295     AliasTypeTable::Init();
2296 }
2297 
2298 void UnicodeDone()
2299 {
2300     AliasTypeTable::Done();
2301     NumericTypeTable::Done();
2302     BidiPairedBracketTypeTable::Done();
2303     BidiClassTable::Done();
2304     BlockTable::Done();
2305     ScriptTable::Done();
2306     AgeTable::Done();
2307     GeneralCategoryTable::Done();
2308     BinaryPropertyTable::Done();
2309     CharacterTable::Done();
2310 }
2311 
2312 } } // namespace soulng::util