Mercurial > hg > MonetDB
changeset 64972:d2c2a51cc7e5
Merge with Jul2017 branch.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Fri, 20 Oct 2017 20:19:04 +0200 (2017-10-20) |
parents | 5633d013012c (current diff) e04a93ced123 (diff) |
children | 2e1bfee1461d |
files | MonetDB.spec configure.ag monetdb5/modules/atoms/str.c |
diffstat | 3 files changed, 2219 insertions(+), 738 deletions(-) [+] |
line wrap: on
line diff
--- a/MonetDB.spec +++ b/MonetDB.spec @@ -970,7 +970,7 @@ fi --enable-monetdb5=yes \ --enable-netcdf=no \ --enable-odbc=yes \ - --enable-optimize=yes \ + --enable-optimize=no \ --enable-profile=no \ --enable-pyintegration=%{?with_pyintegration:yes}%{!?with_pyintegration:no} \ --enable-rintegration=%{?with_rintegration:yes}%{!?with_rintegration:no} \
--- a/configure.ag +++ b/configure.ag @@ -326,19 +326,19 @@ AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], [enable full debugging (default=yes for development sources)])], [enable_debug=$enableval], - [enable_debug=def_$dft_debug]) + [enable_debug=$dft_debug]) AC_ARG_ENABLE([assert], [AS_HELP_STRING([--enable-assert], [enable assertions in the code (default=yes for development sources)])], [enable_assert=$enableval], - [enable_assert=def_$dft_assert]) + [enable_assert=$dft_assert]) AC_ARG_ENABLE([optimize], [AS_HELP_STRING([--enable-optimize], [enable extra optimization (default=no)])], [enable_optimize=$enableval], - [enable_optimize=def_$dft_optimize]) + [enable_optimize=$dft_optimize]) AC_ARG_ENABLE([strict], [AS_HELP_STRING([--enable-strict], @@ -362,7 +362,7 @@ dft_profile=$need_profile AC_ARG_ENABLE([profile], [AS_HELP_STRING([--enable-profile], [enable profiling (default=no)])], [enable_profile=$enableval], - [enable_profile=def_$dft_profile]) + [enable_profile=$dft_profile]) need_instrument=no dft_instrument=$need_instrument @@ -370,7 +370,7 @@ AC_ARG_ENABLE([instrument], [AS_HELP_STRING([--enable-instrument], [enable instrument (default=no)])], [enable_instrument=$enableval], - [enable_instrument=def_$dft_instrument]) + [enable_instrument=$dft_instrument]) # RIPEMD160 is patent free, academic and European, but unfortunately # can't use it by default, as that would exclude JDBC usage (Java
--- a/monetdb5/modules/atoms/str.c +++ b/monetdb5/modules/atoms/str.c @@ -96,51 +96,10 @@ * high-performance hash-lookup (all code inlined). */ -/* This table was generated from the Unicode 5.0.0 spec. The table is - * generated by using the codes for conversion to lower case and for - * conversion to title case and upper case. A few code points have - * been moved in order to get reasonable conversions (if two code - * points are converted to the same value, the first one in this table - * wins). The code points that have been interchanged are: - * U+0345 (COMBINING GREEK YPOGEGRAMMENI) / U+03B9 (GREEK SMALL LETTER IOTA) <-> U+0399 (GREEK CAPITAL LETTER IOTA) - * U+00B5 (MICRO SIGN) / U+03BC (GREEK SMALL LETTER MU) <-> U+039C (GREEK CAPITAL LETTER MU) - * U+03C2 (GREEK SMALL LETTER FINAL SIGMA) / U+03C3 (GREEK SMALL LETTER SIGMA) <-> U+3A3 (GREEK CAPITAL LETTER SIGMA) - * - * In addition, there are a few code points where there are different - * versions for upper case and title case. These had to be switched - * around a little so that the mappings are done sensibly. - * - * The following combinations are included in this order: - * lower case <-> title case - * lower case <- upper case - * upper case -> title case - * The conversion title case -> upper case was removed - * - * The relevant code points are: - * U+01C4 (LATIN CAPITAL LETTER DZ WITH CARON) - * U+01C5 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON) - * U+01C6 (LATIN SMALL LETTER DZ WITH CARON) - * U+01C7 (LATIN CAPITAL LETTER LJ) - * U+01C8 (LATIN CAPITAL LETTER L WITH SMALL LETTER J) - * U+01C9 (LATIN SMALL LETTER LJ) - * U+01CA (LATIN CAPITAL LETTER NJ) - * U+01CB (LATIN CAPITAL LETTER N WITH SMALL LETTER J) - * U+01CC (LATIN SMALL LETTER NJ) - * U+01F1 (LATIN CAPITAL LETTER DZ) - * U+01F2 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z) - * U+01F3 (LATIN SMALL LETTER DZ) - * - * The script used was basically: -(cut -d\; -f1,14 UnicodeData.txt | sed -n 's/\(.*\);\(..*\)/\2;\1/p' - cut -d\; -f1,15 UnicodeData.txt | grep -v ';$' - cut -d\; -f1,13 UnicodeData.txt | grep -v ';$' -) | grep -v '^\([^ ]*\);\1$' | sort -t\; -u | sed 's/\(.*\);\(.*\)/{0x\1,0x\2,},/' - * with some hand munging afterward. The data file is UnicodeData.txt - * from http://www.unicode.org/. - */ +/* These tables were generated from the Unicode 10.0.0 spec. */ struct UTF8_lower_upper { - unsigned int lower, upper; -} UTF8_lower_upper[] = { + unsigned int from, to; +} UTF8_toUpper[] = { /* code points with non-null uppercase conversion */ { 0x0061, 0x0041, }, { 0x0062, 0x0042, }, { 0x0063, 0x0043, }, @@ -150,10 +109,8 @@ struct UTF8_lower_upper { { 0x0067, 0x0047, }, { 0x0068, 0x0048, }, { 0x0069, 0x0049, }, - { 0x0069, 0x0130, }, { 0x006A, 0x004A, }, { 0x006B, 0x004B, }, - { 0x006B, 0x212A, }, { 0x006C, 0x004C, }, { 0x006D, 0x004D, }, { 0x006E, 0x004E, }, @@ -169,14 +126,13 @@ struct UTF8_lower_upper { { 0x0078, 0x0058, }, { 0x0079, 0x0059, }, { 0x007A, 0x005A, }, - { 0x03BC, 0x039C, }, + { 0x00B5, 0x039C, }, { 0x00E0, 0x00C0, }, { 0x00E1, 0x00C1, }, { 0x00E2, 0x00C2, }, { 0x00E3, 0x00C3, }, { 0x00E4, 0x00C4, }, { 0x00E5, 0x00C5, }, - { 0x00E5, 0x212B, }, { 0x00E6, 0x00C6, }, { 0x00E7, 0x00C7, }, { 0x00E8, 0x00C8, }, @@ -286,15 +242,12 @@ struct UTF8_lower_upper { { 0x01B9, 0x01B8, }, { 0x01BD, 0x01BC, }, { 0x01BF, 0x01F7, }, - { 0x01C6, 0x01C5, }, + { 0x01C5, 0x01C4, }, { 0x01C6, 0x01C4, }, - { 0x01C4, 0x01C5, }, - { 0x01C9, 0x01C8, }, + { 0x01C8, 0x01C7, }, { 0x01C9, 0x01C7, }, - { 0x01C7, 0x01C8, }, - { 0x01CC, 0x01CB, }, + { 0x01CB, 0x01CA, }, { 0x01CC, 0x01CA, }, - { 0x01CA, 0x01CB, }, { 0x01CE, 0x01CD, }, { 0x01D0, 0x01CF, }, { 0x01D2, 0x01D1, }, @@ -313,9 +266,8 @@ struct UTF8_lower_upper { { 0x01EB, 0x01EA, }, { 0x01ED, 0x01EC, }, { 0x01EF, 0x01EE, }, - { 0x01F3, 0x01F2, }, + { 0x01F2, 0x01F1, }, { 0x01F3, 0x01F1, }, - { 0x01F1, 0x01F2, }, { 0x01F5, 0x01F4, }, { 0x01F9, 0x01F8, }, { 0x01FB, 0x01FA, }, @@ -347,36 +299,54 @@ struct UTF8_lower_upper { { 0x0231, 0x0230, }, { 0x0233, 0x0232, }, { 0x023C, 0x023B, }, + { 0x023F, 0x2C7E, }, + { 0x0240, 0x2C7F, }, { 0x0242, 0x0241, }, { 0x0247, 0x0246, }, { 0x0249, 0x0248, }, { 0x024B, 0x024A, }, { 0x024D, 0x024C, }, { 0x024F, 0x024E, }, + { 0x0250, 0x2C6F, }, + { 0x0251, 0x2C6D, }, + { 0x0252, 0x2C70, }, { 0x0253, 0x0181, }, { 0x0254, 0x0186, }, { 0x0256, 0x0189, }, { 0x0257, 0x018A, }, { 0x0259, 0x018F, }, { 0x025B, 0x0190, }, + { 0x025C, 0xA7AB, }, { 0x0260, 0x0193, }, + { 0x0261, 0xA7AC, }, { 0x0263, 0x0194, }, + { 0x0265, 0xA78D, }, + { 0x0266, 0xA7AA, }, { 0x0268, 0x0197, }, { 0x0269, 0x0196, }, + { 0x026A, 0xA7AE, }, { 0x026B, 0x2C62, }, + { 0x026C, 0xA7AD, }, { 0x026F, 0x019C, }, + { 0x0271, 0x2C6E, }, { 0x0272, 0x019D, }, { 0x0275, 0x019F, }, { 0x027D, 0x2C64, }, { 0x0280, 0x01A6, }, { 0x0283, 0x01A9, }, + { 0x0287, 0xA7B1, }, { 0x0288, 0x01AE, }, { 0x0289, 0x0244, }, { 0x028A, 0x01B1, }, { 0x028B, 0x01B2, }, { 0x028C, 0x0245, }, { 0x0292, 0x01B7, }, - { 0x03B9, 0x0399, }, + { 0x029D, 0xA7B2, }, + { 0x029E, 0xA7B0, }, + { 0x0345, 0x0399, }, + { 0x0371, 0x0370, }, + { 0x0373, 0x0372, }, + { 0x0377, 0x0376, }, { 0x037B, 0x03FD, }, { 0x037C, 0x03FE, }, { 0x037D, 0x03FF, }, @@ -392,25 +362,23 @@ struct UTF8_lower_upper { { 0x03B6, 0x0396, }, { 0x03B7, 0x0397, }, { 0x03B8, 0x0398, }, - { 0x03B8, 0x03F4, }, - { 0x0345, 0x0399, }, + { 0x03B9, 0x0399, }, { 0x03BA, 0x039A, }, { 0x03BB, 0x039B, }, - { 0x00B5, 0x039C, }, + { 0x03BC, 0x039C, }, { 0x03BD, 0x039D, }, { 0x03BE, 0x039E, }, { 0x03BF, 0x039F, }, { 0x03C0, 0x03A0, }, { 0x03C1, 0x03A1, }, + { 0x03C2, 0x03A3, }, { 0x03C3, 0x03A3, }, - { 0x03C2, 0x03A3, }, { 0x03C4, 0x03A4, }, { 0x03C5, 0x03A5, }, { 0x03C6, 0x03A6, }, { 0x03C7, 0x03A7, }, { 0x03C8, 0x03A8, }, { 0x03C9, 0x03A9, }, - { 0x03C9, 0x2126, }, { 0x03CA, 0x03AA, }, { 0x03CB, 0x03AB, }, { 0x03CC, 0x038C, }, @@ -420,6 +388,7 @@ struct UTF8_lower_upper { { 0x03D1, 0x0398, }, { 0x03D5, 0x03A6, }, { 0x03D6, 0x03A0, }, + { 0x03D7, 0x03CF, }, { 0x03D9, 0x03D8, }, { 0x03DB, 0x03DA, }, { 0x03DD, 0x03DC, }, @@ -435,6 +404,7 @@ struct UTF8_lower_upper { { 0x03F0, 0x039A, }, { 0x03F1, 0x03A1, }, { 0x03F2, 0x03F9, }, + { 0x03F3, 0x037F, }, { 0x03F5, 0x0395, }, { 0x03F8, 0x03F7, }, { 0x03FB, 0x03FA, }, @@ -572,6 +542,20 @@ struct UTF8_lower_upper { { 0x050F, 0x050E, }, { 0x0511, 0x0510, }, { 0x0513, 0x0512, }, + { 0x0515, 0x0514, }, + { 0x0517, 0x0516, }, + { 0x0519, 0x0518, }, + { 0x051B, 0x051A, }, + { 0x051D, 0x051C, }, + { 0x051F, 0x051E, }, + { 0x0521, 0x0520, }, + { 0x0523, 0x0522, }, + { 0x0525, 0x0524, }, + { 0x0527, 0x0526, }, + { 0x0529, 0x0528, }, + { 0x052B, 0x052A, }, + { 0x052D, 0x052C, }, + { 0x052F, 0x052E, }, { 0x0561, 0x0531, }, { 0x0562, 0x0532, }, { 0x0563, 0x0533, }, @@ -610,6 +594,22 @@ struct UTF8_lower_upper { { 0x0584, 0x0554, }, { 0x0585, 0x0555, }, { 0x0586, 0x0556, }, + { 0x13F8, 0x13F0, }, + { 0x13F9, 0x13F1, }, + { 0x13FA, 0x13F2, }, + { 0x13FB, 0x13F3, }, + { 0x13FC, 0x13F4, }, + { 0x13FD, 0x13F5, }, + { 0x1C80, 0x0412, }, + { 0x1C81, 0x0414, }, + { 0x1C82, 0x041E, }, + { 0x1C83, 0x0421, }, + { 0x1C84, 0x0422, }, + { 0x1C85, 0x0422, }, + { 0x1C86, 0x042A, }, + { 0x1C87, 0x0462, }, + { 0x1C88, 0xA64A, }, + { 0x1D79, 0xA77D, }, { 0x1D7D, 0x2C63, }, { 0x1E01, 0x1E00, }, { 0x1E03, 0x1E02, }, @@ -732,6 +732,9 @@ struct UTF8_lower_upper { { 0x1EF5, 0x1EF4, }, { 0x1EF7, 0x1EF6, }, { 0x1EF9, 0x1EF8, }, + { 0x1EFB, 0x1EFA, }, + { 0x1EFD, 0x1EFC, }, + { 0x1EFF, 0x1EFE, }, { 0x1F00, 0x1F08, }, { 0x1F01, 0x1F09, }, { 0x1F02, 0x1F0A, }, @@ -926,6 +929,7 @@ struct UTF8_lower_upper { { 0x2C68, 0x2C67, }, { 0x2C6A, 0x2C69, }, { 0x2C6C, 0x2C6B, }, + { 0x2C73, 0x2C72, }, { 0x2C76, 0x2C75, }, { 0x2C81, 0x2C80, }, { 0x2C83, 0x2C82, }, @@ -977,6 +981,9 @@ struct UTF8_lower_upper { { 0x2CDF, 0x2CDE, }, { 0x2CE1, 0x2CE0, }, { 0x2CE3, 0x2CE2, }, + { 0x2CEC, 0x2CEB, }, + { 0x2CEE, 0x2CED, }, + { 0x2CF3, 0x2CF2, }, { 0x2D00, 0x10A0, }, { 0x2D01, 0x10A1, }, { 0x2D02, 0x10A2, }, @@ -1015,6 +1022,186 @@ struct UTF8_lower_upper { { 0x2D23, 0x10C3, }, { 0x2D24, 0x10C4, }, { 0x2D25, 0x10C5, }, + { 0x2D27, 0x10C7, }, + { 0x2D2D, 0x10CD, }, + { 0xA641, 0xA640, }, + { 0xA643, 0xA642, }, + { 0xA645, 0xA644, }, + { 0xA647, 0xA646, }, + { 0xA649, 0xA648, }, + { 0xA64B, 0xA64A, }, + { 0xA64D, 0xA64C, }, + { 0xA64F, 0xA64E, }, + { 0xA651, 0xA650, }, + { 0xA653, 0xA652, }, + { 0xA655, 0xA654, }, + { 0xA657, 0xA656, }, + { 0xA659, 0xA658, }, + { 0xA65B, 0xA65A, }, + { 0xA65D, 0xA65C, }, + { 0xA65F, 0xA65E, }, + { 0xA661, 0xA660, }, + { 0xA663, 0xA662, }, + { 0xA665, 0xA664, }, + { 0xA667, 0xA666, }, + { 0xA669, 0xA668, }, + { 0xA66B, 0xA66A, }, + { 0xA66D, 0xA66C, }, + { 0xA681, 0xA680, }, + { 0xA683, 0xA682, }, + { 0xA685, 0xA684, }, + { 0xA687, 0xA686, }, + { 0xA689, 0xA688, }, + { 0xA68B, 0xA68A, }, + { 0xA68D, 0xA68C, }, + { 0xA68F, 0xA68E, }, + { 0xA691, 0xA690, }, + { 0xA693, 0xA692, }, + { 0xA695, 0xA694, }, + { 0xA697, 0xA696, }, + { 0xA699, 0xA698, }, + { 0xA69B, 0xA69A, }, + { 0xA723, 0xA722, }, + { 0xA725, 0xA724, }, + { 0xA727, 0xA726, }, + { 0xA729, 0xA728, }, + { 0xA72B, 0xA72A, }, + { 0xA72D, 0xA72C, }, + { 0xA72F, 0xA72E, }, + { 0xA733, 0xA732, }, + { 0xA735, 0xA734, }, + { 0xA737, 0xA736, }, + { 0xA739, 0xA738, }, + { 0xA73B, 0xA73A, }, + { 0xA73D, 0xA73C, }, + { 0xA73F, 0xA73E, }, + { 0xA741, 0xA740, }, + { 0xA743, 0xA742, }, + { 0xA745, 0xA744, }, + { 0xA747, 0xA746, }, + { 0xA749, 0xA748, }, + { 0xA74B, 0xA74A, }, + { 0xA74D, 0xA74C, }, + { 0xA74F, 0xA74E, }, + { 0xA751, 0xA750, }, + { 0xA753, 0xA752, }, + { 0xA755, 0xA754, }, + { 0xA757, 0xA756, }, + { 0xA759, 0xA758, }, + { 0xA75B, 0xA75A, }, + { 0xA75D, 0xA75C, }, + { 0xA75F, 0xA75E, }, + { 0xA761, 0xA760, }, + { 0xA763, 0xA762, }, + { 0xA765, 0xA764, }, + { 0xA767, 0xA766, }, + { 0xA769, 0xA768, }, + { 0xA76B, 0xA76A, }, + { 0xA76D, 0xA76C, }, + { 0xA76F, 0xA76E, }, + { 0xA77A, 0xA779, }, + { 0xA77C, 0xA77B, }, + { 0xA77F, 0xA77E, }, + { 0xA781, 0xA780, }, + { 0xA783, 0xA782, }, + { 0xA785, 0xA784, }, + { 0xA787, 0xA786, }, + { 0xA78C, 0xA78B, }, + { 0xA791, 0xA790, }, + { 0xA793, 0xA792, }, + { 0xA797, 0xA796, }, + { 0xA799, 0xA798, }, + { 0xA79B, 0xA79A, }, + { 0xA79D, 0xA79C, }, + { 0xA79F, 0xA79E, }, + { 0xA7A1, 0xA7A0, }, + { 0xA7A3, 0xA7A2, }, + { 0xA7A5, 0xA7A4, }, + { 0xA7A7, 0xA7A6, }, + { 0xA7A9, 0xA7A8, }, + { 0xA7B5, 0xA7B4, }, + { 0xA7B7, 0xA7B6, }, + { 0xAB53, 0xA7B3, }, + { 0xAB70, 0x13A0, }, + { 0xAB71, 0x13A1, }, + { 0xAB72, 0x13A2, }, + { 0xAB73, 0x13A3, }, + { 0xAB74, 0x13A4, }, + { 0xAB75, 0x13A5, }, + { 0xAB76, 0x13A6, }, + { 0xAB77, 0x13A7, }, + { 0xAB78, 0x13A8, }, + { 0xAB79, 0x13A9, }, + { 0xAB7A, 0x13AA, }, + { 0xAB7B, 0x13AB, }, + { 0xAB7C, 0x13AC, }, + { 0xAB7D, 0x13AD, }, + { 0xAB7E, 0x13AE, }, + { 0xAB7F, 0x13AF, }, + { 0xAB80, 0x13B0, }, + { 0xAB81, 0x13B1, }, + { 0xAB82, 0x13B2, }, + { 0xAB83, 0x13B3, }, + { 0xAB84, 0x13B4, }, + { 0xAB85, 0x13B5, }, + { 0xAB86, 0x13B6, }, + { 0xAB87, 0x13B7, }, + { 0xAB88, 0x13B8, }, + { 0xAB89, 0x13B9, }, + { 0xAB8A, 0x13BA, }, + { 0xAB8B, 0x13BB, }, + { 0xAB8C, 0x13BC, }, + { 0xAB8D, 0x13BD, }, + { 0xAB8E, 0x13BE, }, + { 0xAB8F, 0x13BF, }, + { 0xAB90, 0x13C0, }, + { 0xAB91, 0x13C1, }, + { 0xAB92, 0x13C2, }, + { 0xAB93, 0x13C3, }, + { 0xAB94, 0x13C4, }, + { 0xAB95, 0x13C5, }, + { 0xAB96, 0x13C6, }, + { 0xAB97, 0x13C7, }, + { 0xAB98, 0x13C8, }, + { 0xAB99, 0x13C9, }, + { 0xAB9A, 0x13CA, }, + { 0xAB9B, 0x13CB, }, + { 0xAB9C, 0x13CC, }, + { 0xAB9D, 0x13CD, }, + { 0xAB9E, 0x13CE, }, + { 0xAB9F, 0x13CF, }, + { 0xABA0, 0x13D0, }, + { 0xABA1, 0x13D1, }, + { 0xABA2, 0x13D2, }, + { 0xABA3, 0x13D3, }, + { 0xABA4, 0x13D4, }, + { 0xABA5, 0x13D5, }, + { 0xABA6, 0x13D6, }, + { 0xABA7, 0x13D7, }, + { 0xABA8, 0x13D8, }, + { 0xABA9, 0x13D9, }, + { 0xABAA, 0x13DA, }, + { 0xABAB, 0x13DB, }, + { 0xABAC, 0x13DC, }, + { 0xABAD, 0x13DD, }, + { 0xABAE, 0x13DE, }, + { 0xABAF, 0x13DF, }, + { 0xABB0, 0x13E0, }, + { 0xABB1, 0x13E1, }, + { 0xABB2, 0x13E2, }, + { 0xABB3, 0x13E3, }, + { 0xABB4, 0x13E4, }, + { 0xABB5, 0x13E5, }, + { 0xABB6, 0x13E6, }, + { 0xABB7, 0x13E7, }, + { 0xABB8, 0x13E8, }, + { 0xABB9, 0x13E9, }, + { 0xABBA, 0x13EA, }, + { 0xABBB, 0x13EB, }, + { 0xABBC, 0x13EC, }, + { 0xABBD, 0x13ED, }, + { 0xABBE, 0x13EE, }, + { 0xABBF, 0x13EF, }, { 0xFF41, 0xFF21, }, { 0xFF42, 0xFF22, }, { 0xFF43, 0xFF23, }, @@ -1081,42 +1268,1547 @@ struct UTF8_lower_upper { { 0x1044D, 0x10425, }, { 0x1044E, 0x10426, }, { 0x1044F, 0x10427, }, + { 0x104D8, 0x104B0, }, + { 0x104D9, 0x104B1, }, + { 0x104DA, 0x104B2, }, + { 0x104DB, 0x104B3, }, + { 0x104DC, 0x104B4, }, + { 0x104DD, 0x104B5, }, + { 0x104DE, 0x104B6, }, + { 0x104DF, 0x104B7, }, + { 0x104E0, 0x104B8, }, + { 0x104E1, 0x104B9, }, + { 0x104E2, 0x104BA, }, + { 0x104E3, 0x104BB, }, + { 0x104E4, 0x104BC, }, + { 0x104E5, 0x104BD, }, + { 0x104E6, 0x104BE, }, + { 0x104E7, 0x104BF, }, + { 0x104E8, 0x104C0, }, + { 0x104E9, 0x104C1, }, + { 0x104EA, 0x104C2, }, + { 0x104EB, 0x104C3, }, + { 0x104EC, 0x104C4, }, + { 0x104ED, 0x104C5, }, + { 0x104EE, 0x104C6, }, + { 0x104EF, 0x104C7, }, + { 0x104F0, 0x104C8, }, + { 0x104F1, 0x104C9, }, + { 0x104F2, 0x104CA, }, + { 0x104F3, 0x104CB, }, + { 0x104F4, 0x104CC, }, + { 0x104F5, 0x104CD, }, + { 0x104F6, 0x104CE, }, + { 0x104F7, 0x104CF, }, + { 0x104F8, 0x104D0, }, + { 0x104F9, 0x104D1, }, + { 0x104FA, 0x104D2, }, + { 0x104FB, 0x104D3, }, + { 0x10CC0, 0x10C80, }, + { 0x10CC1, 0x10C81, }, + { 0x10CC2, 0x10C82, }, + { 0x10CC3, 0x10C83, }, + { 0x10CC4, 0x10C84, }, + { 0x10CC5, 0x10C85, }, + { 0x10CC6, 0x10C86, }, + { 0x10CC7, 0x10C87, }, + { 0x10CC8, 0x10C88, }, + { 0x10CC9, 0x10C89, }, + { 0x10CCA, 0x10C8A, }, + { 0x10CCB, 0x10C8B, }, + { 0x10CCC, 0x10C8C, }, + { 0x10CCD, 0x10C8D, }, + { 0x10CCE, 0x10C8E, }, + { 0x10CCF, 0x10C8F, }, + { 0x10CD0, 0x10C90, }, + { 0x10CD1, 0x10C91, }, + { 0x10CD2, 0x10C92, }, + { 0x10CD3, 0x10C93, }, + { 0x10CD4, 0x10C94, }, + { 0x10CD5, 0x10C95, }, + { 0x10CD6, 0x10C96, }, + { 0x10CD7, 0x10C97, }, + { 0x10CD8, 0x10C98, }, + { 0x10CD9, 0x10C99, }, + { 0x10CDA, 0x10C9A, }, + { 0x10CDB, 0x10C9B, }, + { 0x10CDC, 0x10C9C, }, + { 0x10CDD, 0x10C9D, }, + { 0x10CDE, 0x10C9E, }, + { 0x10CDF, 0x10C9F, }, + { 0x10CE0, 0x10CA0, }, + { 0x10CE1, 0x10CA1, }, + { 0x10CE2, 0x10CA2, }, + { 0x10CE3, 0x10CA3, }, + { 0x10CE4, 0x10CA4, }, + { 0x10CE5, 0x10CA5, }, + { 0x10CE6, 0x10CA6, }, + { 0x10CE7, 0x10CA7, }, + { 0x10CE8, 0x10CA8, }, + { 0x10CE9, 0x10CA9, }, + { 0x10CEA, 0x10CAA, }, + { 0x10CEB, 0x10CAB, }, + { 0x10CEC, 0x10CAC, }, + { 0x10CED, 0x10CAD, }, + { 0x10CEE, 0x10CAE, }, + { 0x10CEF, 0x10CAF, }, + { 0x10CF0, 0x10CB0, }, + { 0x10CF1, 0x10CB1, }, + { 0x10CF2, 0x10CB2, }, + { 0x118C0, 0x118A0, }, + { 0x118C1, 0x118A1, }, + { 0x118C2, 0x118A2, }, + { 0x118C3, 0x118A3, }, + { 0x118C4, 0x118A4, }, + { 0x118C5, 0x118A5, }, + { 0x118C6, 0x118A6, }, + { 0x118C7, 0x118A7, }, + { 0x118C8, 0x118A8, }, + { 0x118C9, 0x118A9, }, + { 0x118CA, 0x118AA, }, + { 0x118CB, 0x118AB, }, + { 0x118CC, 0x118AC, }, + { 0x118CD, 0x118AD, }, + { 0x118CE, 0x118AE, }, + { 0x118CF, 0x118AF, }, + { 0x118D0, 0x118B0, }, + { 0x118D1, 0x118B1, }, + { 0x118D2, 0x118B2, }, + { 0x118D3, 0x118B3, }, + { 0x118D4, 0x118B4, }, + { 0x118D5, 0x118B5, }, + { 0x118D6, 0x118B6, }, + { 0x118D7, 0x118B7, }, + { 0x118D8, 0x118B8, }, + { 0x118D9, 0x118B9, }, + { 0x118DA, 0x118BA, }, + { 0x118DB, 0x118BB, }, + { 0x118DC, 0x118BC, }, + { 0x118DD, 0x118BD, }, + { 0x118DE, 0x118BE, }, + { 0x118DF, 0x118BF, }, + { 0x1E922, 0x1E900, }, + { 0x1E923, 0x1E901, }, + { 0x1E924, 0x1E902, }, + { 0x1E925, 0x1E903, }, + { 0x1E926, 0x1E904, }, + { 0x1E927, 0x1E905, }, + { 0x1E928, 0x1E906, }, + { 0x1E929, 0x1E907, }, + { 0x1E92A, 0x1E908, }, + { 0x1E92B, 0x1E909, }, + { 0x1E92C, 0x1E90A, }, + { 0x1E92D, 0x1E90B, }, + { 0x1E92E, 0x1E90C, }, + { 0x1E92F, 0x1E90D, }, + { 0x1E930, 0x1E90E, }, + { 0x1E931, 0x1E90F, }, + { 0x1E932, 0x1E910, }, + { 0x1E933, 0x1E911, }, + { 0x1E934, 0x1E912, }, + { 0x1E935, 0x1E913, }, + { 0x1E936, 0x1E914, }, + { 0x1E937, 0x1E915, }, + { 0x1E938, 0x1E916, }, + { 0x1E939, 0x1E917, }, + { 0x1E93A, 0x1E918, }, + { 0x1E93B, 0x1E919, }, + { 0x1E93C, 0x1E91A, }, + { 0x1E93D, 0x1E91B, }, + { 0x1E93E, 0x1E91C, }, + { 0x1E93F, 0x1E91D, }, + { 0x1E940, 0x1E91E, }, + { 0x1E941, 0x1E91F, }, + { 0x1E942, 0x1E920, }, + { 0x1E943, 0x1E921, }, +}, UTF8_toLower[] = { /* code points with non-null lowercase conversion */ + { 0x0041, 0x0061, }, + { 0x0042, 0x0062, }, + { 0x0043, 0x0063, }, + { 0x0044, 0x0064, }, + { 0x0045, 0x0065, }, + { 0x0046, 0x0066, }, + { 0x0047, 0x0067, }, + { 0x0048, 0x0068, }, + { 0x0049, 0x0069, }, + { 0x004A, 0x006A, }, + { 0x004B, 0x006B, }, + { 0x004C, 0x006C, }, + { 0x004D, 0x006D, }, + { 0x004E, 0x006E, }, + { 0x004F, 0x006F, }, + { 0x0050, 0x0070, }, + { 0x0051, 0x0071, }, + { 0x0052, 0x0072, }, + { 0x0053, 0x0073, }, + { 0x0054, 0x0074, }, + { 0x0055, 0x0075, }, + { 0x0056, 0x0076, }, + { 0x0057, 0x0077, }, + { 0x0058, 0x0078, }, + { 0x0059, 0x0079, }, + { 0x005A, 0x007A, }, + { 0x00C0, 0x00E0, }, + { 0x00C1, 0x00E1, }, + { 0x00C2, 0x00E2, }, + { 0x00C3, 0x00E3, }, + { 0x00C4, 0x00E4, }, + { 0x00C5, 0x00E5, }, + { 0x00C6, 0x00E6, }, + { 0x00C7, 0x00E7, }, + { 0x00C8, 0x00E8, }, + { 0x00C9, 0x00E9, }, + { 0x00CA, 0x00EA, }, + { 0x00CB, 0x00EB, }, + { 0x00CC, 0x00EC, }, + { 0x00CD, 0x00ED, }, + { 0x00CE, 0x00EE, }, + { 0x00CF, 0x00EF, }, + { 0x00D0, 0x00F0, }, + { 0x00D1, 0x00F1, }, + { 0x00D2, 0x00F2, }, + { 0x00D3, 0x00F3, }, + { 0x00D4, 0x00F4, }, + { 0x00D5, 0x00F5, }, + { 0x00D6, 0x00F6, }, + { 0x00D8, 0x00F8, }, + { 0x00D9, 0x00F9, }, + { 0x00DA, 0x00FA, }, + { 0x00DB, 0x00FB, }, + { 0x00DC, 0x00FC, }, + { 0x00DD, 0x00FD, }, + { 0x00DE, 0x00FE, }, + { 0x0100, 0x0101, }, + { 0x0102, 0x0103, }, + { 0x0104, 0x0105, }, + { 0x0106, 0x0107, }, + { 0x0108, 0x0109, }, + { 0x010A, 0x010B, }, + { 0x010C, 0x010D, }, + { 0x010E, 0x010F, }, + { 0x0110, 0x0111, }, + { 0x0112, 0x0113, }, + { 0x0114, 0x0115, }, + { 0x0116, 0x0117, }, + { 0x0118, 0x0119, }, + { 0x011A, 0x011B, }, + { 0x011C, 0x011D, }, + { 0x011E, 0x011F, }, + { 0x0120, 0x0121, }, + { 0x0122, 0x0123, }, + { 0x0124, 0x0125, }, + { 0x0126, 0x0127, }, + { 0x0128, 0x0129, }, + { 0x012A, 0x012B, }, + { 0x012C, 0x012D, }, + { 0x012E, 0x012F, }, + { 0x0130, 0x0069, }, + { 0x0132, 0x0133, }, + { 0x0134, 0x0135, }, + { 0x0136, 0x0137, }, + { 0x0139, 0x013A, }, + { 0x013B, 0x013C, }, + { 0x013D, 0x013E, }, + { 0x013F, 0x0140, }, + { 0x0141, 0x0142, }, + { 0x0143, 0x0144, }, + { 0x0145, 0x0146, }, + { 0x0147, 0x0148, }, + { 0x014A, 0x014B, }, + { 0x014C, 0x014D, }, + { 0x014E, 0x014F, }, + { 0x0150, 0x0151, }, + { 0x0152, 0x0153, }, + { 0x0154, 0x0155, }, + { 0x0156, 0x0157, }, + { 0x0158, 0x0159, }, + { 0x015A, 0x015B, }, + { 0x015C, 0x015D, }, + { 0x015E, 0x015F, }, + { 0x0160, 0x0161, }, + { 0x0162, 0x0163, }, + { 0x0164, 0x0165, }, + { 0x0166, 0x0167, }, + { 0x0168, 0x0169, }, + { 0x016A, 0x016B, }, + { 0x016C, 0x016D, }, + { 0x016E, 0x016F, }, + { 0x0170, 0x0171, }, + { 0x0172, 0x0173, }, + { 0x0174, 0x0175, }, + { 0x0176, 0x0177, }, + { 0x0178, 0x00FF, }, + { 0x0179, 0x017A, }, + { 0x017B, 0x017C, }, + { 0x017D, 0x017E, }, + { 0x0181, 0x0253, }, + { 0x0182, 0x0183, }, + { 0x0184, 0x0185, }, + { 0x0186, 0x0254, }, + { 0x0187, 0x0188, }, + { 0x0189, 0x0256, }, + { 0x018A, 0x0257, }, + { 0x018B, 0x018C, }, + { 0x018E, 0x01DD, }, + { 0x018F, 0x0259, }, + { 0x0190, 0x025B, }, + { 0x0191, 0x0192, }, + { 0x0193, 0x0260, }, + { 0x0194, 0x0263, }, + { 0x0196, 0x0269, }, + { 0x0197, 0x0268, }, + { 0x0198, 0x0199, }, + { 0x019C, 0x026F, }, + { 0x019D, 0x0272, }, + { 0x019F, 0x0275, }, + { 0x01A0, 0x01A1, }, + { 0x01A2, 0x01A3, }, + { 0x01A4, 0x01A5, }, + { 0x01A6, 0x0280, }, + { 0x01A7, 0x01A8, }, + { 0x01A9, 0x0283, }, + { 0x01AC, 0x01AD, }, + { 0x01AE, 0x0288, }, + { 0x01AF, 0x01B0, }, + { 0x01B1, 0x028A, }, + { 0x01B2, 0x028B, }, + { 0x01B3, 0x01B4, }, + { 0x01B5, 0x01B6, }, + { 0x01B7, 0x0292, }, + { 0x01B8, 0x01B9, }, + { 0x01BC, 0x01BD, }, + { 0x01C4, 0x01C6, }, + { 0x01C5, 0x01C6, }, + { 0x01C7, 0x01C9, }, + { 0x01C8, 0x01C9, }, + { 0x01CA, 0x01CC, }, + { 0x01CB, 0x01CC, }, + { 0x01CD, 0x01CE, }, + { 0x01CF, 0x01D0, }, + { 0x01D1, 0x01D2, }, + { 0x01D3, 0x01D4, }, + { 0x01D5, 0x01D6, }, + { 0x01D7, 0x01D8, }, + { 0x01D9, 0x01DA, }, + { 0x01DB, 0x01DC, }, + { 0x01DE, 0x01DF, }, + { 0x01E0, 0x01E1, }, + { 0x01E2, 0x01E3, }, + { 0x01E4, 0x01E5, }, + { 0x01E6, 0x01E7, }, + { 0x01E8, 0x01E9, }, + { 0x01EA, 0x01EB, }, + { 0x01EC, 0x01ED, }, + { 0x01EE, 0x01EF, }, + { 0x01F1, 0x01F3, }, + { 0x01F2, 0x01F3, }, + { 0x01F4, 0x01F5, }, + { 0x01F6, 0x0195, }, + { 0x01F7, 0x01BF, }, + { 0x01F8, 0x01F9, }, + { 0x01FA, 0x01FB, }, + { 0x01FC, 0x01FD, }, + { 0x01FE, 0x01FF, }, + { 0x0200, 0x0201, }, + { 0x0202, 0x0203, }, + { 0x0204, 0x0205, }, + { 0x0206, 0x0207, }, + { 0x0208, 0x0209, }, + { 0x020A, 0x020B, }, + { 0x020C, 0x020D, }, + { 0x020E, 0x020F, }, + { 0x0210, 0x0211, }, + { 0x0212, 0x0213, }, + { 0x0214, 0x0215, }, + { 0x0216, 0x0217, }, + { 0x0218, 0x0219, }, + { 0x021A, 0x021B, }, + { 0x021C, 0x021D, }, + { 0x021E, 0x021F, }, + { 0x0220, 0x019E, }, + { 0x0222, 0x0223, }, + { 0x0224, 0x0225, }, + { 0x0226, 0x0227, }, + { 0x0228, 0x0229, }, + { 0x022A, 0x022B, }, + { 0x022C, 0x022D, }, + { 0x022E, 0x022F, }, + { 0x0230, 0x0231, }, + { 0x0232, 0x0233, }, + { 0x023A, 0x2C65, }, + { 0x023B, 0x023C, }, + { 0x023D, 0x019A, }, + { 0x023E, 0x2C66, }, + { 0x0241, 0x0242, }, + { 0x0243, 0x0180, }, + { 0x0244, 0x0289, }, + { 0x0245, 0x028C, }, + { 0x0246, 0x0247, }, + { 0x0248, 0x0249, }, + { 0x024A, 0x024B, }, + { 0x024C, 0x024D, }, + { 0x024E, 0x024F, }, + { 0x0370, 0x0371, }, + { 0x0372, 0x0373, }, + { 0x0376, 0x0377, }, + { 0x037F, 0x03F3, }, + { 0x0386, 0x03AC, }, + { 0x0388, 0x03AD, }, + { 0x0389, 0x03AE, }, + { 0x038A, 0x03AF, }, + { 0x038C, 0x03CC, }, + { 0x038E, 0x03CD, }, + { 0x038F, 0x03CE, }, + { 0x0391, 0x03B1, }, + { 0x0392, 0x03B2, }, + { 0x0393, 0x03B3, }, + { 0x0394, 0x03B4, }, + { 0x0395, 0x03B5, }, + { 0x0396, 0x03B6, }, + { 0x0397, 0x03B7, }, + { 0x0398, 0x03B8, }, + { 0x0399, 0x03B9, }, + { 0x039A, 0x03BA, }, + { 0x039B, 0x03BB, }, + { 0x039C, 0x03BC, }, + { 0x039D, 0x03BD, }, + { 0x039E, 0x03BE, }, + { 0x039F, 0x03BF, }, + { 0x03A0, 0x03C0, }, + { 0x03A1, 0x03C1, }, + { 0x03A3, 0x03C3, }, + { 0x03A4, 0x03C4, }, + { 0x03A5, 0x03C5, }, + { 0x03A6, 0x03C6, }, + { 0x03A7, 0x03C7, }, + { 0x03A8, 0x03C8, }, + { 0x03A9, 0x03C9, }, + { 0x03AA, 0x03CA, }, + { 0x03AB, 0x03CB, }, + { 0x03CF, 0x03D7, }, + { 0x03D8, 0x03D9, }, + { 0x03DA, 0x03DB, }, + { 0x03DC, 0x03DD, }, + { 0x03DE, 0x03DF, }, + { 0x03E0, 0x03E1, }, + { 0x03E2, 0x03E3, }, + { 0x03E4, 0x03E5, }, + { 0x03E6, 0x03E7, }, + { 0x03E8, 0x03E9, }, + { 0x03EA, 0x03EB, }, + { 0x03EC, 0x03ED, }, + { 0x03EE, 0x03EF, }, + { 0x03F4, 0x03B8, }, + { 0x03F7, 0x03F8, }, + { 0x03F9, 0x03F2, }, + { 0x03FA, 0x03FB, }, + { 0x03FD, 0x037B, }, + { 0x03FE, 0x037C, }, + { 0x03FF, 0x037D, }, + { 0x0400, 0x0450, }, + { 0x0401, 0x0451, }, + { 0x0402, 0x0452, }, + { 0x0403, 0x0453, }, + { 0x0404, 0x0454, }, + { 0x0405, 0x0455, }, + { 0x0406, 0x0456, }, + { 0x0407, 0x0457, }, + { 0x0408, 0x0458, }, + { 0x0409, 0x0459, }, + { 0x040A, 0x045A, }, + { 0x040B, 0x045B, }, + { 0x040C, 0x045C, }, + { 0x040D, 0x045D, }, + { 0x040E, 0x045E, }, + { 0x040F, 0x045F, }, + { 0x0410, 0x0430, }, + { 0x0411, 0x0431, }, + { 0x0412, 0x0432, }, + { 0x0413, 0x0433, }, + { 0x0414, 0x0434, }, + { 0x0415, 0x0435, }, + { 0x0416, 0x0436, }, + { 0x0417, 0x0437, }, + { 0x0418, 0x0438, }, + { 0x0419, 0x0439, }, + { 0x041A, 0x043A, }, + { 0x041B, 0x043B, }, + { 0x041C, 0x043C, }, + { 0x041D, 0x043D, }, + { 0x041E, 0x043E, }, + { 0x041F, 0x043F, }, + { 0x0420, 0x0440, }, + { 0x0421, 0x0441, }, + { 0x0422, 0x0442, }, + { 0x0423, 0x0443, }, + { 0x0424, 0x0444, }, + { 0x0425, 0x0445, }, + { 0x0426, 0x0446, }, + { 0x0427, 0x0447, }, + { 0x0428, 0x0448, }, + { 0x0429, 0x0449, }, + { 0x042A, 0x044A, }, + { 0x042B, 0x044B, }, + { 0x042C, 0x044C, }, + { 0x042D, 0x044D, }, + { 0x042E, 0x044E, }, + { 0x042F, 0x044F, }, + { 0x0460, 0x0461, }, + { 0x0462, 0x0463, }, + { 0x0464, 0x0465, }, + { 0x0466, 0x0467, }, + { 0x0468, 0x0469, }, + { 0x046A, 0x046B, }, + { 0x046C, 0x046D, }, + { 0x046E, 0x046F, }, + { 0x0470, 0x0471, }, + { 0x0472, 0x0473, }, + { 0x0474, 0x0475, }, + { 0x0476, 0x0477, }, + { 0x0478, 0x0479, }, + { 0x047A, 0x047B, }, + { 0x047C, 0x047D, }, + { 0x047E, 0x047F, }, + { 0x0480, 0x0481, }, + { 0x048A, 0x048B, }, + { 0x048C, 0x048D, }, + { 0x048E, 0x048F, }, + { 0x0490, 0x0491, }, + { 0x0492, 0x0493, }, + { 0x0494, 0x0495, }, + { 0x0496, 0x0497, }, + { 0x0498, 0x0499, }, + { 0x049A, 0x049B, }, + { 0x049C, 0x049D, }, + { 0x049E, 0x049F, }, + { 0x04A0, 0x04A1, }, + { 0x04A2, 0x04A3, }, + { 0x04A4, 0x04A5, }, + { 0x04A6, 0x04A7, }, + { 0x04A8, 0x04A9, }, + { 0x04AA, 0x04AB, }, + { 0x04AC, 0x04AD, }, + { 0x04AE, 0x04AF, }, + { 0x04B0, 0x04B1, }, + { 0x04B2, 0x04B3, }, + { 0x04B4, 0x04B5, }, + { 0x04B6, 0x04B7, }, + { 0x04B8, 0x04B9, }, + { 0x04BA, 0x04BB, }, + { 0x04BC, 0x04BD, }, + { 0x04BE, 0x04BF, }, + { 0x04C0, 0x04CF, }, + { 0x04C1, 0x04C2, }, + { 0x04C3, 0x04C4, }, + { 0x04C5, 0x04C6, }, + { 0x04C7, 0x04C8, }, + { 0x04C9, 0x04CA, }, + { 0x04CB, 0x04CC, }, + { 0x04CD, 0x04CE, }, + { 0x04D0, 0x04D1, }, + { 0x04D2, 0x04D3, }, + { 0x04D4, 0x04D5, }, + { 0x04D6, 0x04D7, }, + { 0x04D8, 0x04D9, }, + { 0x04DA, 0x04DB, }, + { 0x04DC, 0x04DD, }, + { 0x04DE, 0x04DF, }, + { 0x04E0, 0x04E1, }, + { 0x04E2, 0x04E3, }, + { 0x04E4, 0x04E5, }, + { 0x04E6, 0x04E7, }, + { 0x04E8, 0x04E9, }, + { 0x04EA, 0x04EB, }, + { 0x04EC, 0x04ED, }, + { 0x04EE, 0x04EF, }, + { 0x04F0, 0x04F1, }, + { 0x04F2, 0x04F3, }, + { 0x04F4, 0x04F5, }, + { 0x04F6, 0x04F7, }, + { 0x04F8, 0x04F9, }, + { 0x04FA, 0x04FB, }, + { 0x04FC, 0x04FD, }, + { 0x04FE, 0x04FF, }, + { 0x0500, 0x0501, }, + { 0x0502, 0x0503, }, + { 0x0504, 0x0505, }, + { 0x0506, 0x0507, }, + { 0x0508, 0x0509, }, + { 0x050A, 0x050B, }, + { 0x050C, 0x050D, }, + { 0x050E, 0x050F, }, + { 0x0510, 0x0511, }, + { 0x0512, 0x0513, }, + { 0x0514, 0x0515, }, + { 0x0516, 0x0517, }, + { 0x0518, 0x0519, }, + { 0x051A, 0x051B, }, + { 0x051C, 0x051D, }, + { 0x051E, 0x051F, }, + { 0x0520, 0x0521, }, + { 0x0522, 0x0523, }, + { 0x0524, 0x0525, }, + { 0x0526, 0x0527, }, + { 0x0528, 0x0529, }, + { 0x052A, 0x052B, }, + { 0x052C, 0x052D, }, + { 0x052E, 0x052F, }, + { 0x0531, 0x0561, }, + { 0x0532, 0x0562, }, + { 0x0533, 0x0563, }, + { 0x0534, 0x0564, }, + { 0x0535, 0x0565, }, + { 0x0536, 0x0566, }, + { 0x0537, 0x0567, }, + { 0x0538, 0x0568, }, + { 0x0539, 0x0569, }, + { 0x053A, 0x056A, }, + { 0x053B, 0x056B, }, + { 0x053C, 0x056C, }, + { 0x053D, 0x056D, }, + { 0x053E, 0x056E, }, + { 0x053F, 0x056F, }, + { 0x0540, 0x0570, }, + { 0x0541, 0x0571, }, + { 0x0542, 0x0572, }, + { 0x0543, 0x0573, }, + { 0x0544, 0x0574, }, + { 0x0545, 0x0575, }, + { 0x0546, 0x0576, }, + { 0x0547, 0x0577, }, + { 0x0548, 0x0578, }, + { 0x0549, 0x0579, }, + { 0x054A, 0x057A, }, + { 0x054B, 0x057B, }, + { 0x054C, 0x057C, }, + { 0x054D, 0x057D, }, + { 0x054E, 0x057E, }, + { 0x054F, 0x057F, }, + { 0x0550, 0x0580, }, + { 0x0551, 0x0581, }, + { 0x0552, 0x0582, }, + { 0x0553, 0x0583, }, + { 0x0554, 0x0584, }, + { 0x0555, 0x0585, }, + { 0x0556, 0x0586, }, + { 0x10A0, 0x2D00, }, + { 0x10A1, 0x2D01, }, + { 0x10A2, 0x2D02, }, + { 0x10A3, 0x2D03, }, + { 0x10A4, 0x2D04, }, + { 0x10A5, 0x2D05, }, + { 0x10A6, 0x2D06, }, + { 0x10A7, 0x2D07, }, + { 0x10A8, 0x2D08, }, + { 0x10A9, 0x2D09, }, + { 0x10AA, 0x2D0A, }, + { 0x10AB, 0x2D0B, }, + { 0x10AC, 0x2D0C, }, + { 0x10AD, 0x2D0D, }, + { 0x10AE, 0x2D0E, }, + { 0x10AF, 0x2D0F, }, + { 0x10B0, 0x2D10, }, + { 0x10B1, 0x2D11, }, + { 0x10B2, 0x2D12, }, + { 0x10B3, 0x2D13, }, + { 0x10B4, 0x2D14, }, + { 0x10B5, 0x2D15, }, + { 0x10B6, 0x2D16, }, + { 0x10B7, 0x2D17, }, + { 0x10B8, 0x2D18, }, + { 0x10B9, 0x2D19, }, + { 0x10BA, 0x2D1A, }, + { 0x10BB, 0x2D1B, }, + { 0x10BC, 0x2D1C, }, + { 0x10BD, 0x2D1D, }, + { 0x10BE, 0x2D1E, }, + { 0x10BF, 0x2D1F, }, + { 0x10C0, 0x2D20, }, + { 0x10C1, 0x2D21, }, + { 0x10C2, 0x2D22, }, + { 0x10C3, 0x2D23, }, + { 0x10C4, 0x2D24, }, + { 0x10C5, 0x2D25, }, + { 0x10C7, 0x2D27, }, + { 0x10CD, 0x2D2D, }, + { 0x13A0, 0xAB70, }, + { 0x13A1, 0xAB71, }, + { 0x13A2, 0xAB72, }, + { 0x13A3, 0xAB73, }, + { 0x13A4, 0xAB74, }, + { 0x13A5, 0xAB75, }, + { 0x13A6, 0xAB76, }, + { 0x13A7, 0xAB77, }, + { 0x13A8, 0xAB78, }, + { 0x13A9, 0xAB79, }, + { 0x13AA, 0xAB7A, }, + { 0x13AB, 0xAB7B, }, + { 0x13AC, 0xAB7C, }, + { 0x13AD, 0xAB7D, }, + { 0x13AE, 0xAB7E, }, + { 0x13AF, 0xAB7F, }, + { 0x13B0, 0xAB80, }, + { 0x13B1, 0xAB81, }, + { 0x13B2, 0xAB82, }, + { 0x13B3, 0xAB83, }, + { 0x13B4, 0xAB84, }, + { 0x13B5, 0xAB85, }, + { 0x13B6, 0xAB86, }, + { 0x13B7, 0xAB87, }, + { 0x13B8, 0xAB88, }, + { 0x13B9, 0xAB89, }, + { 0x13BA, 0xAB8A, }, + { 0x13BB, 0xAB8B, }, + { 0x13BC, 0xAB8C, }, + { 0x13BD, 0xAB8D, }, + { 0x13BE, 0xAB8E, }, + { 0x13BF, 0xAB8F, }, + { 0x13C0, 0xAB90, }, + { 0x13C1, 0xAB91, }, + { 0x13C2, 0xAB92, }, + { 0x13C3, 0xAB93, }, + { 0x13C4, 0xAB94, }, + { 0x13C5, 0xAB95, }, + { 0x13C6, 0xAB96, }, + { 0x13C7, 0xAB97, }, + { 0x13C8, 0xAB98, }, + { 0x13C9, 0xAB99, }, + { 0x13CA, 0xAB9A, }, + { 0x13CB, 0xAB9B, }, + { 0x13CC, 0xAB9C, }, + { 0x13CD, 0xAB9D, }, + { 0x13CE, 0xAB9E, }, + { 0x13CF, 0xAB9F, }, + { 0x13D0, 0xABA0, }, + { 0x13D1, 0xABA1, }, + { 0x13D2, 0xABA2, }, + { 0x13D3, 0xABA3, }, + { 0x13D4, 0xABA4, }, + { 0x13D5, 0xABA5, }, + { 0x13D6, 0xABA6, }, + { 0x13D7, 0xABA7, }, + { 0x13D8, 0xABA8, }, + { 0x13D9, 0xABA9, }, + { 0x13DA, 0xABAA, }, + { 0x13DB, 0xABAB, }, + { 0x13DC, 0xABAC, }, + { 0x13DD, 0xABAD, }, + { 0x13DE, 0xABAE, }, + { 0x13DF, 0xABAF, }, + { 0x13E0, 0xABB0, }, + { 0x13E1, 0xABB1, }, + { 0x13E2, 0xABB2, }, + { 0x13E3, 0xABB3, }, + { 0x13E4, 0xABB4, }, + { 0x13E5, 0xABB5, }, + { 0x13E6, 0xABB6, }, + { 0x13E7, 0xABB7, }, + { 0x13E8, 0xABB8, }, + { 0x13E9, 0xABB9, }, + { 0x13EA, 0xABBA, }, + { 0x13EB, 0xABBB, }, + { 0x13EC, 0xABBC, }, + { 0x13ED, 0xABBD, }, + { 0x13EE, 0xABBE, }, + { 0x13EF, 0xABBF, }, + { 0x13F0, 0x13F8, }, + { 0x13F1, 0x13F9, }, + { 0x13F2, 0x13FA, }, + { 0x13F3, 0x13FB, }, + { 0x13F4, 0x13FC, }, + { 0x13F5, 0x13FD, }, + { 0x1E00, 0x1E01, }, + { 0x1E02, 0x1E03, }, + { 0x1E04, 0x1E05, }, + { 0x1E06, 0x1E07, }, + { 0x1E08, 0x1E09, }, + { 0x1E0A, 0x1E0B, }, + { 0x1E0C, 0x1E0D, }, + { 0x1E0E, 0x1E0F, }, + { 0x1E10, 0x1E11, }, + { 0x1E12, 0x1E13, }, + { 0x1E14, 0x1E15, }, + { 0x1E16, 0x1E17, }, + { 0x1E18, 0x1E19, }, + { 0x1E1A, 0x1E1B, }, + { 0x1E1C, 0x1E1D, }, + { 0x1E1E, 0x1E1F, }, + { 0x1E20, 0x1E21, }, + { 0x1E22, 0x1E23, }, + { 0x1E24, 0x1E25, }, + { 0x1E26, 0x1E27, }, + { 0x1E28, 0x1E29, }, + { 0x1E2A, 0x1E2B, }, + { 0x1E2C, 0x1E2D, }, + { 0x1E2E, 0x1E2F, }, + { 0x1E30, 0x1E31, }, + { 0x1E32, 0x1E33, }, + { 0x1E34, 0x1E35, }, + { 0x1E36, 0x1E37, }, + { 0x1E38, 0x1E39, }, + { 0x1E3A, 0x1E3B, }, + { 0x1E3C, 0x1E3D, }, + { 0x1E3E, 0x1E3F, }, + { 0x1E40, 0x1E41, }, + { 0x1E42, 0x1E43, }, + { 0x1E44, 0x1E45, }, + { 0x1E46, 0x1E47, }, + { 0x1E48, 0x1E49, }, + { 0x1E4A, 0x1E4B, }, + { 0x1E4C, 0x1E4D, }, + { 0x1E4E, 0x1E4F, }, + { 0x1E50, 0x1E51, }, + { 0x1E52, 0x1E53, }, + { 0x1E54, 0x1E55, }, + { 0x1E56, 0x1E57, }, + { 0x1E58, 0x1E59, }, + { 0x1E5A, 0x1E5B, }, + { 0x1E5C, 0x1E5D, }, + { 0x1E5E, 0x1E5F, }, + { 0x1E60, 0x1E61, }, + { 0x1E62, 0x1E63, }, + { 0x1E64, 0x1E65, }, + { 0x1E66, 0x1E67, }, + { 0x1E68, 0x1E69, }, + { 0x1E6A, 0x1E6B, }, + { 0x1E6C, 0x1E6D, }, + { 0x1E6E, 0x1E6F, }, + { 0x1E70, 0x1E71, }, + { 0x1E72, 0x1E73, }, + { 0x1E74, 0x1E75, }, + { 0x1E76, 0x1E77, }, + { 0x1E78, 0x1E79, }, + { 0x1E7A, 0x1E7B, }, + { 0x1E7C, 0x1E7D, }, + { 0x1E7E, 0x1E7F, }, + { 0x1E80, 0x1E81, }, + { 0x1E82, 0x1E83, }, + { 0x1E84, 0x1E85, }, + { 0x1E86, 0x1E87, }, + { 0x1E88, 0x1E89, }, + { 0x1E8A, 0x1E8B, }, + { 0x1E8C, 0x1E8D, }, + { 0x1E8E, 0x1E8F, }, + { 0x1E90, 0x1E91, }, + { 0x1E92, 0x1E93, }, + { 0x1E94, 0x1E95, }, + { 0x1E9E, 0x00DF, }, + { 0x1EA0, 0x1EA1, }, + { 0x1EA2, 0x1EA3, }, + { 0x1EA4, 0x1EA5, }, + { 0x1EA6, 0x1EA7, }, + { 0x1EA8, 0x1EA9, }, + { 0x1EAA, 0x1EAB, }, + { 0x1EAC, 0x1EAD, }, + { 0x1EAE, 0x1EAF, }, + { 0x1EB0, 0x1EB1, }, + { 0x1EB2, 0x1EB3, }, + { 0x1EB4, 0x1EB5, }, + { 0x1EB6, 0x1EB7, }, + { 0x1EB8, 0x1EB9, }, + { 0x1EBA, 0x1EBB, }, + { 0x1EBC, 0x1EBD, }, + { 0x1EBE, 0x1EBF, }, + { 0x1EC0, 0x1EC1, }, + { 0x1EC2, 0x1EC3, }, + { 0x1EC4, 0x1EC5, }, + { 0x1EC6, 0x1EC7, }, + { 0x1EC8, 0x1EC9, }, + { 0x1ECA, 0x1ECB, }, + { 0x1ECC, 0x1ECD, }, + { 0x1ECE, 0x1ECF, }, + { 0x1ED0, 0x1ED1, }, + { 0x1ED2, 0x1ED3, }, + { 0x1ED4, 0x1ED5, }, + { 0x1ED6, 0x1ED7, }, + { 0x1ED8, 0x1ED9, }, + { 0x1EDA, 0x1EDB, }, + { 0x1EDC, 0x1EDD, }, + { 0x1EDE, 0x1EDF, }, + { 0x1EE0, 0x1EE1, }, + { 0x1EE2, 0x1EE3, }, + { 0x1EE4, 0x1EE5, }, + { 0x1EE6, 0x1EE7, }, + { 0x1EE8, 0x1EE9, }, + { 0x1EEA, 0x1EEB, }, + { 0x1EEC, 0x1EED, }, + { 0x1EEE, 0x1EEF, }, + { 0x1EF0, 0x1EF1, }, + { 0x1EF2, 0x1EF3, }, + { 0x1EF4, 0x1EF5, }, + { 0x1EF6, 0x1EF7, }, + { 0x1EF8, 0x1EF9, }, + { 0x1EFA, 0x1EFB, }, + { 0x1EFC, 0x1EFD, }, + { 0x1EFE, 0x1EFF, }, + { 0x1F08, 0x1F00, }, + { 0x1F09, 0x1F01, }, + { 0x1F0A, 0x1F02, }, + { 0x1F0B, 0x1F03, }, + { 0x1F0C, 0x1F04, }, + { 0x1F0D, 0x1F05, }, + { 0x1F0E, 0x1F06, }, + { 0x1F0F, 0x1F07, }, + { 0x1F18, 0x1F10, }, + { 0x1F19, 0x1F11, }, + { 0x1F1A, 0x1F12, }, + { 0x1F1B, 0x1F13, }, + { 0x1F1C, 0x1F14, }, + { 0x1F1D, 0x1F15, }, + { 0x1F28, 0x1F20, }, + { 0x1F29, 0x1F21, }, + { 0x1F2A, 0x1F22, }, + { 0x1F2B, 0x1F23, }, + { 0x1F2C, 0x1F24, }, + { 0x1F2D, 0x1F25, }, + { 0x1F2E, 0x1F26, }, + { 0x1F2F, 0x1F27, }, + { 0x1F38, 0x1F30, }, + { 0x1F39, 0x1F31, }, + { 0x1F3A, 0x1F32, }, + { 0x1F3B, 0x1F33, }, + { 0x1F3C, 0x1F34, }, + { 0x1F3D, 0x1F35, }, + { 0x1F3E, 0x1F36, }, + { 0x1F3F, 0x1F37, }, + { 0x1F48, 0x1F40, }, + { 0x1F49, 0x1F41, }, + { 0x1F4A, 0x1F42, }, + { 0x1F4B, 0x1F43, }, + { 0x1F4C, 0x1F44, }, + { 0x1F4D, 0x1F45, }, + { 0x1F59, 0x1F51, }, + { 0x1F5B, 0x1F53, }, + { 0x1F5D, 0x1F55, }, + { 0x1F5F, 0x1F57, }, + { 0x1F68, 0x1F60, }, + { 0x1F69, 0x1F61, }, + { 0x1F6A, 0x1F62, }, + { 0x1F6B, 0x1F63, }, + { 0x1F6C, 0x1F64, }, + { 0x1F6D, 0x1F65, }, + { 0x1F6E, 0x1F66, }, + { 0x1F6F, 0x1F67, }, + { 0x1F88, 0x1F80, }, + { 0x1F89, 0x1F81, }, + { 0x1F8A, 0x1F82, }, + { 0x1F8B, 0x1F83, }, + { 0x1F8C, 0x1F84, }, + { 0x1F8D, 0x1F85, }, + { 0x1F8E, 0x1F86, }, + { 0x1F8F, 0x1F87, }, + { 0x1F98, 0x1F90, }, + { 0x1F99, 0x1F91, }, + { 0x1F9A, 0x1F92, }, + { 0x1F9B, 0x1F93, }, + { 0x1F9C, 0x1F94, }, + { 0x1F9D, 0x1F95, }, + { 0x1F9E, 0x1F96, }, + { 0x1F9F, 0x1F97, }, + { 0x1FA8, 0x1FA0, }, + { 0x1FA9, 0x1FA1, }, + { 0x1FAA, 0x1FA2, }, + { 0x1FAB, 0x1FA3, }, + { 0x1FAC, 0x1FA4, }, + { 0x1FAD, 0x1FA5, }, + { 0x1FAE, 0x1FA6, }, + { 0x1FAF, 0x1FA7, }, + { 0x1FB8, 0x1FB0, }, + { 0x1FB9, 0x1FB1, }, + { 0x1FBA, 0x1F70, }, + { 0x1FBB, 0x1F71, }, + { 0x1FBC, 0x1FB3, }, + { 0x1FC8, 0x1F72, }, + { 0x1FC9, 0x1F73, }, + { 0x1FCA, 0x1F74, }, + { 0x1FCB, 0x1F75, }, + { 0x1FCC, 0x1FC3, }, + { 0x1FD8, 0x1FD0, }, + { 0x1FD9, 0x1FD1, }, + { 0x1FDA, 0x1F76, }, + { 0x1FDB, 0x1F77, }, + { 0x1FE8, 0x1FE0, }, + { 0x1FE9, 0x1FE1, }, + { 0x1FEA, 0x1F7A, }, + { 0x1FEB, 0x1F7B, }, + { 0x1FEC, 0x1FE5, }, + { 0x1FF8, 0x1F78, }, + { 0x1FF9, 0x1F79, }, + { 0x1FFA, 0x1F7C, }, + { 0x1FFB, 0x1F7D, }, + { 0x1FFC, 0x1FF3, }, + { 0x2126, 0x03C9, }, + { 0x212A, 0x006B, }, + { 0x212B, 0x00E5, }, + { 0x2132, 0x214E, }, + { 0x2160, 0x2170, }, + { 0x2161, 0x2171, }, + { 0x2162, 0x2172, }, + { 0x2163, 0x2173, }, + { 0x2164, 0x2174, }, + { 0x2165, 0x2175, }, + { 0x2166, 0x2176, }, + { 0x2167, 0x2177, }, + { 0x2168, 0x2178, }, + { 0x2169, 0x2179, }, + { 0x216A, 0x217A, }, + { 0x216B, 0x217B, }, + { 0x216C, 0x217C, }, + { 0x216D, 0x217D, }, + { 0x216E, 0x217E, }, + { 0x216F, 0x217F, }, + { 0x2183, 0x2184, }, + { 0x24B6, 0x24D0, }, + { 0x24B7, 0x24D1, }, + { 0x24B8, 0x24D2, }, + { 0x24B9, 0x24D3, }, + { 0x24BA, 0x24D4, }, + { 0x24BB, 0x24D5, }, + { 0x24BC, 0x24D6, }, + { 0x24BD, 0x24D7, }, + { 0x24BE, 0x24D8, }, + { 0x24BF, 0x24D9, }, + { 0x24C0, 0x24DA, }, + { 0x24C1, 0x24DB, }, + { 0x24C2, 0x24DC, }, + { 0x24C3, 0x24DD, }, + { 0x24C4, 0x24DE, }, + { 0x24C5, 0x24DF, }, + { 0x24C6, 0x24E0, }, + { 0x24C7, 0x24E1, }, + { 0x24C8, 0x24E2, }, + { 0x24C9, 0x24E3, }, + { 0x24CA, 0x24E4, }, + { 0x24CB, 0x24E5, }, + { 0x24CC, 0x24E6, }, + { 0x24CD, 0x24E7, }, + { 0x24CE, 0x24E8, }, + { 0x24CF, 0x24E9, }, + { 0x2C00, 0x2C30, }, + { 0x2C01, 0x2C31, }, + { 0x2C02, 0x2C32, }, + { 0x2C03, 0x2C33, }, + { 0x2C04, 0x2C34, }, + { 0x2C05, 0x2C35, }, + { 0x2C06, 0x2C36, }, + { 0x2C07, 0x2C37, }, + { 0x2C08, 0x2C38, }, + { 0x2C09, 0x2C39, }, + { 0x2C0A, 0x2C3A, }, + { 0x2C0B, 0x2C3B, }, + { 0x2C0C, 0x2C3C, }, + { 0x2C0D, 0x2C3D, }, + { 0x2C0E, 0x2C3E, }, + { 0x2C0F, 0x2C3F, }, + { 0x2C10, 0x2C40, }, + { 0x2C11, 0x2C41, }, + { 0x2C12, 0x2C42, }, + { 0x2C13, 0x2C43, }, + { 0x2C14, 0x2C44, }, + { 0x2C15, 0x2C45, }, + { 0x2C16, 0x2C46, }, + { 0x2C17, 0x2C47, }, + { 0x2C18, 0x2C48, }, + { 0x2C19, 0x2C49, }, + { 0x2C1A, 0x2C4A, }, + { 0x2C1B, 0x2C4B, }, + { 0x2C1C, 0x2C4C, }, + { 0x2C1D, 0x2C4D, }, + { 0x2C1E, 0x2C4E, }, + { 0x2C1F, 0x2C4F, }, + { 0x2C20, 0x2C50, }, + { 0x2C21, 0x2C51, }, + { 0x2C22, 0x2C52, }, + { 0x2C23, 0x2C53, }, + { 0x2C24, 0x2C54, }, + { 0x2C25, 0x2C55, }, + { 0x2C26, 0x2C56, }, + { 0x2C27, 0x2C57, }, + { 0x2C28, 0x2C58, }, + { 0x2C29, 0x2C59, }, + { 0x2C2A, 0x2C5A, }, + { 0x2C2B, 0x2C5B, }, + { 0x2C2C, 0x2C5C, }, + { 0x2C2D, 0x2C5D, }, + { 0x2C2E, 0x2C5E, }, + { 0x2C60, 0x2C61, }, + { 0x2C62, 0x026B, }, + { 0x2C63, 0x1D7D, }, + { 0x2C64, 0x027D, }, + { 0x2C67, 0x2C68, }, + { 0x2C69, 0x2C6A, }, + { 0x2C6B, 0x2C6C, }, + { 0x2C6D, 0x0251, }, + { 0x2C6E, 0x0271, }, + { 0x2C6F, 0x0250, }, + { 0x2C70, 0x0252, }, + { 0x2C72, 0x2C73, }, + { 0x2C75, 0x2C76, }, + { 0x2C7E, 0x023F, }, + { 0x2C7F, 0x0240, }, + { 0x2C80, 0x2C81, }, + { 0x2C82, 0x2C83, }, + { 0x2C84, 0x2C85, }, + { 0x2C86, 0x2C87, }, + { 0x2C88, 0x2C89, }, + { 0x2C8A, 0x2C8B, }, + { 0x2C8C, 0x2C8D, }, + { 0x2C8E, 0x2C8F, }, + { 0x2C90, 0x2C91, }, + { 0x2C92, 0x2C93, }, + { 0x2C94, 0x2C95, }, + { 0x2C96, 0x2C97, }, + { 0x2C98, 0x2C99, }, + { 0x2C9A, 0x2C9B, }, + { 0x2C9C, 0x2C9D, }, + { 0x2C9E, 0x2C9F, }, + { 0x2CA0, 0x2CA1, }, + { 0x2CA2, 0x2CA3, }, + { 0x2CA4, 0x2CA5, }, + { 0x2CA6, 0x2CA7, }, + { 0x2CA8, 0x2CA9, }, + { 0x2CAA, 0x2CAB, }, + { 0x2CAC, 0x2CAD, }, + { 0x2CAE, 0x2CAF, }, + { 0x2CB0, 0x2CB1, }, + { 0x2CB2, 0x2CB3, }, + { 0x2CB4, 0x2CB5, }, + { 0x2CB6, 0x2CB7, }, + { 0x2CB8, 0x2CB9, }, + { 0x2CBA, 0x2CBB, }, + { 0x2CBC, 0x2CBD, }, + { 0x2CBE, 0x2CBF, }, + { 0x2CC0, 0x2CC1, }, + { 0x2CC2, 0x2CC3, }, + { 0x2CC4, 0x2CC5, }, + { 0x2CC6, 0x2CC7, }, + { 0x2CC8, 0x2CC9, }, + { 0x2CCA, 0x2CCB, }, + { 0x2CCC, 0x2CCD, }, + { 0x2CCE, 0x2CCF, }, + { 0x2CD0, 0x2CD1, }, + { 0x2CD2, 0x2CD3, }, + { 0x2CD4, 0x2CD5, }, + { 0x2CD6, 0x2CD7, }, + { 0x2CD8, 0x2CD9, }, + { 0x2CDA, 0x2CDB, }, + { 0x2CDC, 0x2CDD, }, + { 0x2CDE, 0x2CDF, }, + { 0x2CE0, 0x2CE1, }, + { 0x2CE2, 0x2CE3, }, + { 0x2CEB, 0x2CEC, }, + { 0x2CED, 0x2CEE, }, + { 0x2CF2, 0x2CF3, }, + { 0xA640, 0xA641, }, + { 0xA642, 0xA643, }, + { 0xA644, 0xA645, }, + { 0xA646, 0xA647, }, + { 0xA648, 0xA649, }, + { 0xA64A, 0xA64B, }, + { 0xA64C, 0xA64D, }, + { 0xA64E, 0xA64F, }, + { 0xA650, 0xA651, }, + { 0xA652, 0xA653, }, + { 0xA654, 0xA655, }, + { 0xA656, 0xA657, }, + { 0xA658, 0xA659, }, + { 0xA65A, 0xA65B, }, + { 0xA65C, 0xA65D, }, + { 0xA65E, 0xA65F, }, + { 0xA660, 0xA661, }, + { 0xA662, 0xA663, }, + { 0xA664, 0xA665, }, + { 0xA666, 0xA667, }, + { 0xA668, 0xA669, }, + { 0xA66A, 0xA66B, }, + { 0xA66C, 0xA66D, }, + { 0xA680, 0xA681, }, + { 0xA682, 0xA683, }, + { 0xA684, 0xA685, }, + { 0xA686, 0xA687, }, + { 0xA688, 0xA689, }, + { 0xA68A, 0xA68B, }, + { 0xA68C, 0xA68D, }, + { 0xA68E, 0xA68F, }, + { 0xA690, 0xA691, }, + { 0xA692, 0xA693, }, + { 0xA694, 0xA695, }, + { 0xA696, 0xA697, }, + { 0xA698, 0xA699, }, + { 0xA69A, 0xA69B, }, + { 0xA722, 0xA723, }, + { 0xA724, 0xA725, }, + { 0xA726, 0xA727, }, + { 0xA728, 0xA729, }, + { 0xA72A, 0xA72B, }, + { 0xA72C, 0xA72D, }, + { 0xA72E, 0xA72F, }, + { 0xA732, 0xA733, }, + { 0xA734, 0xA735, }, + { 0xA736, 0xA737, }, + { 0xA738, 0xA739, }, + { 0xA73A, 0xA73B, }, + { 0xA73C, 0xA73D, }, + { 0xA73E, 0xA73F, }, + { 0xA740, 0xA741, }, + { 0xA742, 0xA743, }, + { 0xA744, 0xA745, }, + { 0xA746, 0xA747, }, + { 0xA748, 0xA749, }, + { 0xA74A, 0xA74B, }, + { 0xA74C, 0xA74D, }, + { 0xA74E, 0xA74F, }, + { 0xA750, 0xA751, }, + { 0xA752, 0xA753, }, + { 0xA754, 0xA755, }, + { 0xA756, 0xA757, }, + { 0xA758, 0xA759, }, + { 0xA75A, 0xA75B, }, + { 0xA75C, 0xA75D, }, + { 0xA75E, 0xA75F, }, + { 0xA760, 0xA761, }, + { 0xA762, 0xA763, }, + { 0xA764, 0xA765, }, + { 0xA766, 0xA767, }, + { 0xA768, 0xA769, }, + { 0xA76A, 0xA76B, }, + { 0xA76C, 0xA76D, }, + { 0xA76E, 0xA76F, }, + { 0xA779, 0xA77A, }, + { 0xA77B, 0xA77C, }, + { 0xA77D, 0x1D79, }, + { 0xA77E, 0xA77F, }, + { 0xA780, 0xA781, }, + { 0xA782, 0xA783, }, + { 0xA784, 0xA785, }, + { 0xA786, 0xA787, }, + { 0xA78B, 0xA78C, }, + { 0xA78D, 0x0265, }, + { 0xA790, 0xA791, }, + { 0xA792, 0xA793, }, + { 0xA796, 0xA797, }, + { 0xA798, 0xA799, }, + { 0xA79A, 0xA79B, }, + { 0xA79C, 0xA79D, }, + { 0xA79E, 0xA79F, }, + { 0xA7A0, 0xA7A1, }, + { 0xA7A2, 0xA7A3, }, + { 0xA7A4, 0xA7A5, }, + { 0xA7A6, 0xA7A7, }, + { 0xA7A8, 0xA7A9, }, + { 0xA7AA, 0x0266, }, + { 0xA7AB, 0x025C, }, + { 0xA7AC, 0x0261, }, + { 0xA7AD, 0x026C, }, + { 0xA7AE, 0x026A, }, + { 0xA7B0, 0x029E, }, + { 0xA7B1, 0x0287, }, + { 0xA7B2, 0x029D, }, + { 0xA7B3, 0xAB53, }, + { 0xA7B4, 0xA7B5, }, + { 0xA7B6, 0xA7B7, }, + { 0xFF21, 0xFF41, }, + { 0xFF22, 0xFF42, }, + { 0xFF23, 0xFF43, }, + { 0xFF24, 0xFF44, }, + { 0xFF25, 0xFF45, }, + { 0xFF26, 0xFF46, }, + { 0xFF27, 0xFF47, }, + { 0xFF28, 0xFF48, }, + { 0xFF29, 0xFF49, }, + { 0xFF2A, 0xFF4A, }, + { 0xFF2B, 0xFF4B, }, + { 0xFF2C, 0xFF4C, }, + { 0xFF2D, 0xFF4D, }, + { 0xFF2E, 0xFF4E, }, + { 0xFF2F, 0xFF4F, }, + { 0xFF30, 0xFF50, }, + { 0xFF31, 0xFF51, }, + { 0xFF32, 0xFF52, }, + { 0xFF33, 0xFF53, }, + { 0xFF34, 0xFF54, }, + { 0xFF35, 0xFF55, }, + { 0xFF36, 0xFF56, }, + { 0xFF37, 0xFF57, }, + { 0xFF38, 0xFF58, }, + { 0xFF39, 0xFF59, }, + { 0xFF3A, 0xFF5A, }, + { 0x10400, 0x10428, }, + { 0x10401, 0x10429, }, + { 0x10402, 0x1042A, }, + { 0x10403, 0x1042B, }, + { 0x10404, 0x1042C, }, + { 0x10405, 0x1042D, }, + { 0x10406, 0x1042E, }, + { 0x10407, 0x1042F, }, + { 0x10408, 0x10430, }, + { 0x10409, 0x10431, }, + { 0x1040A, 0x10432, }, + { 0x1040B, 0x10433, }, + { 0x1040C, 0x10434, }, + { 0x1040D, 0x10435, }, + { 0x1040E, 0x10436, }, + { 0x1040F, 0x10437, }, + { 0x10410, 0x10438, }, + { 0x10411, 0x10439, }, + { 0x10412, 0x1043A, }, + { 0x10413, 0x1043B, }, + { 0x10414, 0x1043C, }, + { 0x10415, 0x1043D, }, + { 0x10416, 0x1043E, }, + { 0x10417, 0x1043F, }, + { 0x10418, 0x10440, }, + { 0x10419, 0x10441, }, + { 0x1041A, 0x10442, }, + { 0x1041B, 0x10443, }, + { 0x1041C, 0x10444, }, + { 0x1041D, 0x10445, }, + { 0x1041E, 0x10446, }, + { 0x1041F, 0x10447, }, + { 0x10420, 0x10448, }, + { 0x10421, 0x10449, }, + { 0x10422, 0x1044A, }, + { 0x10423, 0x1044B, }, + { 0x10424, 0x1044C, }, + { 0x10425, 0x1044D, }, + { 0x10426, 0x1044E, }, + { 0x10427, 0x1044F, }, + { 0x104B0, 0x104D8, }, + { 0x104B1, 0x104D9, }, + { 0x104B2, 0x104DA, }, + { 0x104B3, 0x104DB, }, + { 0x104B4, 0x104DC, }, + { 0x104B5, 0x104DD, }, + { 0x104B6, 0x104DE, }, + { 0x104B7, 0x104DF, }, + { 0x104B8, 0x104E0, }, + { 0x104B9, 0x104E1, }, + { 0x104BA, 0x104E2, }, + { 0x104BB, 0x104E3, }, + { 0x104BC, 0x104E4, }, + { 0x104BD, 0x104E5, }, + { 0x104BE, 0x104E6, }, + { 0x104BF, 0x104E7, }, + { 0x104C0, 0x104E8, }, + { 0x104C1, 0x104E9, }, + { 0x104C2, 0x104EA, }, + { 0x104C3, 0x104EB, }, + { 0x104C4, 0x104EC, }, + { 0x104C5, 0x104ED, }, + { 0x104C6, 0x104EE, }, + { 0x104C7, 0x104EF, }, + { 0x104C8, 0x104F0, }, + { 0x104C9, 0x104F1, }, + { 0x104CA, 0x104F2, }, + { 0x104CB, 0x104F3, }, + { 0x104CC, 0x104F4, }, + { 0x104CD, 0x104F5, }, + { 0x104CE, 0x104F6, }, + { 0x104CF, 0x104F7, }, + { 0x104D0, 0x104F8, }, + { 0x104D1, 0x104F9, }, + { 0x104D2, 0x104FA, }, + { 0x104D3, 0x104FB, }, + { 0x10C80, 0x10CC0, }, + { 0x10C81, 0x10CC1, }, + { 0x10C82, 0x10CC2, }, + { 0x10C83, 0x10CC3, }, + { 0x10C84, 0x10CC4, }, + { 0x10C85, 0x10CC5, }, + { 0x10C86, 0x10CC6, }, + { 0x10C87, 0x10CC7, }, + { 0x10C88, 0x10CC8, }, + { 0x10C89, 0x10CC9, }, + { 0x10C8A, 0x10CCA, }, + { 0x10C8B, 0x10CCB, }, + { 0x10C8C, 0x10CCC, }, + { 0x10C8D, 0x10CCD, }, + { 0x10C8E, 0x10CCE, }, + { 0x10C8F, 0x10CCF, }, + { 0x10C90, 0x10CD0, }, + { 0x10C91, 0x10CD1, }, + { 0x10C92, 0x10CD2, }, + { 0x10C93, 0x10CD3, }, + { 0x10C94, 0x10CD4, }, + { 0x10C95, 0x10CD5, }, + { 0x10C96, 0x10CD6, }, + { 0x10C97, 0x10CD7, }, + { 0x10C98, 0x10CD8, }, + { 0x10C99, 0x10CD9, }, + { 0x10C9A, 0x10CDA, }, + { 0x10C9B, 0x10CDB, }, + { 0x10C9C, 0x10CDC, }, + { 0x10C9D, 0x10CDD, }, + { 0x10C9E, 0x10CDE, }, + { 0x10C9F, 0x10CDF, }, + { 0x10CA0, 0x10CE0, }, + { 0x10CA1, 0x10CE1, }, + { 0x10CA2, 0x10CE2, }, + { 0x10CA3, 0x10CE3, }, + { 0x10CA4, 0x10CE4, }, + { 0x10CA5, 0x10CE5, }, + { 0x10CA6, 0x10CE6, }, + { 0x10CA7, 0x10CE7, }, + { 0x10CA8, 0x10CE8, }, + { 0x10CA9, 0x10CE9, }, + { 0x10CAA, 0x10CEA, }, + { 0x10CAB, 0x10CEB, }, + { 0x10CAC, 0x10CEC, }, + { 0x10CAD, 0x10CED, }, + { 0x10CAE, 0x10CEE, }, + { 0x10CAF, 0x10CEF, }, + { 0x10CB0, 0x10CF0, }, + { 0x10CB1, 0x10CF1, }, + { 0x10CB2, 0x10CF2, }, + { 0x118A0, 0x118C0, }, + { 0x118A1, 0x118C1, }, + { 0x118A2, 0x118C2, }, + { 0x118A3, 0x118C3, }, + { 0x118A4, 0x118C4, }, + { 0x118A5, 0x118C5, }, + { 0x118A6, 0x118C6, }, + { 0x118A7, 0x118C7, }, + { 0x118A8, 0x118C8, }, + { 0x118A9, 0x118C9, }, + { 0x118AA, 0x118CA, }, + { 0x118AB, 0x118CB, }, + { 0x118AC, 0x118CC, }, + { 0x118AD, 0x118CD, }, + { 0x118AE, 0x118CE, }, + { 0x118AF, 0x118CF, }, + { 0x118B0, 0x118D0, }, + { 0x118B1, 0x118D1, }, + { 0x118B2, 0x118D2, }, + { 0x118B3, 0x118D3, }, + { 0x118B4, 0x118D4, }, + { 0x118B5, 0x118D5, }, + { 0x118B6, 0x118D6, }, + { 0x118B7, 0x118D7, }, + { 0x118B8, 0x118D8, }, + { 0x118B9, 0x118D9, }, + { 0x118BA, 0x118DA, }, + { 0x118BB, 0x118DB, }, + { 0x118BC, 0x118DC, }, + { 0x118BD, 0x118DD, }, + { 0x118BE, 0x118DE, }, + { 0x118BF, 0x118DF, }, + { 0x1E900, 0x1E922, }, + { 0x1E901, 0x1E923, }, + { 0x1E902, 0x1E924, }, + { 0x1E903, 0x1E925, }, + { 0x1E904, 0x1E926, }, + { 0x1E905, 0x1E927, }, + { 0x1E906, 0x1E928, }, + { 0x1E907, 0x1E929, }, + { 0x1E908, 0x1E92A, }, + { 0x1E909, 0x1E92B, }, + { 0x1E90A, 0x1E92C, }, + { 0x1E90B, 0x1E92D, }, + { 0x1E90C, 0x1E92E, }, + { 0x1E90D, 0x1E92F, }, + { 0x1E90E, 0x1E930, }, + { 0x1E90F, 0x1E931, }, + { 0x1E910, 0x1E932, }, + { 0x1E911, 0x1E933, }, + { 0x1E912, 0x1E934, }, + { 0x1E913, 0x1E935, }, + { 0x1E914, 0x1E936, }, + { 0x1E915, 0x1E937, }, + { 0x1E916, 0x1E938, }, + { 0x1E917, 0x1E939, }, + { 0x1E918, 0x1E93A, }, + { 0x1E919, 0x1E93B, }, + { 0x1E91A, 0x1E93C, }, + { 0x1E91B, 0x1E93D, }, + { 0x1E91C, 0x1E93E, }, + { 0x1E91D, 0x1E93F, }, + { 0x1E91E, 0x1E940, }, + { 0x1E91F, 0x1E941, }, + { 0x1E920, 0x1E942, }, + { 0x1E921, 0x1E943, }, }; -#define UTF8_CONVERSIONS (sizeof(UTF8_lower_upper) / sizeof(UTF8_lower_upper[0])) +static BAT *UTF8_toUpperFrom = NULL, *UTF8_toUpperTo = NULL, + *UTF8_toLowerFrom = NULL, *UTF8_toLowerTo = NULL; + +#ifndef NDEBUG +static void +UTF8_assert(const char *s) +{ + int c; -static BAT *UTF8_upperBat = NULL, *UTF8_lowerBat = NULL; + if (s == NULL) + return; + if (*s == '\200' && s[1] == '\0') + return; /* str_nil */ + while ((c = *s++) != '\0') { + if ((c & 0x80) == 0) + continue; + if ((*s++ & 0xC0) != 0x80) + assert(0); + if ((c & 0xE0) == 0xC0) + continue; + if ((*s++ & 0xC0) != 0x80) + assert(0); + if ((c & 0xF0) == 0xE0) + continue; + if ((*s++ & 0xC0) != 0x80) + assert(0); + if ((c & 0xF8) == 0xF0) + continue; + assert(0); + } +} +#else +#define UTF8_assert(s) ((void) 0) +#endif str strPrelude(void *ret) { (void) ret; - if (UTF8_upperBat == NULL) { - int i = UTF8_CONVERSIONS; + if (UTF8_toUpperFrom == NULL) { + size_t i; - UTF8_upperBat = COLnew(0, TYPE_int, UTF8_CONVERSIONS, TRANSIENT); - UTF8_lowerBat = COLnew(0, TYPE_int, UTF8_CONVERSIONS, TRANSIENT); - if (UTF8_upperBat == NULL || UTF8_lowerBat == NULL) + UTF8_toUpperFrom = COLnew(0, TYPE_int, 1500, TRANSIENT); + UTF8_toUpperTo = COLnew(0, TYPE_int, 1500, TRANSIENT); + UTF8_toLowerFrom = COLnew(0, TYPE_int, 1500, TRANSIENT); + UTF8_toLowerTo = COLnew(0, TYPE_int, 1500, TRANSIENT); + if (UTF8_toUpperFrom == NULL || UTF8_toUpperTo == NULL || + UTF8_toLowerFrom == NULL || UTF8_toLowerTo == NULL) { goto bailout; + } - while (--i >= 0) { - if (BUNappend(UTF8_upperBat, &UTF8_lower_upper[i].upper, FALSE) != GDK_SUCCEED || - BUNappend(UTF8_lowerBat, &UTF8_lower_upper[i].lower, FALSE) != GDK_SUCCEED) { + for (i = 0; i < sizeof(UTF8_toUpper) / sizeof(UTF8_toUpper[0]); i++) { + if (BUNappend(UTF8_toUpperFrom, &UTF8_toUpper[i].from, FALSE) != GDK_SUCCEED || + BUNappend(UTF8_toUpperTo, &UTF8_toUpper[i].to, FALSE) != GDK_SUCCEED) goto bailout; - } } - if (BBPrename(UTF8_upperBat->batCacheid, "monet_unicode_toupper") != 0 || - BBPrename(UTF8_lowerBat->batCacheid, "monet_unicode_tolower") != 0) { + + for (i = 0; i < sizeof(UTF8_toLower) / sizeof(UTF8_toLower[0]); i++) { + if (BUNappend(UTF8_toLowerFrom, &UTF8_toLower[i].from, FALSE) != GDK_SUCCEED || + BUNappend(UTF8_toLowerTo, &UTF8_toLower[i].to, FALSE) != GDK_SUCCEED) + goto bailout; + } + + if (BBPrename(UTF8_toUpperFrom->batCacheid, "monet_unicode_upper_from") != 0 || + BBPrename(UTF8_toUpperTo->batCacheid, "monet_unicode_upper_to") != 0 || + BBPrename(UTF8_toLowerFrom->batCacheid, "monet_unicode_lower_from") != 0 || + BBPrename(UTF8_toLowerTo->batCacheid, "monet_unicode_lower_to") != 0) { goto bailout; } } return MAL_SUCCEED; bailout: - BBPreclaim(UTF8_upperBat); - BBPreclaim(UTF8_lowerBat); - UTF8_upperBat = NULL; - UTF8_lowerBat = NULL; + BBPreclaim(UTF8_toUpperFrom); + BBPreclaim(UTF8_toUpperTo); + BBPreclaim(UTF8_toLowerFrom); + BBPreclaim(UTF8_toLowerTo); + UTF8_toUpperFrom = NULL; + UTF8_toUpperTo = NULL; + UTF8_toLowerFrom = NULL; + UTF8_toLowerTo = NULL; throw(MAL, "str.prelude", GDK_EXCEPTION); } @@ -1124,95 +2816,71 @@ str strEpilogue(void *ret) { (void) ret; - if (UTF8_upperBat) - BBPunfix(UTF8_upperBat->batCacheid); - if (UTF8_lowerBat) - BBPunfix(UTF8_lowerBat->batCacheid); - UTF8_upperBat = UTF8_lowerBat = NULL; + if (UTF8_toUpperFrom) + BBPunfix(UTF8_toUpperFrom->batCacheid); + if (UTF8_toUpperTo) + BBPunfix(UTF8_toUpperTo->batCacheid); + if (UTF8_toLowerFrom) + BBPunfix(UTF8_toLowerFrom->batCacheid); + if (UTF8_toLowerTo) + BBPunfix(UTF8_toLowerTo->batCacheid); + UTF8_toUpperFrom = NULL; + UTF8_toUpperTo = NULL; + UTF8_toLowerFrom = NULL; + UTF8_toLowerTo = NULL; return MAL_SUCCEED; } -/* Get the last char in (X2), and #bytes it takes, but do not decrease the pos in (X2) - * The ELSE IF conditions are computed by comparing the left most byte with the - * (mask-bits - 1). The '-1' is to use '>' i.s.o. '>='. - * See gdk_atoms.c for UTF-8 encoding, especially, definitions of the mask-bits */ +/* Get the last char in (X2), and #bytes it takes, but do not decrease + * the pos in (X2). See gdk_atoms.c for UTF-8 encoding */ #define UTF8_LASTCHAR(X1, SZ, X2, SZ2) \ do { \ - if (*((X2)+SZ2-1) < 0x80) { \ - (X1) = *((X2)+SZ2-1); \ + if (((X2)[SZ2-1] & 0x80) == 0) { \ + (X1) = (X2)[SZ2-1]; \ (SZ) = 1; \ - } else if (*((X2)+SZ2-2) > 0xBF) { \ - (X1) = (*((X2)+SZ2-2) & 0x1F) << 6; \ - (X1) |= (*((X2)+SZ2-1) & 0x3F); \ + } else if (((X2)[SZ2-2] & 0xE0) == 0xC0) { \ + (X1) = ((X2)[SZ2-2] & 0x1F) << 6; \ + (X1) |= ((X2)[SZ2-1] & 0x3F); \ (SZ) = 2; \ - } else if (*((X2)+SZ2-3) > 0xDF) { \ - (X1) = (*((X2)+SZ2-3) & 0x0F) << 12; \ - (X1) |= (*((X2)+SZ2-2) & 0x3F) << 6; \ - (X1) |= (*((X2)+SZ2-1) & 0x3F); \ + } else if (((X2)[SZ2-3] & 0xF0) == 0xE0) { \ + (X1) = ((X2)[SZ2-3] & 0x0F) << 12; \ + (X1) |= ((X2)[SZ2-2] & 0x3F) << 6; \ + (X1) |= ((X2)[SZ2-1] & 0x3F); \ (SZ) = 3; \ - } else if (*((X2)+SZ2-4) > 0xEF) { \ - (X1) = (*((X2)+SZ2-4) & 0x07) << 18; \ - (X1) |= (*((X2)+SZ2-3) & 0x3F) << 12; \ - (X1) |= (*((X2)+SZ2-2) & 0x3F) << 6; \ - (X1) |= (*((X2)+SZ2-1) & 0x3F); \ + } else if (((X2)[SZ2-4] & 0xF8) == 0xF0) { \ + (X1) = ((X2)[SZ2-4] & 0x07) << 18; \ + (X1) |= ((X2)[SZ2-3] & 0x3F) << 12; \ + (X1) |= ((X2)[SZ2-2] & 0x3F) << 6; \ + (X1) |= ((X2)[SZ2-1] & 0x3F); \ (SZ) = 4; \ - } else if (*((X2)+SZ2-5) > 0xF7) { \ - (X1) = (*((X2)+SZ2-5) & 0x03) << 24; \ - (X1) |= (*((X2)+SZ2-4) & 0x3F) << 18; \ - (X1) |= (*((X2)+SZ2-3) & 0x3F) << 12; \ - (X1) |= (*((X2)+SZ2-2) & 0x3F) << 6; \ - (X1) |= (*((X2)+SZ2-1) & 0x3F); \ - (SZ) = 5; \ - } else if (*((X2)+SZ2-6) > 0xFB) { \ - (X1) = (*((X2)+SZ2-6) & 0x01) << 30; \ - (X1) |= (*((X2)+SZ2-5) & 0x3F) << 24; \ - (X1) |= (*((X2)+SZ2-4) & 0x3F) << 18; \ - (X1) |= (*((X2)+SZ2-3) & 0x3F) << 12; \ - (X1) |= (*((X2)+SZ2-2) & 0x3F) << 6; \ - (X1) |= (*((X2)+SZ2-1) & 0x3F); \ - (SZ) = 6; \ } else { \ (X1) = int_nil; \ (SZ) = 0; \ } \ } while (0) -/* Get the first char in (X2), and #bytes it takes, but do not increase the pos in (X2) */ +/* Get the first char in (X2), and #bytes it takes, but do not + * increase the pos in (X2) */ #define UTF8_NEXTCHAR(X1, SZ, X2) \ do { \ - if (*(X2) < 0x80) { \ - (X1) = *(X2); \ + if (((X2)[0] & 0x80) == 0) { \ + (X1) = (X2)[0]; \ (SZ) = 1; \ - } else if (*(X2) < 0xE0) { \ - (X1) = ( *(X2) & 0x1F) << 6; \ - (X1) |= (*((X2)+1) & 0x3F); \ + } else if (((X2)[0] & 0xE0) == 0xC0) { \ + (X1) = ((X2)[0] & 0x1F) << 6; \ + (X1) |= ((X2)[1] & 0x3F); \ (SZ) = 2; \ - } else if (*(X2) < 0xF0) { \ - (X1) = ( *(X2) & 0x0F) << 12; \ - (X1) |= (*((X2)+1) & 0x3F) << 6; \ - (X1) |= (*((X2)+2) & 0x3F); \ + } else if (((X2)[0] & 0xF0) == 0xE0) { \ + (X1) = ((X2)[0] & 0x0F) << 12; \ + (X1) |= ((X2)[1] & 0x3F) << 6; \ + (X1) |= ((X2)[2] & 0x3F); \ (SZ) = 3; \ - } else if (*(X2) < 0xF8) { \ - (X1) = ( *(X2) & 0x07) << 18; \ - (X1) |= (*((X2)+1) & 0x3F) << 12; \ - (X1) |= (*((X2)+2) & 0x3F) << 6; \ - (X1) |= (*((X2)+3) & 0x3F); \ + } else if (((X2)[0] & 0xF8) == 0xF0) { \ + (X1) = ((X2)[0] & 0x07) << 18; \ + (X1) |= ((X2)[1] & 0x3F) << 12; \ + (X1) |= ((X2)[2] & 0x3F) << 6; \ + (X1) |= ((X2)[3] & 0x3F); \ (SZ) = 4; \ - } else if (*(X2) < 0xFC) { \ - (X1) = ( *(X2) & 0x03) << 24; \ - (X1) |= (*((X2)+1) & 0x3F) << 18; \ - (X1) |= (*((X2)+2) & 0x3F) << 12; \ - (X1) |= (*((X2)+3) & 0x3F) << 6; \ - (X1) |= (*((X2)+4) & 0x3F); \ - (SZ) = 5; \ - } else if (*(X2) < 0xFE) { \ - (X1) = ( *(X2) & 0x01) << 30; \ - (X1) |= (*((X2)+1) & 0x3F) << 24; \ - (X1) |= (*((X2)+2) & 0x3F) << 18; \ - (X1) |= (*((X2)+3) & 0x3F) << 12; \ - (X1) |= (*((X2)+4) & 0x3F) << 6; \ - (X1) |= (*((X2)+5) & 0x3F); \ - (SZ) = 6; \ } else { \ (X1) = int_nil; \ (SZ) = 0; \ @@ -1222,39 +2890,24 @@ strEpilogue(void *ret) /* Get the first char in (X2), and #bytes it takes */ #define UTF8_GETCHAR_SZ(X1, SZ, X2) \ do { \ - if (*(X2) < 0x80) { \ + if ((*(X2) & 0x80) == 0) { \ (X1) = *(X2)++; \ (SZ) = 1; \ - } else if (*(X2) < 0xE0) { \ + } else if ((*(X2) & 0xE0) == 0xC0) { \ (X1) = (*(X2)++ & 0x1F) << 6; \ (X1) |= (*(X2)++ & 0x3F); \ (SZ) = 2; \ - } else if (*(X2) < 0xF0) { \ + } else if ((*(X2) & 0xF0) == 0xE0) { \ (X1) = (*(X2)++ & 0x0F) << 12; \ (X1) |= (*(X2)++ & 0x3F) << 6; \ (X1) |= (*(X2)++ & 0x3F); \ (SZ) = 3; \ - } else if (*(X2) < 0xF8) { \ + } else if ((*(X2) & 0xF8) == 0xF0) { \ (X1) = (*(X2)++ & 0x07) << 18; \ (X1) |= (*(X2)++ & 0x3F) << 12; \ (X1) |= (*(X2)++ & 0x3F) << 6; \ (X1) |= (*(X2)++ & 0x3F); \ (SZ) = 4; \ - } else if (*(X2) < 0xFC) { \ - (X1) = (*(X2)++ & 0x03) << 24; \ - (X1) |= (*(X2)++ & 0x3F) << 18; \ - (X1) |= (*(X2)++ & 0x3F) << 12; \ - (X1) |= (*(X2)++ & 0x3F) << 6; \ - (X1) |= (*(X2)++ & 0x3F); \ - (SZ) = 5; \ - } else if (*(X2) < 0xFE) { \ - (X1) = (*(X2)++ & 0x01) << 30; \ - (X1) |= (*(X2)++ & 0x3F) << 24; \ - (X1) |= (*(X2)++ & 0x3F) << 18; \ - (X1) |= (*(X2)++ & 0x3F) << 12; \ - (X1) |= (*(X2)++ & 0x3F) << 6; \ - (X1) |= (*(X2)++ & 0x3F); \ - (SZ) = 6; \ } else { \ (X1) = int_nil; \ (SZ) = 0; \ @@ -1263,200 +2916,119 @@ strEpilogue(void *ret) #define UTF8_GETCHAR(X1, X2) \ do { \ - if (*(X2) < 0x80) { \ + if ((*(X2) & 0x80) == 0) { \ (X1) = *(X2)++; \ - } else if (*(X2) < 0xE0) { \ + } else if ((*(X2) & 0xE0) == 0xC0) { \ (X1) = (*(X2)++ & 0x1F) << 6; \ (X1) |= (*(X2)++ & 0x3F); \ - } else if (*(X2) < 0xF0) { \ + } else if ((*(X2) & 0xF0) == 0xE0) { \ (X1) = (*(X2)++ & 0x0F) << 12; \ (X1) |= (*(X2)++ & 0x3F) << 6; \ (X1) |= (*(X2)++ & 0x3F); \ - } else if (*(X2) < 0xF8) { \ + } else if ((*(X2) & 0xF8) == 0xF0) { \ (X1) = (*(X2)++ & 0x07) << 18; \ (X1) |= (*(X2)++ & 0x3F) << 12; \ (X1) |= (*(X2)++ & 0x3F) << 6; \ (X1) |= (*(X2)++ & 0x3F); \ - } else if (*(X2) < 0xFC) { \ - (X1) = (*(X2)++ & 0x03) << 24; \ - (X1) |= (*(X2)++ & 0x3F) << 18; \ - (X1) |= (*(X2)++ & 0x3F) << 12; \ - (X1) |= (*(X2)++ & 0x3F) << 6; \ - (X1) |= (*(X2)++ & 0x3F); \ - } else if (*(X2) < 0xFE) { \ - (X1) = (*(X2)++ & 0x01) << 30; \ - (X1) |= (*(X2)++ & 0x3F) << 24; \ - (X1) |= (*(X2)++ & 0x3F) << 18; \ - (X1) |= (*(X2)++ & 0x3F) << 12; \ - (X1) |= (*(X2)++ & 0x3F) << 6; \ - (X1) |= (*(X2)++ & 0x3F); \ } else { \ (X1) = int_nil; \ } \ } while (0) -#define UTF8_PUTCHAR(X1,X2) \ - do { \ - if ((X1) < 0 || (SIZEOF_INT > 4 && (int) (X1) >= 0x80000000)) { \ - *(X2)++ = '\200'; \ - } else if ((X1) < 0x80) { \ - *(X2)++ = (X1); \ - } else if ((X1) < 0x800) { \ - *(X2)++ = 0xC0 | ((X1) >> 6); \ - *(X2)++ = 0x80 | ((X1) & 0x3F); \ - } else if ((X1) < 0x10000) { \ - *(X2)++ = 0xE0 | ((X1) >> 12); \ - *(X2)++ = 0x80 | (((X1) >> 6) & 0x3F); \ - *(X2)++ = 0x80 | ((X1) & 0x3F); \ - } else if ((X1) < 0x200000) { \ - *(X2)++ = 0xF0 | ((X1) >> 18); \ - *(X2)++ = 0x80 | (((X1) >> 12) & 0x3F); \ - *(X2)++ = 0x80 | (((X1) >> 6) & 0x3F); \ - *(X2)++ = 0x80 | ((X1) & 0x3F); \ - } else if ((X1) < 0x4000000) { \ - *(X2)++ = 0xF8 | ((X1) >> 24); \ - *(X2)++ = 0x80 | (((X1) >> 18) & 0x3F); \ - *(X2)++ = 0x80 | (((X1) >> 12) & 0x3F); \ - *(X2)++ = 0x80 | (((X1) >> 6) & 0x3F); \ - *(X2)++ = 0x80 | ((X1) & 0x3F); \ - } else /* if ((X1) < 0x80000000) */ { \ - *(X2)++ = 0xFC | ((X1) >> 30); \ - *(X2)++ = 0x80 | (((X1) >> 24) & 0x3F); \ - *(X2)++ = 0x80 | (((X1) >> 18) & 0x3F); \ - *(X2)++ = 0x80 | (((X1) >> 12) & 0x3F); \ - *(X2)++ = 0x80 | (((X1) >> 6) & 0x3F); \ - *(X2)++ = 0x80 | ((X1) & 0x3F); \ - } \ +#define UTF8_PUTCHAR(X1,X2) \ + do { \ + if ((X1) < 0 || (X1) > 0x10FFFF) { \ + goto illegal; \ + } else if ((X1) <= 0x7F) { \ + *(X2)++ = (X1); \ + } else if ((X1) <= 0x7FF) { \ + *(X2)++ = 0xC0 | ((X1) >> 6); \ + *(X2)++ = 0x80 | ((X1) & 0x3F); \ + } else if ((X1) <= 0xFFFF) { \ + *(X2)++ = 0xE0 | ((X1) >> 12); \ + *(X2)++ = 0x80 | (((X1) >> 6) & 0x3F); \ + *(X2)++ = 0x80 | ((X1) & 0x3F); \ + } else { \ + *(X2)++ = 0xF0 | ((X1) >> 18); \ + *(X2)++ = 0x80 | (((X1) >> 12) & 0x3F); \ + *(X2)++ = 0x80 | (((X1) >> 6) & 0x3F); \ + *(X2)++ = 0x80 | ((X1) & 0x3F); \ + } \ } while (0) -static inline int -UTF8_strlen(const char *val) +static inline size_t +UTF8_strlen(const char *s) { - const unsigned char *s = (const unsigned char *) val; - int pos = 0; + size_t pos = 0; + + UTF8_assert(s); + + if (GDK_STRNIL(s)) + return 1; while (*s) { - int c = *s++; - - pos++; - if (c < 0xC0) - continue; - if (*s++ < 0x80) - return int_nil; - if (c < 0xE0) - continue; - if (*s++ < 0x80) - return int_nil; - if (c < 0xF0) - continue; - if (*s++ < 0x80) - return int_nil; - if (c < 0xF8) - continue; - if (*s++ < 0x80) - return int_nil; - if (c < 0xFC) - continue; - if (*s++ < 0x80) - return int_nil; + /* just count leading bytes of encoded code points; only works + * for correctly encoded UTF-8 */ + pos += (*s++ & 0xC0) != 0x80; } return pos; } static inline int -UTF8_strpos(const char *val, const char *end) +UTF8_strpos(const char *s, const char *end) { - const unsigned char *s = (const unsigned char *) val; int pos = 0; - if (s > (unsigned char *) end) { + UTF8_assert(s); + + if (s > end) { return -1; } - while (s < (unsigned char *) end) { - int c = *s++; - - pos++; - if (c == 0) - return -1; - if (c < 0xC0) - continue; - if (*s++ < 0x80) - return -1; - if (c < 0xE0) - continue; - if (*s++ < 0x80) - return -1; - if (c < 0xF0) - continue; - if (*s++ < 0x80) - return -1; - if (c < 0xF8) - continue; - if (*s++ < 0x80) - return -1; - if (c < 0xFC) - continue; - if (*s++ < 0x80) - return -1; + while (s < end) { + /* just count leading bytes of encoded code points; only works + * for correctly encoded UTF-8 */ + pos += (*s++ & 0xC0) != 0x80; } return pos; } static inline str -UTF8_strtail(const char *val, int pos) +UTF8_strtail(const char *s, int pos) { - const unsigned char *s = (const unsigned char *) val; - - while (*s && pos-- > 0) { - int c = *s++; - - if (c < 0xC0) - continue; - if (*s++ < 0x80) - return NULL; - if (c < 0xE0) - continue; - if (*s++ < 0x80) - return NULL; - if (c < 0xF0) - continue; - if (*s++ < 0x80) - return NULL; - if (c < 0xF8) - continue; - if (*s++ < 0x80) - return NULL; - if (c < 0xFC) - continue; - if (*s++ < 0x80) - return NULL; + UTF8_assert(s); + while (*s) { + if ((*s & 0xC0) != 0x80) { + if (pos <= 0) + break; + pos--; + } + s++; } return (str) s; } static str -convertCase(BAT *from, BAT *to, str *res, const char *s, const char *malfunc) +convertCase(BAT *from, BAT *to, str *res, const char *src, const char *malfunc) { BATiter toi = bat_iterator(to); BATiter fromi = bat_iterator(from); - size_t len = strlen(s); - unsigned char *dst; - const unsigned char *src = (const unsigned char *) s; - const unsigned char *end = (const unsigned char *) (src + len); + size_t len = strlen(src); + char *dst; + const char *end = src + len; BUN UTF8_CONV_r; - int lower_to_upper = from == UTF8_lowerBat; + int lower_to_upper = from == UTF8_toUpperFrom; - if (strNil(s)) { + if (strNil(src)) { *res = GDKstrdup(str_nil); } else { *res = GDKmalloc(len + 1); if (*res != NULL) { - dst = (unsigned char *) *res; + dst = *res; while (src < end) { int c; UTF8_GETCHAR(c, src); - if (c < 0x80) { + if ((c & 0x80) == 0) { /* for ASCII characters we don't need to do a hash * lookup */ if (lower_to_upper) { @@ -1467,25 +3039,26 @@ convertCase(BAT *from, BAT *to, str *res c += 'a' - 'A'; } } else { + /* use hash, even though BAT is sorted */ HASHfnd_int(UTF8_CONV_r, fromi, &c); if (UTF8_CONV_r != BUN_NONE) c = *(int*) BUNtloc(toi, UTF8_CONV_r); } - if (dst + 6 > (unsigned char *) *res + len) { + if (dst + 4 > *res + len) { /* not guaranteed to fit, so allocate more space; * also allocate enough for the rest of the * source */ - size_t off = dst - (unsigned char *) *res; + size_t off = dst - *res; - dst = GDKrealloc(*res, (len += 6 + (end - src)) + 1); + dst = GDKrealloc(*res, (len += 4 + (end - src)) + 1); if (dst == NULL) { /* if realloc fails, original buffer is still * allocated, so free it */ GDKfree(*res); goto hashfnd_failed; } - *res = (char *) dst; - dst = (unsigned char *) *res + off; + *res = dst; + dst = *res + off; } UTF8_PUTCHAR(c, dst); } @@ -1496,6 +3069,8 @@ convertCase(BAT *from, BAT *to, str *res return MAL_SUCCEED; hashfnd_failed: throw(MAL, malfunc, SQLSTATE(HY001) MAL_MALLOC_FAIL); + illegal: + throw(MAL, malfunc, SQLSTATE(42000) "Illegal Unicode code point"); } /* @@ -1545,13 +3120,16 @@ STRlike(const char *s, const char *pat, } str -STRlikewrap(bit *ret, const str *s, const str *pat, const str *esc){ - *ret = STRlike(*s,*pat,*esc); +STRlikewrap(bit *ret, const str *s, const str *pat, const str *esc) +{ + *ret = STRlike(*s, *pat, *esc); return MAL_SUCCEED; } + str -STRlikewrap2(bit *ret, const str *s, const str *pat){ - *ret = STRlike(*s,*pat,0); +STRlikewrap2(bit *ret, const str *s, const str *pat) +{ + *ret = STRlike(*s, *pat, NULL); return MAL_SUCCEED; } @@ -1567,9 +3145,6 @@ STRtostr(str *res, const str *src) return MAL_SUCCEED; } -/* - * The concatenate operator requires a type in most cases. - */ str STRConcat(str *res, const str *val1, const str *val2) { @@ -1596,7 +3171,7 @@ STRConcat(str *res, const str *val1, con str STRLength(int *res, const str *arg1) { - int l; + size_t l; const char *s = *arg1; if (strNil(s)) { @@ -1604,8 +3179,10 @@ STRLength(int *res, const str *arg1) return MAL_SUCCEED; } l = UTF8_strlen(s); - assert(l <INT_MAX); - *res = l; + assert(l < INT_MAX); + if (l > INT_MAX) + l = INT_MAX; + *res = (int) l; return MAL_SUCCEED; } @@ -1615,7 +3192,7 @@ STRBytes(int *res, const str *arg1) size_t l; l = strlen(*arg1); - assert(l <INT_MAX); + assert(l < INT_MAX); *res = (int) l; return MAL_SUCCEED; } @@ -1630,19 +3207,14 @@ STRTail(str *res, const str *arg1, const *res = GDKstrdup(str_nil); } else { if (off < 0) { - int len = UTF8_strlen(s); + size_t len = UTF8_strlen(s); - if (len == int_nil) { - *res = GDKstrdup(str_nil); - if (*res == NULL) - throw(MAL, "str.tail", SQLSTATE(HY001) MAL_MALLOC_FAIL); - return MAL_SUCCEED; - } - off = len + off; + assert(len <= INT_MAX); + off += (int) len; if (off < 0) off = 0; } - *res = (char *) GDKstrdup(UTF8_strtail(s, off)); + *res = GDKstrdup(UTF8_strtail(s, off)); } if (*res == NULL) throw(MAL, "str.tail", SQLSTATE(HY001) MAL_MALLOC_FAIL); @@ -1652,7 +3224,8 @@ STRTail(str *res, const str *arg1, const str STRSubString(str *res, const str *arg1, const int *offset, const int *length) { - int len, off = *offset, l = *length; + size_t len; + int off = *offset, l = *length; const char *s = *arg1; if (strNil(s) || off == int_nil || l == int_nil) { @@ -1663,13 +3236,10 @@ STRSubString(str *res, const str *arg1, } if (off < 0) { len = UTF8_strlen(s); - if (len == int_nil) { - *res = GDKstrdup(str_nil); - if (*res == NULL) - throw(MAL, "str.substring", SQLSTATE(HY001) MAL_MALLOC_FAIL); - return MAL_SUCCEED; - } - off = len + off; + assert(len <= INT_MAX); + if (len > INT_MAX) + len = INT_MAX; + off += (int) len; if (off < 0) { l += off; off = 0; @@ -1683,7 +3253,7 @@ STRSubString(str *res, const str *arg1, return MAL_SUCCEED; } s = UTF8_strtail(s, off); - len = (int) (UTF8_strtail(s, l) - s); + len = (size_t) (UTF8_strtail(s, l) - s); *res = GDKmalloc(len + 1); if (*res == NULL) throw(MAL, "str.substring", SQLSTATE(HY001) MAL_MALLOC_FAIL); @@ -1695,23 +3265,35 @@ STRSubString(str *res, const str *arg1, str STRFromWChr(str *res, const int *c) { - str s = *res = GDKmalloc(7); + str s; + if (*c == int_nil) { + *res = GDKstrdup(str_nil); + if (*res == NULL) + throw(MAL, "str.unicode", SQLSTATE(HY001) MAL_MALLOC_FAIL); + return MAL_SUCCEED; + } + + s = *res = GDKmalloc(5); if (*res == NULL) throw(MAL, "str.unicode", SQLSTATE(HY001) MAL_MALLOC_FAIL); UTF8_PUTCHAR(*c, s); *s = 0; return MAL_SUCCEED; + illegal: + GDKfree(*res); + *res = NULL; + throw(MAL, "str.unicode", SQLSTATE(42000) "Illegal Unicode code point"); } +/* return the Unicode code point of arg1 at position at */ str STRWChrAt(int *res, const str *arg1, const int *at) { /* 64bit: should have lng arg */ const char *s = *arg1; - const unsigned char *u; - if (strNil(*arg1) || *at == int_nil || *at < 0) { + if (strNil(s) || *at == int_nil || *at < 0) { *res = int_nil; return MAL_SUCCEED; } @@ -1720,15 +3302,14 @@ STRWChrAt(int *res, const str *arg1, con *res = int_nil; return MAL_SUCCEED; } - u = (const unsigned char *) s; - UTF8_GETCHAR(*res, u); + UTF8_GETCHAR(*res, s); return MAL_SUCCEED; } +/* returns whether arg1 starts with arg2 */ str STRPrefix(bit *res, const str *arg1, const str *arg2) { - size_t pl, i; const char *s = *arg1; const char *prefix = *arg2; @@ -1736,25 +3317,15 @@ STRPrefix(bit *res, const str *arg1, con *res = bit_nil; return MAL_SUCCEED; } - pl = strlen(prefix); - if (strlen(s) < pl) { - *res = 0; - return MAL_SUCCEED; - } - *res = 1; - for (i = 0; i < pl; i++) { - if (s[i] != prefix[i]) { - *res = 0; - break; - } - } + *res = strncmp(s, prefix, strlen(prefix)) == 0; return MAL_SUCCEED; } +/* returns whether arg1 ends with arg2 */ str STRSuffix(bit *res, const str *arg1, const str *arg2) { - size_t i, sl, sul; + size_t sl, sul; const char *s = *arg1; const char *suffix = *arg2; @@ -1765,37 +3336,30 @@ STRSuffix(bit *res, const str *arg1, con sl = strlen(s); sul = strlen(suffix); - if (sl < sul) { + if (sl < sul) *res = 0; - return MAL_SUCCEED; - } - *res = 1; - for (i = 0; i < sul; i++) { - if (s[sl - 1 - i] != suffix[sul - 1 - i]) { - *res = 0; - break; - } - } + else + *res = strcmp(s + sl - sul, suffix) == 0; return MAL_SUCCEED; } str STRLower(str *res, const str *arg1) { - return convertCase(UTF8_upperBat, UTF8_lowerBat, res, *arg1, "str.lower"); + return convertCase(UTF8_toLowerFrom, UTF8_toLowerTo, res, *arg1, "str.lower"); } str STRUpper(str *res, const str *arg1) { - return convertCase(UTF8_lowerBat, UTF8_upperBat, res, *arg1, "str.upper"); + return convertCase(UTF8_toUpperFrom, UTF8_toUpperTo, res, *arg1, "str.upper"); } +/* find first occurrence of needle in haystack */ str STRstrSearch(int *res, const str *haystack, const str *needle) { /* 64bit: should return lng */ - char *p; const char *s = *haystack; const char *s2 = *needle; @@ -1803,20 +3367,19 @@ STRstrSearch(int *res, const str *haysta *res = int_nil; return MAL_SUCCEED; } - if ((p = strstr(s, s2)) != 0) - *res = UTF8_strpos(s, p); + if ((s2 = strstr(s, s2)) != NULL) + *res = UTF8_strpos(s, s2); else *res = -1; return MAL_SUCCEED; } +/* find last occurrence of arg2 in arg1 */ str STRReverseStrSearch(int *res, const str *arg1, const str *arg2) { /* 64bit: should return lng */ size_t len, slen; - const char *p, *q; - size_t i; const char *s = *arg1; const char *s2 = *arg2; @@ -1827,13 +3390,15 @@ STRReverseStrSearch(int *res, const str *res = -1; len = strlen(s); slen = strlen(s2); - for (p = s + len - slen; p >= s; p--) { - for (i = 0, q = p; i < slen && *q == s2[i]; i++, q++) - ; - if (i == slen) { - *res = UTF8_strpos(s, p); - break; - } + *res = -1; /* changed if found */ + if (len >= slen) { + const char *p = s + len - slen; + do { + if (strncmp(p, s2, slen) == 0) { + *res = UTF8_strpos(s, p); + break; + } + } while (p-- > s); } return MAL_SUCCEED; } @@ -1848,19 +3413,19 @@ STRsplitpart(str *res, str *haystack, st const char *s2 = *needle; if (strNil(s) || *field == int_nil) { - *res = GDKstrdup(""); + *res = GDKstrdup(str_nil); if (*res == NULL) throw(MAL, "str.splitpart", SQLSTATE(HY001) MAL_MALLOC_FAIL); return MAL_SUCCEED; } if (*field <= 0) { - throw(MAL, "str.splitpart", "field position must be greater than zero"); + throw(MAL, "str.splitpart", SQLSTATE(42000) "field position must be greater than zero"); } len = strlen(s2); - while ((p = strstr(s, s2)) != 0 && f > 1) { + while ((p = strstr(s, s2)) != NULL && f > 1) { s = p + len; f--; } @@ -1872,257 +3437,312 @@ STRsplitpart(str *res, str *haystack, st return MAL_SUCCEED; } - if (p == 0) { + if (p == NULL) { len = strlen(s); - } else if ((p = strstr(s, s2)) != 0) { + } else { len = (size_t) (p - s); - } else { - len = strlen(s); } - if (len == 0) { - *res = GDKstrdup(""); - if (*res == NULL) - throw(MAL, "str.splitpart", SQLSTATE(HY001) MAL_MALLOC_FAIL); - return MAL_SUCCEED; - } - *res = GDKmalloc(len + 1); + *res = GDKstrndup(s, len); if (*res == NULL) throw(MAL, "str.splitpart", SQLSTATE(HY001) MAL_MALLOC_FAIL); - strncpy(*res, s, len); - (*res)[len] = 0; - return MAL_SUCCEED; -} - -str -STRStrip(str *res, const str *arg1) -{ - const char *start = *arg1; - const char *s; - size_t len; - - while (GDKisspace(*start)) - start++; - - /* Remove the trailing spaces. Make sure not to pass the start */ - /* pointer in case a string only contains spaces. */ - s = start + strlen(start); - while (s > start && GDKisspace(*(s - 1))) - s--; - - len = s - start + 1; - *res = GDKmalloc(len); - if (*res == NULL) - throw(MAL, "str.trim", SQLSTATE(HY001) MAL_MALLOC_FAIL); - memcpy(*res, start, len - 1); - (*res)[len - 1] = '\0'; return MAL_SUCCEED; } -/* Remove the longest string containing only characters from 'arg2' from the start of 'arg1' - * - * Example: trim('zzzytrimzyxyyz', 'xyz') - * Result: trim - */ -str -STRStrip2(str *res, const str *arg1, const str *arg2) +/* returns number of bytes to remove from left to strip the codepoints in rm */ +static size_t +lstrip(const char *s, size_t len, const int *rm, size_t nrm) { - const char *s = *arg1, *s2 = *arg2; - const unsigned char *u = NULL; - int *toRm = NULL; /* candidate list of to be removed characters, converted to INT */ - int i = 0, rm_cnt = UTF8_strlen(s2); - size_t len = strlen(*arg1); + int c; + size_t i, n, skip = 0; + + while (len > 0) { + UTF8_NEXTCHAR(c, n, s); + assert(n > 0 && n <= len); + for (i = 0; i < nrm; i++) { + if (rm[i] == c) { + s += n; + skip += n; + len -= n; + break; + } + } + if (i == nrm) + break; + } + return skip; +} + +/* returns the resulting length of s after stripping codepoints in rm + * from the right */ +static size_t +rstrip(const char *s, size_t len, const int *rm, size_t nrm) +{ + int c; + size_t i, n; - toRm = GDKmalloc(sizeof(int) * rm_cnt); - if (toRm == NULL) - throw(MAL, "str.trim", SQLSTATE(HY001) MAL_MALLOC_FAIL); - u = (const unsigned char *) s2; - for (i = 0; i < rm_cnt; i++) - UTF8_GETCHAR(toRm[i], u); - /* Just a sanity check that all bytes of s2 are consumed */ - if (u[0] != '\0') { - GDKfree(toRm); - throw(MAL, "str.trim", "Invalid UTF-8 string %s", *arg2); + while (len > 0) { + UTF8_LASTCHAR(c, n, s, len); + assert(n > 0 && n <= len); + for (i = 0; i < nrm; i++) { + if (rm[i] == c) { + len -= n; + break; + } + } + if (i == nrm) + break; } + return len; +} - if (strNil(s)) { +const int whitespace[] = { + ' ', /* space */ + '\t', /* tab (character tabulation) */ + '\n', /* line feed */ + '\r', /* carriage return */ + '\f', /* form feed */ + '\v', /* vertical tab (line tabulation) */ +/* below the code points that have the Unicode Zs (space separator) property */ + 0x00A0, /* no-break space */ + 0x1680, /* ogham space mark */ + 0x2000, /* en quad */ + 0x2001, /* em quad */ + 0x2002, /* en space */ + 0x2003, /* em space */ + 0x2004, /* three-per-em space */ + 0x2005, /* four-per-em space */ + 0x2006, /* six-per-em space */ + 0x2007, /* figure space */ + 0x2008, /* punctuation space */ + 0x2009, /* thin space */ + 0x200A, /* hair space */ + 0x202F, /* narrow no-break space */ + 0x205F, /* medium mathematical space */ + 0x3000, /* ideographic space */ +}; +#define NSPACES (sizeof(whitespace) / sizeof(whitespace[0])) + +/* remove all whitespace from either side of arg1 */ +str +STRStrip(str *res, const str *arg1) +{ + const char *s = *arg1; + size_t len; + size_t n; + + if (GDK_STRNIL(s)) { *res = GDKstrdup(str_nil); } else { - int c = 0, sz = 0; - const unsigned char *v = NULL; - - /* trim left */ - u = (const unsigned char *) s; - do { - UTF8_NEXTCHAR(c, sz, u); - - for (i = 0; i < rm_cnt; i++) { - if (toRm[i] == c) { - u += sz; - break; - } - } - } while (i < rm_cnt); - /* trim right */ - v = (const unsigned char *) s; - do { - UTF8_LASTCHAR(c, sz, v, len); - - for (i = 0; i < rm_cnt; i++) { - if (toRm[i] == c) { - len -= sz; - break; - } - } - } while (i < rm_cnt); - *res = GDKstrndup((const char*)u, len - ((const char*)u - s)); + len = strlen(s); + n = lstrip(s, len, whitespace, NSPACES); + s += n; + len -= n; + n = rstrip(s, len, whitespace, NSPACES); + *res = GDKstrndup(s, n); } - - GDKfree(toRm); if (*res == NULL) - throw(MAL, "str.ltrim", SQLSTATE(HY001) MAL_MALLOC_FAIL); + throw(MAL, "str.trim", SQLSTATE(HY001) MAL_MALLOC_FAIL); return MAL_SUCCEED; } +/* remove all whitespace from the start (left) of arg1 */ str STRLtrim(str *res, const str *arg1) { const char *s = *arg1; - if (strNil(s)) { + size_t len; + size_t n; + + if (GDK_STRNIL(s)) { *res = GDKstrdup(str_nil); } else { - while (GDKisspace(*s)) - s++; - *res = GDKstrdup(s); + len = strlen(s); + n = lstrip(s, len, whitespace, NSPACES); + *res = GDKstrndup(s + n, len - n); } if (*res == NULL) throw(MAL, "str.ltrim", SQLSTATE(HY001) MAL_MALLOC_FAIL); return MAL_SUCCEED; } -/* Remove the longest string containing only characters from 'arg2' from the start of 'arg1' - * - * Example: ltrim('zzzytrim', 'xyz') - * Result: trim - */ -str -STRLtrim2(str *res, const str *arg1, const str *arg2) -{ - const char *s = *arg1, *s2 = *arg2; - const unsigned char *u = NULL; - int *toRm = NULL; /* candidate list of to be removed characters, converted to INT */ - int i = 0, rm_cnt = UTF8_strlen(s2); - - toRm = GDKmalloc(sizeof(int) * rm_cnt); - if (toRm == NULL) - throw(MAL, "str.ltrim", SQLSTATE(HY001) MAL_MALLOC_FAIL); - u = (const unsigned char *) s2; - for (i = 0; i < rm_cnt; i++) - UTF8_GETCHAR(toRm[i], u); - /* Just a sanity check that all bytes of s2 are consumed */ - if (u[0] != '\0') { - GDKfree(toRm); - throw(MAL, "str.ltrim", "Invalid UTF-8 string %s", *arg2); - } - - if (strNil(s)) { - *res = GDKstrdup(str_nil); - } else { - int c = 0, sz = 0; - - u = (const unsigned char *) s; - do { - UTF8_NEXTCHAR(c, sz, u); - - for (i = 0; i < rm_cnt; i++) { - if (toRm[i] == c) { - u += sz; - break; - } - } - } while (i < rm_cnt); - *res = GDKstrdup((const char*)u); - } - - GDKfree(toRm); - if (*res == NULL) - throw(MAL, "str.ltrim", SQLSTATE(HY001) MAL_MALLOC_FAIL); - return MAL_SUCCEED; -} - +/* remove all whitespace from the end (right) of arg1 */ str STRRtrim(str *res, const str *arg1) { const char *s = *arg1; - size_t len = strlen(*arg1); + size_t len; + size_t n; - if (strNil(s)) { + if (GDK_STRNIL(s)) { *res = GDKstrdup(str_nil); } else { - while (len > 0 && GDKisspace(s[len - 1])) - len--; - *res = GDKmalloc(len + 1); - if (*res != NULL) { - memcpy(*res, s, len); - (*res)[len] = '\0'; - } + len = strlen(s); + n = rstrip(s, len, whitespace, NSPACES); + *res = GDKstrndup(s, n); } if (*res == NULL) throw(MAL, "str.rtrim", SQLSTATE(HY001) MAL_MALLOC_FAIL); return MAL_SUCCEED; } -/* Remove the longest string containing only characters from 'arg2' from the end of 'arg1' - * - * Example: rtrim('trimxxxxxxxxx', 'xyz') - * Result: trim - */ -str -STRRtrim2(str *res, const str *arg1, const str *arg2) +/* return a list of codepoints in s */ +static int * +trimchars(const char *s, size_t *n) { - const char *s = *arg1, *s2 = *arg2; - const unsigned char *u = NULL; - int *toRm = NULL; - int i = 0, rm_cnt = UTF8_strlen(*arg2); - size_t len = strlen(*arg1); + size_t len = 0; + int *chars = GDKmalloc(strlen(s) * sizeof(int)); + int c; + + if (chars == NULL) + return NULL; - toRm = GDKmalloc(sizeof(int) * rm_cnt); - if (toRm == NULL) - throw(MAL, "str.rtrim", SQLSTATE(HY001) MAL_MALLOC_FAIL); - u = (const unsigned char *) s2; - for (i = 0; i < rm_cnt; i++) - UTF8_GETCHAR(toRm[i], u); - /* Just a sanity check that all bytes of arg2 are consumed */ - if (u[0] != '\0') { - GDKfree(toRm); - throw(MAL, "str.rtrim", "Invalid UTF-8 string %s", *arg2); + while (*s) { + UTF8_GETCHAR(c, s); + assert(c != int_nil); + chars[len++] = c; } + *n = len; + return chars; +} - if (strNil(s)) { +/* remove the longest string containing only characters from arg2 from + * either side of arg1 */ +str +STRStrip2(str *res, const str *arg1, const str *arg2) +{ + const char *s = *arg1; + size_t len; + size_t n; + size_t nchars; + int *chars; + + if (GDK_STRNIL(s)) { *res = GDKstrdup(str_nil); } else { - int c = 0, sz = 0; - u = (unsigned char *) s; - do { - UTF8_LASTCHAR(c, sz, u, len); + chars = trimchars(*arg2, &nchars); + if (chars == NULL) + throw(MAL, "str.trim", SQLSTATE(HY001) MAL_MALLOC_FAIL); + len = strlen(s); + n = lstrip(s, len, chars, nchars); + s += n; + len -= n; + n = rstrip(s, len, chars, nchars); + GDKfree(chars); + *res = GDKstrndup(s, n); + } + if (*res == NULL) + throw(MAL, "str.trim", SQLSTATE(HY001) MAL_MALLOC_FAIL); + return MAL_SUCCEED; +} - for (i = 0; i < rm_cnt; i++) { - if (toRm[i] == c) { - len -= sz; - break; - } - } - } while (i < rm_cnt); - *res = GDKstrndup(s, len); +/* remove the longest string containing only characters from arg2 from + * the start (left) of arg1 */ +str +STRLtrim2(str *res, const str *arg1, const str *arg2) +{ + const char *s = *arg1; + size_t len; + size_t n; + size_t nchars; + int *chars; + + if (GDK_STRNIL(s)) { + *res = GDKstrdup(str_nil); + } else { + chars = trimchars(*arg2, &nchars); + if (chars == NULL) + throw(MAL, "str.trim", SQLSTATE(HY001) MAL_MALLOC_FAIL); + len = strlen(s); + n = lstrip(s, len, chars, nchars); + *res = GDKstrndup(s + n, len - n); } - - GDKfree(toRm); if (*res == NULL) throw(MAL, "str.ltrim", SQLSTATE(HY001) MAL_MALLOC_FAIL); return MAL_SUCCEED; } -/* Fill up 'arg1' to lenth 'len' by prepending whitespaces. +/* remove the longest string containing only characters from arg2 from + * the end (right) of arg1 */ +str +STRRtrim2(str *res, const str *arg1, const str *arg2) +{ + const char *s = *arg1; + size_t len; + size_t n; + size_t nchars; + int *chars; + + if (GDK_STRNIL(s)) { + *res = GDKstrdup(str_nil); + } else { + chars = trimchars(*arg2, &nchars); + if (chars == NULL) + throw(MAL, "str.trim", SQLSTATE(HY001) MAL_MALLOC_FAIL); + len = strlen(s); + n = rstrip(s, len, chars, nchars); + *res = GDKstrndup(s, n); + } + if (*res == NULL) + throw(MAL, "str.rtrim", SQLSTATE(HY001) MAL_MALLOC_FAIL); + return MAL_SUCCEED; +} + +static char * +pad(const char *s, const char *pad, int len, int left) +{ + size_t slen, padlen, repeats, residual, i; + char *res; + + if (GDK_STRNIL(s) || GDK_STRNIL(pad) || len == int_nil) + return GDKstrdup(str_nil); + + if (len < 0) + len = 0; + + slen = UTF8_strlen(s); + + if (slen > (size_t) len) { + /* truncate */ + pad = UTF8_strtail(s, len); + return GDKstrndup(s, pad - s); + } + + padlen = UTF8_strlen(pad); + if (slen == (size_t) len || padlen == 0) { + /* nothing to do (no padding if there is no pad string) */ + return GDKstrdup(s); + } + + repeats = ((size_t) len - slen) / padlen; + residual = ((size_t) len - slen) % padlen; + if (residual > 0) + residual = (size_t) (UTF8_strtail(pad, (int) residual) - pad); + padlen = strlen(pad); + slen = strlen(s); + res = GDKmalloc(slen + repeats * padlen + residual + 1); + if (res == NULL) + return NULL; + if (left) { + for (i = 0; i < repeats; i++) + memcpy(res + i * padlen, pad, padlen); + if (residual > 0) + memcpy(res + repeats * padlen, pad, residual); + if (slen > 0) + memcpy(res + repeats * padlen + residual, s, slen); + } else { + if (slen > 0) + memcpy(res, s, slen); + for (i = 0; i < repeats; i++) + memcpy(res + slen + i * padlen, pad, padlen); + if (residual > 0) + memcpy(res + slen + repeats * padlen, pad, residual); + } + res[repeats * padlen + residual + slen] = 0; + return res; +} + +/* Fill up 'arg1' to length 'len' by prepending whitespaces. * If 'arg1' is already longer than 'len', then it's truncated on the right * (NB: this is the PostgreSQL definition). * @@ -2132,36 +3752,13 @@ STRRtrim2(str *res, const str *arg1, con str STRLpad(str *res, const str *arg1, const int *len) { - const char *s = *arg1; - int pad_cnt = *len - UTF8_strlen(s); /* #whitespaces to be prepended */ - - if (pad_cnt == 0) { - *res = GDKstrdup(s); - } else if (pad_cnt < 0) { /* truncate */ - s = UTF8_strtail(s, *len); - *res = GDKstrndup(*arg1, s - *arg1); - } else { /* pad_cnt > 0: fill */ - int i = 0; - size_t s_len = strlen(s), - res_len = pad_cnt + s_len; - char *r = GDKmalloc(res_len+1); - - if (r == NULL) - throw(MAL, "str.lpad", SQLSTATE(HY001) MAL_MALLOC_FAIL); - for (i = 0; i < pad_cnt; i++) { - r[i] = ' '; - } - memcpy(r + pad_cnt, s, s_len); - r[res_len] = '\0'; - *res = r; - } - + *res = pad(*arg1, " ", *len, 1); if (*res == NULL) throw(MAL, "str.lpad", SQLSTATE(HY001) MAL_MALLOC_FAIL); return MAL_SUCCEED; } -/* Fill up 'arg1' to lenth 'len' by appending whitespaces. +/* Fill up 'arg1' to length 'len' by appending whitespaces. * If 'arg1' is already longer than 'len', then it's truncated (on the right) * (NB: this is the PostgreSQL definition). * @@ -2171,36 +3768,13 @@ STRLpad(str *res, const str *arg1, const str STRRpad(str *res, const str *arg1, const int *len) { - const char *s = *arg1; - int pad_cnt = *len - UTF8_strlen(s); /* #whitespaces to be appended */ - - if (pad_cnt == 0) { - *res = GDKstrdup(s); - } else if (pad_cnt < 0) { /* truncate */ - s = UTF8_strtail(s, *len); - *res = GDKstrndup(*arg1, s - *arg1); - } else { /* pad_cnt > 0: fill */ - size_t i = 0, - s_len = strlen(s), - res_len = pad_cnt + s_len; - char *r = GDKmalloc(res_len+1); - - if (r == NULL) - throw(MAL, "str.lpad", SQLSTATE(HY001) MAL_MALLOC_FAIL); - memcpy(r, s, s_len); - for (i = s_len; i < res_len; i++) { - r[i] = ' '; - } - r[res_len] = '\0'; - *res = r; - } - + *res = pad(*arg1, " ", *len, 0); if (*res == NULL) - throw(MAL, "str.lpad", SQLSTATE(HY001) MAL_MALLOC_FAIL); + throw(MAL, "str.rpad", SQLSTATE(HY001) MAL_MALLOC_FAIL); return MAL_SUCCEED; } -/* Fill up 'arg1' to lenth 'len' by prepending characters from 'arg2' +/* Fill up 'arg1' to length 'len' by prepending characters from 'arg2' * If 'arg1' is already longer than 'len', then it's truncated on the right * (NB: this is the PostgreSQL definition). * @@ -2210,62 +3784,16 @@ STRRpad(str *res, const str *arg1, const str STRLpad2(str *res, const str *arg1, const int *len, const str *arg2) { - const char *s = *arg1; - int pad_cnt = *len - UTF8_strlen(s); /* #chars to be prepended */ - - if (pad_cnt == 0) { - *res = GDKstrdup(s); - } else if (pad_cnt < 0) { /* truncate */ - s = UTF8_strtail(s, *len); - *res = GDKstrndup(*arg1, s - *arg1); - } else { /* pad_cnt > 0: fill */ - const char *s2 = *arg2, *s2_tmp = *arg2; - char *r = NULL; - const unsigned char *u = NULL; - int i, c, sz, s2_cnt, nr_repeat, nr_residual; - size_t s_len, s2_len, repeat_len, residual_len, res_len; + if (**arg2 == 0) + throw(MAL, "str.lpad", SQLSTATE(42000) ILLEGAL_ARGUMENT ": pad string is empty"); - i = 0; - c = 0; - sz = 0; - s2_cnt = UTF8_strlen(s2); - if (s2_cnt == 0) - throw(MAL, "str.lpad", ILLEGAL_ARGUMENT ": pad string is empty"); - nr_repeat = pad_cnt / s2_cnt; - nr_residual = pad_cnt % s2_cnt; - s_len = strlen(s); - s2_len = strlen(s2); - repeat_len = s2_len * nr_repeat; - residual_len = 0; - res_len = s_len + repeat_len; - u = (const unsigned char *) s2_tmp; - for (i = 0; i < nr_residual; i++) { - UTF8_GETCHAR_SZ(c, sz, u); - residual_len += sz; - } - res_len += residual_len; - r = GDKmalloc(res_len+1); - if (r == NULL) - throw(MAL, "str.lpad", SQLSTATE(HY001) MAL_MALLOC_FAIL); - for (i = 0; i < pad_cnt; i++) { - r[i] = ' '; - } - - for (i = 0; i < nr_repeat; i++) { - memcpy(r + s2_len*i, s2, s2_len); - } - memcpy(r + repeat_len, s2, residual_len); - memcpy(r + repeat_len + residual_len, s, s_len); - r[res_len] = '\0'; - *res = r; - } - + *res = pad(*arg1, *arg2, *len, 1); if (*res == NULL) throw(MAL, "str.lpad", SQLSTATE(HY001) MAL_MALLOC_FAIL); return MAL_SUCCEED; } -/* Fill up 'arg1' to lenth 'len' by appending characters from 'arg2' +/* Fill up 'arg1' to length 'len' by appending characters from 'arg2' * If 'arg1' is already longer than 'len', then it's truncated (on the right) * (NB: this is the PostgreSQL definition). * @@ -2275,59 +3803,12 @@ STRLpad2(str *res, const str *arg1, cons str STRRpad2(str *res, const str *arg1, const int *len, const str *arg2) { - const char *s = *arg1; - int pad_cnt = *len - UTF8_strlen(s); /* #chars to be appended */ - - if (pad_cnt == 0) { - *res = GDKstrdup(s); - } else if (pad_cnt < 0) { /* truncate */ - s = UTF8_strtail(s, *len); - *res = GDKstrndup(*arg1, s - *arg1); - } else { /* pad_cnt > 0: fill */ - const char *s2 = *arg2, *s2_tmp = *arg2; - char *r = NULL; - const unsigned char *u = NULL; - int i, c, sz, s2_cnt, nr_repeat, nr_residual; - size_t s_len, s2_len, repeat_len, residual_len, res_len; + if (**arg2 == 0) + throw(MAL, "str.rpad", SQLSTATE(42000) ILLEGAL_ARGUMENT ": pad string is empty"); - i = 0; - c = 0; - sz = 0; - s2_cnt = UTF8_strlen(s2); - if (s2_cnt == 0) - throw(MAL, "str.rpad", ILLEGAL_ARGUMENT ": pad string is empty"); - nr_repeat = pad_cnt / s2_cnt; - nr_residual = pad_cnt % s2_cnt; - s_len = strlen(s); - s2_len = strlen(s2); - repeat_len = s2_len * nr_repeat; - residual_len = 0; - res_len = s_len + repeat_len; - - u = (const unsigned char *)s2_tmp; - for (i = 0; i < nr_residual; i++) { - UTF8_GETCHAR_SZ(c, sz, u); - residual_len += sz; - } - res_len += residual_len; - r = GDKmalloc(res_len+1); - if (r == NULL) - throw(MAL, "str.lpad", SQLSTATE(HY001) MAL_MALLOC_FAIL); - for (i = 0; i < pad_cnt; i++) { - r[i] = ' '; - } - - memcpy(r, s, s_len); - for (i = 0; i < nr_repeat; i++) { - memcpy(r + s_len + s2_len*i, s2, s2_len); - } - memcpy(r + s_len + repeat_len, s2, residual_len); - r[res_len] = '\0'; - *res = r; - } - + *res = pad(*arg1, *arg2, *len, 0); if (*res == NULL) - throw(MAL, "str.lpad", SQLSTATE(HY001) MAL_MALLOC_FAIL); + throw(MAL, "str.rpad", SQLSTATE(HY001) MAL_MALLOC_FAIL); return MAL_SUCCEED; } @@ -2453,7 +3934,7 @@ STRinsert(str *ret, const str *s, const throw(MAL, "str.insert", SQLSTATE(HY001) MAL_MALLOC_FAIL); } if (*l < 0) - throw(MAL, "str.insert", ILLEGAL_ARGUMENT); + throw(MAL, "str.insert", SQLSTATE(42000) ILLEGAL_ARGUMENT); if (strt < 0) { if ((size_t) -strt <= l1) strt = (int) (l1 + strt);