mirror of
https://github.com/php/php-src.git
synced 2026-04-22 15:38:49 +02:00
128768a450
In04e59c916f, I amended the UTF-8 conversion code, so that when given invalid input, it would emit a number of errors markers harmonizing with the WHATWG's specification of the standard UTF-8 decoding algorithm. (Which, gentle reader of commit logs, you can find online at https://encoding.spec.whatwg.org/#utf-8-decoder.) However, the code in04e59c916fwas faulty in the case that a truncated UTF-8 code unit starts with 0xF1. Then, indc1ba61d09, when making a small refactoring to a different part of the UTF-8 conversion code, I inexplicably broke part of the working code, causing the same fault which was already present with truncated UTF-8 code units starting with 0xF1 to also occur with 0xF2 and 0xF3 as well. I don't remember what inane thoughts I was thinking when I pulled off this feat of utter mental confusion. None of these cases were covered by unit tests, by the way. Thankfully, my trusty fuzzer picked up on this when testing the new implementation of mb_parse_str (since the legacy UTF-8 conversion filter did not suffer from the same problem, and I was fuzzing to find any differences in behavior between the old and new implementations). Fortuitously, the fuzzer also picked up another issue which was present in04e59c916f. I was emitting only one error marker for truncated code units starting with 0xE0 or 0xED, in cases where the WHATWG standard indicates two should be emitted. Examples are 0xE0 0x9F <END OF STRING> or 0xED 0xA0 <END OF STRING>. Code units starting with 0xE0-0xED should have 3 bytes. If the first byte is 0xE0, the second MUST be 0xA0 or greater. (Otherwise, the codepoint could have fit in a two-byte code unit.) And if the first byte is 0xED, the second MUST be 0x9F or less. According to the WHATWG algorithm, step 4, if the second byte is outside the legal range, then the decoder should emit an error... AND reprocess the out-of-range byte. The reprocessing will then cause another error. That's why the decoder should indicate two errors and not one.
1094 lines
29 KiB
PHP
1094 lines
29 KiB
PHP
--TEST--
|
|
Torture test for UTF-{7,8,16,32}
|
|
--EXTENSIONS--
|
|
mbstring
|
|
--SKIPIF--
|
|
<?php
|
|
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
|
|
?>
|
|
--FILE--
|
|
<?php
|
|
srand(232); /* Make results consistent */
|
|
mb_substitute_character(0x25); // '%'
|
|
include('encoding_tests.inc');
|
|
|
|
// all ranges of valid codepoints in UnicodeData.txt
|
|
$validRanges = [
|
|
[0x0, 0x377],
|
|
[0x37a, 0x37f],
|
|
[0x384, 0x38a],
|
|
[0x38c, 0x38c],
|
|
[0x38e, 0x3a1],
|
|
[0x3a3, 0x52f],
|
|
[0x531, 0x556],
|
|
[0x559, 0x58a],
|
|
[0x58d, 0x58f],
|
|
[0x591, 0x5c7],
|
|
[0x5d0, 0x5ea],
|
|
[0x5ef, 0x5f4],
|
|
[0x600, 0x61c],
|
|
[0x61e, 0x70d],
|
|
[0x70f, 0x74a],
|
|
[0x74d, 0x7b1],
|
|
[0x7c0, 0x7fa],
|
|
[0x7fd, 0x82d],
|
|
[0x830, 0x83e],
|
|
[0x840, 0x85b],
|
|
[0x85e, 0x85e],
|
|
[0x860, 0x86a],
|
|
[0x8a0, 0x8b4],
|
|
[0x8b6, 0x8c7],
|
|
[0x8d3, 0x983],
|
|
[0x985, 0x98c],
|
|
[0x98f, 0x990],
|
|
[0x993, 0x9a8],
|
|
[0x9aa, 0x9b0],
|
|
[0x9b2, 0x9b2],
|
|
[0x9b6, 0x9b9],
|
|
[0x9bc, 0x9c4],
|
|
[0x9c7, 0x9c8],
|
|
[0x9cb, 0x9ce],
|
|
[0x9d7, 0x9d7],
|
|
[0x9dc, 0x9dd],
|
|
[0x9df, 0x9e3],
|
|
[0x9e6, 0x9fe],
|
|
[0xa01, 0xa03],
|
|
[0xa05, 0xa0a],
|
|
[0xa0f, 0xa10],
|
|
[0xa13, 0xa28],
|
|
[0xa2a, 0xa30],
|
|
[0xa32, 0xa33],
|
|
[0xa35, 0xa36],
|
|
[0xa38, 0xa39],
|
|
[0xa3c, 0xa3c],
|
|
[0xa3e, 0xa42],
|
|
[0xa47, 0xa48],
|
|
[0xa4b, 0xa4d],
|
|
[0xa51, 0xa51],
|
|
[0xa59, 0xa5c],
|
|
[0xa5e, 0xa5e],
|
|
[0xa66, 0xa76],
|
|
[0xa81, 0xa83],
|
|
[0xa85, 0xa8d],
|
|
[0xa8f, 0xa91],
|
|
[0xa93, 0xaa8],
|
|
[0xaaa, 0xab0],
|
|
[0xab2, 0xab3],
|
|
[0xab5, 0xab9],
|
|
[0xabc, 0xac5],
|
|
[0xac7, 0xac9],
|
|
[0xacb, 0xacd],
|
|
[0xad0, 0xad0],
|
|
[0xae0, 0xae3],
|
|
[0xae6, 0xaf1],
|
|
[0xaf9, 0xaff],
|
|
[0xb01, 0xb03],
|
|
[0xb05, 0xb0c],
|
|
[0xb0f, 0xb10],
|
|
[0xb13, 0xb28],
|
|
[0xb2a, 0xb30],
|
|
[0xb32, 0xb33],
|
|
[0xb35, 0xb39],
|
|
[0xb3c, 0xb44],
|
|
[0xb47, 0xb48],
|
|
[0xb4b, 0xb4d],
|
|
[0xb55, 0xb57],
|
|
[0xb5c, 0xb5d],
|
|
[0xb5f, 0xb63],
|
|
[0xb66, 0xb77],
|
|
[0xb82, 0xb83],
|
|
[0xb85, 0xb8a],
|
|
[0xb8e, 0xb90],
|
|
[0xb92, 0xb95],
|
|
[0xb99, 0xb9a],
|
|
[0xb9c, 0xb9c],
|
|
[0xb9e, 0xb9f],
|
|
[0xba3, 0xba4],
|
|
[0xba8, 0xbaa],
|
|
[0xbae, 0xbb9],
|
|
[0xbbe, 0xbc2],
|
|
[0xbc6, 0xbc8],
|
|
[0xbca, 0xbcd],
|
|
[0xbd0, 0xbd0],
|
|
[0xbd7, 0xbd7],
|
|
[0xbe6, 0xbfa],
|
|
[0xc00, 0xc0c],
|
|
[0xc0e, 0xc10],
|
|
[0xc12, 0xc28],
|
|
[0xc2a, 0xc39],
|
|
[0xc3d, 0xc44],
|
|
[0xc46, 0xc48],
|
|
[0xc4a, 0xc4d],
|
|
[0xc55, 0xc56],
|
|
[0xc58, 0xc5a],
|
|
[0xc60, 0xc63],
|
|
[0xc66, 0xc6f],
|
|
[0xc77, 0xc8c],
|
|
[0xc8e, 0xc90],
|
|
[0xc92, 0xca8],
|
|
[0xcaa, 0xcb3],
|
|
[0xcb5, 0xcb9],
|
|
[0xcbc, 0xcc4],
|
|
[0xcc6, 0xcc8],
|
|
[0xcca, 0xccd],
|
|
[0xcd5, 0xcd6],
|
|
[0xcde, 0xcde],
|
|
[0xce0, 0xce3],
|
|
[0xce6, 0xcef],
|
|
[0xcf1, 0xcf2],
|
|
[0xd00, 0xd0c],
|
|
[0xd0e, 0xd10],
|
|
[0xd12, 0xd44],
|
|
[0xd46, 0xd48],
|
|
[0xd4a, 0xd4f],
|
|
[0xd54, 0xd63],
|
|
[0xd66, 0xd7f],
|
|
[0xd81, 0xd83],
|
|
[0xd85, 0xd96],
|
|
[0xd9a, 0xdb1],
|
|
[0xdb3, 0xdbb],
|
|
[0xdbd, 0xdbd],
|
|
[0xdc0, 0xdc6],
|
|
[0xdca, 0xdca],
|
|
[0xdcf, 0xdd4],
|
|
[0xdd6, 0xdd6],
|
|
[0xdd8, 0xddf],
|
|
[0xde6, 0xdef],
|
|
[0xdf2, 0xdf4],
|
|
[0xe01, 0xe3a],
|
|
[0xe3f, 0xe5b],
|
|
[0xe81, 0xe82],
|
|
[0xe84, 0xe84],
|
|
[0xe86, 0xe8a],
|
|
[0xe8c, 0xea3],
|
|
[0xea5, 0xea5],
|
|
[0xea7, 0xebd],
|
|
[0xec0, 0xec4],
|
|
[0xec6, 0xec6],
|
|
[0xec8, 0xecd],
|
|
[0xed0, 0xed9],
|
|
[0xedc, 0xedf],
|
|
[0xf00, 0xf47],
|
|
[0xf49, 0xf6c],
|
|
[0xf71, 0xf97],
|
|
[0xf99, 0xfbc],
|
|
[0xfbe, 0xfcc],
|
|
[0xfce, 0xfda],
|
|
[0x1000, 0x10c5],
|
|
[0x10c7, 0x10c7],
|
|
[0x10cd, 0x10cd],
|
|
[0x10d0, 0x1248],
|
|
[0x124a, 0x124d],
|
|
[0x1250, 0x1256],
|
|
[0x1258, 0x1258],
|
|
[0x125a, 0x125d],
|
|
[0x1260, 0x1288],
|
|
[0x128a, 0x128d],
|
|
[0x1290, 0x12b0],
|
|
[0x12b2, 0x12b5],
|
|
[0x12b8, 0x12be],
|
|
[0x12c0, 0x12c0],
|
|
[0x12c2, 0x12c5],
|
|
[0x12c8, 0x12d6],
|
|
[0x12d8, 0x1310],
|
|
[0x1312, 0x1315],
|
|
[0x1318, 0x135a],
|
|
[0x135d, 0x137c],
|
|
[0x1380, 0x1399],
|
|
[0x13a0, 0x13f5],
|
|
[0x13f8, 0x13fd],
|
|
[0x1400, 0x169c],
|
|
[0x16a0, 0x16f8],
|
|
[0x1700, 0x170c],
|
|
[0x170e, 0x1714],
|
|
[0x1720, 0x1736],
|
|
[0x1740, 0x1753],
|
|
[0x1760, 0x176c],
|
|
[0x176e, 0x1770],
|
|
[0x1772, 0x1773],
|
|
[0x1780, 0x17dd],
|
|
[0x17e0, 0x17e9],
|
|
[0x17f0, 0x17f9],
|
|
[0x1800, 0x180e],
|
|
[0x1810, 0x1819],
|
|
[0x1820, 0x1878],
|
|
[0x1880, 0x18aa],
|
|
[0x18b0, 0x18f5],
|
|
[0x1900, 0x191e],
|
|
[0x1920, 0x192b],
|
|
[0x1930, 0x193b],
|
|
[0x1940, 0x1940],
|
|
[0x1944, 0x196d],
|
|
[0x1970, 0x1974],
|
|
[0x1980, 0x19ab],
|
|
[0x19b0, 0x19c9],
|
|
[0x19d0, 0x19da],
|
|
[0x19de, 0x1a1b],
|
|
[0x1a1e, 0x1a5e],
|
|
[0x1a60, 0x1a7c],
|
|
[0x1a7f, 0x1a89],
|
|
[0x1a90, 0x1a99],
|
|
[0x1aa0, 0x1aad],
|
|
[0x1ab0, 0x1ac0],
|
|
[0x1b00, 0x1b4b],
|
|
[0x1b50, 0x1b7c],
|
|
[0x1b80, 0x1bf3],
|
|
[0x1bfc, 0x1c37],
|
|
[0x1c3b, 0x1c49],
|
|
[0x1c4d, 0x1c88],
|
|
[0x1c90, 0x1cba],
|
|
[0x1cbd, 0x1cc7],
|
|
[0x1cd0, 0x1cfa],
|
|
[0x1d00, 0x1df9],
|
|
[0x1dfb, 0x1f15],
|
|
[0x1f18, 0x1f1d],
|
|
[0x1f20, 0x1f45],
|
|
[0x1f48, 0x1f4d],
|
|
[0x1f50, 0x1f57],
|
|
[0x1f59, 0x1f59],
|
|
[0x1f5b, 0x1f5b],
|
|
[0x1f5d, 0x1f5d],
|
|
[0x1f5f, 0x1f7d],
|
|
[0x1f80, 0x1fb4],
|
|
[0x1fb6, 0x1fc4],
|
|
[0x1fc6, 0x1fd3],
|
|
[0x1fd6, 0x1fdb],
|
|
[0x1fdd, 0x1fef],
|
|
[0x1ff2, 0x1ff4],
|
|
[0x1ff6, 0x1ffe],
|
|
[0x2000, 0x2064],
|
|
[0x2066, 0x2071],
|
|
[0x2074, 0x208e],
|
|
[0x2090, 0x209c],
|
|
[0x20a0, 0x20bf],
|
|
[0x20d0, 0x20f0],
|
|
[0x2100, 0x218b],
|
|
[0x2190, 0x2426],
|
|
[0x2440, 0x244a],
|
|
[0x2460, 0x2b73],
|
|
[0x2b76, 0x2b95],
|
|
[0x2b97, 0x2c2e],
|
|
[0x2c30, 0x2c5e],
|
|
[0x2c60, 0x2cf3],
|
|
[0x2cf9, 0x2d25],
|
|
[0x2d27, 0x2d27],
|
|
[0x2d2d, 0x2d2d],
|
|
[0x2d30, 0x2d67],
|
|
[0x2d6f, 0x2d70],
|
|
[0x2d7f, 0x2d96],
|
|
[0x2da0, 0x2da6],
|
|
[0x2da8, 0x2dae],
|
|
[0x2db0, 0x2db6],
|
|
[0x2db8, 0x2dbe],
|
|
[0x2dc0, 0x2dc6],
|
|
[0x2dc8, 0x2dce],
|
|
[0x2dd0, 0x2dd6],
|
|
[0x2dd8, 0x2dde],
|
|
[0x2de0, 0x2e52],
|
|
[0x2e80, 0x2e99],
|
|
[0x2e9b, 0x2ef3],
|
|
[0x2f00, 0x2fd5],
|
|
[0x2ff0, 0x2ffb],
|
|
[0x3000, 0x303f],
|
|
[0x3041, 0x3096],
|
|
[0x3099, 0x30ff],
|
|
[0x3105, 0x312f],
|
|
[0x3131, 0x318e],
|
|
[0x3190, 0x31e3],
|
|
[0x31f0, 0x321e],
|
|
[0x3220, 0x3400],
|
|
[0x4dbf, 0x4e00],
|
|
[0x9ffc, 0x9ffc],
|
|
[0xa000, 0xa48c],
|
|
[0xa490, 0xa4c6],
|
|
[0xa4d0, 0xa62b],
|
|
[0xa640, 0xa6f7],
|
|
[0xa700, 0xa7bf],
|
|
[0xa7c2, 0xa7ca],
|
|
[0xa7f5, 0xa82c],
|
|
[0xa830, 0xa839],
|
|
[0xa840, 0xa877],
|
|
[0xa880, 0xa8c5],
|
|
[0xa8ce, 0xa8d9],
|
|
[0xa8e0, 0xa953],
|
|
[0xa95f, 0xa97c],
|
|
[0xa980, 0xa9cd],
|
|
[0xa9cf, 0xa9d9],
|
|
[0xa9de, 0xa9fe],
|
|
[0xaa00, 0xaa36],
|
|
[0xaa40, 0xaa4d],
|
|
[0xaa50, 0xaa59],
|
|
[0xaa5c, 0xaac2],
|
|
[0xaadb, 0xaaf6],
|
|
[0xab01, 0xab06],
|
|
[0xab09, 0xab0e],
|
|
[0xab11, 0xab16],
|
|
[0xab20, 0xab26],
|
|
[0xab28, 0xab2e],
|
|
[0xab30, 0xab6b],
|
|
[0xab70, 0xabed],
|
|
[0xabf0, 0xabf9],
|
|
[0xac00, 0xac00],
|
|
[0xd7a3, 0xd7a3],
|
|
[0xd7b0, 0xd7c6],
|
|
[0xd7cb, 0xd7fb],
|
|
[0xd800, 0xd800],
|
|
[0xdb7f, 0xdb80],
|
|
[0xdbff, 0xdc00],
|
|
[0xdfff, 0xe000],
|
|
[0xf8ff, 0xfa6d],
|
|
[0xfa70, 0xfad9],
|
|
[0xfb00, 0xfb06],
|
|
[0xfb13, 0xfb17],
|
|
[0xfb1d, 0xfb36],
|
|
[0xfb38, 0xfb3c],
|
|
[0xfb3e, 0xfb3e],
|
|
[0xfb40, 0xfb41],
|
|
[0xfb43, 0xfb44],
|
|
[0xfb46, 0xfbc1],
|
|
[0xfbd3, 0xfd3f],
|
|
[0xfd50, 0xfd8f],
|
|
[0xfd92, 0xfdc7],
|
|
[0xfdf0, 0xfdfd],
|
|
[0xfe00, 0xfe19],
|
|
[0xfe20, 0xfe52],
|
|
[0xfe54, 0xfe66],
|
|
[0xfe68, 0xfe6b],
|
|
[0xfe70, 0xfe74],
|
|
[0xfe76, 0xfefc],
|
|
[0xfeff, 0xfeff],
|
|
[0xff01, 0xffbe],
|
|
[0xffc2, 0xffc7],
|
|
[0xffca, 0xffcf],
|
|
[0xffd2, 0xffd7],
|
|
[0xffda, 0xffdc],
|
|
[0xffe0, 0xffe6],
|
|
[0xffe8, 0xffee],
|
|
[0xfff9, 0xfffd],
|
|
[0x10000, 0x1000b],
|
|
[0x1000d, 0x10026],
|
|
[0x10028, 0x1003a],
|
|
[0x1003c, 0x1003d],
|
|
[0x1003f, 0x1004d],
|
|
[0x10050, 0x1005d],
|
|
[0x10080, 0x100fa],
|
|
[0x10100, 0x10102],
|
|
[0x10107, 0x10133],
|
|
[0x10137, 0x1018e],
|
|
[0x10190, 0x1019c],
|
|
[0x101a0, 0x101a0],
|
|
[0x101d0, 0x101fd],
|
|
[0x10280, 0x1029c],
|
|
[0x102a0, 0x102d0],
|
|
[0x102e0, 0x102fb],
|
|
[0x10300, 0x10323],
|
|
[0x1032d, 0x1034a],
|
|
[0x10350, 0x1037a],
|
|
[0x10380, 0x1039d],
|
|
[0x1039f, 0x103c3],
|
|
[0x103c8, 0x103d5],
|
|
[0x10400, 0x1049d],
|
|
[0x104a0, 0x104a9],
|
|
[0x104b0, 0x104d3],
|
|
[0x104d8, 0x104fb],
|
|
[0x10500, 0x10527],
|
|
[0x10530, 0x10563],
|
|
[0x1056f, 0x1056f],
|
|
[0x10600, 0x10736],
|
|
[0x10740, 0x10755],
|
|
[0x10760, 0x10767],
|
|
[0x10800, 0x10805],
|
|
[0x10808, 0x10808],
|
|
[0x1080a, 0x10835],
|
|
[0x10837, 0x10838],
|
|
[0x1083c, 0x1083c],
|
|
[0x1083f, 0x10855],
|
|
[0x10857, 0x1089e],
|
|
[0x108a7, 0x108af],
|
|
[0x108e0, 0x108f2],
|
|
[0x108f4, 0x108f5],
|
|
[0x108fb, 0x1091b],
|
|
[0x1091f, 0x10939],
|
|
[0x1093f, 0x1093f],
|
|
[0x10980, 0x109b7],
|
|
[0x109bc, 0x109cf],
|
|
[0x109d2, 0x10a03],
|
|
[0x10a05, 0x10a06],
|
|
[0x10a0c, 0x10a13],
|
|
[0x10a15, 0x10a17],
|
|
[0x10a19, 0x10a35],
|
|
[0x10a38, 0x10a3a],
|
|
[0x10a3f, 0x10a48],
|
|
[0x10a50, 0x10a58],
|
|
[0x10a60, 0x10a9f],
|
|
[0x10ac0, 0x10ae6],
|
|
[0x10aeb, 0x10af6],
|
|
[0x10b00, 0x10b35],
|
|
[0x10b39, 0x10b55],
|
|
[0x10b58, 0x10b72],
|
|
[0x10b78, 0x10b91],
|
|
[0x10b99, 0x10b9c],
|
|
[0x10ba9, 0x10baf],
|
|
[0x10c00, 0x10c48],
|
|
[0x10c80, 0x10cb2],
|
|
[0x10cc0, 0x10cf2],
|
|
[0x10cfa, 0x10d27],
|
|
[0x10d30, 0x10d39],
|
|
[0x10e60, 0x10e7e],
|
|
[0x10e80, 0x10ea9],
|
|
[0x10eab, 0x10ead],
|
|
[0x10eb0, 0x10eb1],
|
|
[0x10f00, 0x10f27],
|
|
[0x10f30, 0x10f59],
|
|
[0x10fb0, 0x10fcb],
|
|
[0x10fe0, 0x10ff6],
|
|
[0x11000, 0x1104d],
|
|
[0x11052, 0x1106f],
|
|
[0x1107f, 0x110c1],
|
|
[0x110cd, 0x110cd],
|
|
[0x110d0, 0x110e8],
|
|
[0x110f0, 0x110f9],
|
|
[0x11100, 0x11134],
|
|
[0x11136, 0x11147],
|
|
[0x11150, 0x11176],
|
|
[0x11180, 0x111df],
|
|
[0x111e1, 0x111f4],
|
|
[0x11200, 0x11211],
|
|
[0x11213, 0x1123e],
|
|
[0x11280, 0x11286],
|
|
[0x11288, 0x11288],
|
|
[0x1128a, 0x1128d],
|
|
[0x1128f, 0x1129d],
|
|
[0x1129f, 0x112a9],
|
|
[0x112b0, 0x112ea],
|
|
[0x112f0, 0x112f9],
|
|
[0x11300, 0x11303],
|
|
[0x11305, 0x1130c],
|
|
[0x1130f, 0x11310],
|
|
[0x11313, 0x11328],
|
|
[0x1132a, 0x11330],
|
|
[0x11332, 0x11333],
|
|
[0x11335, 0x11339],
|
|
[0x1133b, 0x11344],
|
|
[0x11347, 0x11348],
|
|
[0x1134b, 0x1134d],
|
|
[0x11350, 0x11350],
|
|
[0x11357, 0x11357],
|
|
[0x1135d, 0x11363],
|
|
[0x11366, 0x1136c],
|
|
[0x11370, 0x11374],
|
|
[0x11400, 0x1145b],
|
|
[0x1145d, 0x11461],
|
|
[0x11480, 0x114c7],
|
|
[0x114d0, 0x114d9],
|
|
[0x11580, 0x115b5],
|
|
[0x115b8, 0x115dd],
|
|
[0x11600, 0x11644],
|
|
[0x11650, 0x11659],
|
|
[0x11660, 0x1166c],
|
|
[0x11680, 0x116b8],
|
|
[0x116c0, 0x116c9],
|
|
[0x11700, 0x1171a],
|
|
[0x1171d, 0x1172b],
|
|
[0x11730, 0x1173f],
|
|
[0x11800, 0x1183b],
|
|
[0x118a0, 0x118f2],
|
|
[0x118ff, 0x11906],
|
|
[0x11909, 0x11909],
|
|
[0x1190c, 0x11913],
|
|
[0x11915, 0x11916],
|
|
[0x11918, 0x11935],
|
|
[0x11937, 0x11938],
|
|
[0x1193b, 0x11946],
|
|
[0x11950, 0x11959],
|
|
[0x119a0, 0x119a7],
|
|
[0x119aa, 0x119d7],
|
|
[0x119da, 0x119e4],
|
|
[0x11a00, 0x11a47],
|
|
[0x11a50, 0x11aa2],
|
|
[0x11ac0, 0x11af8],
|
|
[0x11c00, 0x11c08],
|
|
[0x11c0a, 0x11c36],
|
|
[0x11c38, 0x11c45],
|
|
[0x11c50, 0x11c6c],
|
|
[0x11c70, 0x11c8f],
|
|
[0x11c92, 0x11ca7],
|
|
[0x11ca9, 0x11cb6],
|
|
[0x11d00, 0x11d06],
|
|
[0x11d08, 0x11d09],
|
|
[0x11d0b, 0x11d36],
|
|
[0x11d3a, 0x11d3a],
|
|
[0x11d3c, 0x11d3d],
|
|
[0x11d3f, 0x11d47],
|
|
[0x11d50, 0x11d59],
|
|
[0x11d60, 0x11d65],
|
|
[0x11d67, 0x11d68],
|
|
[0x11d6a, 0x11d8e],
|
|
[0x11d90, 0x11d91],
|
|
[0x11d93, 0x11d98],
|
|
[0x11da0, 0x11da9],
|
|
[0x11ee0, 0x11ef8],
|
|
[0x11fb0, 0x11fb0],
|
|
[0x11fc0, 0x11ff1],
|
|
[0x11fff, 0x12399],
|
|
[0x12400, 0x1246e],
|
|
[0x12470, 0x12474],
|
|
[0x12480, 0x12543],
|
|
[0x13000, 0x1342e],
|
|
[0x13430, 0x13438],
|
|
[0x14400, 0x14646],
|
|
[0x16800, 0x16a38],
|
|
[0x16a40, 0x16a5e],
|
|
[0x16a60, 0x16a69],
|
|
[0x16a6e, 0x16a6f],
|
|
[0x16ad0, 0x16aed],
|
|
[0x16af0, 0x16af5],
|
|
[0x16b00, 0x16b45],
|
|
[0x16b50, 0x16b59],
|
|
[0x16b5b, 0x16b61],
|
|
[0x16b63, 0x16b77],
|
|
[0x16b7d, 0x16b8f],
|
|
[0x16e40, 0x16e9a],
|
|
[0x16f00, 0x16f4a],
|
|
[0x16f4f, 0x16f87],
|
|
[0x16f8f, 0x16f9f],
|
|
[0x16fe0, 0x16fe4],
|
|
[0x16ff0, 0x16ff1],
|
|
[0x17000, 0x17000],
|
|
[0x187f7, 0x187f7],
|
|
[0x18800, 0x18cd5],
|
|
[0x18d00, 0x18d00],
|
|
[0x18d08, 0x18d08],
|
|
[0x1b000, 0x1b11e],
|
|
[0x1b150, 0x1b152],
|
|
[0x1b164, 0x1b167],
|
|
[0x1b170, 0x1b2fb],
|
|
[0x1bc00, 0x1bc6a],
|
|
[0x1bc70, 0x1bc7c],
|
|
[0x1bc80, 0x1bc88],
|
|
[0x1bc90, 0x1bc99],
|
|
[0x1bc9c, 0x1bca3],
|
|
[0x1d000, 0x1d0f5],
|
|
[0x1d100, 0x1d126],
|
|
[0x1d129, 0x1d1e8],
|
|
[0x1d200, 0x1d245],
|
|
[0x1d2e0, 0x1d2f3],
|
|
[0x1d300, 0x1d356],
|
|
[0x1d360, 0x1d378],
|
|
[0x1d400, 0x1d454],
|
|
[0x1d456, 0x1d49c],
|
|
[0x1d49e, 0x1d49f],
|
|
[0x1d4a2, 0x1d4a2],
|
|
[0x1d4a5, 0x1d4a6],
|
|
[0x1d4a9, 0x1d4ac],
|
|
[0x1d4ae, 0x1d4b9],
|
|
[0x1d4bb, 0x1d4bb],
|
|
[0x1d4bd, 0x1d4c3],
|
|
[0x1d4c5, 0x1d505],
|
|
[0x1d507, 0x1d50a],
|
|
[0x1d50d, 0x1d514],
|
|
[0x1d516, 0x1d51c],
|
|
[0x1d51e, 0x1d539],
|
|
[0x1d53b, 0x1d53e],
|
|
[0x1d540, 0x1d544],
|
|
[0x1d546, 0x1d546],
|
|
[0x1d54a, 0x1d550],
|
|
[0x1d552, 0x1d6a5],
|
|
[0x1d6a8, 0x1d7cb],
|
|
[0x1d7ce, 0x1da8b],
|
|
[0x1da9b, 0x1da9f],
|
|
[0x1daa1, 0x1daaf],
|
|
[0x1e000, 0x1e006],
|
|
[0x1e008, 0x1e018],
|
|
[0x1e01b, 0x1e021],
|
|
[0x1e023, 0x1e024],
|
|
[0x1e026, 0x1e02a],
|
|
[0x1e100, 0x1e12c],
|
|
[0x1e130, 0x1e13d],
|
|
[0x1e140, 0x1e149],
|
|
[0x1e14e, 0x1e14f],
|
|
[0x1e2c0, 0x1e2f9],
|
|
[0x1e2ff, 0x1e2ff],
|
|
[0x1e800, 0x1e8c4],
|
|
[0x1e8c7, 0x1e8d6],
|
|
[0x1e900, 0x1e94b],
|
|
[0x1e950, 0x1e959],
|
|
[0x1e95e, 0x1e95f],
|
|
[0x1ec71, 0x1ecb4],
|
|
[0x1ed01, 0x1ed3d],
|
|
[0x1ee00, 0x1ee03],
|
|
[0x1ee05, 0x1ee1f],
|
|
[0x1ee21, 0x1ee22],
|
|
[0x1ee24, 0x1ee24],
|
|
[0x1ee27, 0x1ee27],
|
|
[0x1ee29, 0x1ee32],
|
|
[0x1ee34, 0x1ee37],
|
|
[0x1ee39, 0x1ee39],
|
|
[0x1ee3b, 0x1ee3b],
|
|
[0x1ee42, 0x1ee42],
|
|
[0x1ee47, 0x1ee47],
|
|
[0x1ee49, 0x1ee49],
|
|
[0x1ee4b, 0x1ee4b],
|
|
[0x1ee4d, 0x1ee4f],
|
|
[0x1ee51, 0x1ee52],
|
|
[0x1ee54, 0x1ee54],
|
|
[0x1ee57, 0x1ee57],
|
|
[0x1ee59, 0x1ee59],
|
|
[0x1ee5b, 0x1ee5b],
|
|
[0x1ee5d, 0x1ee5d],
|
|
[0x1ee5f, 0x1ee5f],
|
|
[0x1ee61, 0x1ee62],
|
|
[0x1ee64, 0x1ee64],
|
|
[0x1ee67, 0x1ee6a],
|
|
[0x1ee6c, 0x1ee72],
|
|
[0x1ee74, 0x1ee77],
|
|
[0x1ee79, 0x1ee7c],
|
|
[0x1ee7e, 0x1ee7e],
|
|
[0x1ee80, 0x1ee89],
|
|
[0x1ee8b, 0x1ee9b],
|
|
[0x1eea1, 0x1eea3],
|
|
[0x1eea5, 0x1eea9],
|
|
[0x1eeab, 0x1eebb],
|
|
[0x1eef0, 0x1eef1],
|
|
[0x1f000, 0x1f02b],
|
|
[0x1f030, 0x1f093],
|
|
[0x1f0a0, 0x1f0ae],
|
|
[0x1f0b1, 0x1f0bf],
|
|
[0x1f0c1, 0x1f0cf],
|
|
[0x1f0d1, 0x1f0f5],
|
|
[0x1f100, 0x1f1ad],
|
|
[0x1f1e6, 0x1f202],
|
|
[0x1f210, 0x1f23b],
|
|
[0x1f240, 0x1f248],
|
|
[0x1f250, 0x1f251],
|
|
[0x1f260, 0x1f265],
|
|
[0x1f300, 0x1f6d7],
|
|
[0x1f6e0, 0x1f6ec],
|
|
[0x1f6f0, 0x1f6fc],
|
|
[0x1f700, 0x1f773],
|
|
[0x1f780, 0x1f7d8],
|
|
[0x1f7e0, 0x1f7eb],
|
|
[0x1f800, 0x1f80b],
|
|
[0x1f810, 0x1f847],
|
|
[0x1f850, 0x1f859],
|
|
[0x1f860, 0x1f887],
|
|
[0x1f890, 0x1f8ad],
|
|
[0x1f8b0, 0x1f8b1],
|
|
[0x1f900, 0x1f978],
|
|
[0x1f97a, 0x1f9cb],
|
|
[0x1f9cd, 0x1fa53],
|
|
[0x1fa60, 0x1fa6d],
|
|
[0x1fa70, 0x1fa74],
|
|
[0x1fa78, 0x1fa7a],
|
|
[0x1fa80, 0x1fa86],
|
|
[0x1fa90, 0x1faa8],
|
|
[0x1fab0, 0x1fab6],
|
|
[0x1fac0, 0x1fac2],
|
|
[0x1fad0, 0x1fad6],
|
|
[0x1fb00, 0x1fb92],
|
|
[0x1fb94, 0x1fbca],
|
|
[0x1fbf0, 0x1fbf9],
|
|
[0x20000, 0x20000],
|
|
[0x2a6dd, 0x2a6dd],
|
|
[0x2a700, 0x2a700],
|
|
[0x2b734, 0x2b734],
|
|
[0x2b740, 0x2b740],
|
|
[0x2b81d, 0x2b81d],
|
|
[0x2b820, 0x2b820],
|
|
[0x2cea1, 0x2cea1],
|
|
[0x2ceb0, 0x2ceb0],
|
|
[0x2ebe0, 0x2ebe0],
|
|
[0x2f800, 0x2fa1d],
|
|
[0x30000, 0x30000],
|
|
[0x3134a, 0x3134a],
|
|
[0xe0001, 0xe0001],
|
|
[0xe0020, 0xe007f],
|
|
[0xe0100, 0xe01ef],
|
|
[0xf0000, 0xf0000],
|
|
[0xffffd, 0xffffd],
|
|
[0x100000, 0x100000],
|
|
[0x10fffd, 0x10fffd]];
|
|
|
|
// in UTF-32BE
|
|
$validCodepoints = array();
|
|
|
|
foreach ($validRanges as $range) {
|
|
for ($cp = $range[0]; $cp <= $range[1]; $cp++) {
|
|
if (($cp < 0xD800 || $cp > 0xDFFF) && $cp !== 0xFEFF)
|
|
$validCodepoints[pack('N', $cp)] = true;
|
|
}
|
|
}
|
|
|
|
function testValidCodepoints($encoding) {
|
|
global $validCodepoints;
|
|
|
|
$good = array_keys($validCodepoints);
|
|
shuffle($good);
|
|
|
|
while (!empty($good)) {
|
|
$string = '';
|
|
$length = min(rand(20,30), count($good));
|
|
while ($length--) {
|
|
$string .= array_pop($good);
|
|
}
|
|
|
|
$converted = mb_convert_encoding($string, $encoding, 'UTF-32BE');
|
|
if ($converted === false)
|
|
die("mb_convert_encoding failed to convert UTF-32BE to $encoding." .
|
|
"\nString: " . bin2hex($string));
|
|
testValidString($converted, $string, $encoding, 'UTF-32BE');
|
|
}
|
|
}
|
|
|
|
function testInvalidCodepoints($invalid, $encoding) {
|
|
global $validCodepoints;
|
|
|
|
$good = array_keys($validCodepoints);
|
|
shuffle($good);
|
|
|
|
foreach ($invalid as $bad => $expected) {
|
|
$good1 = array_pop($good);
|
|
$string = $bad . mb_convert_encoding($good1, $encoding, 'UTF-32BE');
|
|
testInvalidString($string, $expected . $good1, $encoding, 'UTF-32BE');
|
|
}
|
|
}
|
|
|
|
echo "== UTF-8 ==\n";
|
|
|
|
testValidCodepoints('UTF-8');
|
|
|
|
testValidString('', '', 'UTF-8', 'UTF-32BE');
|
|
|
|
$invalid = array(
|
|
// Codepoints outside of valid 0-0x10FFFF range for Unicode
|
|
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
|
|
"\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
|
|
"\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
|
|
|
|
// Reserved range for UTF-16 surrogate pairs
|
|
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800
|
|
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF
|
|
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF
|
|
|
|
// Truncated characters
|
|
"\xDF" => "\x00\x00\x00%", // should have been 2-byte
|
|
"\xEF\xBF" => "\x00\x00\x00%", // should have been 3-byte
|
|
"\xF0\xBF\xBF" => "\x00\x00\x00%", // should have been 4-byte
|
|
"\xF1\x96" => "\x00\x00\x00%",
|
|
"\xF1\x96\x80" => "\x00\x00\x00%",
|
|
"\xF2\x94" => "\x00\x00\x00%",
|
|
"\xF2\x94\x80" => "\x00\x00\x00%",
|
|
"\xF3\x94" => "\x00\x00\x00%",
|
|
"\xF3\x94\x80" => "\x00\x00\x00%",
|
|
"\xE0\x9F" => "\x00\x00\x00%\x00\x00\x00%",
|
|
"\xED\xA6" => "\x00\x00\x00%\x00\x00\x00%",
|
|
|
|
// Multi-byte characters which end too soon and go to ASCII
|
|
"\xDFA" => "\x00\x00\x00%\x00\x00\x00A",
|
|
"\xEF\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
|
|
"\xF0\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
|
|
"\xF0\xBF\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
|
|
|
|
// Multi-byte characters which end too soon and go to another MB char
|
|
"\xDF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
|
|
"\xEF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
|
|
"\xF0\xBF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
|
|
|
|
// Multi-byte characters which end too soon and go to a junk byte
|
|
// (Which isn't even valid to start a new character)
|
|
"\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
|
|
"\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
|
|
|
|
// Continuation bytes which appear outside of a MB char
|
|
"\x80" => "\x00\x00\x00%",
|
|
"A\x80" => "\x00\x00\x00A\x00\x00\x00%",
|
|
"\xDF\xBF\x80" => "\x00\x00\x07\xFF\x00\x00\x00%",
|
|
|
|
// Overlong code units
|
|
// (Using more bytes than needed to encode a character)
|
|
"\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes
|
|
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes
|
|
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
|
|
);
|
|
|
|
testInvalidCodepoints($invalid, 'UTF-8');
|
|
|
|
echo "== UTF-16 ==\n";
|
|
|
|
testValidCodepoints("UTF-16");
|
|
testValidCodepoints("UTF-16LE");
|
|
testValidCodepoints("UTF-16BE");
|
|
|
|
testValidString('', '', 'UTF-16', 'UTF-32BE');
|
|
testValidString('', '', 'UTF-16LE', 'UTF-32BE');
|
|
testValidString('', '', 'UTF-16BE', 'UTF-32BE');
|
|
|
|
$invalid = array(
|
|
// UTF-16 _cannot_ represent codepoints bigger than 0x10FFFF, so we're not
|
|
// worried about that. But there are plenty of other ways to mess up...
|
|
|
|
// Second half of surrogate pair comes first
|
|
"\xDC\x01\xD8\x02" => "\x00\x00\x00%\x00\x00\x00%",
|
|
|
|
// First half of surrogate pair not followed by second part
|
|
"\xD8\x01\x00A" => "\x00\x00\x00%\x00\x00\x00A",
|
|
|
|
// First half of surrogate pair at end of string
|
|
"\xD8\x01" => "\x00\x00\x00%",
|
|
);
|
|
|
|
testInvalidCodepoints($invalid, 'UTF-16');
|
|
testInvalidCodepoints($invalid, 'UTF-16BE');
|
|
|
|
// Truncated strings
|
|
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16', 'UTF-32BE');
|
|
testInvalidString("\x00A\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16', 'UTF-32BE');
|
|
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16BE', 'UTF-32BE');
|
|
testInvalidString("\x00A\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16BE', 'UTF-32BE');
|
|
|
|
$invalid = array(
|
|
// Second half of surrogate pair comes first
|
|
"\x01\xDC\x02\xD8" => "\x00\x00\x00%\x00\x00\x00%",
|
|
|
|
// First half of surrogate pair not followed by second part
|
|
"\x01\xD8A\x00" => "\x00\x00\x00%\x00\x00\x00A",
|
|
|
|
// First half of surrogate pair at end of string
|
|
"\x01\xD8" => "\x00\x00\x00%",
|
|
|
|
// Two successive codepoints which are both the 1st part of a surrogate pair
|
|
"\x01\xD8\x02\xD8" => "\x00\x00\x00%\x00\x00\x00%"
|
|
);
|
|
|
|
testInvalidCodepoints($invalid, 'UTF-16LE');
|
|
|
|
// Truncated
|
|
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE');
|
|
testInvalidString("A\x00\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE');
|
|
|
|
// Test treatment of BOM
|
|
testValidString("\xFE\xFF\x12\x34", "\x00\x00\x12\x34", 'UTF-16', 'UTF-32BE', false);
|
|
testValidString("\xFF\xFE\x12\x34", "\x00\x00\x34\x12", 'UTF-16', 'UTF-32BE', false);
|
|
|
|
// Test treatment of (illegal) codepoints between U+D800 and U+DFFF
|
|
testValidString("\xD8\x00", "\xD8\x00", 'UCS-2BE', 'UTF-16BE', false);
|
|
testValidString("\xDB\xFF", "\xDB\xFF", 'UCS-2BE', 'UTF-16BE', false);
|
|
testValidString("\xDC\x00", "\xDC\x00", 'UCS-2BE', 'UTF-16BE', false);
|
|
testValidString("\xD8\x00", "\x00\xD8", 'UCS-2BE', 'UTF-16LE', false);
|
|
testValidString("\xDC\x00", "\x00\xDC", 'UCS-2BE', 'UTF-16LE', false);
|
|
|
|
// Try codepoint over U+10FFFF
|
|
convertInvalidString("\x00\x11\x56\x78", "\x00%", 'UCS-4BE', 'UTF-16BE');
|
|
convertInvalidString("\x00\x11\x56\x78", "%\x00", 'UCS-4BE', 'UTF-16LE');
|
|
|
|
echo "== UTF-32 ==\n";
|
|
|
|
testValidCodepoints("UTF-32LE");
|
|
testValidCodepoints("UTF-32BE");
|
|
|
|
// Empty string
|
|
testValidString('', '', 'UTF-32', 'UTF-32BE');
|
|
testValidString('', '', 'UTF-32BE', 'UTF-32');
|
|
testValidString('', '', 'UTF-32LE', 'UTF-32BE');
|
|
|
|
$invalid = array(
|
|
// Codepoints which are too big
|
|
"\x00\x11\x00\x00" => "\x00\x00\x00%",
|
|
"\x80\x00\x00\x00" => "\x00\x00\x00%",
|
|
"\xff\xff\xfe\xff" => "\x00\x00\x00%",
|
|
|
|
// Surrogates
|
|
"\x00\x00\xd8\x00" => "\x00\x00\x00%",
|
|
"\x00\x00\xdb\xff" => "\x00\x00\x00%",
|
|
"\x00\x00\xdc\x00" => "\x00\x00\x00%",
|
|
"\x00\x00\xdf\xff" => "\x00\x00\x00%",
|
|
);
|
|
|
|
testInvalidCodepoints($invalid, 'UTF-32');
|
|
testInvalidCodepoints($invalid, 'UTF-32BE');
|
|
|
|
// Truncated code units
|
|
testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32', 'UTF-32BE');
|
|
testInvalidString("\x00\x01", "\x00\x00\x00%", 'UTF-32', 'UTF-32BE');
|
|
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32', 'UTF-32BE');
|
|
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32BE', 'UTF-32');
|
|
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32BE', 'UTF-32');
|
|
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32BE', 'UTF-32');
|
|
|
|
$invalid = array(
|
|
// Codepoints which are too big
|
|
"\x00\x00\x11\x00" => "\x00\x00\x00%",
|
|
"\x00\x00\x00\x80" => "\x00\x00\x00%",
|
|
"\xff\xfe\xff\xff" => "\x00\x00\x00%",
|
|
|
|
// Surrogates
|
|
"\x00\xd8\x00\x00" => "\x00\x00\x00%",
|
|
"\xff\xdb\x00\x00" => "\x00\x00\x00%",
|
|
"\x00\xdc\x00\x00" => "\x00\x00\x00%",
|
|
"\xff\xdf\x00\x00" => "\x00\x00\x00%",
|
|
);
|
|
|
|
testInvalidCodepoints($invalid, 'UTF-32LE');
|
|
|
|
// Truncated code units
|
|
testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
|
|
testInvalidString("\x00\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
|
|
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
|
|
|
|
// Test treatment of BOM
|
|
testValidString("\x00\x00\xFE\xFF\x00\x00\x12\x34", "\x00\x00\x12\x34", 'UTF-32', 'UTF-32BE', false);
|
|
testValidString("\xFF\xFE\x00\x00\x12\x34\x00\x00", "\x00\x00\x34\x12", 'UTF-32', 'UTF-32BE', false);
|
|
|
|
// Test treatment of (illegal) codepoints between U+D800 and U+DFFF
|
|
testValidString("\xD8\x00", "\x00\x00\xD8\x00", 'UCS-2BE', 'UTF-32BE', false);
|
|
testValidString("\xDB\xFF", "\x00\x00\xDB\xFF", 'UCS-2BE', 'UTF-32BE', false);
|
|
testValidString("\xDC\x00", "\x00\x00\xDC\x00", 'UCS-2BE', 'UTF-32BE', false);
|
|
testValidString("\xD8\x00", "\x00\xD8\x00\x00", 'UCS-2BE', 'UTF-32LE', false);
|
|
testValidString("\xDC\x00", "\x00\xDC\x00\x00", 'UCS-2BE', 'UTF-32LE', false);
|
|
|
|
echo "== UTF-7 ==\n";
|
|
|
|
testValidString('', '', 'UTF-7', 'UTF-32BE');
|
|
|
|
// 'Direct' characters
|
|
foreach (range(ord('A'), ord('Z')) as $byte)
|
|
testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE');
|
|
foreach (range(ord('a'), ord('z')) as $byte)
|
|
testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE');
|
|
foreach (range(ord('0'), ord('9')) as $byte)
|
|
testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE');
|
|
foreach (str_split("'(),-./:?") as $char)
|
|
testValidString($char, "\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE');
|
|
|
|
// 'Optional direct' characters are Base64-encoded in mbstring's implementation
|
|
|
|
// Whitespace
|
|
foreach (str_split(" \t\r\n\x00") as $char)
|
|
testValidString($char, "\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE');
|
|
|
|
// Encoding + as +-
|
|
testValidString('+-', "\x00\x00\x00+", 'UTF-7', 'UTF-32BE', false);
|
|
|
|
// UTF-16 + Base64 encoding
|
|
function encode($str, $encoding) {
|
|
// Base64 encoding for UTF-7 doesn't use '=' for padding
|
|
return str_replace('=', '', base64_encode(mb_convert_encoding($str, 'UTF-16BE', $encoding)));
|
|
}
|
|
|
|
for ($i = 0; $i < 256; $i++) {
|
|
$reversible = true;
|
|
if ($i >= ord('A') && $i <= ord('Z'))
|
|
$reversible = false;
|
|
if ($i >= ord('a') && $i <= ord('z'))
|
|
$reversible = false;
|
|
if ($i >= ord('0') && $i <= ord('9'))
|
|
$reversible = false;
|
|
if (strpos("'(),-./:?\x00 \t\r\n", chr($i)) !== false)
|
|
$reversible = false;
|
|
|
|
testValidString('+' . encode("\x00" . chr($i), 'UTF-16BE') . '-', "\x00\x00\x00" . chr($i), 'UTF-7', 'UTF-32BE', $reversible);
|
|
}
|
|
|
|
testValidString('+' . encode("\x12\x34", 'UTF-16BE') . '-', "\x00\x00\x12\x34", 'UTF-7', 'UTF-32BE');
|
|
testValidString('+' . encode("\x12\x34\x56\x78", 'UTF-16BE') . '-', "\x00\x00\x12\x34\x00\x00\x56\x78", 'UTF-7', 'UTF-32BE');
|
|
testValidString('+' . encode("\x12\x34\x56\x78\x00\x40", 'UTF-16BE') . '-', "\x00\x00\x12\x34\x00\x00\x56\x78\x00\x00\x00\x40", 'UTF-7', 'UTF-32BE');
|
|
testValidString('+' . encode("\xFF\xEE\xEE\xFF", 'UTF-16BE') . '-', "\x00\x00\xFF\xEE\x00\x00\xEE\xFF", 'UTF-7', 'UTF-32BE');
|
|
|
|
// Surrogate pair
|
|
testValidString('+' . encode("\x00\x01\x04\x00", 'UTF-32BE') . '-', "\x00\x01\x04\x00", 'UTF-7', 'UTF-32BE');
|
|
testValidString('+' . encode("\x00\x00\x00A\x00\x01\x04\x00\x00\x00\x00B", 'UTF-32BE') . '-', "\x00\x00\x00A\x00\x01\x04\x00\x00\x00\x00B", 'UTF-7', 'UTF-32BE', false);
|
|
testValidString('+' . encode("\x00\x01\x04\x00\x00\x01\x04\x00", 'UTF-32BE') . '-', "\x00\x01\x04\x00\x00\x01\x04\x00", 'UTF-7', 'UTF-32BE');
|
|
|
|
// Unterminated + section
|
|
// (This is not considered illegal)
|
|
testValidString('+' . encode('ABC', 'ASCII'), "\x00A\x00B\x00C", 'UTF-7', 'UTF-16BE', false);
|
|
|
|
// + sections immediately after each other
|
|
// (This isn't illegal either)
|
|
testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00C\x00D", 'UTF-7', 'UTF-16BE', false);
|
|
|
|
// + sections not immediately after each other
|
|
// (Just trying to be exhaustive here)
|
|
testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false);
|
|
|
|
// + section terminated by a non-Base64 ASCII character which is NOT -
|
|
for ($i = 0; $i < 128; $i++) {
|
|
if ($i >= ord('A') && $i <= ord('Z'))
|
|
continue;
|
|
if ($i >= ord('a') && $i <= ord('z'))
|
|
continue;
|
|
if ($i >= ord('0') && $i <= ord('9'))
|
|
continue;
|
|
if ($i == ord('+') || $i == ord('/') || $i == ord('-') || $i == ord('\\') || $i == ord('~'))
|
|
continue;
|
|
$char = chr($i);
|
|
testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false);
|
|
}
|
|
|
|
// Non-direct character followed by direct character
|
|
testValidString('%A', '+ACU-A', 'ASCII', 'UTF-7');
|
|
testValidString('%%A', '+ACUAJQ-A', 'ASCII', 'UTF-7');
|
|
testValidString('%%%A', '+ACUAJQAl-A', 'ASCII', 'UTF-7');
|
|
|
|
// Now let's see how UTF-7 can go BAD...
|
|
|
|
function rawEncode($str) {
|
|
return str_replace('=', '', base64_encode($str));
|
|
}
|
|
|
|
// Totally bogus byte
|
|
testInvalidString("\xFF", "%", 'UTF-7', 'UTF-8');
|
|
// Totally bogus codepoint... '+ACU-' is '%' in UTF-7'
|
|
testInvalidString("\x12\x34\x56\x78", "+ACU-", 'UTF-32BE', 'UTF-7');
|
|
|
|
// First, messed up UTF16 in + section
|
|
// Second half of surrogate pair coming first
|
|
testInvalidString('+' . rawEncode("\xDC\x01\xD8\x02") . '-', "\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
testInvalidString('+' . rawEncode("\x00.\xDC\x01\xD8\x02") . '-', "\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
testInvalidString('+' . rawEncode("\x00.\x00.\xDC\x01\xD8\x02") . '-', "\x00\x00\x00.\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
|
|
// First half of surrogate pair not followed by second half
|
|
testInvalidString('+' . rawEncode("\xD8\x01\x00A") . '-', "\x00\x00\x00%\x00\x00\x00A", 'UTF-7', 'UTF-32BE');
|
|
testInvalidString('+' . rawEncode("\xD8\x01\xD9\x02") . '-', "\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
testInvalidString('+' . rawEncode("\x00.\xD8\x01\x00A") . '-', "\x00\x00\x00.\x00\x00\x00%\x00\x00\x00A", 'UTF-7', 'UTF-32BE');
|
|
testInvalidString('+' . rawEncode("\x00.\xD8\x01\xD9\x02") . '-', "\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
testInvalidString('+' . rawEncode("\x00.\x00.\xD8\x01\x00A") . '-', "\x00\x00\x00.\x00\x00\x00.\x00\x00\x00%\x00\x00\x00A", 'UTF-7', 'UTF-32BE');
|
|
testInvalidString('+' . rawEncode("\x00.\x00.\xD8\x01\xD9\x02") . '-', "\x00\x00\x00.\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
|
|
// First half of surrogate pair appearing at end of string
|
|
testInvalidString('+' . rawEncode("\xD8\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
testInvalidString('+' . rawEncode("\xD8\x01"), "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
|
|
// Truncated string
|
|
testInvalidString('+' . rawEncode("\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
testInvalidString('+l', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
|
|
// And then, messed up Base64 encoding
|
|
|
|
// Bad padding on + section (not zeroes)
|
|
$encoded = encode("\x12\x34", 'UTF-16BE'); // 3 Base64 bytes, 2 bits of padding...
|
|
$corrupted = substr($encoded, 0, 2) . chr(ord($encoded[2]) + 1);
|
|
testInvalidString('+' . $corrupted . '-', "\x00\x00\x12\x34\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
|
|
// Characters which are not Base64 (and not even ASCII) appearing in Base64 section
|
|
testInvalidString("+\x80", "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
|
|
|
// Try codepoint over U+10FFFF; '+ACU-' is the error marker '%'
|
|
convertInvalidString("\x12\x34\x56\x78", "+ACU-", 'UCS-4BE', 'UTF-7');
|
|
convertInvalidString("\x00\x11\x56\x78", "+ACU-", 'UCS-4BE', 'UTF-7');
|
|
|
|
// If error marker character needs to be ASCII-encoded but is able to serve as an
|
|
// ending character for a Base64 section, no need to add an additional dash
|
|
mb_substitute_character(0x3F); // ?
|
|
convertInvalidString("\x1E\xBE", '+AB4?', 'UTF-7', 'UTF-7');
|
|
|
|
echo "Done!\n";
|
|
|
|
?>
|
|
--EXPECT--
|
|
== UTF-8 ==
|
|
== UTF-16 ==
|
|
== UTF-32 ==
|
|
== UTF-7 ==
|
|
Done!
|