[JsonPath] Fix parsing invalid Unicode codepoints

This commit is contained in:
Nicolas Grekas
2025-07-22 13:33:58 +02:00
parent 5d2b68ba70
commit dbe072242b
2 changed files with 24 additions and 20 deletions

View File

@@ -12,6 +12,7 @@
namespace Symfony\Component\JsonPath;
use Symfony\Component\JsonPath\Exception\InvalidArgumentException;
use Symfony\Component\JsonPath\Exception\InvalidJsonPathException;
use Symfony\Component\JsonPath\Exception\InvalidJsonStringInputException;
use Symfony\Component\JsonPath\Exception\JsonCrawlerException;
use Symfony\Component\JsonPath\Tokenizer\JsonPathToken;
@@ -83,7 +84,7 @@ final class JsonCrawler implements JsonCrawlerInterface
return $this->evaluateTokensOnDecodedData($tokens, $data);
} catch (InvalidArgumentException $e) {
throw $e;
} catch (\Throwable $e) {
} catch (InvalidJsonPathException $e) {
throw new JsonCrawlerException($query, $e->getMessage(), previous: $e);
}
}
@@ -329,7 +330,7 @@ final class JsonCrawler implements JsonCrawlerInterface
return \array_key_exists($key, $value) ? [$value[$key]] : [];
}
throw new \LogicException(\sprintf('Unsupported bracket expression "%s".', $expr));
throw new InvalidJsonPathException(\sprintf('Unsupported bracket expression "%s".', $expr));
}
private function evaluateFilter(string $expr, mixed $value): array

View File

@@ -117,7 +117,7 @@ final class JsonPathUtils
't' => "\t",
'u' => self::unescapeUnicodeSequence($str, $i),
$quoteChar => $quoteChar,
default => throw new JsonCrawlerException('', \sprintf('Invalid escape sequence "\\%s" in %s-quoted string', $str[$i + 1], "'" === $quoteChar ? 'single' : 'double')),
default => throw new JsonCrawlerException('', \sprintf('Invalid escape sequence "\\%s" in %s-quoted string.', $str[$i + 1], "'" === $quoteChar ? 'single' : 'double')),
};
++$i;
@@ -132,30 +132,33 @@ final class JsonPathUtils
private static function unescapeUnicodeSequence(string $str, int &$i): string
{
if (!isset($str[$i + 5]) || !ctype_xdigit(substr($str, $i + 2, 4))) {
throw new JsonCrawlerException('', 'Invalid unicode escape sequence');
throw new JsonCrawlerException('', 'Invalid unicode escape sequence.');
}
$hex = substr($str, $i + 2, 4);
$codepoint = hexdec(substr($str, $i + 2, 4));
$codepoint = hexdec($hex);
// looks like a valid Unicode codepoint, string length is sufficient and it starts with \u
if (0xD800 <= $codepoint && $codepoint <= 0xDBFF && isset($str[$i + 11]) && '\\' === $str[$i + 6] && 'u' === $str[$i + 7]) {
$lowHex = substr($str, $i + 8, 4);
if (ctype_xdigit($lowHex)) {
$lowSurrogate = hexdec($lowHex);
if (0xDC00 <= $lowSurrogate && $lowSurrogate <= 0xDFFF) {
$codepoint = 0x10000 + (($codepoint & 0x3FF) << 10) + ($lowSurrogate & 0x3FF);
$i += 10; // skip surrogate pair
return mb_chr($codepoint, 'UTF-8');
}
}
if (0xD800 <= $codepoint
&& $codepoint <= 0xDBFF
&& isset($str[$i + 11])
&& '\\' === $str[$i + 6]
&& 'u' === $str[$i + 7]
&& ctype_xdigit($lowSurrogate = substr($str, $i + 8, 4))
&& 0xDC00 <= ($lowSurrogate = hexdec($lowSurrogate))
&& $lowSurrogate <= 0xDFFF
) {
$codepoint = 0x10000 + (($codepoint & 0x3FF) << 10) + ($lowSurrogate & 0x3FF);
$i += 10; // skip surrogate pair
} else {
// single Unicode character or invalid surrogate, skip the sequence
$i += 4;
}
// single Unicode character or invalid surrogate, skip the sequence
$i += 4;
if (false === $chr = mb_chr($codepoint, 'UTF-8')) {
throw new JsonCrawlerException('', \sprintf('Invalid Unicode codepoint: U+%04X.', $codepoint));
}
return mb_chr($codepoint, 'UTF-8');
return $chr;
}
/**