From 66c8a1a512589678a145929cb6969543b001fb92 Mon Sep 17 00:00:00 2001 From: Nicolas Grekas Date: Tue, 22 Jul 2025 13:33:58 +0200 Subject: [PATCH] [JsonPath] Fix parsing invalid Unicode codepoints --- .../Component/JsonPath/JsonCrawler.php | 5 ++- .../Component/JsonPath/JsonPathUtils.php | 39 ++++++++++--------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/Symfony/Component/JsonPath/JsonCrawler.php b/src/Symfony/Component/JsonPath/JsonCrawler.php index d66b328a7149..8e2b32f3452e 100644 --- a/src/Symfony/Component/JsonPath/JsonCrawler.php +++ b/src/Symfony/Component/JsonPath/JsonCrawler.php @@ -12,6 +12,7 @@ namespace Symfony\Component\JsonPath; use Symfony\Component\JsonPath\Exception\InvalidArgumentException; +use Symfony\Component\JsonPath\Exception\InvalidJsonPathException; use Symfony\Component\JsonPath\Exception\InvalidJsonStringInputException; use Symfony\Component\JsonPath\Exception\JsonCrawlerException; use Symfony\Component\JsonPath\Tokenizer\JsonPathToken; @@ -83,7 +84,7 @@ private function evaluate(JsonPath $query): array return $this->evaluateTokensOnDecodedData($tokens, $data); } catch (InvalidArgumentException $e) { throw $e; - } catch (\Throwable $e) { + } catch (InvalidJsonPathException $e) { throw new JsonCrawlerException($query, $e->getMessage(), previous: $e); } } @@ -329,7 +330,7 @@ private function evaluateBracket(string $expr, mixed $value): array return \array_key_exists($key, $value) ? [$value[$key]] : []; } - throw new \LogicException(\sprintf('Unsupported bracket expression "%s".', $expr)); + throw new InvalidJsonPathException(\sprintf('Unsupported bracket expression "%s".', $expr)); } private function evaluateFilter(string $expr, mixed $value): array diff --git a/src/Symfony/Component/JsonPath/JsonPathUtils.php b/src/Symfony/Component/JsonPath/JsonPathUtils.php index b6667afad205..18134c23d3ea 100644 --- a/src/Symfony/Component/JsonPath/JsonPathUtils.php +++ b/src/Symfony/Component/JsonPath/JsonPathUtils.php @@ -117,7 +117,7 @@ public static function unescapeString(string $str, string $quoteChar): string 't' => "\t", 'u' => self::unescapeUnicodeSequence($str, $i), $quoteChar => $quoteChar, - default => throw new JsonCrawlerException('', \sprintf('Invalid escape sequence "\\%s" in %s-quoted string', $str[$i + 1], "'" === $quoteChar ? 'single' : 'double')), + default => throw new JsonCrawlerException('', \sprintf('Invalid escape sequence "\\%s" in %s-quoted string.', $str[$i + 1], "'" === $quoteChar ? 'single' : 'double')), }; ++$i; @@ -132,30 +132,33 @@ public static function unescapeString(string $str, string $quoteChar): string private static function unescapeUnicodeSequence(string $str, int &$i): string { if (!isset($str[$i + 5]) || !ctype_xdigit(substr($str, $i + 2, 4))) { - throw new JsonCrawlerException('', 'Invalid unicode escape sequence'); + throw new JsonCrawlerException('', 'Invalid unicode escape sequence.'); } - $hex = substr($str, $i + 2, 4); + $codepoint = hexdec(substr($str, $i + 2, 4)); - $codepoint = hexdec($hex); // looks like a valid Unicode codepoint, string length is sufficient and it starts with \u - if (0xD800 <= $codepoint && $codepoint <= 0xDBFF && isset($str[$i + 11]) && '\\' === $str[$i + 6] && 'u' === $str[$i + 7]) { - $lowHex = substr($str, $i + 8, 4); - if (ctype_xdigit($lowHex)) { - $lowSurrogate = hexdec($lowHex); - if (0xDC00 <= $lowSurrogate && $lowSurrogate <= 0xDFFF) { - $codepoint = 0x10000 + (($codepoint & 0x3FF) << 10) + ($lowSurrogate & 0x3FF); - $i += 10; // skip surrogate pair - - return mb_chr($codepoint, 'UTF-8'); - } - } + if (0xD800 <= $codepoint + && $codepoint <= 0xDBFF + && isset($str[$i + 11]) + && '\\' === $str[$i + 6] + && 'u' === $str[$i + 7] + && ctype_xdigit($lowSurrogate = substr($str, $i + 8, 4)) + && 0xDC00 <= ($lowSurrogate = hexdec($lowSurrogate)) + && $lowSurrogate <= 0xDFFF + ) { + $codepoint = 0x10000 + (($codepoint & 0x3FF) << 10) + ($lowSurrogate & 0x3FF); + $i += 10; // skip surrogate pair + } else { + // single Unicode character or invalid surrogate, skip the sequence + $i += 4; } - // single Unicode character or invalid surrogate, skip the sequence - $i += 4; + if (false === $chr = mb_chr($codepoint, 'UTF-8')) { + throw new JsonCrawlerException('', \sprintf('Invalid Unicode codepoint: U+%04X.', $codepoint)); + } - return mb_chr($codepoint, 'UTF-8'); + return $chr; } /**