1
0
mirror of https://github.com/php/php-src.git synced 2026-04-29 11:13:36 +02:00

Merge branch 'PHP-8.4'

* PHP-8.4:
  Fix GH-17481: UTF-8 corruption in \Dom\HTMLDocument
  Fix GH-17486: Incorrect error line numbers reported in Dom\HTMLDocument::createFromString
This commit is contained in:
Niels Dossche
2025-01-17 16:25:23 +01:00
3 changed files with 104 additions and 2 deletions
+32 -2
View File
@@ -553,9 +553,32 @@ static bool dom_decode_encode_fast_path(
size_t *tree_error_offset
)
{
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
const lxb_char_t *buf_ref = *buf_ref_ref;
/* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
lxb_char_t buf[4];
lxb_char_t *buf_ptr = buf;
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) {
buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
}
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
if (!dom_process_parse_chunk(
ctx,
document,
parser,
buf_ptr - buf,
buf,
buf_ptr - buf,
tokenizer_error_offset,
tree_error_offset
)) {
goto fail_oom;
}
}
const lxb_char_t *last_output = buf_ref;
while (buf_ref != buf_end) {
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
@@ -904,6 +927,13 @@ PHP_METHOD(Dom_HTMLDocument, createFromString)
if (!result) {
goto fail_oom;
}
/* In the string case we have a single buffer that acts as a sliding window.
* The `current_input_characters` field starts pointing at the start of the buffer, but needs to slide along the
* sliding window as well. */
if (application_data.current_input_characters) {
application_data.current_input_characters += chunk_size;
}
}
if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
@@ -0,0 +1,33 @@
--TEST--
GH-17481 (UTF-8 corruption in \Dom\HTMLDocument)
--EXTENSIONS--
dom
--FILE--
<?php
$inputs = [
[str_repeat('', 4096), false],
[str_repeat('😏', 4096), false],
[str_repeat('', 4096), true],
[str_repeat('😏', 4096), true],
[str_repeat('', 1358), false],
[str_repeat('', 1359), false],
];
foreach ($inputs as [$input, $endTag]) {
$Data = "<!DOCTYPE HTML><html>$input";
if ($endTag) {
$Data .= '</html>';
}
$Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8');
var_dump($Document->body->textContent === $input);
}
?>
--EXPECT--
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
@@ -0,0 +1,39 @@
--TEST--
GH-17486 (Incorrect error line numbers reported in Dom\HTMLDocument::createFromString)
--EXTENSIONS--
dom
--INI--
error_reporting=E_ALL
--CREDITS--
xPaw
--FILE--
<?php
$repeated = str_repeat('a', 50000);
$html = <<<HTML
<!DOCTYPE html>
<html lang="en">
<body>
<svg>
<path d="{$repeated}" />
</svg>
<div>&#x0;</div>
</body>
</html>
HTML;
\Dom\HTMLDocument::createFromString($html);
file_put_contents(__DIR__ . '/gh17486.tmp', $html);
\Dom\HTMLDocument::createFromFile(__DIR__ . '/gh17486.tmp');
?>
--CLEAN--
<?php
@unlink(__DIR__ . '/gh17486.tmp');
?>
--EXPECTF--
Warning: Dom\HTMLDocument::createFromString(): tokenizer error null-character-reference in Entity, line: 7, column: 9 in %s on line %d
Warning: Dom\HTMLDocument::createFromFile(): tokenizer error null-character-reference in %s line: 7, column: 9 in %s on line %d