1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

Fix GH-17481: UTF-8 corruption in \Dom\HTMLDocument

We need to properly handle the case when we return from having too few
bytes, this needs to be handled separately because the while loop
otherwise just performs a partial byte copy.

Closes GH-17489.
This commit is contained in:
Niels Dossche
2025-01-16 20:11:35 +01:00
parent 21c170c75a
commit 2952e164a9
3 changed files with 59 additions and 2 deletions

1
NEWS
View File

@@ -21,6 +21,7 @@ PHP NEWS
. Fixed bug GH-17397 (Assertion failure ext/dom/php_dom.c). (nielsdos)
. Fixed bug GH-17486 (Incorrect error line numbers reported in
Dom\HTMLDocument::createFromString). (nielsdos)
. Fixed bug GH-17481 (UTF-8 corruption in \Dom\HTMLDocument). (nielsdos)
- Enchant:
. Fix crashes in enchant when passing null bytes. (nielsdos)

View File

@@ -528,9 +528,32 @@ static bool dom_decode_encode_fast_path(
size_t *tree_error_offset
)
{
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
const lxb_char_t *buf_ref = *buf_ref_ref;
/* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
lxb_char_t buf[4];
lxb_char_t *buf_ptr = buf;
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) {
buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
}
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
if (!dom_process_parse_chunk(
ctx,
document,
parser,
buf_ptr - buf,
buf,
buf_ptr - buf,
tokenizer_error_offset,
tree_error_offset
)) {
goto fail_oom;
}
}
const lxb_char_t *last_output = buf_ref;
while (buf_ref != buf_end) {
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */

View File

@@ -0,0 +1,33 @@
--TEST--
GH-17481 (UTF-8 corruption in \Dom\HTMLDocument)
--EXTENSIONS--
dom
--FILE--
<?php
$inputs = [
[str_repeat('', 4096), false],
[str_repeat('😏', 4096), false],
[str_repeat('', 4096), true],
[str_repeat('😏', 4096), true],
[str_repeat('', 1358), false],
[str_repeat('', 1359), false],
];
foreach ($inputs as [$input, $endTag]) {
$Data = "<!DOCTYPE HTML><html>$input";
if ($endTag) {
$Data .= '</html>';
}
$Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8');
var_dump($Document->body->textContent === $input);
}
?>
--EXPECT--
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)