From 2952e164a94d0fb82b752093d383cf18efffc4f5 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Thu, 16 Jan 2025 20:11:35 +0100 Subject: [PATCH] Fix GH-17481: UTF-8 corruption in \Dom\HTMLDocument We need to properly handle the case when we return from having too few bytes, this needs to be handled separately because the while loop otherwise just performs a partial byte copy. Closes GH-17489. --- NEWS | 1 + ext/dom/html_document.c | 27 +++++++++++++-- .../tests/modern/html/encoding/gh17481.phpt | 33 +++++++++++++++++++ 3 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 ext/dom/tests/modern/html/encoding/gh17481.phpt diff --git a/NEWS b/NEWS index 2e541dab322..f90bc7f9113 100644 --- a/NEWS +++ b/NEWS @@ -21,6 +21,7 @@ PHP NEWS . Fixed bug GH-17397 (Assertion failure ext/dom/php_dom.c). (nielsdos) . Fixed bug GH-17486 (Incorrect error line numbers reported in Dom\HTMLDocument::createFromString). (nielsdos) + . Fixed bug GH-17481 (UTF-8 corruption in \Dom\HTMLDocument). (nielsdos) - Enchant: . Fix crashes in enchant when passing null bytes. (nielsdos) diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c index 240fa71a0cc..254c18deb1b 100644 --- a/ext/dom/html_document.c +++ b/ext/dom/html_document.c @@ -528,9 +528,32 @@ static bool dom_decode_encode_fast_path( size_t *tree_error_offset ) { - decoding_encoding_ctx->decode.status = LXB_STATUS_OK; - const lxb_char_t *buf_ref = *buf_ref_ref; + + /* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */ + if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) { + lxb_char_t buf[4]; + lxb_char_t *buf_ptr = buf; + lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end); + if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) { + buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE); + } + decoding_encoding_ctx->decode.status = LXB_STATUS_OK; + + if (!dom_process_parse_chunk( + ctx, + document, + parser, + buf_ptr - buf, + buf, + buf_ptr - buf, + tokenizer_error_offset, + tree_error_offset + )) { + goto fail_oom; + } + } + const lxb_char_t *last_output = buf_ref; while (buf_ref != buf_end) { /* Fast path converts non-validated UTF-8 -> validated UTF-8 */ diff --git a/ext/dom/tests/modern/html/encoding/gh17481.phpt b/ext/dom/tests/modern/html/encoding/gh17481.phpt new file mode 100644 index 00000000000..74e13e13009 --- /dev/null +++ b/ext/dom/tests/modern/html/encoding/gh17481.phpt @@ -0,0 +1,33 @@ +--TEST-- +GH-17481 (UTF-8 corruption in \Dom\HTMLDocument) +--EXTENSIONS-- +dom +--FILE-- +$input"; + if ($endTag) { + $Data .= ''; + } + $Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8'); + var_dump($Document->body->textContent === $input); +} + +?> +--EXPECT-- +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true)