Fix GH-17481: UTF-8 corruption in \Dom\HTMLDocument

We need to properly handle the case when we return from having too few bytes, this needs to be handled separately because the while loop otherwise just performs a partial byte copy. Closes GH-17489.
2026-03-24 00:02:20 +01:00 · 2025-01-16 20:11:35 +01:00
parent 21c170c75a
commit 2952e164a9
3 changed files with 59 additions and 2 deletions
--- a/1
+++ b/1
@@ -21,6 +21,7 @@ PHP                                                                        NEWS
  . Fixed bug GH-17397 (Assertion failure ext/dom/php_dom.c). (nielsdos)
  . Fixed bug GH-17486 (Incorrect error line numbers reported in
    Dom\HTMLDocument::createFromString). (nielsdos)
+  . Fixed bug GH-17481 (UTF-8 corruption in \Dom\HTMLDocument). (nielsdos)

 - Enchant:
  . Fix crashes in enchant when passing null bytes. (nielsdos)
--- a/ext/dom/html_document.c
+++ b/ext/dom/html_document.c
@@ -528,9 +528,32 @@ static bool dom_decode_encode_fast_path(
 	size_t *tree_error_offset
 )
 {
-	decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
-
 	const lxb_char_t *buf_ref = *buf_ref_ref;
+
+	/* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
+	if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
+		lxb_char_t buf[4];
+		lxb_char_t *buf_ptr = buf;
+		lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
+		if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) {
+			buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
+		}
+		decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
+
+		if (!dom_process_parse_chunk(
+			ctx,
+			document,
+			parser,
+			buf_ptr - buf,
+			buf,
+			buf_ptr - buf,
+			tokenizer_error_offset,
+			tree_error_offset
+		)) {
+			goto fail_oom;
+		}
+	}
+
 	const lxb_char_t *last_output = buf_ref;
 	while (buf_ref != buf_end) {
 		/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
--- a/ext/dom/tests/modern/html/encoding/gh17481.phpt
+++ b/ext/dom/tests/modern/html/encoding/gh17481.phpt
@@ -0,0 +1,33 @@
+--TEST--
+GH-17481 (UTF-8 corruption in \Dom\HTMLDocument)
+--EXTENSIONS--
+dom
+--FILE--
+<?php
+
+$inputs = [
+    [str_repeat('–', 4096), false],
+    [str_repeat('😏', 4096), false],
+    [str_repeat('–', 4096), true],
+    [str_repeat('😏', 4096), true],
+    [str_repeat('–', 1358), false],
+    [str_repeat('–', 1359), false],
+];
+
+foreach ($inputs as [$input, $endTag]) {
+    $Data = "<!DOCTYPE HTML><html>$input";
+    if ($endTag) {
+        $Data .= '</html>';
+    }
+    $Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8');
+    var_dump($Document->body->textContent === $input);
+}
+
+?>
+--EXPECT--
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)