mirror of
https://github.com/php/php-src.git
synced 2026-03-24 00:02:20 +01:00
Fix GH-17481: UTF-8 corruption in \Dom\HTMLDocument
We need to properly handle the case when we return from having too few bytes, this needs to be handled separately because the while loop otherwise just performs a partial byte copy. Closes GH-17489.
This commit is contained in:
1
NEWS
1
NEWS
@@ -21,6 +21,7 @@ PHP NEWS
|
||||
. Fixed bug GH-17397 (Assertion failure ext/dom/php_dom.c). (nielsdos)
|
||||
. Fixed bug GH-17486 (Incorrect error line numbers reported in
|
||||
Dom\HTMLDocument::createFromString). (nielsdos)
|
||||
. Fixed bug GH-17481 (UTF-8 corruption in \Dom\HTMLDocument). (nielsdos)
|
||||
|
||||
- Enchant:
|
||||
. Fix crashes in enchant when passing null bytes. (nielsdos)
|
||||
|
||||
@@ -528,9 +528,32 @@ static bool dom_decode_encode_fast_path(
|
||||
size_t *tree_error_offset
|
||||
)
|
||||
{
|
||||
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
|
||||
|
||||
const lxb_char_t *buf_ref = *buf_ref_ref;
|
||||
|
||||
/* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
|
||||
if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
|
||||
lxb_char_t buf[4];
|
||||
lxb_char_t *buf_ptr = buf;
|
||||
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
|
||||
if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) {
|
||||
buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
|
||||
}
|
||||
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
|
||||
|
||||
if (!dom_process_parse_chunk(
|
||||
ctx,
|
||||
document,
|
||||
parser,
|
||||
buf_ptr - buf,
|
||||
buf,
|
||||
buf_ptr - buf,
|
||||
tokenizer_error_offset,
|
||||
tree_error_offset
|
||||
)) {
|
||||
goto fail_oom;
|
||||
}
|
||||
}
|
||||
|
||||
const lxb_char_t *last_output = buf_ref;
|
||||
while (buf_ref != buf_end) {
|
||||
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
|
||||
|
||||
33
ext/dom/tests/modern/html/encoding/gh17481.phpt
Normal file
33
ext/dom/tests/modern/html/encoding/gh17481.phpt
Normal file
@@ -0,0 +1,33 @@
|
||||
--TEST--
|
||||
GH-17481 (UTF-8 corruption in \Dom\HTMLDocument)
|
||||
--EXTENSIONS--
|
||||
dom
|
||||
--FILE--
|
||||
<?php
|
||||
|
||||
$inputs = [
|
||||
[str_repeat('–', 4096), false],
|
||||
[str_repeat('😏', 4096), false],
|
||||
[str_repeat('–', 4096), true],
|
||||
[str_repeat('😏', 4096), true],
|
||||
[str_repeat('–', 1358), false],
|
||||
[str_repeat('–', 1359), false],
|
||||
];
|
||||
|
||||
foreach ($inputs as [$input, $endTag]) {
|
||||
$Data = "<!DOCTYPE HTML><html>$input";
|
||||
if ($endTag) {
|
||||
$Data .= '</html>';
|
||||
}
|
||||
$Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8');
|
||||
var_dump($Document->body->textContent === $input);
|
||||
}
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
Reference in New Issue
Block a user