mirror of
https://github.com/php/php-src.git
synced 2026-04-29 11:13:36 +02:00
Merge branch 'PHP-8.4'
* PHP-8.4: Fix GH-17481: UTF-8 corruption in \Dom\HTMLDocument Fix GH-17486: Incorrect error line numbers reported in Dom\HTMLDocument::createFromString
This commit is contained in:
+32
-2
@@ -553,9 +553,32 @@ static bool dom_decode_encode_fast_path(
|
||||
size_t *tree_error_offset
|
||||
)
|
||||
{
|
||||
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
|
||||
|
||||
const lxb_char_t *buf_ref = *buf_ref_ref;
|
||||
|
||||
/* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
|
||||
if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
|
||||
lxb_char_t buf[4];
|
||||
lxb_char_t *buf_ptr = buf;
|
||||
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
|
||||
if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) {
|
||||
buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
|
||||
}
|
||||
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
|
||||
|
||||
if (!dom_process_parse_chunk(
|
||||
ctx,
|
||||
document,
|
||||
parser,
|
||||
buf_ptr - buf,
|
||||
buf,
|
||||
buf_ptr - buf,
|
||||
tokenizer_error_offset,
|
||||
tree_error_offset
|
||||
)) {
|
||||
goto fail_oom;
|
||||
}
|
||||
}
|
||||
|
||||
const lxb_char_t *last_output = buf_ref;
|
||||
while (buf_ref != buf_end) {
|
||||
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
|
||||
@@ -904,6 +927,13 @@ PHP_METHOD(Dom_HTMLDocument, createFromString)
|
||||
if (!result) {
|
||||
goto fail_oom;
|
||||
}
|
||||
|
||||
/* In the string case we have a single buffer that acts as a sliding window.
|
||||
* The `current_input_characters` field starts pointing at the start of the buffer, but needs to slide along the
|
||||
* sliding window as well. */
|
||||
if (application_data.current_input_characters) {
|
||||
application_data.current_input_characters += chunk_size;
|
||||
}
|
||||
}
|
||||
|
||||
if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
--TEST--
|
||||
GH-17481 (UTF-8 corruption in \Dom\HTMLDocument)
|
||||
--EXTENSIONS--
|
||||
dom
|
||||
--FILE--
|
||||
<?php
|
||||
|
||||
$inputs = [
|
||||
[str_repeat('–', 4096), false],
|
||||
[str_repeat('😏', 4096), false],
|
||||
[str_repeat('–', 4096), true],
|
||||
[str_repeat('😏', 4096), true],
|
||||
[str_repeat('–', 1358), false],
|
||||
[str_repeat('–', 1359), false],
|
||||
];
|
||||
|
||||
foreach ($inputs as [$input, $endTag]) {
|
||||
$Data = "<!DOCTYPE HTML><html>$input";
|
||||
if ($endTag) {
|
||||
$Data .= '</html>';
|
||||
}
|
||||
$Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8');
|
||||
var_dump($Document->body->textContent === $input);
|
||||
}
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
@@ -0,0 +1,39 @@
|
||||
--TEST--
|
||||
GH-17486 (Incorrect error line numbers reported in Dom\HTMLDocument::createFromString)
|
||||
--EXTENSIONS--
|
||||
dom
|
||||
--INI--
|
||||
error_reporting=E_ALL
|
||||
--CREDITS--
|
||||
xPaw
|
||||
--FILE--
|
||||
<?php
|
||||
|
||||
$repeated = str_repeat('a', 50000);
|
||||
|
||||
$html = <<<HTML
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<body>
|
||||
<svg>
|
||||
<path d="{$repeated}" />
|
||||
</svg>
|
||||
<div>�</div>
|
||||
</body>
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
\Dom\HTMLDocument::createFromString($html);
|
||||
|
||||
file_put_contents(__DIR__ . '/gh17486.tmp', $html);
|
||||
\Dom\HTMLDocument::createFromFile(__DIR__ . '/gh17486.tmp');
|
||||
|
||||
?>
|
||||
--CLEAN--
|
||||
<?php
|
||||
@unlink(__DIR__ . '/gh17486.tmp');
|
||||
?>
|
||||
--EXPECTF--
|
||||
Warning: Dom\HTMLDocument::createFromString(): tokenizer error null-character-reference in Entity, line: 7, column: 9 in %s on line %d
|
||||
|
||||
Warning: Dom\HTMLDocument::createFromFile(): tokenizer error null-character-reference in %s line: 7, column: 9 in %s on line %d
|
||||
Reference in New Issue
Block a user