1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

Implement #53655: Improve speed of DOMNode::C14N() on large XML documents

The XPath query is in accordance to spec [1]. However, we can do it in a
simpler way. We can use a custom callback function instead of a linear
search in XPath to check if a node is visible. Note that comment nodes
are handled internally by libxml2 already, so we do not need to
differentiate between node types. The callback will do an upwards
traversal of the tree until the root of the canonicalization is reached.
In practice this will speed up the application a lot.

[1] https://www.w3.org/TR/2001/REC-xml-c14n-20010315 section 2.1

Closes GH-12278.
This commit is contained in:
Niels Dossche
2023-09-22 18:46:18 +02:00
parent ab30f27ce3
commit 5d68d61943
3 changed files with 31 additions and 16 deletions

2
NEWS
View File

@@ -7,6 +7,8 @@ Core:
DOM:
. Added DOMNode::compareDocumentPosition(). (nielsdos)
. Implement #53655 (Improve speed of DOMNode::C14N() on large XML documents).
(nielsdos)
Intl:
. Added IntlDateFormatter::PATTERN constant. (David Carlier)

View File

@@ -107,3 +107,6 @@ PHP 8.4 UPGRADE NOTES
14. Performance Improvements
========================================
* The performance of DOMNode::C14N() is greatly improved for the case without
an xpath query. This can give a time improvement of easily two order of
magnitude for documents with tens of thousands of nodes.

View File

@@ -1736,6 +1736,25 @@ PHP_METHOD(DOMNode, lookupNamespaceURI)
}
/* }}} end dom_node_lookup_namespace_uri */
static int dom_canonicalize_node_parent_lookup_cb(void *user_data, xmlNodePtr node, xmlNodePtr parent)
{
xmlNodePtr root = user_data;
/* We have to unroll the first iteration because node->parent
* is not necessarily equal to parent due to libxml2 tree rules (ns decls out of the tree for example). */
if (node == root) {
return 1;
}
node = parent;
while (node != NULL) {
if (node == root) {
return 1;
}
node = node->parent;
}
return 0;
}
static void dom_canonicalization(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */
{
zval *id;
@@ -1779,22 +1798,10 @@ static void dom_canonicalization(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{
php_libxml_invalidate_node_list_cache_from_doc(docp);
bool simple_node_parent_lookup_callback = false;
if (xpath_array == NULL) {
if (nodep->type != XML_DOCUMENT_NODE) {
ctxp = xmlXPathNewContext(docp);
ctxp->node = nodep;
xpathobjp = xmlXPathEvalExpression((xmlChar *) "(.//. | .//@* | .//namespace::*)", ctxp);
ctxp->node = NULL;
if (xpathobjp && xpathobjp->type == XPATH_NODESET) {
nodeset = xpathobjp->nodesetval;
} else {
if (xpathobjp) {
xmlXPathFreeObject(xpathobjp);
}
xmlXPathFreeContext(ctxp);
zend_throw_error(NULL, "XPath query did not return a nodeset");
RETURN_THROWS();
}
simple_node_parent_lookup_callback = true;
}
} else {
/*xpath query from xpath_array */
@@ -1873,8 +1880,11 @@ static void dom_canonicalization(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{
}
if (buf != NULL) {
ret = xmlC14NDocSaveTo(docp, nodeset, exclusive, inclusive_ns_prefixes,
with_comments, buf);
if (simple_node_parent_lookup_callback) {
ret = xmlC14NExecute(docp, dom_canonicalize_node_parent_lookup_cb, nodep, exclusive, inclusive_ns_prefixes, with_comments, buf);
} else {
ret = xmlC14NDocSaveTo(docp, nodeset, exclusive, inclusive_ns_prefixes, with_comments, buf);
}
}
if (inclusive_ns_prefixes != NULL) {