From 5d68d619436baeaeea78adae51dbd538d62abe66 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Fri, 22 Sep 2023 18:46:18 +0200 Subject: [PATCH] Implement #53655: Improve speed of DOMNode::C14N() on large XML documents The XPath query is in accordance to spec [1]. However, we can do it in a simpler way. We can use a custom callback function instead of a linear search in XPath to check if a node is visible. Note that comment nodes are handled internally by libxml2 already, so we do not need to differentiate between node types. The callback will do an upwards traversal of the tree until the root of the canonicalization is reached. In practice this will speed up the application a lot. [1] https://www.w3.org/TR/2001/REC-xml-c14n-20010315 section 2.1 Closes GH-12278. --- NEWS | 2 ++ UPGRADING | 3 +++ ext/dom/node.c | 42 ++++++++++++++++++++++++++---------------- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/NEWS b/NEWS index 2c84a7fb780..53d6a71945a 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,8 @@ Core: DOM: . Added DOMNode::compareDocumentPosition(). (nielsdos) + . Implement #53655 (Improve speed of DOMNode::C14N() on large XML documents). + (nielsdos) Intl: . Added IntlDateFormatter::PATTERN constant. (David Carlier) diff --git a/UPGRADING b/UPGRADING index d37d5e78270..54a2f0e8c33 100644 --- a/UPGRADING +++ b/UPGRADING @@ -107,3 +107,6 @@ PHP 8.4 UPGRADE NOTES 14. Performance Improvements ======================================== +* The performance of DOMNode::C14N() is greatly improved for the case without + an xpath query. This can give a time improvement of easily two order of + magnitude for documents with tens of thousands of nodes. diff --git a/ext/dom/node.c b/ext/dom/node.c index 9bcf8384e58..b208a2d8a8f 100644 --- a/ext/dom/node.c +++ b/ext/dom/node.c @@ -1736,6 +1736,25 @@ PHP_METHOD(DOMNode, lookupNamespaceURI) } /* }}} end dom_node_lookup_namespace_uri */ +static int dom_canonicalize_node_parent_lookup_cb(void *user_data, xmlNodePtr node, xmlNodePtr parent) +{ + xmlNodePtr root = user_data; + /* We have to unroll the first iteration because node->parent + * is not necessarily equal to parent due to libxml2 tree rules (ns decls out of the tree for example). */ + if (node == root) { + return 1; + } + node = parent; + while (node != NULL) { + if (node == root) { + return 1; + } + node = node->parent; + } + + return 0; +} + static void dom_canonicalization(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */ { zval *id; @@ -1779,22 +1798,10 @@ static void dom_canonicalization(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ php_libxml_invalidate_node_list_cache_from_doc(docp); + bool simple_node_parent_lookup_callback = false; if (xpath_array == NULL) { if (nodep->type != XML_DOCUMENT_NODE) { - ctxp = xmlXPathNewContext(docp); - ctxp->node = nodep; - xpathobjp = xmlXPathEvalExpression((xmlChar *) "(.//. | .//@* | .//namespace::*)", ctxp); - ctxp->node = NULL; - if (xpathobjp && xpathobjp->type == XPATH_NODESET) { - nodeset = xpathobjp->nodesetval; - } else { - if (xpathobjp) { - xmlXPathFreeObject(xpathobjp); - } - xmlXPathFreeContext(ctxp); - zend_throw_error(NULL, "XPath query did not return a nodeset"); - RETURN_THROWS(); - } + simple_node_parent_lookup_callback = true; } } else { /*xpath query from xpath_array */ @@ -1873,8 +1880,11 @@ static void dom_canonicalization(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ } if (buf != NULL) { - ret = xmlC14NDocSaveTo(docp, nodeset, exclusive, inclusive_ns_prefixes, - with_comments, buf); + if (simple_node_parent_lookup_callback) { + ret = xmlC14NExecute(docp, dom_canonicalize_node_parent_lookup_cb, nodep, exclusive, inclusive_ns_prefixes, with_comments, buf); + } else { + ret = xmlC14NDocSaveTo(docp, nodeset, exclusive, inclusive_ns_prefixes, with_comments, buf); + } } if (inclusive_ns_prefixes != NULL) {