diff --git a/ext/dom/lexbor/lexbor/html/tokenizer.c b/ext/dom/lexbor/lexbor/html/tokenizer.c
index a399758c6a3..0bd9aec504f 100755
--- a/ext/dom/lexbor/lexbor/html/tokenizer.c
+++ b/ext/dom/lexbor/lexbor/html/tokenizer.c
@@ -315,12 +315,13 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
tkz->last = end;
while (data < end) {
+ size_t current_column = tkz->current_column;
const lxb_char_t *new_data = tkz->state(tkz, data, end);
while (data < new_data) {
/* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
if (*data == '\n') {
tkz->current_line++;
- tkz->current_column = 0;
+ current_column = 0;
} else {
/* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
* Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
@@ -328,11 +329,12 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
/* Continuation byte, do nothing */
} else {
/* First byte for a codepoint */
- tkz->current_column++;
+ current_column++;
}
}
data++;
}
+ tkz->current_column = current_column;
}
return tkz->status;
diff --git a/ext/dom/lexbor/patches/0001-Expose-line-and-column-information-for-use-in-PHP.patch b/ext/dom/lexbor/patches/0001-Expose-line-and-column-information-for-use-in-PHP.patch
index f59db2195bf..7549403ea09 100644
--- a/ext/dom/lexbor/patches/0001-Expose-line-and-column-information-for-use-in-PHP.patch
+++ b/ext/dom/lexbor/patches/0001-Expose-line-and-column-information-for-use-in-PHP.patch
@@ -1,4 +1,4 @@
-From faee2893e499bdcaa3a511bcff197366b8a87968 Mon Sep 17 00:00:00 2001
+From 9d60c0fda0b51e9374a234c48df36130d2c988ee Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Sat, 26 Aug 2023 15:08:59 +0200
Subject: [PATCH] Expose line and column information for use in PHP
@@ -6,13 +6,13 @@ Subject: [PATCH] Expose line and column information for use in PHP
---
source/lexbor/dom/interfaces/node.h | 2 ++
source/lexbor/html/token.h | 2 ++
- source/lexbor/html/tokenizer.c | 22 +++++++++++++++++++++-
+ source/lexbor/html/tokenizer.c | 24 +++++++++++++++++++++++-
source/lexbor/html/tokenizer.h | 2 ++
source/lexbor/html/tokenizer/state.h | 2 ++
source/lexbor/html/tree.c | 11 +++++++++++
source/lexbor/html/tree/error.c | 5 +++--
source/lexbor/html/tree/error.h | 5 +++--
- 8 files changed, 46 insertions(+), 5 deletions(-)
+ 8 files changed, 48 insertions(+), 5 deletions(-)
diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
index 4a10197..ff9c924 100755
@@ -41,7 +41,7 @@ index 79accd0..0b7f4fd 100755
const lxb_char_t *text_start;
const lxb_char_t *text_end;
diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
-index 741bced..a399758 100755
+index 741bced..0bd9aec 100755
--- a/source/lexbor/html/tokenizer.c
+++ b/source/lexbor/html/tokenizer.c
@@ -91,6 +91,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
@@ -61,17 +61,18 @@ index 741bced..a399758 100755
return LXB_STATUS_OK;
}
-@@ -312,7 +315,24 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
+@@ -312,7 +315,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
tkz->last = end;
while (data < end) {
- data = tkz->state(tkz, data, end);
++ size_t current_column = tkz->current_column;
+ const lxb_char_t *new_data = tkz->state(tkz, data, end);
+ while (data < new_data) {
+ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
+ if (*data == '\n') {
+ tkz->current_line++;
-+ tkz->current_column = 0;
++ current_column = 0;
+ } else {
+ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
+ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
@@ -79,11 +80,12 @@ index 741bced..a399758 100755
+ /* Continuation byte, do nothing */
+ } else {
+ /* First byte for a codepoint */
-+ tkz->current_column++;
++ current_column++;
+ }
+ }
+ data++;
+ }
++ tkz->current_column = current_column;
}
return tkz->status;
@@ -182,5 +184,5 @@ index 2fd06cb..ed1859f 100755
lxb_html_tree_error_t;
--
-2.41.0
+2.43.0