1
0
mirror of https://github.com/php/php-src.git synced 2026-04-09 09:03:04 +02:00

New memcpy()-based wordwrap() implementation. The simple case

(single-character break, no forced break) appears to be about 60%
faster, and there's simply no comparison for non-simple cases with
non-trivial amounts of text. The old algorithm was O(n^2) (with an
unfortunately large constant factor) because of the use of strncat(),
the new one is O(n). Added some more tests, too.
@ - Made wordwrap() significantly faster. (Jim)
# test case: $t = join('',file('ChangeLog')); $w = wordwrap($t,10,"\n",1);
# new code completes in less than a second. i'm still waiting for the
# old code to finish.
This commit is contained in:
jim winstead
2002-01-05 20:46:43 +00:00
parent e56fb1639b
commit ca15b22212
2 changed files with 87 additions and 97 deletions

View File

@@ -613,9 +613,11 @@ PHP_FUNCTION(ltrim)
Wraps buffer to selected number of characters using string break char */
PHP_FUNCTION(wordwrap)
{
char *text, *breakchar = "\n", *newtext;
const char *text, *breakchar = "\n";
char *newtext;
int textlen, breakcharlen = 1, newtextlen;
long linelength = 75, i = 0, l = 0, pgr = 0, last = 0;
long current = 0, laststart = 0, lastspace = 0;
long linelength = 75;
zend_bool docut = 0;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|lsb", &text, &textlen, &linelength, &breakchar, &breakcharlen, &docut) == FAILURE) {
@@ -634,121 +636,96 @@ PHP_FUNCTION(wordwrap)
additional storage space */
if (breakcharlen == 1 && !docut) {
newtext = estrndup(text, textlen);
while (newtext[i] != '\0') {
/* prescan line to see if it is greater than linelength */
l = 0;
while (newtext[i+l] != breakchar[0]) {
if (newtext[i+l] == '\0') {
l--;
break;
}
l++;
laststart = lastspace = 0;
for (current = 0; current < textlen; current++) {
if (text[current] == breakchar[0]) {
laststart = lastspace = current;
}
if (l >= linelength) {
pgr = l;
l = linelength;
/* needs breaking; work backwards to find previous word */
while (l >= 0) {
if (newtext[i+l] == ' ') {
newtext[i+l] = breakchar[0];
break;
}
l--;
}
if (l == -1) {
/* couldn't break is backwards, try looking forwards */
l = linelength;
while (l <= pgr) {
if(newtext[i+l] == ' ') {
newtext[i+l] = breakchar[0];
break;
}
l++;
}
else if (text[current] == ' ') {
if (current - laststart >= linelength) {
newtext[current] = breakchar[0];
laststart = current;
}
lastspace = current;
}
else if (current - laststart >= linelength
&& laststart != lastspace) {
newtext[lastspace] = breakchar[0];
laststart = lastspace;
}
i += l + 1;
}
RETURN_STRINGL(newtext, textlen, 0);
}
else {
/* Multiple character line break */
newtextlen = textlen * (breakcharlen + 1) + 1;
/* Multiple character line break or forced cut */
if (linelength > 0) {
newtextlen = textlen + (textlen/linelength) * breakcharlen + 1;
}
else {
newtextlen = textlen * (breakcharlen + 1) + 1;
}
newtext = emalloc(newtextlen);
newtext[0] = '\0';
i = 0;
while (text[i] != '\0') {
/* now keep track of the actual new text length */
newtextlen = 0;
/* prescan line to see if it is greater than linelength */
l = 0;
while (text[i+l] != '\0') {
if (text[i+l] == breakchar[0]) {
if (breakcharlen == 1 || !strncmp(text+i+l, breakchar, breakcharlen))
break;
}
l++;
laststart = lastspace = 0;
for (current = 0; current < textlen; current++) {
/* when we hit an existing break, copy to new buffer, and
* fix up laststart and lastspace */
if (text[current] == breakchar[0]
&& current + breakcharlen < textlen
&& !strncmp(text+current, breakchar, breakcharlen)) {
memcpy(newtext+newtextlen, text+laststart, current-laststart+breakcharlen);
newtextlen += current-laststart+breakcharlen;
current += breakcharlen - 1;
laststart = lastspace = current + 1;
}
if (l >= linelength) {
pgr = l;
l = linelength;
/* needs breaking; work backwards to find previous word */
while (l >= 0) {
if (text[i+l] == ' ') {
strncat(newtext, text+last, i+l-last);
strncat(newtext, breakchar, breakcharlen);
last = i + l + 1;
break;
}
l--;
/* if it is a space, check if it is at the line boundary,
* copy and insert a break, or just keep track of it */
else if (text[current] == ' ') {
if (current - laststart >= linelength) {
memcpy(newtext+newtextlen, text+laststart, current-laststart);
newtextlen += current - laststart;
memcpy(newtext+newtextlen, breakchar, breakcharlen);
newtextlen += breakcharlen;
laststart = current + 1;
}
if (l == -1) {
/* couldn't break it backwards, try looking forwards */
l = linelength - 1;
while (l <= pgr) {
if (!docut) {
if (text[i+l] == ' ') {
strncat(newtext, text+last, i+l-last);
strncat(newtext, breakchar, breakcharlen);
last = i + l + 1;
++l;
break;
}
}
/* cut if longer than allowed */
else {
if (text[i+l] == ' ' || l > i-last) {
strncat(newtext, text+last, i+l-last+1);
strncat(newtext, breakchar, breakcharlen);
last = i + l + 1;
++l;
break;
}
}
l++;
}
}
i += l + 1;
lastspace = current;
}
else {
i += (l ? l : 1);
/* if we are cutting, and we've accumulated enough
* characters, copy and insert a break. */
else if (current - laststart >= linelength && docut) {
memcpy(newtext+newtextlen, text+laststart, current-laststart);
newtextlen += current - laststart;
memcpy(newtext+newtextlen, breakchar, breakcharlen);
newtextlen += breakcharlen;
laststart = lastspace = current;
}
/* if the current word puts us over the linelength, copy
* back up until the last space, insert a break, and move
* up the laststart */
else if (current - laststart >= linelength
&& laststart < lastspace) {
memcpy(newtext+newtextlen, text+laststart, lastspace-laststart);
newtextlen += lastspace - laststart;
memcpy(newtext+newtextlen, breakchar, breakcharlen);
newtextlen += breakcharlen;
laststart = lastspace = lastspace + 1;
}
}
if (i + l > last) {
strncat(newtext, text+last, i+l-last);
/* copy over any stragglers */
if (laststart != current) {
memcpy(newtext+newtextlen, text+laststart, current-laststart);
newtextlen += current - laststart;
}
RETURN_STRINGL(newtext, strlen(newtext), 0);
newtext[newtextlen] = '\0';
RETURN_STRINGL(newtext, newtextlen, 0);
}
}
/* }}} */

View File

@@ -11,6 +11,19 @@ $tests = <<<TESTS
"12345\\n12345\\n12345\\n12345" === wordwrap("12345 12345 12345 12345",0)
"12345ab12345ab12345ab12345" === wordwrap("12345 12345 12345 12345",0,"ab")
"12345 12345ab1234567890ab1234567890" === wordwrap("12345 12345 1234567890 1234567890",12,"ab")
"123ab123ab123" === wordwrap("123ab123ab123", 3, "ab")
"123ab123ab123" === wordwrap("123ab123ab123", 5, "ab")
"123ab 123ab123" === wordwrap("123 123ab123", 3, "ab")
"123ab123ab123" === wordwrap("123 123ab123", 5, "ab")
"123 123ab123" === wordwrap("123 123 123", 10, "ab")
"123ab123ab123" === wordwrap("123ab123ab123", 3, "ab", 1)
"123ab123ab123" === wordwrap("123ab123ab123", 5, "ab", 1)
"123ab 12ab3ab123" === wordwrap("123 123ab123", 3, "ab", 1)
"123 ab123ab123" === wordwrap("123 123ab123", 5, "ab", 1)
"123 123ab 123" === wordwrap("123 123 123", 8, "ab", 1)
"123 123ab45 123" === wordwrap("123 12345 123", 8, "ab", 1)
"1ab2ab3ab4" === wordwrap("1234", 1, "ab", 1)
TESTS;
include('../../../../tests/quicktester.inc');