1
0
mirror of https://github.com/php/php-src.git synced 2026-04-28 18:53:33 +02:00

Unicode upgrade for fgetcsv()

This commit is contained in:
Sara Golemon
2006-12-05 04:13:47 +00:00
parent 40b496d63f
commit 78f65cc451
4 changed files with 583 additions and 339 deletions
+539 -338
View File
@@ -1932,43 +1932,6 @@ PHPAPI PHP_FUNCTION(fread)
}
/* }}} */
static const char *php_fgetcsv_lookup_trailing_spaces(const char *ptr, size_t len, const char delimiter TSRMLS_DC)
{
int inc_len;
unsigned char last_chars[2] = { 0, 0 };
while (len > 0) {
inc_len = (*ptr == '\0' ? 1: php_mblen(ptr, len));
switch (inc_len) {
case -2:
case -1:
inc_len = 1;
php_mblen(NULL, 0);
break;
case 0:
goto quit_loop;
case 1:
default:
last_chars[0] = last_chars[1];
last_chars[1] = *ptr;
break;
}
ptr += inc_len;
len -= inc_len;
}
quit_loop:
switch (last_chars[1]) {
case '\n':
if (last_chars[0] == '\r') {
return ptr - 2;
}
/* break is omitted intentionally */
case '\r':
return ptr - 1;
}
return ptr;
}
#define FPUTCSV_FLD_CHK(c) memchr(Z_STRVAL_PP(field), c, Z_STRLEN_PP(field))
/* {{{ proto int fputcsv(resource fp, array fields [, string delimiter [, string enclosure]])
@@ -2072,87 +2035,149 @@ PHP_FUNCTION(fputcsv)
}
/* }}} */
/* {{{ proto array fgetcsv(resource fp [,int length [, string delimiter [, string enclosure]]])
/* {{{ proto array fgetcsv(resource fp [,int length [, string delimiter [, string enclosure[, string escape]]]]) U
Get line from file pointer and parse for CSV fields */
/* UTODO: Accept unicode contents */
#define PHP_FGETCSV_TRUNCATE(field) \
if (argc > 4) { \
/* Caller knows about new semantics since they're using new param, allow multichar */ \
} else if (field##_type == IS_STRING && field##_len > 1) { \
php_error_docref(NULL TSRMLS_CC, E_NOTICE, #field " must be a single character"); \
delimiter_len = 1; \
} else if (field##_type == IS_UNICODE && u_countChar32((UChar*)field, field##_len) > 1) { \
int __tmp = 0; \
php_error_docref(NULL TSRMLS_CC, E_NOTICE, #field " must be a single character"); \
U16_FWD_1(((UChar*)field), __tmp, field##_len); \
field##_len = __tmp; \
}
PHP_FUNCTION(fgetcsv)
{
char delimiter = ','; /* allow this to be set as parameter */
char enclosure = '"'; /* allow this to be set as parameter */
/* first section exactly as php_fgetss */
long len = 0;
size_t buf_len;
char *buf;
zend_uchar delimiter_type = IS_STRING, enclosure_type = IS_STRING, escape_type = IS_STRING;
char *delimiter = ",", *enclosure = "\"", *escape = "\\";
int delimiter_len = 1, enclosure_len = 1, escape_len = 1;
long len = -1;
zstr buf;
int buf_len, argc = ZEND_NUM_ARGS();
php_stream *stream;
zval *zstream;
zend_uchar delimiter_free = 0, enclosure_free = 0, escape_free = 0;
{
zval *fd, **len_zv = NULL;
char *delimiter_str = NULL;
int delimiter_str_len = 0;
char *enclosure_str = NULL;
int enclosure_str_len = 0;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r|Zss",
&fd, &len_zv, &delimiter_str, &delimiter_str_len,
&enclosure_str, &enclosure_str_len) == FAILURE) {
return;
}
if (delimiter_str != NULL) {
/* Make sure that there is at least one character in string */
if (delimiter_str_len < 1) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "delimiter must be a character");
RETURN_FALSE;
} else if (delimiter_str_len > 1) {
php_error_docref(NULL TSRMLS_CC, E_NOTICE, "delimiter must be a single character");
}
/* use first character from string */
delimiter = delimiter_str[0];
}
if (enclosure_str != NULL) {
if (enclosure_str_len < 1) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "enclosure must be a character");
RETURN_FALSE;
} else if (enclosure_str_len > 1) {
php_error_docref(NULL TSRMLS_CC, E_NOTICE, "enclosure must be a single character");
}
/* use first character from string */
enclosure = enclosure_str[0];
}
if (len_zv != NULL && Z_TYPE_PP(len_zv) != IS_NULL) {
convert_to_long_ex(len_zv);
len = Z_LVAL_PP(len_zv);
if (len < 0) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Length parameter may not be negative");
RETURN_FALSE;
} else if (len == 0) {
len = -1;
}
} else {
len = -1;
}
PHP_STREAM_TO_ZVAL(stream, &fd);
if (zend_parse_parameters(argc TSRMLS_CC, "r|l!ttt", &zstream, &len,
&delimiter, &delimiter_len, &delimiter_type,
&enclosure, &enclosure_len, &enclosure_type,
&escape, &escape_len, &escape_type) == FAILURE) {
return;
}
if (len < 0) {
if ((buf = php_stream_get_line(stream, NULL_ZSTR, 0, &buf_len)) == NULL) {
RETURN_FALSE;
PHP_STREAM_TO_ZVAL(stream, &zstream);
/* Make sure that there is at least one character in string,
* For userspace BC purposes we generally limit delimiters and enclosures to 1 character,
* though the code now supports multiple characters
*
* If this function is called with all five parameters however,
* then multiple characters are allowed for all subarguments */
if (delimiter_len < 1) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "delimiter must be a character");
RETURN_FALSE;
} else PHP_FGETCSV_TRUNCATE(delimiter);
if (enclosure_len < 1) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "enclosure must be a character");
RETURN_FALSE;
} else PHP_FGETCSV_TRUNCATE(enclosure);
if (escape_len < 1) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "escape must be a character");
RETURN_FALSE;
}
if (len < -1) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Length parameter may not be negative");
RETURN_FALSE;
} else if (len == 0) {
len = -1;
}
if (stream->readbuf_type == IS_STRING) {
/* Binary mode stream needs binary delmiter/enclosure */
if (delimiter_type == IS_UNICODE) {
if (FAILURE == zend_unicode_to_string(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), &delimiter, &delimiter_len, (UChar*)delimiter, delimiter_len TSRMLS_CC)) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed converting delimiter from unicode");
RETVAL_FALSE;
goto cleanup;
}
delimiter_free = 1;
}
if (enclosure_type == IS_UNICODE) {
if (FAILURE == zend_unicode_to_string(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), &enclosure, &enclosure_len, (UChar*)enclosure, enclosure_len TSRMLS_CC)) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed converting enclosure from unicode");
RETVAL_FALSE;
goto cleanup;
}
enclosure_free = 1;
}
if (escape_type == IS_UNICODE) {
if (FAILURE == zend_unicode_to_string(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), &escape, &escape_len, (UChar*)escape, escape_len TSRMLS_CC)) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed converting escape from unicode");
RETVAL_FALSE;
goto cleanup;
}
escape_free = 1;
}
} else {
buf = emalloc(len + 1);
if (php_stream_get_line(stream, ZSTR(buf), len + 1, &buf_len) == NULL) {
efree(buf);
RETURN_FALSE;
/* Unicode mode stream needs unicode delimiter/enclosure */
if (delimiter_type == IS_STRING) {
if (FAILURE == zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), (UChar**)&delimiter, &delimiter_len, delimiter, delimiter_len TSRMLS_CC)) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed converting delimiter to unicode");
RETVAL_FALSE;
goto cleanup;
}
delimiter_free = 1;
}
if (enclosure_type == IS_STRING) {
if (FAILURE == zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), (UChar**)&enclosure, &enclosure_len, enclosure, enclosure_len TSRMLS_CC)) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed converting enclosure to unicode");
RETVAL_FALSE;
goto cleanup;
}
enclosure_free = 1;
}
if (escape_type == IS_STRING) {
if (FAILURE == zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), (UChar**)&escape, &escape_len, escape, escape_len TSRMLS_CC)) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed converting escape to unicode");
RETVAL_FALSE;
goto cleanup;
}
escape_free = 1;
}
}
php_fgetcsv(stream, delimiter, enclosure, buf_len, buf, return_value TSRMLS_CC);
buf.v = php_stream_get_line_ex(stream, stream->readbuf_type, NULL_ZSTR, 0, len, &buf_len);
if (!buf.v) {
/* No data */
RETVAL_FALSE;
goto cleanup;
}
if (stream->readbuf_type == IS_UNICODE) {
/* Unicode mode */
php_u_fgetcsv(stream, (UChar*)delimiter, delimiter_len, (UChar*)enclosure, enclosure_len, (UChar*)escape, escape_len, buf.u, buf_len, return_value TSRMLS_CC);
} else {
/* Binary mode */
php_fgetcsv_ex(stream, delimiter, delimiter_len, enclosure, enclosure_len, escape, escape_len, buf.s, buf_len, return_value TSRMLS_CC);
}
cleanup:
if (delimiter_free) {
efree(delimiter);
}
if (enclosure_free) {
efree(enclosure);
}
if (escape_free) {
efree(escape);
}
}
/* }}} */
@@ -2161,266 +2186,442 @@ PHPAPI void php_fgetcsv(php_stream *stream, /* {{{ */
size_t buf_len, char *buf,
zval *return_value TSRMLS_DC)
{
char *temp, *tptr, *bptr, *line_end, *limit;
const char escape_char = '\\';
char *delim = &delimiter, *enc = &enclosure, *buffer = buf;
int delim_len = 1, enc_len = 1, buffer_len = buf_len;
zend_uchar type = IS_STRING;
size_t temp_len, line_end_len;
int inc_len;
if (stream) {
type = stream->readbuf_type;
}
/* initialize internal state */
php_mblen(NULL, 0);
if (type == IS_UNICODE) {
UChar esc = '\\';
/* Now into new section that parses buf for delimiter/enclosure fields */
/* Unicode stream, but binary delimiter/enclosures/prefetch, promote to unicode */
if (FAILURE == zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), (UChar**)&delim, &delim_len, &delimiter, 1 TSRMLS_CC)) {
INIT_PZVAL(return_value);
return;
}
if (FAILURE == zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), (UChar**)&enc, &enc_len, &enclosure, 1 TSRMLS_CC)) {
efree(delim);
INIT_PZVAL(return_value);
return;
}
if (FAILURE == zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), (UChar**)&buffer, &buffer_len, buf, buf_len TSRMLS_CC)) {
efree(delim);
efree(enc);
INIT_PZVAL(return_value);
return;
}
/* Strip trailing space from buf, saving end of line in case required for enclosure field */
php_u_fgetcsv(stream, (UChar*)delim, delim_len, (UChar*)enc, enc_len, &esc, 1,
(UChar*)buffer, buffer_len, return_value TSRMLS_CC);
bptr = buf;
tptr = (char *)php_fgetcsv_lookup_trailing_spaces(buf, buf_len, delimiter TSRMLS_CC);
line_end_len = buf_len - (size_t)(tptr - buf);
line_end = limit = tptr;
/* Types converted, free storage */
efree(delim);
efree(enc);
efree(buffer);
} else {
/* Binary stream with binary delimiter/enclosures/prefetch */
php_fgetcsv_ex(stream, delim, delim_len, enc, enc_len, "\\", 1, buffer, buffer_len, return_value TSRMLS_CC);
}
}
/* reserve workspace for building each individual field */
temp_len = buf_len;
temp = emalloc(temp_len + line_end_len + 1);
typedef enum _php_fgetcsv_state {
PHP_FGETCSV_READY,
PHP_FGETCSV_FIELD_NO_ENC,
PHP_FGETCSV_FIELD_WITH_ENC,
PHP_FGETCSV_POST_ENC,
} php_fgetcsv_state;
#define PHP_FGETCSV_BIN_CHECK(p, e, m, mlen) ((p) < (e) && (((mlen) == 1 && *(p) == *(m)) || ((mlen) > 1 && (((e) - (p)) >= (mlen)) && memcmp((p), (m), (mlen)) == 0)))
/* Binary mode fgetcsv */
PHPAPI void php_fgetcsv_ex(php_stream *stream,
char *delimiter, int delimiter_len,
char *enclosure, int enclosure_len,
char *escape, int escape_len,
char *buffer, int buffer_len,
zval *return_value TSRMLS_DC)
{
php_fgetcsv_state state = PHP_FGETCSV_READY;
char *p = buffer, *e = buffer + buffer_len, *field_start = NULL, *field_end = NULL;
/* Initialize return array */
array_init(return_value);
/* Main loop to read CSV fields */
/* NB this routine will return a single null entry for a blank line */
while(p < e) {
switch (state) {
case PHP_FGETCSV_READY:
ready_state:
/* Ready to start a new field */
do {
char *comp_end, *hunk_begin;
tptr = temp;
/* 1. Strip any leading space */
for (;;) {
inc_len = (bptr < limit ? (*bptr == '\0' ? 1: php_mblen(bptr, limit - bptr)): 0);
switch (inc_len) {
case -2:
case -1:
inc_len = 1;
php_mblen(NULL, 0);
/* Is there nothing left to scan? */
if (*p == '\r' || *p == '\n') {
/* Terminal delimiter, treat as empty field */
p++;
add_next_index_stringl(return_value, "", 0, 1);
break;
case 0:
goto quit_loop_1;
case 1:
if (!isspace((int)*(unsigned char *)bptr) || *bptr == delimiter) {
goto quit_loop_1;
}
/* Is it enclosed? */
if (PHP_FGETCSV_BIN_CHECK(p, e, enclosure, enclosure_len)) {
/* Enclosure encountered, switch state */
state = PHP_FGETCSV_FIELD_WITH_ENC;
p += enclosure_len;
field_start = p;
break;
}
/* Is it an immediate delimiter? */
if (PHP_FGETCSV_BIN_CHECK(p, e, delimiter, delimiter_len)) {
/* Immediate delimiter, treate as empty field */
p += delimiter_len;
add_next_index_stringl(return_value, "", 0, 1);
break;
}
/* Whitespace? */
if (*p == ' ' || *p == '\t') {
p++;
if (p >= e) break;
goto ready_state;
}
/* Is it an escape character? */
if (PHP_FGETCSV_BIN_CHECK(p, e, escape, escape_len)) {
/* Skip escape sequence and let next char be treated as literal */
p += escape_len;
/* FALL THROUGH */
}
/* Otherwise, starting a new field without enclosures */
state = PHP_FGETCSV_FIELD_NO_ENC;
field_start = p;
field_end = NULL;
p++;
break;
case PHP_FGETCSV_FIELD_WITH_ENC:
with_enc:
/* Check for ending enclosure */
if (PHP_FGETCSV_BIN_CHECK(p, e, enclosure, enclosure_len)) {
/* Enclosure encountered, is it paired? */
if (PHP_FGETCSV_BIN_CHECK(p + enclosure_len, e, enclosure, enclosure_len)) {
/* Double enclosure gets translated to single enclosure */
memmove(p, p + enclosure_len, (e - p) - enclosure_len);
e -= enclosure_len;
p += enclosure_len;
goto with_enc;
} else {
/* Genuine end enclosure, switch state */
field_end = p;
p += enclosure_len;
state = PHP_FGETCSV_POST_ENC;
goto post_enc;
}
}
/* Check for field escapes */
if (PHP_FGETCSV_BIN_CHECK(p, e, escape, escape_len)) {
p += escape_len + 1;
/* Reprocess for ending enclosures */
goto with_enc;
}
/* Simple character */
if (e - p) {
p++;
}
/* Hungry? */
if (((e - p) < enclosure_len) && stream) {
/* Feed me! */
int new_len;
char *new_buf = php_stream_get_line(stream, NULL_ZSTR, 0, &new_len);
if (new_buf) {
int tmp_len = new_len + e - field_start;
char *tmp = emalloc(tmp_len);
/* Realign scan buffer */
memcpy(tmp, field_start, e - field_start);
memcpy(tmp + (e - field_start), new_buf, new_len);
field_start = tmp;
if (field_end) {
field_end = tmp + (field_end - field_start);
}
efree(buffer);
efree(new_buf);
buffer = tmp;
buffer_len = tmp_len;
p = buffer;
e = buffer + buffer_len;
}
}
if ((e - p) == 0) {
/* Nothing left to consume the buffer, use it */
add_next_index_stringl(return_value, field_start, p - field_start, 1);
/* Loop is dying anyway, but be pedantic */
state = PHP_FGETCSV_READY;
field_start = field_end = NULL;
break;
default:
goto quit_loop_1;
}
bptr += inc_len;
}
break;
case PHP_FGETCSV_POST_ENC:
post_enc:
/* Check for delimiters or EOL */
if (p >= e || *p == '\r' || *p == '\n' || PHP_FGETCSV_BIN_CHECK(p, e, delimiter, delimiter_len)) {
int field_len = field_end - field_start;
char *field;
if ((p - enclosure_len) > field_end) {
/* There's cruft, append it to the proper field */
int cruft_len = p - (field_end + enclosure_len);
field = emalloc(field_len + cruft_len + 1);
memcpy(field, field_start, field_len);
memcpy(field + field_len, field_end + enclosure_len, cruft_len);
field_len += cruft_len;
field[field_len] = 0;
} else {
field = estrndup(field_start, field_end - field_start);
}
add_next_index_stringl(return_value, field, field_len, 0);
/* Reset scanner */
state = PHP_FGETCSV_READY;
field_start = field_end = NULL;
p += delimiter_len;
goto ready_state;
}
/* Queue anything else as cruft */
p++;
break;
case PHP_FGETCSV_FIELD_NO_ENC:
/* Check for escapes */
if (PHP_FGETCSV_BIN_CHECK(p, e, escape, escape_len)) {
p += escape_len + 1;
}
/* Check for delimiter */
if (p >= e || *p == '\r' || *p == '\n' || PHP_FGETCSV_BIN_CHECK(p, e, delimiter, delimiter_len)) {
add_next_index_stringl(return_value, field_start, p - field_start, 1);
/* Reset scanner */
state = PHP_FGETCSV_READY;
field_start = field_end = NULL;
p += delimiter_len;
goto ready_state;
}
/* Simple character */
p++;
break;
}
}
quit_loop_1:
/* 2. Read field, leaving bptr pointing at start of next field */
if (inc_len != 0 && *bptr == enclosure) {
int state = 0;
efree(buffer);
}
/* }}} */
bptr++; /* move on to first character in field */
hunk_begin = bptr;
#define PHP_FGETCSV_UNI_CHECK(p, e, m, mlen) ((p) < (e) && (((mlen) == 1 && *(p) == *(m)) || ((mlen) > 1 && (((e) - (p)) >= (mlen)) && memcmp((p), (m), UBYTES(mlen)) == 0)))
/* 2A. handle enclosure delimited field */
for (;;) {
switch (inc_len) {
case 0:
switch (state) {
case 2:
memcpy(tptr, hunk_begin, bptr - hunk_begin - 1);
tptr += (bptr - hunk_begin - 1);
hunk_begin = bptr;
goto quit_loop_2;
/* Unicode mode fgetcsv */
PHPAPI void php_u_fgetcsv(php_stream *stream,
UChar *delimiter, int delimiter_len,
UChar *enclosure, int enclosure_len,
UChar *escape, int escape_len,
UChar *buffer, int buffer_len,
zval *return_value TSRMLS_DC)
{
php_fgetcsv_state state = PHP_FGETCSV_READY;
UChar *p = buffer, *e = buffer + buffer_len, *field_start = NULL, *field_end = NULL;
case 1:
memcpy(tptr, hunk_begin, bptr - hunk_begin);
tptr += (bptr - hunk_begin);
hunk_begin = bptr;
/* break is omitted intentionally */
array_init(return_value);
case 0: {
char *new_buf;
size_t new_len;
char *new_temp;
while(p < e) {
switch (state) {
case PHP_FGETCSV_READY:
ready_state:
/* Ready to start a new field */
memcpy(tptr, hunk_begin, bptr - hunk_begin);
tptr += (bptr - hunk_begin);
hunk_begin = bptr;
if (hunk_begin != line_end) {
memcpy(tptr, hunk_begin, bptr - hunk_begin);
tptr += (bptr - hunk_begin);
hunk_begin = bptr;
}
/* add the embedded line end to the field */
memcpy(tptr, line_end, line_end_len);
tptr += line_end_len;
if ((new_buf = php_stream_get_line(stream, NULL_ZSTR, 0, &new_len)) == NULL) {
/* we've got an unterminated enclosure,
* assign all the data from the start of
* the enclosure to end of data to the
* last element */
if ((size_t)temp_len > (size_t)(limit - buf)) {
goto quit_loop_2;
}
zval_dtor(return_value);
RETVAL_FALSE;
goto out;
}
temp_len += new_len;
new_temp = erealloc(temp, temp_len);
tptr = new_temp + (size_t)(tptr - temp);
temp = new_temp;
efree(buf);
buf_len = new_len;
bptr = buf = new_buf;
hunk_begin = buf;
line_end = limit = (char *)php_fgetcsv_lookup_trailing_spaces(buf, buf_len, delimiter TSRMLS_CC);
line_end_len = buf_len - (size_t)(limit - buf);
state = 0;
} break;
}
break;
case -2:
case -1:
php_mblen(NULL, 0);
/* break is omitted intentionally */
case 1:
/* we need to determine if the enclosure is
* 'real' or is it escaped */
switch (state) {
case 1: /* escaped */
bptr++;
state = 0;
break;
case 2: /* embedded enclosure ? let's check it */
if (*bptr != enclosure) {
/* real enclosure */
memcpy(tptr, hunk_begin, bptr - hunk_begin - 1);
tptr += (bptr - hunk_begin - 1);
hunk_begin = bptr;
goto quit_loop_2;
}
memcpy(tptr, hunk_begin, bptr - hunk_begin);
tptr += (bptr - hunk_begin);
bptr++;
hunk_begin = bptr;
state = 0;
break;
default:
if (*bptr == escape_char) {
state = 1;
} else if (*bptr == enclosure) {
state = 2;
}
bptr++;
break;
}
break;
default:
switch (state) {
case 2:
/* real enclosure */
memcpy(tptr, hunk_begin, bptr - hunk_begin - 1);
tptr += (bptr - hunk_begin - 1);
hunk_begin = bptr;
goto quit_loop_2;
case 1:
bptr += inc_len;
memcpy(tptr, hunk_begin, bptr - hunk_begin);
tptr += (bptr - hunk_begin);
hunk_begin = bptr;
break;
default:
bptr += inc_len;
break;
}
break;
/* Is there nothing left to scan? */
if (*p == '\r' || *p == '\n') {
/* Terminal delimiter, treat as empty field */
p++;
add_next_index_stringl(return_value, "", 0, 1);
break;
}
inc_len = (bptr < limit ? (*bptr == '\0' ? 1: php_mblen(bptr, limit - bptr)): 0);
}
quit_loop_2:
/* look up for a delimiter */
for (;;) {
switch (inc_len) {
case 0:
goto quit_loop_3;
case -2:
case -1:
inc_len = 1;
php_mblen(NULL, 0);
/* break is omitted intentionally */
case 1:
if (*bptr == delimiter) {
goto quit_loop_3;
}
break;
default:
break;
/* Is it enclosed? */
if (PHP_FGETCSV_UNI_CHECK(p, e, enclosure, enclosure_len)) {
/* Enclosure encountered, switch state */
state = PHP_FGETCSV_FIELD_WITH_ENC;
p += enclosure_len;
field_start = p;
break;
}
bptr += inc_len;
inc_len = (bptr < limit ? (*bptr == '\0' ? 1: php_mblen(bptr, limit - bptr)): 0);
}
quit_loop_3:
memcpy(tptr, hunk_begin, bptr - hunk_begin);
tptr += (bptr - hunk_begin);
bptr += inc_len;
comp_end = tptr;
} else {
/* 2B. Handle non-enclosure field */
hunk_begin = bptr;
for (;;) {
switch (inc_len) {
case 0:
goto quit_loop_4;
case -2:
case -1:
inc_len = 1;
php_mblen(NULL, 0);
/* break is omitted intentionally */
case 1:
if (*bptr == delimiter) {
goto quit_loop_4;
}
break;
default:
break;
/* Is it an immediate delimiter? */
if (PHP_FGETCSV_UNI_CHECK(p, e, delimiter, delimiter_len)) {
/* Immediate delimiter, treate as empty field */
p += delimiter_len;
add_next_index_unicodel(return_value, (UChar*)"", 0, 1);
break;
}
bptr += inc_len;
inc_len = (bptr < limit ? (*bptr == '\0' ? 1: php_mblen(bptr, limit - bptr)): 0);
}
quit_loop_4:
memcpy(tptr, hunk_begin, bptr - hunk_begin);
tptr += (bptr - hunk_begin);
comp_end = (char *)php_fgetcsv_lookup_trailing_spaces(temp, tptr - temp, delimiter TSRMLS_CC);
if (*bptr == delimiter) {
bptr++;
}
/* Whitespace? */
if (*p == ' ' || *p == '\t') {
p++;
if (p >= e) break;
goto ready_state;
}
/* Is it an escape character? */
if (PHP_FGETCSV_UNI_CHECK(p, e, escape, escape_len)) {
/* Skip escape sequence and let next char be treated as literal */
p += escape_len;
/* FALL THROUGH */
}
/* Otherwise, starting a new field without enclosures */
state = PHP_FGETCSV_FIELD_NO_ENC;
field_start = p;
field_end = NULL;
p++;
break;
case PHP_FGETCSV_FIELD_WITH_ENC:
with_enc:
/* Check for ending enclosure */
if (PHP_FGETCSV_UNI_CHECK(p, e, enclosure, enclosure_len)) {
/* Enclosure encountered, is it paired? */
if (PHP_FGETCSV_UNI_CHECK(p + enclosure_len, e, enclosure, enclosure_len)) {
/* Double enclosure gets translated to single enclosure */
memmove(p, p + enclosure_len, (e - p) - enclosure_len);
e -= enclosure_len;
p += enclosure_len;
goto with_enc;
} else {
/* Genuine end enclosure, switch state */
field_end = p;
p += enclosure_len;
state = PHP_FGETCSV_POST_ENC;
goto post_enc;
}
}
/* Check for field escapes */
if (PHP_FGETCSV_UNI_CHECK(p, e, escape, escape_len)) {
p += escape_len + 1;
/* Reprocess for ending enclosures */
goto with_enc;
}
/* Simple character */
if (e - p) {
p++;
}
/* Hungry? */
if (((e - p) < enclosure_len) && stream) {
/* Feed me! */
int new_len;
UChar *new_buf = (UChar*)php_stream_get_line_ex(stream, IS_UNICODE, NULL_ZSTR, 0, 0, &new_len);
if (new_buf) {
int tmp_len = new_len + e - field_start;
UChar *tmp = eumalloc(tmp_len);
/* Realign scan buffer, ick -- expensive */
memcpy(tmp, field_start, UBYTES(e - field_start));
memcpy(tmp + (e - field_start), new_buf, UBYTES(new_len));
field_start = tmp;
if (field_end) {
field_end = tmp + (field_end - field_start);
}
efree(buffer);
efree(new_buf);
buffer = tmp;
buffer_len = tmp_len;
p = buffer;
e = buffer + buffer_len;
}
}
if ((e - p) == 0) {
/* Nothing left to consume the buffer */
add_next_index_unicodel(return_value, field_start, p - field_start, 1);
/* Loop is dying, but cleanup anyway */
state = PHP_FGETCSV_READY;
field_start = field_end = NULL;
break;
}
break;
case PHP_FGETCSV_POST_ENC:
post_enc:
/* Check for delimiters or EOL */
if (p >= e || *p == '\r' || *p == '\n' || PHP_FGETCSV_UNI_CHECK(p, e, delimiter, delimiter_len)) {
int field_len = field_end - field_start;
UChar *field;
if ((p - enclosure_len) > field_end) {
/* There's cruft, append it to the regular field */
int cruft_len = p - (field_end + enclosure_len);
field = eumalloc(field_len + cruft_len + 1);
memcpy(field, field_start, field_len);
memcpy(field + field_len, field_end + enclosure_len, UBYTES(cruft_len));
field_len += cruft_len;
field[field_len] = 0;
} else {
field = eustrndup(field_start, field_len);
}
add_next_index_unicodel(return_value, field, field_len, 0);
/* Reset scanner state */
state = PHP_FGETCSV_READY;
field_start = field_end = NULL;
p += delimiter_len;
goto ready_state;
}
/* Queue anything else as cruft */
p++;
break;
case PHP_FGETCSV_FIELD_NO_ENC:
/* Check for escapes */
if (PHP_FGETCSV_UNI_CHECK(p, e, escape, escape_len)) {
p += escape_len + 1;
}
/* Check for delimiter */
if (p >= e || *p == '\r' || *p == '\n' || PHP_FGETCSV_UNI_CHECK(p, e, delimiter, delimiter_len)) {
add_next_index_unicodel(return_value, field_start, p - field_start, 1);
state = PHP_FGETCSV_READY;
field_start = field_end = NULL;
p += delimiter_len;
goto ready_state;
}
/* Simple character */
p++;
break;
}
}
/* 3. Now pass our field back to php */
*comp_end = '\0';
add_next_index_stringl(return_value, temp, comp_end - temp, 1);
} while (inc_len > 0);
out:
efree(temp);
efree(buf);
efree(buffer);
}
/* }}} */
+5
View File
@@ -77,6 +77,11 @@ PHPAPI int php_copy_file_ex(char *src, char *dest, int src_chk TSRMLS_DC);
PHPAPI int php_mkdir_ex(char *dir, long mode, int options TSRMLS_DC);
PHPAPI int php_mkdir(char *dir, long mode TSRMLS_DC);
PHPAPI void php_fgetcsv(php_stream *stream, char delimiter, char enclosure, size_t buf_len, char *buf, zval *return_value TSRMLS_DC);
PHPAPI void php_fgetcsv_ex(php_stream *stream, char *delimiter, int delimiter_len, char *enclosure, int enclosure_len, char *escape, int escape_len,
char *buffer, int buffer_len, zval *return_value TSRMLS_DC);
PHPAPI void php_u_fgetcsv(php_stream *stream, UChar *delimiter, int delimiter_len, UChar *enclosure, int enclosure_len, UChar *escape, int escape_len,
UChar *buffer, int buffer_len, zval *return_value TSRMLS_DC);
#define META_DEF_BUFSIZE 8192
+38
View File
@@ -46,3 +46,41 @@ array(4) {
2,4,5,line3
"
}
--UEXPECT--
array(4) {
[0]=>
unicode(1) "6"
[1]=>
unicode(1) "7"
[2]=>
unicode(1) "8"
[3]=>
unicode(5) "line1"
}
array(4) {
[0]=>
unicode(1) "1"
[1]=>
unicode(1) "2"
[2]=>
unicode(1) "3"
[3]=>
unicode(186) "line2
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
2,4,5,line3
"
}
+1 -1
View File
@@ -28,7 +28,7 @@ various fgetcsv() functionality tests
$file = dirname(__FILE__) . 'fgetcsv.csv';
@unlink($file);
foreach ($list as $v) {
$fp = fopen($file, "w");
$fp = fopen($file, "wt");
fwrite($fp, $v . "\n");
fclose($fp);