1
0
mirror of https://github.com/php/php-src.git synced 2026-04-22 07:28:09 +02:00
Files
archived-php-src/ext/standard/url.c
T
Dmitry Stogov 27f38798a1 Fast parameter parsing API
This API is experemental. It may be changed or removed.
It should be used only for really often used functions.
(Keep the original parsing code and wrap usage with #ifndef FAST_ZPP)
2014-07-11 16:32:20 +04:00

797 lines
19 KiB
C

/*
+----------------------------------------------------------------------+
| PHP Version 5 |
+----------------------------------------------------------------------+
| Copyright (c) 1997-2014 The PHP Group |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
| Author: Jim Winstead <jimw@php.net> |
+----------------------------------------------------------------------+
*/
/* $Id$ */
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include "php.h"
#include "url.h"
#include "file.h"
#ifdef _OSD_POSIX
#ifndef APACHE
#error On this EBCDIC platform, PHP is only supported as an Apache module.
#else /*APACHE*/
#ifndef CHARSET_EBCDIC
#define CHARSET_EBCDIC /* this machine uses EBCDIC, not ASCII! */
#endif
#include "ebcdic.h"
#endif /*APACHE*/
#endif /*_OSD_POSIX*/
/* {{{ free_url
*/
PHPAPI void php_url_free(php_url *theurl)
{
if (theurl->scheme)
efree(theurl->scheme);
if (theurl->user)
efree(theurl->user);
if (theurl->pass)
efree(theurl->pass);
if (theurl->host)
efree(theurl->host);
if (theurl->path)
efree(theurl->path);
if (theurl->query)
efree(theurl->query);
if (theurl->fragment)
efree(theurl->fragment);
efree(theurl);
}
/* }}} */
/* {{{ php_replace_controlchars
*/
PHPAPI char *php_replace_controlchars_ex(char *str, int len)
{
unsigned char *s = (unsigned char *)str;
unsigned char *e = (unsigned char *)str + len;
if (!str) {
return (NULL);
}
while (s < e) {
if (iscntrl(*s)) {
*s='_';
}
s++;
}
return (str);
}
/* }}} */
PHPAPI char *php_replace_controlchars(char *str)
{
return php_replace_controlchars_ex(str, strlen(str));
}
PHPAPI php_url *php_url_parse(char const *str)
{
return php_url_parse_ex(str, strlen(str));
}
/* {{{ php_url_parse
*/
PHPAPI php_url *php_url_parse_ex(char const *str, int length)
{
char port_buf[6];
php_url *ret = ecalloc(1, sizeof(php_url));
char const *s, *e, *p, *pp, *ue;
s = str;
ue = s + length;
/* parse scheme */
if ((e = memchr(s, ':', length)) && (e - s)) {
/* validate scheme */
p = s;
while (p < e) {
/* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */
if (!isalpha(*p) && !isdigit(*p) && *p != '+' && *p != '.' && *p != '-') {
if (e + 1 < ue) {
goto parse_port;
} else {
goto just_path;
}
}
p++;
}
if (*(e + 1) == '\0') { /* only scheme is available */
ret->scheme = estrndup(s, (e - s));
php_replace_controlchars_ex(ret->scheme, (e - s));
goto end;
}
/*
* certain schemas like mailto: and zlib: may not have any / after them
* this check ensures we support those.
*/
if (*(e+1) != '/') {
/* check if the data we get is a port this allows us to
* correctly parse things like a.com:80
*/
p = e + 1;
while (isdigit(*p)) {
p++;
}
if ((*p == '\0' || *p == '/') && (p - e) < 7) {
goto parse_port;
}
ret->scheme = estrndup(s, (e-s));
php_replace_controlchars_ex(ret->scheme, (e - s));
length -= ++e - s;
s = e;
goto just_path;
} else {
ret->scheme = estrndup(s, (e-s));
php_replace_controlchars_ex(ret->scheme, (e - s));
if (*(e+2) == '/') {
s = e + 3;
if (!strncasecmp("file", ret->scheme, sizeof("file"))) {
if (*(e + 3) == '/') {
/* support windows drive letters as in:
file:///c:/somedir/file.txt
*/
if (*(e + 5) == ':') {
s = e + 4;
}
goto nohost;
}
}
} else {
if (!strncasecmp("file", ret->scheme, sizeof("file"))) {
s = e + 1;
goto nohost;
} else {
length -= ++e - s;
s = e;
goto just_path;
}
}
}
} else if (e) { /* no scheme; starts with colon: look for port */
parse_port:
p = e + 1;
pp = p;
while (pp-p < 6 && isdigit(*pp)) {
pp++;
}
if (pp - p > 0 && pp - p < 6 && (*pp == '/' || *pp == '\0')) {
long port;
memcpy(port_buf, p, (pp - p));
port_buf[pp - p] = '\0';
port = strtol(port_buf, NULL, 10);
if (port > 0 && port <= 65535) {
ret->port = (unsigned short) port;
} else {
if (ret->scheme) efree(ret->scheme);
efree(ret);
return NULL;
}
} else if (p == pp && *pp == '\0') {
if (ret->scheme) efree(ret->scheme);
efree(ret);
return NULL;
} else if (*s == '/' && *(s+1) == '/') { /* relative-scheme URL */
s += 2;
} else {
goto just_path;
}
} else if (*s == '/' && *(s+1) == '/') { /* relative-scheme URL */
s += 2;
} else {
just_path:
ue = s + length;
goto nohost;
}
e = ue;
if (!(p = memchr(s, '/', (ue - s)))) {
char *query, *fragment;
query = memchr(s, '?', (ue - s));
fragment = memchr(s, '#', (ue - s));
if (query && fragment) {
if (query > fragment) {
e = fragment;
} else {
e = query;
}
} else if (query) {
e = query;
} else if (fragment) {
e = fragment;
}
} else {
e = p;
}
/* check for login and password */
if ((p = zend_memrchr(s, '@', (e-s)))) {
if ((pp = memchr(s, ':', (p-s)))) {
if ((pp-s) > 0) {
ret->user = estrndup(s, (pp-s));
php_replace_controlchars_ex(ret->user, (pp - s));
}
pp++;
if (p-pp > 0) {
ret->pass = estrndup(pp, (p-pp));
php_replace_controlchars_ex(ret->pass, (p-pp));
}
} else {
ret->user = estrndup(s, (p-s));
php_replace_controlchars_ex(ret->user, (p-s));
}
s = p + 1;
}
/* check for port */
if (*s == '[' && *(e-1) == ']') {
/* Short circuit portscan,
we're dealing with an
IPv6 embedded address */
p = s;
} else {
/* memrchr is a GNU specific extension
Emulate for wide compatability */
for(p = e; p >= s && *p != ':'; p--);
}
if (p >= s && *p == ':') {
if (!ret->port) {
p++;
if (e-p > 5) { /* port cannot be longer then 5 characters */
if (ret->scheme) efree(ret->scheme);
if (ret->user) efree(ret->user);
if (ret->pass) efree(ret->pass);
efree(ret);
return NULL;
} else if (e - p > 0) {
long port;
memcpy(port_buf, p, (e - p));
port_buf[e - p] = '\0';
port = strtol(port_buf, NULL, 10);
if (port > 0 && port <= 65535) {
ret->port = (unsigned short)port;
} else {
if (ret->scheme) efree(ret->scheme);
if (ret->user) efree(ret->user);
if (ret->pass) efree(ret->pass);
efree(ret);
return NULL;
}
}
p--;
}
} else {
p = e;
}
/* check if we have a valid host, if we don't reject the string as url */
if ((p-s) < 1) {
if (ret->scheme) efree(ret->scheme);
if (ret->user) efree(ret->user);
if (ret->pass) efree(ret->pass);
efree(ret);
return NULL;
}
ret->host = estrndup(s, (p-s));
php_replace_controlchars_ex(ret->host, (p - s));
if (e == ue) {
return ret;
}
s = e;
nohost:
if ((p = memchr(s, '?', (ue - s)))) {
pp = strchr(s, '#');
if (pp && pp < p) {
if (pp - s) {
ret->path = estrndup(s, (pp-s));
php_replace_controlchars_ex(ret->path, (pp - s));
}
p = pp;
goto label_parse;
}
if (p - s) {
ret->path = estrndup(s, (p-s));
php_replace_controlchars_ex(ret->path, (p - s));
}
if (pp) {
if (pp - ++p) {
ret->query = estrndup(p, (pp-p));
php_replace_controlchars_ex(ret->query, (pp - p));
}
p = pp;
goto label_parse;
} else if (++p - ue) {
ret->query = estrndup(p, (ue-p));
php_replace_controlchars_ex(ret->query, (ue - p));
}
} else if ((p = memchr(s, '#', (ue - s)))) {
if (p - s) {
ret->path = estrndup(s, (p-s));
php_replace_controlchars_ex(ret->path, (p - s));
}
label_parse:
p++;
if (ue - p) {
ret->fragment = estrndup(p, (ue-p));
php_replace_controlchars_ex(ret->fragment, (ue - p));
}
} else {
ret->path = estrndup(s, (ue-s));
php_replace_controlchars_ex(ret->path, (ue - s));
}
end:
return ret;
}
/* }}} */
/* {{{ proto mixed parse_url(string url, [int url_component])
Parse a URL and return its components */
PHP_FUNCTION(parse_url)
{
char *str;
int str_len;
php_url *resource;
long key = -1;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len, &key) == FAILURE) {
return;
}
resource = php_url_parse_ex(str, str_len);
if (resource == NULL) {
/* @todo Find a method to determine why php_url_parse_ex() failed */
RETURN_FALSE;
}
if (key > -1) {
switch (key) {
case PHP_URL_SCHEME:
if (resource->scheme != NULL) RETVAL_STRING(resource->scheme);
break;
case PHP_URL_HOST:
if (resource->host != NULL) RETVAL_STRING(resource->host);
break;
case PHP_URL_PORT:
if (resource->port != 0) RETVAL_LONG(resource->port);
break;
case PHP_URL_USER:
if (resource->user != NULL) RETVAL_STRING(resource->user);
break;
case PHP_URL_PASS:
if (resource->pass != NULL) RETVAL_STRING(resource->pass);
break;
case PHP_URL_PATH:
if (resource->path != NULL) RETVAL_STRING(resource->path);
break;
case PHP_URL_QUERY:
if (resource->query != NULL) RETVAL_STRING(resource->query);
break;
case PHP_URL_FRAGMENT:
if (resource->fragment != NULL) RETVAL_STRING(resource->fragment);
break;
default:
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid URL component identifier %ld", key);
RETVAL_FALSE;
}
goto done;
}
/* allocate an array for return */
array_init(return_value);
/* add the various elements to the array */
if (resource->scheme != NULL)
add_assoc_string(return_value, "scheme", resource->scheme);
if (resource->host != NULL)
add_assoc_string(return_value, "host", resource->host);
if (resource->port != 0)
add_assoc_long(return_value, "port", resource->port);
if (resource->user != NULL)
add_assoc_string(return_value, "user", resource->user);
if (resource->pass != NULL)
add_assoc_string(return_value, "pass", resource->pass);
if (resource->path != NULL)
add_assoc_string(return_value, "path", resource->path);
if (resource->query != NULL)
add_assoc_string(return_value, "query", resource->query);
if (resource->fragment != NULL)
add_assoc_string(return_value, "fragment", resource->fragment);
done:
php_url_free(resource);
}
/* }}} */
/* {{{ php_htoi
*/
static int php_htoi(char *s)
{
int value;
int c;
c = ((unsigned char *)s)[0];
if (isupper(c))
c = tolower(c);
value = (c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10) * 16;
c = ((unsigned char *)s)[1];
if (isupper(c))
c = tolower(c);
value += c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10;
return (value);
}
/* }}} */
/* rfc1738:
...The characters ";",
"/", "?", ":", "@", "=" and "&" are the characters which may be
reserved for special meaning within a scheme...
...Thus, only alphanumerics, the special characters "$-_.+!*'(),", and
reserved characters used for their reserved purposes may be used
unencoded within a URL...
For added safety, we only leave -_. unencoded.
*/
static unsigned char hexchars[] = "0123456789ABCDEF";
/* {{{ php_url_encode
*/
PHPAPI zend_string *php_url_encode(char const *s, int len)
{
register unsigned char c;
unsigned char *to;
unsigned char const *from, *end;
zend_string *start;
from = (unsigned char *)s;
end = (unsigned char *)s + len;
start = STR_ALLOC(3 * len, 0);
to = (unsigned char*)start->val;
while (from < end) {
c = *from++;
if (c == ' ') {
*to++ = '+';
#ifndef CHARSET_EBCDIC
} else if ((c < '0' && c != '-' && c != '.') ||
(c < 'A' && c > '9') ||
(c > 'Z' && c < 'a' && c != '_') ||
(c > 'z')) {
to[0] = '%';
to[1] = hexchars[c >> 4];
to[2] = hexchars[c & 15];
to += 3;
#else /*CHARSET_EBCDIC*/
} else if (!isalnum(c) && strchr("_-.", c) == NULL) {
/* Allow only alphanumeric chars and '_', '-', '.'; escape the rest */
to[0] = '%';
to[1] = hexchars[os_toascii[c] >> 4];
to[2] = hexchars[os_toascii[c] & 15];
to += 3;
#endif /*CHARSET_EBCDIC*/
} else {
*to++ = c;
}
}
*to = '\0';
start = STR_REALLOC(start, to - (unsigned char*)start->val, 0);
return start;
}
/* }}} */
/* {{{ proto string urlencode(string str)
URL-encodes string */
PHP_FUNCTION(urlencode)
{
zend_string *in_str;
#ifndef FAST_ZPP
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S", &in_str) == FAILURE) {
return;
}
#else
ZEND_PARSE_PARAMETERS_START(1, 1)
Z_PARAM_STR(in_str)
ZEND_PARSE_PARAMETERS_END();
#endif
RETURN_STR(php_url_encode(in_str->val, in_str->len));
}
/* }}} */
/* {{{ proto string urldecode(string str)
Decodes URL-encoded string */
PHP_FUNCTION(urldecode)
{
zend_string *in_str, *out_str;
#ifndef FAST_ZPP
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S", &in_str) == FAILURE) {
return;
}
#else
ZEND_PARSE_PARAMETERS_START(1, 1)
Z_PARAM_STR(in_str)
ZEND_PARSE_PARAMETERS_END();
#endif
out_str = STR_INIT(in_str->val, in_str->len, 0);
out_str->len = php_url_decode(out_str->val, out_str->len);
RETURN_NEW_STR(out_str);
}
/* }}} */
/* {{{ php_url_decode
*/
PHPAPI int php_url_decode(char *str, int len)
{
char *dest = str;
char *data = str;
while (len--) {
if (*data == '+') {
*dest = ' ';
}
else if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1))
&& isxdigit((int) *(data + 2))) {
#ifndef CHARSET_EBCDIC
*dest = (char) php_htoi(data + 1);
#else
*dest = os_toebcdic[(char) php_htoi(data + 1)];
#endif
data += 2;
len -= 2;
} else {
*dest = *data;
}
data++;
dest++;
}
*dest = '\0';
return dest - str;
}
/* }}} */
/* {{{ php_raw_url_encode
*/
PHPAPI zend_string *php_raw_url_encode(char const *s, int len)
{
register int x, y;
zend_string *str;
str = STR_ALLOC(3 * len, 0);
for (x = 0, y = 0; len--; x++, y++) {
str->val[y] = (unsigned char) s[x];
#ifndef CHARSET_EBCDIC
if ((str->val[y] < '0' && str->val[y] != '-' && str->val[y] != '.') ||
(str->val[y] < 'A' && str->val[y] > '9') ||
(str->val[y] > 'Z' && str->val[y] < 'a' && str->val[y] != '_') ||
(str->val[y] > 'z' && str->val[y] != '~')) {
str->val[y++] = '%';
str->val[y++] = hexchars[(unsigned char) s[x] >> 4];
str->val[y] = hexchars[(unsigned char) s[x] & 15];
#else /*CHARSET_EBCDIC*/
if (!isalnum(str->val[y]) && str->valchr("_-.~", str->val[y]) != NULL) {
str->val[y++] = '%';
str->val[y++] = hexchars[os_toascii[(unsigned char) s[x]] >> 4];
str->val[y] = hexchars[os_toascii[(unsigned char) s[x]] & 15];
#endif /*CHARSET_EBCDIC*/
}
}
str->val[y] = '\0';
str = STR_REALLOC(str, y, 0);
return str;
}
/* }}} */
/* {{{ proto string rawurlencode(string str)
URL-encodes string */
PHP_FUNCTION(rawurlencode)
{
zend_string *in_str;
#ifndef FAST_ZPP
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S", &in_str) == FAILURE) {
return;
}
#else
ZEND_PARSE_PARAMETERS_START(1, 1)
Z_PARAM_STR(in_str)
ZEND_PARSE_PARAMETERS_END();
#endif
RETURN_STR(php_raw_url_encode(in_str->val, in_str->len));
}
/* }}} */
/* {{{ proto string rawurldecode(string str)
Decodes URL-encodes string */
PHP_FUNCTION(rawurldecode)
{
zend_string *in_str, *out_str;
#ifndef FAST_ZPP
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S", &in_str) == FAILURE) {
return;
}
#else
ZEND_PARSE_PARAMETERS_START(1, 1)
Z_PARAM_STR(in_str)
ZEND_PARSE_PARAMETERS_END();
#endif
out_str = STR_INIT(in_str->val, in_str->len, 0);
out_str->len = php_raw_url_decode(out_str->val, out_str->len);
RETURN_NEW_STR(out_str);
}
/* }}} */
/* {{{ php_raw_url_decode
*/
PHPAPI int php_raw_url_decode(char *str, int len)
{
char *dest = str;
char *data = str;
while (len--) {
if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1))
&& isxdigit((int) *(data + 2))) {
#ifndef CHARSET_EBCDIC
*dest = (char) php_htoi(data + 1);
#else
*dest = os_toebcdic[(char) php_htoi(data + 1)];
#endif
data += 2;
len -= 2;
} else {
*dest = *data;
}
data++;
dest++;
}
*dest = '\0';
return dest - str;
}
/* }}} */
/* {{{ proto array get_headers(string url[, int format])
fetches all the headers sent by the server in response to a HTTP request */
PHP_FUNCTION(get_headers)
{
char *url;
int url_len;
php_stream_context *context;
php_stream *stream;
zval *prev_val, *hdr = NULL, *h;
HashTable *hashT;
long format = 0;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &url, &url_len, &format) == FAILURE) {
return;
}
context = FG(default_context) ? FG(default_context) : (FG(default_context) = php_stream_context_alloc(TSRMLS_C));
if (!(stream = php_stream_open_wrapper_ex(url, "r", REPORT_ERRORS | STREAM_USE_URL | STREAM_ONLY_GET_HEADERS, NULL, context))) {
RETURN_FALSE;
}
if (Z_TYPE(stream->wrapperdata) != IS_ARRAY) {
php_stream_close(stream);
RETURN_FALSE;
}
array_init(return_value);
/* check for curl-wrappers that provide headers via a special "headers" element */
if ((h = zend_hash_str_find(HASH_OF(&stream->wrapperdata), "headers", sizeof("headers")-1)) != NULL && Z_TYPE_P(h) == IS_ARRAY) {
/* curl-wrappers don't load data until the 1st read */
if (!Z_ARRVAL_P(h)->nNumOfElements) {
php_stream_getc(stream);
}
h = zend_hash_str_find(HASH_OF(&stream->wrapperdata), "headers", sizeof("headers")-1);
hashT = Z_ARRVAL_P(h);
} else {
hashT = HASH_OF(&stream->wrapperdata);
}
ZEND_HASH_FOREACH_VAL(hashT, hdr) {
if (Z_TYPE_P(hdr) != IS_STRING) {
continue;
}
if (!format) {
no_name_header:
add_next_index_str(return_value, STR_COPY(Z_STR_P(hdr)));
} else {
char c;
char *s, *p;
if ((p = strchr(Z_STRVAL_P(hdr), ':'))) {
c = *p;
*p = '\0';
s = p + 1;
while (isspace((int)*(unsigned char *)s)) {
s++;
}
if ((prev_val = zend_hash_str_find(HASH_OF(return_value), Z_STRVAL_P(hdr), (p - Z_STRVAL_P(hdr)))) == NULL) {
add_assoc_stringl_ex(return_value, Z_STRVAL_P(hdr), (p - Z_STRVAL_P(hdr) + 1), s, (Z_STRLEN_P(hdr) - (s - Z_STRVAL_P(hdr))));
} else { /* some headers may occur more then once, therefor we need to remake the string into an array */
convert_to_array(prev_val);
add_next_index_stringl(prev_val, s, (Z_STRLEN_P(hdr) - (s - Z_STRVAL_P(hdr))));
}
*p = c;
} else {
goto no_name_header;
}
}
} ZEND_HASH_FOREACH_END();
php_stream_close(stream);
}
/* }}} */
/*
* Local variables:
* tab-width: 4
* c-basic-offset: 4
* End:
* vim600: sw=4 ts=4 fdm=marker
* vim<600: sw=4 ts=4
*/