mirror of
https://github.com/php/php-src.git
synced 2026-03-31 04:32:19 +02:00
Currently, php-fuzz-mbstring only confirms that no crashes (including ASAN violations) occur when converting text from one encoding to another. Try performing each conversion operation with two different sizes for the intermediate buffer which is used to pass data from the decoder to the encoder. If the encoding conversion code is correct, the size of that intermediate buffer shouldn't matter; we should always get exactly the same results. This is a much stricter test, which is more likely to catch bugs.
119 lines
4.3 KiB
C
119 lines
4.3 KiB
C
/*
|
|
+----------------------------------------------------------------------+
|
|
| Copyright (c) The PHP Group |
|
|
+----------------------------------------------------------------------+
|
|
| This source file is subject to version 3.01 of the PHP license, |
|
|
| that is bundled with this package in the file LICENSE, and is |
|
|
| available through the world-wide-web at the following url: |
|
|
| https://www.php.net/license/3_01.txt |
|
|
| If you did not receive a copy of the PHP license and are unable to |
|
|
| obtain it through the world-wide-web, please send a note to |
|
|
| license@php.net so we can mail you a copy immediately. |
|
|
+----------------------------------------------------------------------+
|
|
| Authors: Stanislav Malyshev <stas@php.net> |
|
|
+----------------------------------------------------------------------+
|
|
*/
|
|
|
|
|
|
#include "zend.h"
|
|
#include "fuzzer.h"
|
|
#include "fuzzer-sapi.h"
|
|
#include "ext/mbstring/mbstring.h"
|
|
|
|
zend_string* convert_encoding(const uint8_t *Data, size_t Size, const mbfl_encoding *FromEncoding, const mbfl_encoding *ToEncoding, size_t BufSize, unsigned int *NumErrors)
|
|
{
|
|
uint32_t *wchar_buf = ecalloc(BufSize, sizeof(uint32_t));
|
|
unsigned int state = 0;
|
|
|
|
mb_convert_buf buf;
|
|
mb_convert_buf_init(&buf, Size, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
|
|
|
|
while (Size) {
|
|
size_t out_len = FromEncoding->to_wchar((unsigned char**)&Data, &Size, wchar_buf, BufSize, &state);
|
|
ZEND_ASSERT(out_len <= BufSize);
|
|
ToEncoding->from_wchar(wchar_buf, out_len, &buf, !Size);
|
|
}
|
|
|
|
*NumErrors = buf.errors;
|
|
zend_string *result = mb_convert_buf_result(&buf, ToEncoding);
|
|
efree(wchar_buf);
|
|
return result;
|
|
}
|
|
|
|
void assert_zend_string_eql(zend_string *str1, zend_string *str2)
|
|
{
|
|
ZEND_ASSERT(ZSTR_LEN(str1) == ZSTR_LEN(str2));
|
|
for (int i = 0; i < ZSTR_LEN(str1); i++) {
|
|
ZEND_ASSERT(ZSTR_VAL(str1)[i] == ZSTR_VAL(str2)[i]);
|
|
}
|
|
}
|
|
|
|
int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
|
|
const uint8_t *Comma1 = memchr(Data, ',', Size);
|
|
if (!Comma1) {
|
|
return 0;
|
|
}
|
|
|
|
size_t ToEncodingNameLen = Comma1 - Data;
|
|
char *ToEncodingName = estrndup((char *) Data, ToEncodingNameLen);
|
|
Data = Comma1 + 1;
|
|
Size -= ToEncodingNameLen + 1;
|
|
|
|
const uint8_t *Comma2 = memchr(Data, ',', Size);
|
|
if (!Comma2) {
|
|
efree(ToEncodingName);
|
|
return 0;
|
|
}
|
|
|
|
size_t FromEncodingNameLen = Comma2 - Data;
|
|
char *FromEncodingName = estrndup((char *) Data, FromEncodingNameLen);
|
|
Data = Comma2 + 1;
|
|
Size -= FromEncodingNameLen + 1;
|
|
|
|
const mbfl_encoding *ToEncoding = mbfl_name2encoding(ToEncodingName);
|
|
const mbfl_encoding *FromEncoding = mbfl_name2encoding(FromEncodingName);
|
|
|
|
if (!ToEncoding || !FromEncoding || Size < 2 || fuzzer_request_startup() == FAILURE) {
|
|
efree(ToEncodingName);
|
|
efree(FromEncodingName);
|
|
return 0;
|
|
}
|
|
|
|
/* Rather than converting an entire (possibly very long) string at once, mbstring converts
|
|
* strings 'chunk by chunk'; the decoder will run until it fills up its output buffer with
|
|
* wchars, then the encoder will process those wchars, then the decoder runs again until it
|
|
* again fills up its output buffer, and so on
|
|
*
|
|
* The most error-prone part of the decoder/encoder code is where we exit a decoder/encoder
|
|
* function and save its state to allow later resumption
|
|
* To stress-test that aspect of the decoders/encoders, try performing an encoding conversion
|
|
* operation with different, random buffer sizes
|
|
* If the code is correct, the result should always be the same either way */
|
|
size_t bufsize1 = *Data++;
|
|
size_t bufsize2 = *Data++;
|
|
bufsize1 = MAX(bufsize1, MBSTRING_MIN_WCHAR_BUFSIZE);
|
|
bufsize2 = MAX(bufsize2, MBSTRING_MIN_WCHAR_BUFSIZE);
|
|
Size -= 2;
|
|
|
|
unsigned int errors1 = 0, errors2 = 0;
|
|
|
|
zend_string *Result1 = convert_encoding(Data, Size, FromEncoding, ToEncoding, bufsize1, &errors1);
|
|
zend_string *Result2 = convert_encoding(Data, Size, FromEncoding, ToEncoding, bufsize2, &errors2);
|
|
|
|
assert_zend_string_eql(Result1, Result2);
|
|
ZEND_ASSERT(errors1 == errors2);
|
|
|
|
zend_string_release(Result1);
|
|
zend_string_release(Result2);
|
|
efree(ToEncodingName);
|
|
efree(FromEncodingName);
|
|
|
|
fuzzer_request_shutdown();
|
|
return 0;
|
|
}
|
|
|
|
int LLVMFuzzerInitialize(int *argc, char ***argv) {
|
|
fuzzer_init_php(NULL);
|
|
return 0;
|
|
}
|