Welcome! Log In Create A New Profile

Advanced

[njs] Introduced UTF-8 decoder according to WHATWG encoding spec.

Alexander Borisov
July 15, 2020 12:22PM
details: https://hg.nginx.org/njs/rev/855edd76bdb6
branches:
changeset: 1472:855edd76bdb6
user: Alexander Borisov <alexander.borisov@nginx.com>
date: Wed Jul 15 19:19:19 2020 +0300
description:
Introduced UTF-8 decoder according to WHATWG encoding spec.

diffstat:

src/njs_json.c | 8 +-
src/njs_parser.c | 73 ++++++---
src/njs_string.c | 293 +++++++++++++++++++++----------------
src/njs_unicode.h | 4 +
src/njs_utf8.c | 333 ++++++++++++++++++++----------------------
src/njs_utf8.h | 40 ++--
src/test/njs_unit_test.c | 44 ++++-
src/test/unicode_unit_test.c | 53 ++++--
8 files changed, 466 insertions(+), 382 deletions(-)

diffs (truncated from 1394 to 1000 lines):

diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_json.c
--- a/src/njs_json.c Wed Jul 15 19:19:18 2020 +0300
+++ b/src/njs_json.c Wed Jul 15 19:19:19 2020 +0300
@@ -728,7 +728,7 @@ njs_json_parse_string(njs_json_parse_ctx
if (njs_surrogate_any(utf)) {

if (utf > 0xdbff || p[0] != '\\' || p[1] != 'u') {
- s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT);
+ s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT);
continue;
}

@@ -741,12 +741,12 @@ njs_json_parse_string(njs_json_parse_ctx
utf = njs_string_surrogate_pair(utf, utf_low);

} else if (njs_surrogate_leading(utf_low)) {
- utf = NJS_UTF8_REPLACEMENT;
- s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT);
+ utf = NJS_UNICODE_REPLACEMENT;
+ s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT);

} else {
utf = utf_low;
- s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT);
+ s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT);
}
}

diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_parser.c
--- a/src/njs_parser.c Wed Jul 15 19:19:18 2020 +0300
+++ b/src/njs_parser.c Wed Jul 15 19:19:19 2020 +0300
@@ -7896,11 +7896,12 @@ njs_int_t
njs_parser_string_create(njs_vm_t *vm, njs_lexer_token_t *token,
njs_value_t *value)
{
- u_char *dst;
- ssize_t size, length;
- uint32_t cp;
- njs_str_t *src;
- const u_char *p, *end;
+ u_char *dst;
+ ssize_t size, length;
+ uint32_t cp;
+ njs_str_t *src;
+ const u_char *p, *end;
+ njs_unicode_decode_t ctx;

src = &token->text;

@@ -7914,10 +7915,17 @@ njs_parser_string_create(njs_vm_t *vm, n
p = src->start;
end = src->start + src->length;

+ njs_utf8_decode_init(&ctx);
+
while (p < end) {
- cp = njs_utf8_safe_decode(&p, end);
-
- dst = njs_utf8_encode(dst, cp);
+ cp = njs_utf8_decode(&ctx, &p, end);
+
+ if (cp <= NJS_UNICODE_MAX_CODEPOINT) {
+ dst = njs_utf8_encode(dst, cp);
+
+ } else {
+ dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
+ }
}

if (length > NJS_STRING_MAP_STRIDE && size != length) {
@@ -7932,12 +7940,13 @@ static njs_token_type_t
njs_parser_escape_string_create(njs_parser_t *parser, njs_lexer_token_t *token,
njs_value_t *value)
{
- u_char c, *start, *dst;
- size_t size, length, hex_length;
- uint64_t cp, cp_pair;
- njs_int_t ret;
- njs_str_t *string;
- const u_char *src, *end, *hex_end;
+ u_char c, *start, *dst;
+ size_t size, length, hex_length;
+ uint64_t cp, cp_pair;
+ njs_int_t ret;
+ njs_str_t *string;
+ const u_char *src, *end, *hex_end;
+ njs_unicode_decode_t ctx;

ret = njs_parser_escape_string_calc_length(parser, token, &size, &length);
if (njs_slow_path(ret != NJS_OK)) {
@@ -8053,7 +8062,13 @@ njs_parser_escape_string_create(njs_pars

src--;

- cp = njs_utf8_safe_decode2(&src, end);
+ njs_utf8_decode_init(&ctx);
+
+ cp = njs_utf8_decode(&ctx, &src, end);
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ cp = NJS_UNICODE_REPLACEMENT;
+ }
+
dst = njs_utf8_encode(dst, cp);

continue;
@@ -8076,12 +8091,12 @@ njs_parser_escape_string_create(njs_pars
cp = njs_string_surrogate_pair(cp_pair, cp);

} else if (njs_slow_path(njs_surrogate_leading(cp))) {
- cp = NJS_UTF8_REPLACEMENT;
+ cp = NJS_UNICODE_REPLACEMENT;

dst = njs_utf8_encode(dst, (uint32_t) cp);

} else {
- dst = njs_utf8_encode(dst, NJS_UTF8_REPLACEMENT);
+ dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
}

cp_pair = 0;
@@ -8092,7 +8107,7 @@ njs_parser_escape_string_create(njs_pars
continue;
}

- cp = NJS_UTF8_REPLACEMENT;
+ cp = NJS_UNICODE_REPLACEMENT;
}

dst = njs_utf8_encode(dst, (uint32_t) cp);
@@ -8116,10 +8131,11 @@ static njs_int_t
njs_parser_escape_string_calc_length(njs_parser_t *parser,
njs_lexer_token_t *token, size_t *out_size, size_t *out_length)
{
- size_t size, length, hex_length;
- uint64_t cp, cp_pair;
- njs_str_t *string;
- const u_char *ptr, *src, *end, *hex_end;
+ size_t size, length, hex_length;
+ uint64_t cp, cp_pair;
+ njs_str_t *string;
+ const u_char *ptr, *src, *end, *hex_end;
+ njs_unicode_decode_t ctx;

size = 0;
length = 0;
@@ -8173,7 +8189,12 @@ njs_parser_escape_string_calc_length(njs
}

if (*src >= 0x80) {
- cp = njs_utf8_safe_decode2(&src, end);
+ njs_utf8_decode_init(&ctx);
+
+ cp = njs_utf8_decode(&ctx, &src, end);
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ cp = NJS_UNICODE_REPLACEMENT;
+ }

size += njs_utf8_size(cp);
length++;
@@ -8220,13 +8241,13 @@ njs_parser_escape_string_calc_length(njs
cp = njs_string_surrogate_pair(cp_pair, cp);

} else if (njs_slow_path(njs_surrogate_leading(cp))) {
- cp = NJS_UTF8_REPLACEMENT;
+ cp = NJS_UNICODE_REPLACEMENT;

size += njs_utf8_size(cp);
length++;

} else {
- size += njs_utf8_size(NJS_UTF8_REPLACEMENT);
+ size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
length++;
}

@@ -8238,7 +8259,7 @@ njs_parser_escape_string_calc_length(njs
continue;
}

- cp = NJS_UTF8_REPLACEMENT;
+ cp = NJS_UNICODE_REPLACEMENT;
}

size += njs_utf8_size(cp);
diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_string.c
--- a/src/njs_string.c Wed Jul 15 19:19:18 2020 +0300
+++ b/src/njs_string.c Wed Jul 15 19:19:19 2020 +0300
@@ -20,10 +20,8 @@ static njs_int_t njs_string_slice_prop(n
njs_slice_prop_t *slice, njs_value_t *args, njs_uint_t nargs);
static njs_int_t njs_string_slice_args(njs_vm_t *vm, njs_slice_prop_t *slice,
njs_value_t *args, njs_uint_t nargs);
-static njs_int_t njs_string_from_char_code(njs_vm_t *vm,
- njs_value_t *args, njs_uint_t nargs, njs_index_t unused);
-static njs_int_t njs_string_from_code_point(njs_vm_t *vm, njs_value_t *args,
- njs_uint_t nargs, njs_index_t unused);
+static njs_int_t njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args,
+ njs_uint_t nargs, njs_index_t is_point);
static njs_int_t njs_string_bytes_from(njs_vm_t *vm, njs_value_t *args,
njs_uint_t nargs, njs_index_t unused);
static njs_int_t njs_string_bytes_from_array_like(njs_vm_t *vm,
@@ -545,7 +543,7 @@ static const njs_object_prop_t njs_stri
{
.type = NJS_PROPERTY,
.name = njs_string("fromCharCode"),
- .value = njs_native_function(njs_string_from_char_code, 1),
+ .value = njs_native_function2(njs_string_from_char_code, 1, 0),
.writable = 1,
.configurable = 1,
},
@@ -553,7 +551,7 @@ static const njs_object_prop_t njs_stri
{
.type = NJS_PROPERTY,
.name = njs_string("fromCodePoint"),
- .value = njs_native_function(njs_string_from_code_point, 1),
+ .value = njs_native_function2(njs_string_from_char_code, 1, 1),
.writable = 1,
.configurable = 1,
},
@@ -1029,13 +1027,14 @@ static njs_int_t
njs_string_prototype_to_bytes(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
njs_index_t unused)
{
- u_char *p;
- size_t length;
- uint32_t byte;
- njs_int_t ret;
- const u_char *s, *end;
- njs_slice_prop_t slice;
- njs_string_prop_t string;
+ u_char *p;
+ size_t length;
+ uint32_t byte;
+ njs_int_t ret;
+ const u_char *s, *end;
+ njs_slice_prop_t slice;
+ njs_string_prop_t string;
+ njs_unicode_decode_t ctx;

ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0));
if (njs_slow_path(ret != NJS_OK)) {
@@ -1064,8 +1063,10 @@ njs_string_prototype_to_bytes(njs_vm_t *

length = slice.length;

+ njs_utf8_decode_init(&ctx);
+
while (length != 0 && s < end) {
- byte = njs_utf8_decode(&s, end);
+ byte = njs_utf8_decode(&ctx, &s, end);

if (njs_slow_path(byte > 0xFF)) {
njs_release(vm, &vm->retval);
@@ -1463,13 +1464,14 @@ static njs_int_t
njs_string_prototype_char_code_at(njs_vm_t *vm, njs_value_t *args,
njs_uint_t nargs, njs_index_t unused)
{
- double num;
- size_t length;
- int64_t index;
- uint32_t code;
- njs_int_t ret;
- const u_char *start, *end;
- njs_string_prop_t string;
+ double num;
+ size_t length;
+ int64_t index;
+ uint32_t code;
+ njs_int_t ret;
+ const u_char *start, *end;
+ njs_string_prop_t string;
+ njs_unicode_decode_t ctx;

ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0));
if (njs_slow_path(ret != NJS_OK)) {
@@ -1493,10 +1495,12 @@ njs_string_prototype_char_code_at(njs_vm
code = string.start[index];

} else {
+ njs_utf8_decode_init(&ctx);
+
/* UTF-8 string. */
end = string.start + string.size;
start = njs_string_offset(string.start, end, index);
- code = njs_utf8_decode(&start, end);
+ code = njs_utf8_decode(&ctx, &start, end);
}

num = code;
@@ -1829,14 +1833,27 @@ njs_decode_base64_core(njs_vm_t *vm, njs


static njs_int_t
-njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args,
- njs_uint_t nargs, njs_index_t unused)
+njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
+ njs_index_t is_point)
{
- u_char *p;
- size_t size;
- uint16_t code;
- njs_int_t ret;
- njs_uint_t i;
+ double num;
+ u_char *p, *start, *end;
+ ssize_t len;
+ int32_t code;
+ uint32_t cp;
+ uint64_t length, size;
+ njs_int_t ret;
+ njs_uint_t i;
+ njs_unicode_decode_t ctx;
+ u_char buf[4];
+
+ size = 0;
+ length = 0;
+
+ cp = 0x00;
+ end = buf + sizeof(buf);
+
+ njs_utf16_decode_init(&ctx);

for (i = 1; i < nargs; i++) {
if (!njs_is_numeric(&args[i])) {
@@ -1845,73 +1862,76 @@ njs_string_from_char_code(njs_vm_t *vm,
return ret;
}
}
+
+ if (is_point) {
+ num = njs_number(&args[i]);
+ if (isnan(num)) {
+ goto range_error;
+ }
+
+ code = num;
+
+ if (code != num || code < 0 || code > 0x10FFFF) {
+ goto range_error;
+ }
+
+ } else {
+ code = njs_number_to_uint16(njs_number(&args[i]));
+ }
+
+ start = buf;
+ len = njs_utf16_encode(code, &start, end);
+
+ start = buf;
+ cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + len);
+
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ if (cp == NJS_UNICODE_CONTINUE) {
+ continue;
+ }
+
+ cp = NJS_UNICODE_REPLACEMENT;
+ }
+
+ size += njs_utf8_size(cp);
+ length++;
}

- size = 0;
-
- for (i = 1; i < nargs; i++) {
- code = njs_number_to_uint16(njs_number(&args[i]));
- size += njs_utf8_size_uint16(code);
+ if (cp == NJS_UNICODE_CONTINUE) {
+ size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
+ length++;
}

- p = njs_string_alloc(vm, &vm->retval, size, nargs - 1);
+ p = njs_string_alloc(vm, &vm->retval, size, length);
if (njs_slow_path(p == NULL)) {
return NJS_ERROR;
}

- for (i = 1; i < nargs; i++) {
- code = njs_number_to_uint16(njs_number(&args[i]));
- p = njs_utf8_encode(p, code);
- }
-
- return NJS_OK;
-}
-
-
-static njs_int_t
-njs_string_from_code_point(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
- njs_index_t unused)
-{
- u_char *p;
- double num;
- size_t size;
- int32_t code;
- njs_int_t ret;
- njs_uint_t i;
+ njs_utf16_decode_init(&ctx);

for (i = 1; i < nargs; i++) {
- if (!njs_is_numeric(&args[i])) {
- ret = njs_value_to_numeric(vm, &args[i], &args[i]);
- if (ret != NJS_OK) {
- return ret;
- }
- }
- }
-
- size = 0;
-
- for (i = 1; i < nargs; i++) {
- num = njs_number(&args[i]);
- if (isnan(num)) {
- goto range_error;
+ if (is_point) {
+ code = njs_number(&args[i]);
+
+ } else {
+ code = njs_number_to_uint16(njs_number(&args[i]));
}

- code = num;
-
- if (code != num || code < 0 || code >= 0x110000) {
- goto range_error;
+ start = buf;
+ len = njs_utf16_encode(code, &start, end);
+
+ start = buf;
+ cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + len);
+
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ if (cp == NJS_UNICODE_CONTINUE && i + 1 != nargs) {
+ continue;
+ }
+
+ cp = NJS_UNICODE_REPLACEMENT;
}

- size += njs_utf8_size(code);
- }
-
- p = njs_string_alloc(vm, &vm->retval, size, nargs - 1);
- if (njs_slow_path(p == NULL)) {
- return NJS_ERROR;
- }
-
- for (i = 1; i < nargs; i++) {
- p = njs_utf8_encode(p, njs_number(&args[i]));
+ p = njs_utf8_encode(p, cp);
}

return NJS_OK;
@@ -2591,11 +2611,12 @@ static njs_int_t
njs_string_prototype_trim(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
njs_index_t mode)
{
- uint32_t u, trim, length;
- njs_int_t ret;
- njs_value_t *value;
- const u_char *p, *prev, *start, *end;
- njs_string_prop_t string;
+ uint32_t u, trim, length;
+ njs_int_t ret;
+ njs_value_t *value;
+ const u_char *p, *prev, *start, *end;
+ njs_string_prop_t string;
+ njs_unicode_decode_t ctx;

value = njs_argument(args, 0);
ret = njs_string_object_validate(vm, value);
@@ -2651,13 +2672,15 @@ njs_string_prototype_trim(njs_vm_t *vm,
/* UTF-8 string. */

if (mode & NJS_TRIM_START) {
+ njs_utf8_decode_init(&ctx);
+
for ( ;; ) {
if (start == end) {
goto empty;
}

p = start;
- u = njs_utf8_decode(&start, end);
+ u = njs_utf8_decode(&ctx, &start, end);

if (njs_utf8_is_whitespace(u)) {
trim++;
@@ -2672,6 +2695,8 @@ njs_string_prototype_trim(njs_vm_t *vm,
if (mode & NJS_TRIM_END) {
prev = end;

+ njs_utf8_decode_init(&ctx);
+
for ( ;; ) {
if (start == prev) {
goto empty;
@@ -2679,7 +2704,7 @@ njs_string_prototype_trim(njs_vm_t *vm,

prev = njs_utf8_prev(prev);
p = prev;
- u = njs_utf8_decode(&p, end);
+ u = njs_utf8_decode(&ctx, &p, end);

if (njs_utf8_is_whitespace(u)) {
trim++;
@@ -3640,11 +3665,12 @@ njs_string_prototype_replace(njs_vm_t *v
double
njs_string_to_number(const njs_value_t *value, njs_bool_t parse_float)
{
- double num;
- size_t size;
- uint32_t u;
- njs_bool_t minus;
- const u_char *p, *start, *end;
+ double num;
+ size_t size;
+ uint32_t u;
+ njs_bool_t minus;
+ const u_char *p, *start, *end;
+ njs_unicode_decode_t ctx;

const size_t infinity = njs_length("Infinity");

@@ -3660,9 +3686,11 @@ njs_string_to_number(const njs_value_t *

end = p + size;

+ njs_utf8_decode_init(&ctx);
+
while (p < end) {
start = p;
- u = njs_utf8_decode(&p, end);
+ u = njs_utf8_decode(&ctx, &p, end);

if (!njs_utf8_is_whitespace(u)) {
p = start;
@@ -4179,15 +4207,16 @@ njs_int_t
njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
njs_index_t component)
{
- u_char byte, *dst;
- uint64_t size;
- uint32_t cp, cp_low;
- njs_int_t ret;
- njs_value_t *value;
- const u_char *src, *end;
- const uint32_t *escape;
- njs_string_prop_t string;
- u_char encode[4];
+ u_char byte, *dst;
+ uint64_t size;
+ uint32_t cp, cp_low;
+ njs_int_t ret;
+ njs_value_t *value;
+ const u_char *src, *end;
+ const uint32_t *escape;
+ njs_string_prop_t string;
+ njs_unicode_decode_t ctx;
+ u_char encode[4];

static const uint32_t escape_uri[] = {
0xffffffff, /* 1111 1111 1111 1111 1111 1111 1111 1111 */
@@ -4257,8 +4286,10 @@ njs_string_encode_uri(njs_vm_t *vm, njs_
} else {
/* UTF-8 string. */

+ njs_utf8_decode_init(&ctx);
+
while (src < end) {
- cp = njs_utf8_decode(&src, end);
+ cp = njs_utf8_decode(&ctx, &src, end);

if (cp < 0x80 && !njs_need_escape(escape, cp)) {
size++;
@@ -4271,7 +4302,7 @@ njs_string_encode_uri(njs_vm_t *vm, njs_
}

if (njs_surrogate_leading(cp)) {
- cp_low = njs_utf8_decode(&src, end);
+ cp_low = njs_utf8_decode(&ctx, &src, end);

if (njs_slow_path(!njs_surrogate_trailing(cp_low))) {
goto uri_error;
@@ -4310,11 +4341,13 @@ njs_string_encode_uri(njs_vm_t *vm, njs_

/* UTF-8 string. */

+ njs_utf8_decode_init(&ctx);
+
while (src < end) {
- cp = njs_utf8_decode(&src, end);
+ cp = njs_utf8_decode(&ctx, &src, end);

if (njs_slow_path(njs_surrogate_leading(cp))) {
- cp_low = njs_utf8_decode(&src, end);
+ cp_low = njs_utf8_decode(&ctx, &src, end);
cp = njs_string_surrogate_pair(cp, cp_low);
}

@@ -4337,11 +4370,14 @@ njs_inline uint32_t
njs_string_decode_uri_cp(const int8_t *hex, const u_char **start,
const u_char *end, njs_bool_t expect_percent)
{
- int8_t d0, d1;
- uint32_t cp;
- const u_char *p;
-
- cp = njs_utf8_decode(start, end);
+ int8_t d0, d1;
+ uint32_t cp;
+ const u_char *p;
+ njs_unicode_decode_t ctx;
+
+ njs_utf8_decode_init(&ctx);
+
+ cp = njs_utf8_decode(&ctx, start, end);
if (njs_fast_path(cp != '%')) {
return expect_percent ? 0xFFFFFFFF: cp;
}
@@ -4378,18 +4414,19 @@ njs_int_t
njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
njs_index_t component)
{
- u_char *dst;
- int64_t size, length;
- uint32_t cp;
- njs_int_t ret;
- njs_chb_t chain;
- njs_uint_t i, n;
- njs_bool_t percent;
- njs_value_t *value;
- const u_char *src, *p, *end;
- const uint32_t *reserve;
- njs_string_prop_t string;
- u_char encode[4];
+ u_char *dst;
+ int64_t size, length;
+ uint32_t cp;
+ njs_int_t ret;
+ njs_chb_t chain;
+ njs_uint_t i, n;
+ njs_bool_t percent;
+ njs_value_t *value;
+ const u_char *src, *p, *end;
+ const uint32_t *reserve;
+ njs_string_prop_t string;
+ njs_unicode_decode_t ctx;
+ u_char encode[4];

static const uint32_t reserve_uri[] = {
0x00000000, /* 0000 0000 0000 0000 0000 0000 0000 0000 */
@@ -4472,6 +4509,8 @@ njs_string_decode_uri(njs_vm_t *vm, njs_

njs_chb_init(&chain, vm->mem_pool);

+ njs_utf8_decode_init(&ctx);
+
while (src < end) {
percent = (src[0] == '%');
cp = njs_string_decode_uri_cp(hex, &src, end, 0);
@@ -4529,8 +4568,8 @@ njs_string_decode_uri(njs_vm_t *vm, njs_
}

p = encode;
- cp = njs_utf8_decode(&p, p + n);
- if (njs_slow_path(cp == 0xFFFFFFFF)) {
+ cp = njs_utf8_decode(&ctx, &p, p + n);
+ if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) {
goto uri_error;
}

diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_unicode.h
--- a/src/njs_unicode.h Wed Jul 15 19:19:18 2020 +0300
+++ b/src/njs_unicode.h Wed Jul 15 19:19:19 2020 +0300
@@ -9,6 +9,7 @@


enum {
+ NJS_UNICODE_REPLACEMENT = 0xFFFD,
NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF,
NJS_UNICODE_ERROR = 0x1FFFFF,
NJS_UNICODE_CONTINUE = 0x2FFFFF
@@ -16,6 +17,9 @@ enum {

typedef struct {
uint32_t codepoint;
+
+ unsigned need;
+ u_char lower;
u_char upper;
} njs_unicode_decode_t;

diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_utf8.c
--- a/src/njs_utf8.c Wed Jul 15 19:19:18 2020 +0300
+++ b/src/njs_utf8.c Wed Jul 15 19:19:19 2020 +0300
@@ -56,211 +56,166 @@ njs_utf8_encode(u_char *p, uint32_t u)
}


-/*
- * njs_utf8_decode() decodes UTF-8 sequences and returns a valid
- * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
- * UTF-8 sequence.
- */
+njs_inline njs_int_t
+njs_utf8_boundary(njs_unicode_decode_t *ctx, const u_char **data,
+ unsigned *need, u_char lower, u_char upper)
+{
+ u_char ch;

-uint32_t
-njs_utf8_decode(const u_char **start, const u_char *end)
-{
- uint32_t u;
+ ch = **data;

- u = (uint32_t) **start;
-
- if (u < 0x80) {
- (*start)++;
- return u;
+ if (ch < lower || ch > upper) {
+ return NJS_ERROR;
}

- return njs_utf8_decode2(start, end);
+ (*data)++;
+ (*need)--;
+ ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
+
+ return NJS_OK;
}


-/*
- * njs_utf8_decode2() decodes two and more bytes UTF-8 sequences only
- * and returns a valid character 0x80 - 0x10FFFF, OR 0xFFFFFFFF for
- * invalid or overlong UTF-8 sequence.
- */
-
-uint32_t
-njs_utf8_decode2(const u_char **start, const u_char *end)
+njs_inline void
+njs_utf8_boundary_set(njs_unicode_decode_t *ctx, const u_char ch,
+ u_char first, u_char second, u_char lower, u_char upper)
{
- u_char c;
- size_t n;
- uint32_t u, overlong;
- const u_char *p;
-
- p = *start;
- u = (uint32_t) *p;
-
- if (u >= 0xE0) {
-
- if (u >= 0xF0) {
-
- if (njs_slow_path(u > 0xF4)) {
- /*
- * The maximum valid Unicode character is 0x10FFFF
- * which is encoded as 0xF4 0x8F 0xBF 0xBF.
- */
- return 0xFFFFFFFF;
- }
-
- u &= 0x07;
- overlong = 0x00FFFF;
- n = 3;
-
- } else {
- u &= 0x0F;
- overlong = 0x07FF;
- n = 2;
- }
+ if (ch == first) {
+ ctx->lower = lower;
+ ctx->upper = 0xBF;

- } else if (u >= 0xC2) {
-
- /* 0x80 is encoded as 0xC2 0x80. */
-
- u &= 0x1F;
- overlong = 0x007F;
- n = 1;
-
- } else {
- /* u <= 0xC2 */
- return 0xFFFFFFFF;
+ } else if (ch == second) {
+ ctx->lower = 0x80;
+ ctx->upper = upper;
}
-
- p++;
-
- if (njs_fast_path(p + n <= end)) {
-
- do {
- c = *p++;
- /*
- * The byte must in the 0x80 - 0xBF range.
- * Values below 0x80 become >= 0x80.
- */
- c = c - 0x80;
-
- if (njs_slow_path(c > 0x3F)) {
- return 0xFFFFFFFF;
- }
-
- u = (u << 6) | c;
- n--;
-
- } while (n != 0);
-
- if (overlong < u && u < 0x110000) {
- *start = p;
- return u;
- }
- }
-
- return 0xFFFFFFFF;
}


uint32_t
-njs_utf8_safe_decode(const u_char **start, const u_char *end)
-{
- uint32_t u;
-
- u = (uint32_t) **start;
-
- if (u < 0x80) {
- (*start)++;
- return u;
- }
-
- return njs_utf8_safe_decode2(start, end);
-}
-
-
-uint32_t
-njs_utf8_safe_decode2(const u_char **start, const u_char *end)
+njs_utf8_decode(njs_unicode_decode_t *ctx, const u_char **start,
+ const u_char *end)
{
u_char c;
- size_t n;
- uint32_t u, overlong;
+ unsigned need;
+ njs_int_t ret;
const u_char *p;

- p = *start;
- u = (uint32_t) *p;
-
- if (u >= 0xE0) {
-
- if (u >= 0xF0) {
+ if (ctx->need != 0) {
+ need = ctx->need;
+ ctx->need = 0;

- if (njs_slow_path(u > 0xF4)) {
- /*
- * The maximum valid Unicode character is 0x10FFFF
- * which is encoded as 0xF4 0x8F 0xBF 0xBF.
- */
- goto fail_one;
+ if (ctx->lower != 0x00) {
+ ret = njs_utf8_boundary(ctx, start, &need, ctx->lower, ctx->upper);
+ if (njs_slow_path(ret != NJS_OK)) {
+ goto failed;
}

- u &= 0x07;
- overlong = 0x00FFFF;
- n = 3;
+ ctx->lower = 0x00;
+ }
+
+ goto decode;
+ }
+
+ c = *(*start)++;
+
+ if (c < 0x80) {
+ return c;

- } else {
- u &= 0x0F;
- overlong = 0x07FF;
- n = 2;
+ } else if (c <= 0xDF) {
+ if (c < 0xC2) {
+ return NJS_UNICODE_ERROR;
+ }
+
+ need = 1;
+ ctx->codepoint = c & 0x1F;
+
+ } else if (c < 0xF0) {
+ need = 2;
+ ctx->codepoint = c & 0x0F;
+
+ if (*start == end) {
+ njs_utf8_boundary_set(ctx, c, 0xE0, 0xED, 0xA0, 0x9F);
+ goto next;
}

- } else if (u >= 0xC2) {
+ ret = NJS_OK;
+
+ if (c == 0xE0) {
+ ret = njs_utf8_boundary(ctx, start, &need, 0xA0, 0xBF);

- /* 0x80 is encoded as 0xC2 0x80. */
+ } else if (c == 0xED) {
+ ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x9F);
+ }
+
+ if (njs_slow_path(ret != NJS_OK)) {
+ goto failed;
+ }
+
+ } else if (c < 0xF5) {
+ need = 3;
+ ctx->codepoint = c & 0x07;

- u &= 0x1F;
- overlong = 0x007F;
- n = 1;
+ if (*start == end) {
+ njs_utf8_boundary_set(ctx, c, 0xF0, 0xF4, 0x90, 0x8F);
+ goto next;
+ }
+
+ ret = NJS_OK;
+
+ if (c == 0xF0) {
+ ret = njs_utf8_boundary(ctx, start, &need, 0x90, 0xBF);
+
+ } else if (c == 0xF4) {
+ ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x8F);
+ }
+
+ if (njs_slow_path(ret != NJS_OK)) {
+ goto failed;
+ }

} else {
- /* u <= 0xC2 */
- goto fail_one;
+ return NJS_UNICODE_ERROR;
}

- p++;
+decode:
+
+ for (p = *start; p < end; p++) {
+ c = *p;

- while (p < end && n != 0) {
- c = *p++;
- /*
- * The byte must in the 0x80 - 0xBF range.
- * Values below 0x80 become >= 0x80.
- */
- c = c - 0x80;
+ if (c < 0x80 || c > 0xBF) {
+ *start = p;

- if (njs_slow_path(c > 0x3F)) {
- *start = --p;
- return NJS_UTF8_REPLACEMENT;
+ goto failed;
}

- u = (u << 6) | c;
- n--;
+ ctx->codepoint = (ctx->codepoint << 6) | (c & 0x3F);
+
+ if (--need == 0) {
+ *start = p + 1;
+
+ return ctx->codepoint;
+ }
}

*start = p;

- if (n == 0 && overlong < u && u < 0x110000) {
- return u;
- }
+next:

- return NJS_UTF8_REPLACEMENT;
+ ctx->need = need;
_______________________________________________
nginx-devel mailing list
nginx-devel@nginx.org
http://mailman.nginx.org/mailman/listinfo/nginx-devel
Subject Author Views Posted

[njs] Introduced UTF-8 decoder according to WHATWG encoding spec.

Alexander Borisov 95 July 15, 2020 12:22PM



Sorry, you do not have permission to post/reply in this forum.

Online Users

Guests: 91
Record Number of Users: 6 on February 13, 2018
Record Number of Guests: 421 on December 02, 2018
Powered by nginx      Powered by FreeBSD      PHP Powered      Powered by MariaDB      ipv6 ready