Welcome! Log In Create A New Profile

Advanced

[njs] Improved UTF-8 encoding/decoding.

Alexander Borisov
August 26, 2020 02:28PM
details: https://hg.nginx.org/njs/rev/b98eb205a37b
branches:
changeset: 1505:b98eb205a37b
user: Alexander Borisov <alexander.borisov@nginx.com>
date: Wed Aug 26 21:05:46 2020 +0300
description:
Improved UTF-8 encoding/decoding.

diffstat:

src/njs_encoding.c | 135 ++++++----------------------------------------------
src/njs_parser.c | 18 +-----
src/njs_utf8.c | 101 ++++++++++++++++++++++++---------------
src/njs_utf8.h | 37 ++++++++++++-
4 files changed, 117 insertions(+), 174 deletions(-)

diffs (443 lines):

diff -r 657d446001da -r b98eb205a37b src/njs_encoding.c
--- a/src/njs_encoding.c Wed Aug 26 14:56:47 2020 +0000
+++ b/src/njs_encoding.c Wed Aug 26 21:05:46 2020 +0300
@@ -18,7 +18,6 @@ typedef struct {
njs_bool_t fatal;
njs_bool_t ignore_bom;

- uint32_t codepoint;
njs_unicode_decode_t ctx;
} njs_encoding_decode_t;

@@ -87,11 +86,10 @@ njs_text_encoder_encode(njs_vm_t *vm, nj
njs_index_t unused)
{
u_char *dst;
- int64_t size;
- uint32_t cp;
+ size_t size;
njs_int_t ret;
njs_value_t *this, *input, value;
- const u_char *p, *start, *end;
+ const u_char *start, *end;
njs_string_prop_t prop;
njs_typed_array_t *array;
njs_unicode_decode_t ctx;
@@ -126,30 +124,9 @@ njs_text_encoder_encode(njs_vm_t *vm, nj
end = start + prop.size;
}

- p = start;
-
- cp = 0;
- size = 0;
-
njs_utf8_decode_init(&ctx);

- while (p < end) {
- cp = njs_utf8_decode(&ctx, &p, end);
-
- if (cp > NJS_UNICODE_MAX_CODEPOINT) {
- if (cp == NJS_UNICODE_CONTINUE) {
- continue;
- }
-
- cp = NJS_UNICODE_REPLACEMENT;
- }
-
- size += njs_utf8_size(cp);
- }
-
- if (cp == NJS_UNICODE_CONTINUE) {
- size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
- }
+ (void) njs_utf8_stream_length(&ctx, start, end - start, 1, 0, &size);

njs_set_number(&value, size);

@@ -161,23 +138,7 @@ njs_text_encoder_encode(njs_vm_t *vm, nj
dst = njs_typed_array_buffer(array)->u.u8;
njs_utf8_decode_init(&ctx);

- while (start < end) {
- cp = njs_utf8_decode(&ctx, &start, end);
-
- if (cp > NJS_UNICODE_MAX_CODEPOINT) {
- if (cp == NJS_UNICODE_CONTINUE) {
- continue;
- }
-
- cp = NJS_UNICODE_REPLACEMENT;
- }
-
- dst = njs_utf8_encode(dst, cp);
- }
-
- if (cp == NJS_UNICODE_CONTINUE) {
- (void) njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
- }
+ (void) njs_utf8_stream_encode(&ctx, start, end, dst, 1, 0);

njs_set_typed_array(&vm->retval, array);

@@ -410,7 +371,6 @@ njs_text_decoder_constructor(njs_vm_t *v
return ret;
}

- data->codepoint = 0;
njs_utf8_decode_init(&data->ctx);

njs_set_data(&ov->value, data, NJS_DATA_TAG_TEXT_DECODER);
@@ -573,12 +533,12 @@ njs_text_decoder_decode(njs_vm_t *vm, nj
njs_index_t unused)
{
u_char *dst;
- uint32_t length, cp;
- uint64_t size;
+ size_t size;
+ ssize_t length;
njs_int_t ret;
njs_bool_t stream;
njs_value_t retval, *this, *typed_array, *options;
- const u_char *start, *end, *p;
+ const u_char *start, *end;
njs_unicode_decode_t ctx;
njs_encoding_decode_t *data;
const njs_typed_array_t *array;
@@ -632,52 +592,18 @@ njs_text_decoder_decode(njs_vm_t *vm, nj
data = njs_object_data(this);

ctx = data->ctx;
- cp = data->codepoint;
-
- size = 0;
- length = 0;
-
- p = start;

/* Looking for BOM. */

- if (!data->ignore_bom && p + 3 <= end) {
- cp = njs_utf8_decode(&ctx, &p, end);
-
- if (cp == NJS_UNICODE_BOM) {
- start = p;
-
- } else {
- p = start;
- }
+ if (!data->ignore_bom) {
+ start += njs_utf8_bom(start, end);
}

- while (p < end) {
- cp = njs_utf8_decode(&ctx, &p, end);
-
- if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) {
- if (cp == NJS_UNICODE_CONTINUE) {
- break;
- }
-
- if (data->fatal) {
- goto fatal;
- }
-
- cp = NJS_UNICODE_REPLACEMENT;
- }
-
- size += njs_utf8_size(cp);
- length++;
- }
-
- if (cp == NJS_UNICODE_CONTINUE && !stream) {
- if (data->fatal) {
- goto fatal;
- }
-
- size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
- length++;
+ length = njs_utf8_stream_length(&ctx, start, end - start, !stream,
+ data->fatal, &size);
+ if (length == -1) {
+ njs_type_error(vm, "The encoded data was not valid");
+ return NJS_ERROR;
}

dst = njs_string_alloc(vm, &vm->retval, size, length);
@@ -685,40 +611,13 @@ njs_text_decoder_decode(njs_vm_t *vm, nj
return NJS_ERROR;
}

- while (start < end) {
- cp = njs_utf8_decode(&data->ctx, &start, end);
+ (void) njs_utf8_stream_encode(&data->ctx, start, end, dst, !stream, 0);

- if (cp > NJS_UNICODE_MAX_CODEPOINT) {
- if (cp == NJS_UNICODE_CONTINUE) {
- break;
- }
-
- cp = NJS_UNICODE_REPLACEMENT;
- }
-
- dst = njs_utf8_encode(dst, cp);
+ if (!stream) {
+ njs_utf8_decode_init(&data->ctx);
}

- if (stream) {
- data->codepoint = cp;
- return NJS_OK;
- }
-
- if (cp == NJS_UNICODE_CONTINUE) {
- (void) njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
- }
-
- data->codepoint = 0;
-
- njs_utf8_decode_init(&data->ctx);
-
return NJS_OK;
-
-fatal:
-
- njs_type_error(vm, "The encoded data was not valid");
-
- return NJS_ERROR;
}


diff -r 657d446001da -r b98eb205a37b src/njs_parser.c
--- a/src/njs_parser.c Wed Aug 26 14:56:47 2020 +0000
+++ b/src/njs_parser.c Wed Aug 26 21:05:46 2020 +0300
@@ -7897,15 +7897,16 @@ njs_parser_string_create(njs_vm_t *vm, n
njs_value_t *value)
{
u_char *dst;
- ssize_t size, length;
- uint32_t cp;
+ size_t size, length;
njs_str_t *src;
const u_char *p, *end;
njs_unicode_decode_t ctx;

src = &token->text;

- length = njs_utf8_safe_length(src->start, src->length, &size);
+ njs_utf8_decode_init(&ctx);
+
+ length = njs_utf8_stream_length(&ctx, src->start, src->length, 1, 0, &size);

dst = njs_string_alloc(vm, value, size, length);
if (njs_slow_path(dst == NULL)) {
@@ -7917,16 +7918,7 @@ njs_parser_string_create(njs_vm_t *vm, n

njs_utf8_decode_init(&ctx);

- while (p < end) {
- cp = njs_utf8_decode(&ctx, &p, end);
-
- if (cp <= NJS_UNICODE_MAX_CODEPOINT) {
- dst = njs_utf8_encode(dst, cp);
-
- } else {
- dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
- }
- }
+ (void) njs_utf8_stream_encode(&ctx, p, end, dst, 1, 0);

if (length > NJS_STRING_MAP_STRIDE && size != length) {
njs_string_offset_map_init(value->long_string.data->start, size);
diff -r 657d446001da -r b98eb205a37b src/njs_utf8.c
--- a/src/njs_utf8.c Wed Aug 26 14:56:47 2020 +0000
+++ b/src/njs_utf8.c Wed Aug 26 21:05:46 2020 +0300
@@ -213,6 +213,43 @@ failed:
return NJS_UNICODE_ERROR;
}

+
+u_char *
+njs_utf8_stream_encode(njs_unicode_decode_t *ctx, const u_char *start,
+ const u_char *end, u_char *dst, njs_bool_t last, njs_bool_t fatal)
+{
+ uint32_t cp;
+
+ while (start < end) {
+ cp = njs_utf8_decode(ctx, &start, end);
+
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ if (cp == NJS_UNICODE_CONTINUE) {
+ break;
+ }
+
+ if (fatal) {
+ return NULL;
+ }
+
+ cp = NJS_UNICODE_REPLACEMENT;
+ }
+
+ dst = njs_utf8_encode(dst, cp);
+ }
+
+ if (last && ctx->need != 0x00) {
+ if (fatal) {
+ return NULL;
+ }
+
+ dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
+ }
+
+ return dst;
+}
+
+
/*
* njs_utf8_casecmp() tests only up to the minimum of given lengths, but
* requires lengths of both strings because otherwise njs_utf8_decode()
@@ -314,57 +351,43 @@ njs_utf8_upper_case(const u_char **start


ssize_t
-njs_utf8_length(const u_char *p, size_t len)
+njs_utf8_stream_length(njs_unicode_decode_t *ctx, const u_char *p, size_t len,
+ njs_bool_t last, njs_bool_t fatal, size_t *out_size)
{
- ssize_t length;
- const u_char *end;
- njs_unicode_decode_t ctx;
-
- length = 0;
-
- end = p + len;
-
- njs_utf8_decode_init(&ctx);
-
- while (p < end) {
- if (njs_slow_path(njs_utf8_decode(&ctx, &p, end)
- > NJS_UNICODE_MAX_CODEPOINT))
- {
- return -1;
- }
-
- length++;
- }
-
- return length;
-}
-
-
-ssize_t
-njs_utf8_safe_length(const u_char *p, size_t len, ssize_t *out_size)
-{
- ssize_t size, length;
- uint32_t codepoint;
- const u_char *end;
- njs_unicode_decode_t ctx;
+ size_t size, length;
+ uint32_t codepoint;
+ const u_char *end;

size = 0;
length = 0;

end = p + len;

- njs_utf8_decode_init(&ctx);
-
while (p < end) {
- codepoint = njs_utf8_decode(&ctx, &p, end);
+ codepoint = njs_utf8_decode(ctx, &p, end);

- if (codepoint <= NJS_UNICODE_MAX_CODEPOINT) {
- size += njs_utf8_size(codepoint);
+ if (codepoint > NJS_UNICODE_MAX_CODEPOINT) {
+ if (codepoint == NJS_UNICODE_CONTINUE) {
+ break;
+ }

- } else {
- size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
+ if (fatal) {
+ return -1;
+ }
+
+ codepoint = NJS_UNICODE_REPLACEMENT;
}

+ size += njs_utf8_size(codepoint);
+ length++;
+ }
+
+ if (last && ctx->need != 0x00) {
+ if (fatal) {
+ return -1;
+ }
+
+ size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
length++;
}

diff -r 657d446001da -r b98eb205a37b src/njs_utf8.h
--- a/src/njs_utf8.h Wed Aug 26 14:56:47 2020 +0000
+++ b/src/njs_utf8.h Wed Aug 26 21:05:46 2020 +0300
@@ -8,18 +8,21 @@
#define _NJS_UTF8_H_INCLUDED_


-NJS_EXPORT u_char *njs_utf8_encode(u_char *p, uint32_t u);
NJS_EXPORT uint32_t njs_utf8_decode(njs_unicode_decode_t *ctx,
const u_char **data, const u_char *end);
+NJS_EXPORT u_char *njs_utf8_encode(u_char *p, uint32_t u);
+NJS_EXPORT u_char *njs_utf8_stream_encode(njs_unicode_decode_t *ctx,
+ const u_char *start, const u_char *end, u_char *dst, njs_bool_t last,
+ njs_bool_t fatal);
NJS_EXPORT njs_int_t njs_utf8_casecmp(const u_char *start1,
const u_char *start2, size_t len1, size_t len2);
NJS_EXPORT uint32_t njs_utf8_lower_case(const u_char **start,
const u_char *end);
NJS_EXPORT uint32_t njs_utf8_upper_case(const u_char **start,
const u_char *end);
-NJS_EXPORT ssize_t njs_utf8_length(const u_char *p, size_t len);
-NJS_EXPORT ssize_t njs_utf8_safe_length(const u_char *p, size_t len,
- ssize_t *out_size);
+NJS_EXPORT ssize_t njs_utf8_stream_length(njs_unicode_decode_t *ctx,
+ const u_char *p, size_t len, njs_bool_t last, njs_bool_t fatal,
+ size_t *out_size);
NJS_EXPORT njs_bool_t njs_utf8_is_valid(const u_char *p, size_t len);


@@ -119,6 +122,32 @@ njs_utf8_decode_init(njs_unicode_decode_
}


+njs_inline ssize_t
+njs_utf8_length(const u_char *p, size_t len)
+{
+ njs_unicode_decode_t ctx;
+
+ njs_utf8_decode_init(&ctx);
+
+ return njs_utf8_stream_length(&ctx, p, len, 1, 1, NULL);
+}
+
+
+njs_inline size_t
+njs_utf8_bom(const u_char *start, const u_char *end)
+{
+ if (start + 3 > end) {
+ return 0;
+ }
+
+ if (start[0] == 0xEF && start[1] == 0xBB && start[2] == 0xBF) {
+ return 3;
+ }
+
+ return 0;
+}
+
+
njs_inline size_t
njs_utf8_size(uint32_t cp)
{
_______________________________________________
nginx-devel mailing list
nginx-devel@nginx.org
http://mailman.nginx.org/mailman/listinfo/nginx-devel
Subject Author Views Posted

[njs] Improved UTF-8 encoding/decoding.

Alexander Borisov 263 August 26, 2020 02:28PM



Sorry, you do not have permission to post/reply in this forum.

Online Users

Guests: 211
Record Number of Users: 8 on April 13, 2023
Record Number of Guests: 421 on December 02, 2018
Powered by nginx      Powered by FreeBSD      PHP Powered      Powered by MariaDB      ipv6 ready