Welcome! Log In Create A New Profile

Advanced

[njs] Improved processing of invalid surrogate pairs in strings.

Alexander Borisov
May 29, 2019 10:56AM
details: https://hg.nginx.org/njs/rev/96dc9de9f92c
branches:
changeset: 991:96dc9de9f92c
user: Alexander Borisov <alexander.borisov@nginx.com>
date: Tue May 28 20:49:58 2019 +0300
description:
Improved processing of invalid surrogate pairs in strings.

Previously, an exception was thrown on invalid surrogate pairs.
Now, all such pairs are converted to replacement character.

This closes #170 issue on GitHub.

diffstat:

njs/njs_parser_terminal.c | 51 +++++++++++++++++++++++++++++++---------------
njs/test/njs_unit_test.c | 20 +++++++++++++----
nxt/nxt_utf8.h | 2 +
3 files changed, 51 insertions(+), 22 deletions(-)

diffs (129 lines):

diff -r 8e7e7ba29c71 -r 96dc9de9f92c njs/njs_parser_terminal.c
--- a/njs/njs_parser_terminal.c Thu May 23 18:03:46 2019 +0300
+++ b/njs/njs_parser_terminal.c Tue May 28 20:49:58 2019 +0300
@@ -1049,12 +1049,27 @@ njs_parser_escape_string_create(njs_vm_t
}

if (cp_pair != 0) {
- cp = njs_string_surrogate_pair(cp_pair, cp);
+ if (nxt_fast_path(cp >= 0xdc00 && cp <= 0xdfff)) {
+ cp = njs_string_surrogate_pair(cp_pair, cp);
+
+ } else if (nxt_slow_path(cp >= 0xd800 && cp <= 0xdbff)) {
+ cp = NXT_UTF8_REPLACEMENT;
+
+ dst = nxt_utf8_encode(dst, (uint32_t) cp);
+
+ } else {
+ dst = nxt_utf8_encode(dst, NXT_UTF8_REPLACEMENT);
+ }
+
cp_pair = 0;

} else if (cp >= 0xd800 && cp <= 0xdfff) {
- cp_pair = cp;
- continue;
+ if (cp <= 0xdbff && src[0] == '\\' && src[1] == 'u') {
+ cp_pair = cp;
+ continue;
+ }
+
+ cp = NXT_UTF8_REPLACEMENT;
}

dst = nxt_utf8_encode(dst, (uint32_t) cp);
@@ -1183,20 +1198,29 @@ njs_parser_escape_string_calc_length(njs
}

if (cp_pair != 0) {
- if (nxt_slow_path(cp < 0xdc00 || cp > 0xdfff)) {
- goto invalid_pair;
+ if (nxt_fast_path(cp >= 0xdc00 && cp <= 0xdfff)) {
+ cp = njs_string_surrogate_pair(cp_pair, cp);
+
+ } else if (nxt_slow_path(cp >= 0xd800 && cp <= 0xdbff)) {
+ cp = NXT_UTF8_REPLACEMENT;
+
+ size += nxt_utf8_size(cp);
+ length++;
+
+ } else {
+ size += nxt_utf8_size(NXT_UTF8_REPLACEMENT);
+ length++;
}

- cp = njs_string_surrogate_pair(cp_pair, cp);
cp_pair = 0;

} else if (cp >= 0xd800 && cp <= 0xdfff) {
- if (nxt_slow_path(cp > 0xdbff || src[0] != '\\' || src[1] != 'u')) {
- goto invalid_pair;
+ if (cp <= 0xdbff && src[0] == '\\' && src[1] == 'u') {
+ cp_pair = cp;
+ continue;
}

- cp_pair = cp;
- continue;
+ cp = NXT_UTF8_REPLACEMENT;
}

size += nxt_utf8_size(cp);
@@ -1214,11 +1238,4 @@ invalid:
njs_parser_text(parser));

return NJS_ERROR;
-
-invalid_pair:
-
- njs_parser_syntax_error(vm, parser, "Invalid surrogate pair \"%V\"",
- njs_parser_text(parser));
-
- return NJS_ERROR;
}
diff -r 8e7e7ba29c71 -r 96dc9de9f92c njs/test/njs_unit_test.c
--- a/njs/test/njs_unit_test.c Thu May 23 18:03:46 2019 +0300
+++ b/njs/test/njs_unit_test.c Tue May 28 20:49:58 2019 +0300
@@ -4448,15 +4448,25 @@ static njs_unit_test_t njs_test[] =
nxt_string("1") },

{ nxt_string("'\\ud83d abc \\udc4d'"),
- nxt_string("SyntaxError: Invalid surrogate pair "
- "\"\\ud83d abc \\udc4d\" in 1") },
+ nxt_string("� abc �") },

{ nxt_string("'\\ud83d'"),
- nxt_string("SyntaxError: Invalid surrogate pair \"\\ud83d\" in 1") },
+ nxt_string("�") },

{ nxt_string("'\\ud83d\\uabcd'"),
- nxt_string("SyntaxError: Invalid surrogate pair "
- "\"\\ud83d\\uabcd\" in 1") },
+ nxt_string("�ꯍ") },
+
+ { nxt_string("'\\u{d800}\\u{dB00}'"),
+ nxt_string("��") },
+
+ { nxt_string("'\\u{d800}\\u{d7ff}'"),
+ nxt_string("�퟿") },
+
+ { nxt_string("'\\u{d800}['"),
+ nxt_string("�[") },
+
+ { nxt_string("'\\u{D800}\\u{'"),
+ nxt_string("SyntaxError: Invalid Unicode code point \"\\u{D800}\\u{\" in 1") },

{ nxt_string("''.hasOwnProperty('length')"),
nxt_string("true") },
diff -r 8e7e7ba29c71 -r 96dc9de9f92c nxt/nxt_utf8.h
--- a/nxt/nxt_utf8.h Thu May 23 18:03:46 2019 +0300
+++ b/nxt/nxt_utf8.h Tue May 28 20:49:58 2019 +0300
@@ -15,6 +15,8 @@
*/
#define NXT_UTF8_SORT_INVALID 0x0EEE0EEE

+#define NXT_UTF8_REPLACEMENT 0xFFFD
+

NXT_EXPORT u_char *nxt_utf8_encode(u_char *p, uint32_t u);
NXT_EXPORT uint32_t nxt_utf8_decode(const u_char **start, const u_char *end);
_______________________________________________
nginx-devel mailing list
nginx-devel@nginx.org
http://mailman.nginx.org/mailman/listinfo/nginx-devel
Subject Author Views Posted

[njs] Improved processing of invalid surrogate pairs in strings.

Alexander Borisov 337 May 29, 2019 10:56AM



Sorry, you do not have permission to post/reply in this forum.

Online Users

Guests: 233
Record Number of Users: 8 on April 13, 2023
Record Number of Guests: 421 on December 02, 2018
Powered by nginx      Powered by FreeBSD      PHP Powered      Powered by MariaDB      ipv6 ready