Welcome! Log In Create A New Profile

Advanced

[njs] Fixed RegExpBuiltinExec() with UTF-8 only regexps.

Dmitry Volyntsev
June 25, 2021 01:34PM
details: https://hg.nginx.org/njs/rev/f10d5c38f098
branches:
changeset: 1667:f10d5c38f098
user: Dmitry Volyntsev <xeioex@nginx.com>
date: Fri Jun 25 17:00:12 2021 +0000
description:
Fixed RegExpBuiltinExec() with UTF-8 only regexps.

The original issue was introduced in f9082cd59ba6 (0.4.2) while adding
RegExpBuiltinExec(), but after de64420d0f2b (0.6.0) it started to affect
RegExp.prototype.test() as it was rewritten according to spec.

diffstat:

src/njs_regexp.c | 24 ++++++++++++++----------
src/test/njs_unit_test.c | 13 +++++++++++++
2 files changed, 27 insertions(+), 10 deletions(-)

diffs (123 lines):

diff -r 7717b6523cd4 -r f10d5c38f098 src/njs_regexp.c
--- a/src/njs_regexp.c Fri Jun 18 15:01:48 2021 +0000
+++ b/src/njs_regexp.c Fri Jun 25 17:00:12 2021 +0000
@@ -26,8 +26,7 @@ static u_char *njs_regexp_compile_trace_
static u_char *njs_regexp_match_trace_handler(njs_trace_t *trace,
njs_trace_data_t *td, u_char *start);
static njs_array_t *njs_regexp_exec_result(njs_vm_t *vm, njs_value_t *r,
- njs_regexp_utf8_t type, njs_string_prop_t *string,
- njs_regex_match_data_t *data);
+ njs_utf8_t utf8, njs_string_prop_t *string, njs_regex_match_data_t *data);
static njs_int_t njs_regexp_string_create(njs_vm_t *vm, njs_value_t *value,
u_char *start, uint32_t size, int32_t length);

@@ -946,6 +945,7 @@ njs_regexp_builtin_exec(njs_vm_t *vm, nj
size_t length, offset;
int64_t last_index;
njs_int_t ret;
+ njs_utf8_t utf8;
njs_value_t value;
njs_array_t *result;
njs_regexp_t *regexp;
@@ -979,11 +979,15 @@ njs_regexp_builtin_exec(njs_vm_t *vm, nj
goto not_found;
}

+ utf8 = NJS_STRING_BYTE;
type = NJS_REGEXP_BYTE;

- if (length != string.size) {
- /* UTF-8 string. */
+ if (string.length != 0) {
type = NJS_REGEXP_UTF8;
+
+ if (string.length != string.size) {
+ utf8 = NJS_STRING_UTF8;
+ }
}

pattern = regexp->pattern;
@@ -998,7 +1002,7 @@ njs_regexp_builtin_exec(njs_vm_t *vm, nj
return NJS_ERROR;
}

- if (type != NJS_REGEXP_UTF8) {
+ if (utf8 != NJS_STRING_UTF8) {
offset = last_index;

} else {
@@ -1010,7 +1014,7 @@ njs_regexp_builtin_exec(njs_vm_t *vm, nj
ret = njs_regexp_match(vm, &pattern->regex[type], string.start, offset,
string.size, match_data);
if (ret >= 0) {
- result = njs_regexp_exec_result(vm, r, type, &string, match_data);
+ result = njs_regexp_exec_result(vm, r, utf8, &string, match_data);
if (njs_slow_path(result == NULL)) {
return NJS_ERROR;
}
@@ -1043,7 +1047,7 @@ not_found:


static njs_array_t *
-njs_regexp_exec_result(njs_vm_t *vm, njs_value_t *r, njs_regexp_utf8_t type,
+njs_regexp_exec_result(njs_vm_t *vm, njs_value_t *r, njs_utf8_t utf8,
njs_string_prop_t *string, njs_regex_match_data_t *match_data)
{
int *captures;
@@ -1081,7 +1085,7 @@ njs_regexp_exec_result(njs_vm_t *vm, njs
start = &string->start[captures[n]];
size = captures[n + 1] - captures[n];

- if (type == NJS_REGEXP_UTF8) {
+ if (utf8 == NJS_STRING_UTF8) {
length = njs_max(njs_utf8_length(start, size), 0);

} else {
@@ -1105,7 +1109,7 @@ njs_regexp_exec_result(njs_vm_t *vm, njs
goto fail;
}

- if (type == NJS_REGEXP_UTF8) {
+ if (utf8 == NJS_STRING_UTF8) {
index = njs_string_index(string, captures[0]);

} else {
@@ -1115,7 +1119,7 @@ njs_regexp_exec_result(njs_vm_t *vm, njs
njs_set_number(&prop->value, index);

if (pattern->global || pattern->sticky) {
- if (type == NJS_REGEXP_UTF8) {
+ if (utf8 == NJS_STRING_UTF8) {
index = njs_string_index(string, captures[1]);

} else {
diff -r 7717b6523cd4 -r f10d5c38f098 src/test/njs_unit_test.c
--- a/src/test/njs_unit_test.c Fri Jun 18 15:01:48 2021 +0000
+++ b/src/test/njs_unit_test.c Fri Jun 25 17:00:12 2021 +0000
@@ -10763,6 +10763,12 @@ static njs_unit_test_t njs_test[] =
{ njs_str("/α/.test('\\u00CE\\u00B1'.toBytes())"),
njs_str("true") },

+ { njs_str("/[A-Za-z]/.test('S')"),
+ njs_str("true") },
+
+ { njs_str("/[A-Za-z]/.test('ø')"),
+ njs_str("false") },
+
{ njs_str("var r = /abc/y; r.test('abc'); r.lastIndex"),
njs_str("3") },

@@ -21004,6 +21010,13 @@ static njs_unit_test_t njs_regexp_test[

{ njs_str("RegExp('[\0]').test('\0')"),
njs_str("true") },
+
+ { njs_str("/[A-Za-z\\u00F8-\\u02FF]/.test('S')"),
+ njs_str("true") },
+
+ { njs_str("/[A-Za-z\\u00F8-\\u02FF]/.test('ø')"),
+ njs_str("true") },
+
};


_______________________________________________
nginx-devel mailing list
nginx-devel@nginx.org
http://mailman.nginx.org/mailman/listinfo/nginx-devel
Subject Author Views Posted

[njs] Fixed RegExpBuiltinExec() with UTF-8 only regexps.

Dmitry Volyntsev 106 June 25, 2021 01:34PM



Sorry, you do not have permission to post/reply in this forum.

Online Users

Guests: 59
Record Number of Users: 6 on February 13, 2018
Record Number of Guests: 421 on December 02, 2018
Powered by nginx      Powered by FreeBSD      PHP Powered      Powered by MariaDB      ipv6 ready