Welcome! Log In Create A New Profile

Advanced

[njs] Fixed String.prototype.split() according to the specification.

Dmitry Volyntsev
June 09, 2021 01:56PM
details: https://hg.nginx.org/njs/rev/f2d02c3b5f8a
branches:
changeset: 1656:f2d02c3b5f8a
user: Dmitry Volyntsev <xeioex@nginx.com>
date: Wed Jun 09 17:14:10 2021 +0000
description:
Fixed String.prototype.split() according to the specification.

This closes #359 issue on GitHub.

diffstat:

src/njs_regexp.c | 252 ++++++++++++++++++++++++++++++++++++++++++
src/njs_string.c | 278 ++++++++++++++++++++--------------------------
src/njs_string.h | 2 +
src/test/njs_benchmark.c | 6 +-
src/test/njs_unit_test.c | 38 ++++++-
5 files changed, 418 insertions(+), 158 deletions(-)

diffs (697 lines):

diff -r 5516f717a8c9 -r f2d02c3b5f8a src/njs_regexp.c
--- a/src/njs_regexp.c Tue Jun 08 18:01:25 2021 +0000
+++ b/src/njs_regexp.c Wed Jun 09 17:14:10 2021 +0000
@@ -1612,6 +1612,250 @@ exception:
}


+static njs_int_t
+njs_regexp_prototype_symbol_split(njs_vm_t *vm, njs_value_t *args,
+ njs_uint_t nargs, njs_index_t unused)
+{
+ u_char *dst;
+ int64_t e, i, p, q, ncaptures, length;
+ uint32_t limit;
+ njs_int_t ret;
+ njs_bool_t sticky;
+ njs_utf8_t utf8;
+ njs_array_t *array;
+ njs_value_t *rx, *string, *value;
+ njs_value_t r, z, this, s_lvalue, retval, setval, constructor;
+ njs_object_t *object;
+ const u_char *start, *end;
+ njs_string_prop_t s;
+ njs_value_t arguments[2];
+
+ static const njs_value_t string_lindex = njs_string("lastIndex");
+ static const njs_value_t string_flags = njs_string("flags");
+
+ rx = njs_argument(args, 0);
+
+ if (njs_slow_path(!njs_is_object(rx))) {
+ njs_type_error(vm, "\"this\" is not object");
+ return NJS_ERROR;
+ }
+
+ string = njs_lvalue_arg(&s_lvalue, args, nargs, 1);
+
+ ret = njs_value_to_string(vm, string, string);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return ret;
+ }
+
+ njs_set_function(&constructor, &vm->constructors[NJS_OBJ_TYPE_REGEXP]);
+
+ ret = njs_value_species_constructor(vm, rx, &constructor, &constructor);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return ret;
+ }
+
+ ret = njs_value_property(vm, rx, njs_value_arg(&string_flags), &retval);
+ if (njs_slow_path(ret == NJS_ERROR)) {
+ return NJS_ERROR;
+ }
+
+ ret = njs_value_to_string(vm, &retval, &retval);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return ret;
+ }
+
+ (void) njs_string_prop(&s, &retval);
+
+ sticky = memchr(s.start, 'y', s.size) != NULL;
+
+ object = njs_function_new_object(vm, &constructor);
+ if (njs_slow_path(object == NULL)) {
+ return NJS_ERROR;
+ }
+
+ njs_set_object(&this, object);
+
+ arguments[0] = *rx;
+
+ if (!sticky) {
+ length = njs_is_byte_string(&s) ? 0 : s.length + 1;
+
+ dst = njs_string_alloc(vm, &arguments[1], s.size + 1, length);
+ if (njs_slow_path(dst == NULL)) {
+ return NJS_ERROR;
+ }
+
+ dst = njs_cpymem(dst, s.start, s.size);
+ *dst++ = 'y';
+
+ } else {
+ arguments[1] = retval;
+ }
+
+ ret = njs_function_call2(vm, njs_function(&constructor), &this,
+ njs_value_arg(&arguments), 2, &r, 1);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return NJS_ERROR;
+ }
+
+ rx = &r;
+
+ array = njs_array_alloc(vm, 0, 0, NJS_ARRAY_SPARE);
+ if (njs_slow_path(array == NULL)) {
+ return NJS_ERROR;
+ }
+
+ value = njs_arg(args, nargs, 2);
+ limit = UINT32_MAX;
+
+ if (njs_is_defined(value)) {
+ ret = njs_value_to_uint32(vm, value, &limit);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return ret;
+ }
+ }
+
+ if (njs_slow_path(limit == 0)) {
+ goto done;
+ }
+
+ length = njs_string_prop(&s, string);
+
+ if (njs_slow_path(s.size == 0)) {
+ ret = njs_regexp_exec(vm, rx, string, &z);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return NJS_ERROR;
+ }
+
+ if (!njs_is_null(&z)) {
+ goto done;
+ }
+
+ goto single;
+ }
+
+ utf8 = NJS_STRING_BYTE;
+
+ if (s.length != 0 && s.length != s.size) {
+ utf8 = NJS_STRING_UTF8;
+ }
+
+ p = 0;
+ q = 0;
+
+ while (q < length) {
+ njs_set_number(&setval, q);
+ ret = njs_value_property_set(vm, rx, njs_value_arg(&string_lindex),
+ &setval);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return NJS_ERROR;
+ }
+
+ ret = njs_regexp_exec(vm, rx, string, &z);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return NJS_ERROR;
+ }
+
+ if (njs_is_null(&z)) {
+ q = q + 1;
+ continue;
+ }
+
+ ret = njs_value_property(vm, rx, njs_value_arg(&string_lindex),
+ &retval);
+ if (njs_slow_path(ret == NJS_ERROR)) {
+ return NJS_ERROR;
+ }
+
+ ret = njs_value_to_length(vm, &retval, &e);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return NJS_ERROR;
+ }
+
+ e = njs_min(e, length);
+
+ if (e == p) {
+ q = q + 1;
+ continue;
+ }
+
+ if (utf8 == NJS_STRING_UTF8) {
+ start = njs_string_offset(s.start, s.start + s.size, p);
+ end = njs_string_offset(s.start, s.start + s.size, q);
+
+ } else {
+ start = &s.start[p];
+ end = &s.start[q];
+ }
+
+ ret = njs_string_split_part_add(vm, array, utf8, start, end - start);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return ret;
+ }
+
+ if (array->length == limit) {
+ goto done;
+ }
+
+ p = e;
+
+ ret = njs_object_length(vm, &z, &ncaptures);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return NJS_ERROR;
+ }
+
+ ncaptures = njs_max(ncaptures - 1, 0);
+
+ for (i = 1; i <= ncaptures; i++) {
+ value = njs_array_push(vm, array);
+ if (njs_slow_path(value == NULL)) {
+ return NJS_ERROR;
+ }
+
+ ret = njs_value_property_i64(vm, &z, i, value);
+ if (njs_slow_path(ret == NJS_ERROR)) {
+ return NJS_ERROR;
+ }
+
+ if (array->length == limit) {
+ goto done;
+ }
+ }
+
+ q = p;
+ }
+
+ end = &s.start[s.size];
+
+ if (utf8 == NJS_STRING_UTF8) {
+ start = njs_string_offset(s.start, s.start + s.size, p);
+
+ } else {
+ start = &s.start[p];
+ }
+
+ ret = njs_string_split_part_add(vm, array, utf8, start, end - start);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return ret;
+ }
+
+ goto done;
+
+single:
+
+ value = njs_array_push(vm, array);
+ if (njs_slow_path(value == NULL)) {
+ return NJS_ERROR;
+ }
+
+ *value = *string;
+
+done:
+
+ njs_set_array(&vm->retval, array);
+
+ return NJS_OK;
+}


static const njs_object_prop_t njs_regexp_constructor_properties[] =
@@ -1755,6 +1999,14 @@ static const njs_object_prop_t njs_rege
.writable = 1,
.configurable = 1,
},
+
+ {
+ .type = NJS_PROPERTY,
+ .name = njs_wellknown_symbol(NJS_SYMBOL_SPLIT),
+ .value = njs_native_function(njs_regexp_prototype_symbol_split, 2),
+ .writable = 1,
+ .configurable = 1,
+ },
};


diff -r 5516f717a8c9 -r f2d02c3b5f8a src/njs_string.c
--- a/src/njs_string.c Tue Jun 08 18:01:25 2021 +0000
+++ b/src/njs_string.c Wed Jun 09 17:14:10 2021 +0000
@@ -72,8 +72,6 @@ static njs_int_t njs_string_bytes_from_s
const njs_value_t *string, const njs_value_t *encoding);
static njs_int_t njs_string_match_multiple(njs_vm_t *vm, njs_value_t *args,
njs_regexp_pattern_t *pattern);
-static njs_int_t njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array,
- njs_utf8_t utf8, const u_char *start, size_t size);


#define njs_base64_encoded_length(len) (((len + 2) / 3) * 4)
@@ -3338,19 +3336,49 @@ static njs_int_t
njs_string_prototype_split(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
njs_index_t unused)
{
- int *captures;
- size_t size;
- uint32_t limit;
- njs_int_t ret;
- njs_utf8_t utf8;
- njs_value_t *value;
- njs_array_t *array;
- const u_char *p, *start, *next, *last, *end;
- njs_regexp_utf8_t type;
- njs_string_prop_t string, split;
- njs_regexp_pattern_t *pattern;
-
- ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0));
+ size_t size;
+ uint32_t limit;
+ njs_int_t ret;
+ njs_utf8_t utf8;
+ njs_bool_t undefined;
+ njs_value_t *this, *separator, *value;
+ njs_value_t separator_lvalue, limit_lvalue, splitter;
+ njs_array_t *array;
+ const u_char *p, *start, *next, *last, *end;
+ njs_string_prop_t string, split;
+ njs_value_t arguments[3];
+
+ static const njs_value_t split_key =
+ njs_wellknown_symbol(NJS_SYMBOL_SPLIT);
+
+ this = njs_argument(args, 0);
+
+ if (njs_slow_path(njs_is_null_or_undefined(this))) {
+ njs_type_error(vm, "cannot convert \"%s\"to object",
+ njs_type_string(this->type));
+ return NJS_ERROR;
+ }
+
+ separator = njs_lvalue_arg(&separator_lvalue, args, nargs, 1);
+ value = njs_lvalue_arg(&limit_lvalue, args, nargs, 2);
+
+ if (!njs_is_null_or_undefined(separator)) {
+ ret = njs_value_method(vm, separator, njs_value_arg(&split_key),
+ &splitter);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return ret;
+ }
+
+ if (njs_is_defined(&splitter)) {
+ arguments[0] = *this;
+ arguments[1] = *value;
+
+ return njs_function_call(vm, njs_function(&splitter), separator,
+ arguments, 2, &vm->retval);
+ }
+ }
+
+ ret = njs_value_to_string(vm, this, this);
if (njs_slow_path(ret != NJS_OK)) {
return ret;
}
@@ -3360,159 +3388,99 @@ njs_string_prototype_split(njs_vm_t *vm,
return NJS_ERROR;
}

- if (nargs > 1) {
-
- if (nargs > 2) {
- value = njs_argument(args, 2);
-
- if (njs_slow_path(!njs_is_number(value))) {
- ret = njs_value_to_uint32(vm, value, &limit);
- if (njs_slow_path(ret != NJS_OK)) {
- return ret;
- }
-
- } else {
- limit = njs_number_to_uint32(njs_number(value));
- }
-
- if (limit == 0) {
- goto done;
- }
-
- } else {
- limit = (uint32_t) -1;
+ limit = UINT32_MAX;
+
+ if (njs_is_defined(value)) {
+ ret = njs_value_to_uint32(vm, value, &limit);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return ret;
}
-
- (void) njs_string_prop(&string, &args[0]);
-
- if (string.size == 0) {
+ }
+
+ undefined = njs_is_undefined(separator);
+
+ ret = njs_value_to_string(vm, separator, separator);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return ret;
+ }
+
+ if (njs_slow_path(limit == 0)) {
+ goto done;
+ }
+
+ if (njs_slow_path(undefined)) {
+ goto single;
+ }
+
+ (void) njs_string_prop(&string, this);
+ (void) njs_string_prop(&split, separator);
+
+ if (njs_slow_path(string.size == 0)) {
+ if (split.size != 0) {
goto single;
}

- utf8 = NJS_STRING_BYTE;
- type = NJS_REGEXP_BYTE;
-
- if (string.length != 0) {
- utf8 = NJS_STRING_ASCII;
- type = NJS_REGEXP_UTF8;
-
- if (string.length != string.size) {
- utf8 = NJS_STRING_UTF8;
+ goto done;
+ }
+
+ utf8 = NJS_STRING_BYTE;
+
+ if (string.length != 0) {
+ utf8 = NJS_STRING_ASCII;
+
+ if (string.length != string.size) {
+ utf8 = NJS_STRING_UTF8;
+ }
+ }
+
+ start = string.start;
+ end = string.start + string.size;
+ last = end - split.size;
+
+ do {
+
+ for (p = start; p <= last; p++) {
+ if (memcmp(p, split.start, split.size) == 0) {
+ goto found;
}
}

- switch (args[1].type) {
-
- case NJS_REGEXP:
- pattern = njs_regexp_pattern(&args[1]);
-
- if (!njs_regex_is_valid(&pattern->regex[type])) {
- goto single;
- }
-
- start = string.start;
- end = string.start + string.size;
-
- do {
- ret = njs_regexp_match(vm, &pattern->regex[type], start, 0,
- end - start, vm->single_match_data);
- if (ret >= 0) {
- captures = njs_regex_captures(vm->single_match_data);
-
- p = start + captures[0];
- next = start + captures[1];
-
- } else if (ret == NJS_REGEX_NOMATCH) {
- p = (u_char *) end;
- next = (u_char *) end + 1;
-
- } else {
- return NJS_ERROR;
- }
-
- /* Empty split regexp. */
- if (p == next) {
- p = (utf8 != NJS_STRING_BYTE) ? njs_utf8_next(p, end)
- : p + 1;
- next = p;
- }
-
- size = p - start;
-
- ret = njs_string_split_part_add(vm, array, utf8, start, size);
- if (njs_slow_path(ret != NJS_OK)) {
- return ret;
- }
-
- start = next;
- limit--;
-
- } while (limit != 0 && p < end);
-
- goto done;
-
- case NJS_UNDEFINED:
- break;
-
- default:
- if (njs_slow_path(!njs_is_string(&args[1]))) {
- ret = njs_value_to_string(vm, &args[1], &args[1]);
- if (njs_slow_path(ret != NJS_OK)) {
- return ret;
- }
- }
-
- (void) njs_string_prop(&split, &args[1]);
-
- if (string.size < split.size) {
- goto single;
- }
-
- start = string.start;
- end = string.start + string.size;
- last = end - split.size;
-
- do {
- for (p = start; p <= last; p++) {
- if (memcmp(p, split.start, split.size) == 0) {
- goto found;
- }
- }
-
- p = end;
+ p = end;

found:

- next = p + split.size;
-
- /* Empty split string. */
- if (p == next) {
- p = (utf8 != NJS_STRING_BYTE) ? njs_utf8_next(p, end)
- : p + 1;
- next = p;
- }
-
- size = p - start;
-
- ret = njs_string_split_part_add(vm, array, utf8, start, size);
- if (njs_slow_path(ret != NJS_OK)) {
- return ret;
- }
-
- start = next;
- limit--;
-
- } while (limit != 0 && p < end);
-
- goto done;
+ next = p + split.size;
+
+ /* Empty split string. */
+
+ if (p == next) {
+ p = (utf8 != NJS_STRING_BYTE) ? njs_utf8_next(p, end)
+ : p + 1;
+ next = p;
}
- }
+
+ size = p - start;
+
+ ret = njs_string_split_part_add(vm, array, utf8, start, size);
+ if (njs_slow_path(ret != NJS_OK)) {
+ return ret;
+ }
+
+ start = next;
+ limit--;
+
+ } while (limit != 0 && p < end);
+
+ goto done;

single:

- /* GC: retain. */
- array->start[0] = args[0];
- array->length = 1;
+ value = njs_array_push(vm, array);
+ if (njs_slow_path(value == NULL)) {
+ return NJS_ERROR;
+ }
+
+ *value = *this;

done:

@@ -3522,7 +3490,7 @@ done:
}


-static njs_int_t
+njs_int_t
njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array, njs_utf8_t utf8,
const u_char *start, size_t size)
{
diff -r 5516f717a8c9 -r f2d02c3b5f8a src/njs_string.h
--- a/src/njs_string.h Tue Jun 08 18:01:25 2021 +0000
+++ b/src/njs_string.h Wed Jun 09 17:14:10 2021 +0000
@@ -239,6 +239,8 @@ njs_int_t njs_string_decode_uri(njs_vm_t

njs_int_t njs_string_prototype_concat(njs_vm_t *vm, njs_value_t *args,
njs_uint_t nargs, njs_index_t unused);
+njs_int_t njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array,
+ njs_utf8_t utf8, const u_char *start, size_t size);
njs_int_t njs_string_get_substitution(njs_vm_t *vm, njs_value_t *matched,
njs_value_t *string, int64_t pos, njs_value_t *captures, int64_t ncaptures,
njs_value_t *groups, njs_value_t *replacement, njs_value_t *retval);
diff -r 5516f717a8c9 -r f2d02c3b5f8a src/test/njs_benchmark.c
--- a/src/test/njs_benchmark.c Tue Jun 08 18:01:25 2021 +0000
+++ b/src/test/njs_benchmark.c Wed Jun 09 17:14:10 2021 +0000
@@ -317,8 +317,10 @@ static njs_benchmark_test_t njs_test[]
1 },

{ "regexp split",
- njs_str("'a a'.split(/ /).length"),
- njs_str("2"),
+ njs_str("var s = Array(26).fill(0).map((v,i)=> {"
+ " var u = String.fromCodePoint(65+i), l = u.toLowerCase(); return u+l+l;}).join('');"
+ "s.split(/(?=[A-Z])/).length"),
+ njs_str("26"),
100 },

{ "regexp 10K split",
diff -r 5516f717a8c9 -r f2d02c3b5f8a src/test/njs_unit_test.c
--- a/src/test/njs_unit_test.c Tue Jun 08 18:01:25 2021 +0000
+++ b/src/test/njs_unit_test.c Wed Jun 09 17:14:10 2021 +0000
@@ -8635,6 +8635,9 @@ static njs_unit_test_t njs_test[] =
{ njs_str("/[\\"),
njs_str("SyntaxError: Unterminated RegExp \"/[\\\" in 1") },

+ { njs_str("/\\s*;\\s*/"),
+ njs_str("/\\s*;\\s*/") },
+
{ njs_str("RegExp(']')"),
njs_str("/\\]/") },

@@ -8802,7 +8805,7 @@ static njs_unit_test_t njs_test[] =
njs_str("abc") },

{ njs_str("''.split('').length"),
- njs_str("1") },
+ njs_str("0") },

{ njs_str("'abc'.split('')"),
njs_str("a,b,c") },
@@ -8858,9 +8861,40 @@ static njs_unit_test_t njs_test[] =
{ njs_str("'abc'.split(/abc/)"),
njs_str(",") },

+ { njs_str("'AbcDefGhi'.split(/([A-Z][a-z]+)/)"),
+ njs_str(",Abc,,Def,,Ghi,") },
+
+ { njs_str("'myCamelCaseString'.split(/(?=[A-Z])/)"),
+ njs_str("my,Camel,Case,String") },
+
+ { njs_str("'мояВерблюжьяСтрока'.split(/(?=[А-Я])/)"),
+ njs_str("моя,Верблюжья,Строка") },
+
+ { njs_str("'Harry Trump ;Fred Barney; Helen Rigby ; Bill Abel ;Chris Hand '.split( /\\s*(?:;|$)\\s*/)"),
+ njs_str("Harry Trump,Fred Barney,Helen Rigby,Bill Abel,Chris Hand,") },
+
+ { njs_str("'Гарри Трамп ;Фрэд Барни; Хелен Ригби ; Билл Абель'.split(/\\s*;\\s*/)"),
+ njs_str("Гарри Трамп,Фрэд Барни,Хелен Ригби,Билл Абель") },
+
+ { njs_str("'Hello 1 world. Sentence number 2.'.split(/(\\d)/)"),
+ njs_str("Hello ,1, world. Sentence number ,2,.") },
+
+ { njs_str("'Привет 1 мир. Предложение номер 2.'.split(/(\\d)/)"),
+ njs_str("Привет ,1, мир. Предложение номер ,2,.") },
+
{ njs_str("'0123456789'.split('').reverse().join('')"),
njs_str("9876543210") },

+ { njs_str("/-/[Symbol.split]('a-b-c')"),
+ njs_str("a,b,c") },
+
+ { njs_str("var O = RegExp.prototype[Symbol.split];"
+ "RegExp.prototype[Symbol.split] = function (s, limit) { "
+ " return O.call(this, s, limit).map(v => `@${v}#`); "
+ "};"
+ "'2016-01-02'.split(/-/)"),
+ njs_str("@2016#,@01#,@02#") },
+
{ njs_str("'abc'.repeat(3)"),
njs_str("abcabcabc") },

@@ -17006,11 +17040,13 @@ static njs_unit_test_t njs_test[] =
{ njs_str("var a = [1]; a[2] = 'x'; JSON.stringify(a)"),
njs_str("[1,null,\"x\"]") },

+#if (!NJS_HAVE_MEMORY_SANITIZER) /* very long test under MSAN */
{ njs_str(njs_declare_sparse_array("a", 32769)
"a[32] = 'a'; a[64] = 'b';"
"var s = JSON.stringify(a); "
"[s.length,s.substring(162,163),s.match(/null/g).length]"),
njs_str("163844,a,32767") },
+#endif

{ njs_str(njs_declare_sparse_array("a", 8)
"a[2] = 'a'; a[4] = 'b'; a.length = 3;"
_______________________________________________
nginx-devel mailing list
nginx-devel@nginx.org
http://mailman.nginx.org/mailman/listinfo/nginx-devel
Subject Author Views Posted

[njs] Fixed String.prototype.split() according to the specification.

Dmitry Volyntsev 125 June 09, 2021 01:56PM



Sorry, you do not have permission to post/reply in this forum.

Online Users

Guests: 64
Record Number of Users: 6 on February 13, 2018
Record Number of Guests: 421 on December 02, 2018
Powered by nginx      Powered by FreeBSD      PHP Powered      Powered by MariaDB      ipv6 ready