Dmitry Volyntsev
May 20, 2023 12:32AM
details: https://hg.nginx.org/njs/rev/3ec3e7d2ce5f
branches:
changeset: 2124:3ec3e7d2ce5f
user: Dmitry Volyntsev <xeioex@nginx.com>
date: Fri May 19 20:22:14 2023 -0700
description:
Added support of regular expressions not supported directly by PCRE2.

The following patterns were fixed:
`[]` - matches nothing, previously was rejected as invalid expression.
`[^]` - matched any character, unlike `.` this syntax matches new
line, previously was rejected as invalid expression.
`++`, `*+`, `?+` - are rejected now, whereas in PCRE2 they are considered
valid possessive quantifiers.

diffstat:

external/njs_regex.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
src/njs_regexp.c | 45 +++++++++++++++++++++++++++++++-
src/test/njs_unit_test.c | 41 +++++++++++++++++++++++++++++
3 files changed, 152 insertions(+), 1 deletions(-)

diffs (206 lines):

diff -r 4d26300ddc64 -r 3ec3e7d2ce5f external/njs_regex.c
--- a/external/njs_regex.c Thu May 18 18:33:36 2023 -0700
+++ b/external/njs_regex.c Fri May 19 20:22:14 2023 -0700
@@ -94,6 +94,73 @@ njs_int_t
njs_regex_escape(njs_mp_t *mp, njs_str_t *text)
{
#ifdef NJS_HAVE_PCRE2
+ size_t anychars, nomatches;
+ u_char *p, *dst, *start, *end;
+
+ /*
+ * 1) [^] is a valid regexp expression in JavaScript, but PCRE2
+ * rejects it as invalid, replacing it with equivalent PCRE2 [\s\S]
+ * expression.
+ * 2) [] is a valid regexp expression in JavaScript, but PCRE2
+ * rejects it as invalid, replacing it with equivalent PCRE2 (?!)
+ * expression which matches nothing.
+ */
+
+ start = text->start;
+ end = text->start + text->length;
+
+ anychars = 0;
+ nomatches = 0;
+
+ for (p = start; p < end; p++) {
+ switch (*p) {
+ case '[':
+ if (p + 1 < end && p[1] == ']') {
+ p += 1;
+ nomatches += 1;
+
+ } else if (p + 2 < end && p[1] == '^' && p[2] == ']') {
+ p += 2;
+ anychars += 1;
+ }
+
+ break;
+ }
+ }
+
+ if (!anychars && !nomatches) {
+ return NJS_OK;
+ }
+
+ text->length = text->length
+ + anychars * (njs_length("\\s\\S") - njs_length("^"))
+ + nomatches * (njs_length("?!"));
+
+ text->start = njs_mp_alloc(mp, text->length);
+ if (njs_slow_path(text->start == NULL)) {
+ return NJS_ERROR;
+ }
+
+ dst = text->start;
+
+ for (p = start; p < end; p++) {
+
+ switch (*p) {
+ case '[':
+ if (p + 1 < end && p[1] == ']') {
+ p += 1;
+ dst = njs_cpymem(dst, "(?!)", 4);
+ continue;
+
+ } else if (p + 2 < end && p[1] == '^' && p[2] == ']') {
+ p += 2;
+ dst = njs_cpymem(dst, "[\\s\\S]", 6);
+ continue;
+ }
+ }
+
+ *dst++ = *p;
+ }

return NJS_OK;

diff -r 4d26300ddc64 -r 3ec3e7d2ce5f src/njs_regexp.c
--- a/src/njs_regexp.c Thu May 18 18:33:36 2023 -0700
+++ b/src/njs_regexp.c Fri May 19 20:22:14 2023 -0700
@@ -263,9 +263,10 @@ njs_regexp_pattern_create(njs_vm_t *vm,
njs_regex_flags_t flags)
{
int ret;
- u_char *p;
+ u_char *p, *end;
size_t size;
njs_str_t text;
+ njs_bool_t in;
njs_uint_t n;
njs_regex_t *regex;
njs_regexp_group_t *group;
@@ -274,6 +275,42 @@ njs_regexp_pattern_create(njs_vm_t *vm,
text.start = start;
text.length = length;

+ in = 0;
+ end = start + length;
+
+ for (p = start; p < end; p++) {
+
+ switch (*p) {
+ case '[':
+ in = 1;
+ break;
+
+ case ']':
+ in = 0;
+ break;
+
+ case '\\':
+ p++;
+ break;
+
+ case '+':
+ if (njs_slow_path(!in
+ && (p - 1 > start)
+ && (p[-1] == '+'|| p[-1] == '*' || p[-1] == '?'))
+ && (p - 2 >= start && p[-2] != '\\'))
+ {
+ /**
+ * PCRE possessive quantifiers `++`, `*+`, `?+`
+ * are not allowed in JavaScript. Whereas `[++]` or `\?+` are
+ * allowed.
+ */
+ goto nothing_to_repeat;
+ }
+
+ break;
+ }
+ }
+
ret = njs_regex_escape(vm->mem_pool, &text);
if (njs_slow_path(ret != NJS_OK)) {
njs_memory_error(vm);
@@ -370,6 +407,12 @@ fail:

njs_mp_free(vm->mem_pool, pattern);
return NULL;
+
+nothing_to_repeat:
+
+ njs_syntax_error(vm, "Invalid regular expression \"%V\" nothing to repeat",
+ &text);
+ return NULL;
}


diff -r 4d26300ddc64 -r 3ec3e7d2ce5f src/test/njs_unit_test.c
--- a/src/test/njs_unit_test.c Thu May 18 18:33:36 2023 -0700
+++ b/src/test/njs_unit_test.c Fri May 19 20:22:14 2023 -0700
@@ -11810,6 +11810,38 @@ static njs_unit_test_t njs_test[] =
{ njs_str("var r = /./; r"),
njs_str("/./") },

+ { njs_str("/[^]+|[^]+/.test('\\n| ')"),
+ njs_str("true") },
+
+ { njs_str("/[^]+|[^][^]/.test('|aa')"),
+ njs_str("true") },
+
+ { njs_str("/a[]/.test('a')"),
+ njs_str("false") },
+
+ { njs_str("/[]a/.test('a')"),
+ njs_str("false") },
+
+#ifdef NJS_HAVE_PCRE2
+ { njs_str("/[]*a/.test('a')"),
+ njs_str("true") },
+#endif
+
+ { njs_str("/Ca++BB/"),
+ njs_str("SyntaxError: Invalid regular expression \"Ca++BB\" nothing to repeat in 1") },
+
+ { njs_str("/a*+/"),
+ njs_str("SyntaxError: Invalid regular expression \"a*+\" nothing to repeat in 1") },
+
+ { njs_str("/a?+/"),
+ njs_str("SyntaxError: Invalid regular expression \"a?+\" nothing to repeat in 1") },
+
+ { njs_str(" /\\[[]++\\]/"),
+ njs_str("SyntaxError: Invalid regular expression \"\\[[]++\\]\" nothing to repeat in 1") },
+
+ { njs_str("/\\?+/"),
+ njs_str("/\\?+/") },
+
{ njs_str("var r = new RegExp(); r"),
njs_str("/(?:)/") },

@@ -11870,6 +11902,15 @@ static njs_unit_test_t njs_test[] =
{ njs_str("RegExp(new RegExp('expr'))"),
njs_str("/expr/") },

+ { njs_str("RegExp(RegExp('[^]+|[^][^]')).test('| \\na')"),
+ njs_str("true") },
+
+ { njs_str("RegExp('a++')"),
+ njs_str("SyntaxError: Invalid regular expression \"a++\" nothing to repeat") },
+
+ { njs_str("RegExp('[a++]')"),
+ njs_str("/[a++]/") },
+
{ njs_str("RegExp(new RegExp('expr')).multiline"),
njs_str("false") },

_______________________________________________
nginx-devel mailing list
nginx-devel@nginx.org
https://mailman.nginx.org/mailman/listinfo/nginx-devel
Subject Author Views Posted

[njs] Added support of regular expressions not supported directly by PCRE2.

Dmitry Volyntsev 266 May 20, 2023 12:32AM



Sorry, you do not have permission to post/reply in this forum.

Online Users

Guests: 205
Record Number of Users: 8 on April 13, 2023
Record Number of Guests: 421 on December 02, 2018
Powered by nginx      Powered by FreeBSD      PHP Powered      Powered by MariaDB      ipv6 ready