GDevelop Core
Core library for developing platforms and tools compatible with GDevelop.
regex.h
1 // Tencent is pleased to support the open source community by making RapidJSON available.
2 //
3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
4 //
5 // Licensed under the MIT License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // http://opensource.org/licenses/MIT
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #ifndef RAPIDJSON_INTERNAL_REGEX_H_
16 #define RAPIDJSON_INTERNAL_REGEX_H_
17 
18 #include "../allocators.h"
19 #include "../stream.h"
20 #include "stack.h"
21 
22 #ifdef __clang__
23 RAPIDJSON_DIAG_PUSH
24 RAPIDJSON_DIAG_OFF(padded)
25 RAPIDJSON_DIAG_OFF(switch-enum)
26 RAPIDJSON_DIAG_OFF(implicit-fallthrough)
27 #endif
28 
29 #ifdef __GNUC__
30 RAPIDJSON_DIAG_PUSH
31 RAPIDJSON_DIAG_OFF(effc++)
32 #endif
33 
34 #ifdef _MSC_VER
35 RAPIDJSON_DIAG_PUSH
36 RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated
37 #endif
38 
39 #ifndef RAPIDJSON_REGEX_VERBOSE
40 #define RAPIDJSON_REGEX_VERBOSE 0
41 #endif
42 
44 namespace internal {
45 
47 // GenericRegex
48 
49 static const SizeType kRegexInvalidState = ~SizeType(0);
50 static const SizeType kRegexInvalidRange = ~SizeType(0);
51 
53 
84 template <typename Encoding, typename Allocator = CrtAllocator>
85 class GenericRegex {
86 public:
87  typedef typename Encoding::Ch Ch;
88 
89  GenericRegex(const Ch* source, Allocator* allocator = 0) :
90  states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
91  stateSet_(), state0_(allocator, 0), state1_(allocator, 0), anchorBegin_(), anchorEnd_()
92  {
94  DecodedStream<GenericStringStream<Encoding> > ds(ss);
95  Parse(ds);
96  }
97 
98  ~GenericRegex() {
99  Allocator::Free(stateSet_);
100  }
101 
102  bool IsValid() const {
103  return root_ != kRegexInvalidState;
104  }
105 
106  template <typename InputStream>
107  bool Match(InputStream& is) const {
108  return SearchWithAnchoring(is, true, true);
109  }
110 
111  bool Match(const Ch* s) const {
113  return Match(is);
114  }
115 
116  template <typename InputStream>
117  bool Search(InputStream& is) const {
118  return SearchWithAnchoring(is, anchorBegin_, anchorEnd_);
119  }
120 
121  bool Search(const Ch* s) const {
123  return Search(is);
124  }
125 
126 private:
127  enum Operator {
128  kZeroOrOne,
129  kZeroOrMore,
130  kOneOrMore,
131  kConcatenation,
132  kAlternation,
133  kLeftParenthesis
134  };
135 
136  static const unsigned kAnyCharacterClass = 0xFFFFFFFF;
137  static const unsigned kRangeCharacterClass = 0xFFFFFFFE;
138  static const unsigned kRangeNegationFlag = 0x80000000;
139 
140  struct Range {
141  unsigned start; //
142  unsigned end;
143  SizeType next;
144  };
145 
146  struct State {
147  SizeType out;
148  SizeType out1;
149  SizeType rangeStart;
150  unsigned codepoint;
151  };
152 
153  struct Frag {
154  Frag(SizeType s, SizeType o, SizeType m) : start(s), out(o), minIndex(m) {}
155  SizeType start;
156  SizeType out;
157  SizeType minIndex;
158  };
159 
160  template <typename SourceStream>
161  class DecodedStream {
162  public:
163  DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); }
164  unsigned Peek() { return codepoint_; }
165  unsigned Take() {
166  unsigned c = codepoint_;
167  if (c) // No further decoding when '\0'
168  Decode();
169  return c;
170  }
171 
172  private:
173  void Decode() {
174  if (!Encoding::Decode(ss_, &codepoint_))
175  codepoint_ = 0;
176  }
177 
178  SourceStream& ss_;
179  unsigned codepoint_;
180  };
181 
182  State& GetState(SizeType index) {
183  RAPIDJSON_ASSERT(index < stateCount_);
184  return states_.template Bottom<State>()[index];
185  }
186 
187  const State& GetState(SizeType index) const {
188  RAPIDJSON_ASSERT(index < stateCount_);
189  return states_.template Bottom<State>()[index];
190  }
191 
192  Range& GetRange(SizeType index) {
193  RAPIDJSON_ASSERT(index < rangeCount_);
194  return ranges_.template Bottom<Range>()[index];
195  }
196 
197  const Range& GetRange(SizeType index) const {
198  RAPIDJSON_ASSERT(index < rangeCount_);
199  return ranges_.template Bottom<Range>()[index];
200  }
201 
202  template <typename InputStream>
203  void Parse(DecodedStream<InputStream>& ds) {
204  Allocator allocator;
205  Stack<Allocator> operandStack(&allocator, 256); // Frag
206  Stack<Allocator> operatorStack(&allocator, 256); // Operator
207  Stack<Allocator> atomCountStack(&allocator, 256); // unsigned (Atom per parenthesis)
208 
209  *atomCountStack.template Push<unsigned>() = 0;
210 
211  unsigned codepoint;
212  while (ds.Peek() != 0) {
213  switch (codepoint = ds.Take()) {
214  case '^':
215  anchorBegin_ = true;
216  break;
217 
218  case '$':
219  anchorEnd_ = true;
220  break;
221 
222  case '|':
223  while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
224  if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
225  return;
226  *operatorStack.template Push<Operator>() = kAlternation;
227  *atomCountStack.template Top<unsigned>() = 0;
228  break;
229 
230  case '(':
231  *operatorStack.template Push<Operator>() = kLeftParenthesis;
232  *atomCountStack.template Push<unsigned>() = 0;
233  break;
234 
235  case ')':
236  while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
237  if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
238  return;
239  if (operatorStack.Empty())
240  return;
241  operatorStack.template Pop<Operator>(1);
242  atomCountStack.template Pop<unsigned>(1);
243  ImplicitConcatenation(atomCountStack, operatorStack);
244  break;
245 
246  case '?':
247  if (!Eval(operandStack, kZeroOrOne))
248  return;
249  break;
250 
251  case '*':
252  if (!Eval(operandStack, kZeroOrMore))
253  return;
254  break;
255 
256  case '+':
257  if (!Eval(operandStack, kOneOrMore))
258  return;
259  break;
260 
261  case '{':
262  {
263  unsigned n, m;
264  if (!ParseUnsigned(ds, &n))
265  return;
266 
267  if (ds.Peek() == ',') {
268  ds.Take();
269  if (ds.Peek() == '}')
270  m = kInfinityQuantifier;
271  else if (!ParseUnsigned(ds, &m) || m < n)
272  return;
273  }
274  else
275  m = n;
276 
277  if (!EvalQuantifier(operandStack, n, m) || ds.Peek() != '}')
278  return;
279  ds.Take();
280  }
281  break;
282 
283  case '.':
284  PushOperand(operandStack, kAnyCharacterClass);
285  ImplicitConcatenation(atomCountStack, operatorStack);
286  break;
287 
288  case '[':
289  {
290  SizeType range;
291  if (!ParseRange(ds, &range))
292  return;
293  SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass);
294  GetState(s).rangeStart = range;
295  *operandStack.template Push<Frag>() = Frag(s, s, s);
296  }
297  ImplicitConcatenation(atomCountStack, operatorStack);
298  break;
299 
300  case '\\': // Escape character
301  if (!CharacterEscape(ds, &codepoint))
302  return; // Unsupported escape character
303  // fall through to default
304 
305  default: // Pattern character
306  PushOperand(operandStack, codepoint);
307  ImplicitConcatenation(atomCountStack, operatorStack);
308  }
309  }
310 
311  while (!operatorStack.Empty())
312  if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
313  return;
314 
315  // Link the operand to matching state.
316  if (operandStack.GetSize() == sizeof(Frag)) {
317  Frag* e = operandStack.template Pop<Frag>(1);
318  Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
319  root_ = e->start;
320 
321 #if RAPIDJSON_REGEX_VERBOSE
322  printf("root: %d\n", root_);
323  for (SizeType i = 0; i < stateCount_ ; i++) {
324  State& s = GetState(i);
325  printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint);
326  }
327  printf("\n");
328 #endif
329  }
330 
331  // Preallocate buffer for SearchWithAnchoring()
332  RAPIDJSON_ASSERT(stateSet_ == 0);
333  if (stateCount_ > 0) {
334  stateSet_ = static_cast<unsigned*>(states_.GetAllocator().Malloc(GetStateSetSize()));
335  state0_.template Reserve<SizeType>(stateCount_);
336  state1_.template Reserve<SizeType>(stateCount_);
337  }
338  }
339 
340  SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) {
341  State* s = states_.template Push<State>();
342  s->out = out;
343  s->out1 = out1;
344  s->codepoint = codepoint;
345  s->rangeStart = kRegexInvalidRange;
346  return stateCount_++;
347  }
348 
349  void PushOperand(Stack<Allocator>& operandStack, unsigned codepoint) {
350  SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
351  *operandStack.template Push<Frag>() = Frag(s, s, s);
352  }
353 
354  void ImplicitConcatenation(Stack<Allocator>& atomCountStack, Stack<Allocator>& operatorStack) {
355  if (*atomCountStack.template Top<unsigned>())
356  *operatorStack.template Push<Operator>() = kConcatenation;
357  (*atomCountStack.template Top<unsigned>())++;
358  }
359 
360  SizeType Append(SizeType l1, SizeType l2) {
361  SizeType old = l1;
362  while (GetState(l1).out != kRegexInvalidState)
363  l1 = GetState(l1).out;
364  GetState(l1).out = l2;
365  return old;
366  }
367 
368  void Patch(SizeType l, SizeType s) {
369  for (SizeType next; l != kRegexInvalidState; l = next) {
370  next = GetState(l).out;
371  GetState(l).out = s;
372  }
373  }
374 
375  bool Eval(Stack<Allocator>& operandStack, Operator op) {
376  switch (op) {
377  case kConcatenation:
378  RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag) * 2);
379  {
380  Frag e2 = *operandStack.template Pop<Frag>(1);
381  Frag e1 = *operandStack.template Pop<Frag>(1);
382  Patch(e1.out, e2.start);
383  *operandStack.template Push<Frag>() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex));
384  }
385  return true;
386 
387  case kAlternation:
388  if (operandStack.GetSize() >= sizeof(Frag) * 2) {
389  Frag e2 = *operandStack.template Pop<Frag>(1);
390  Frag e1 = *operandStack.template Pop<Frag>(1);
391  SizeType s = NewState(e1.start, e2.start, 0);
392  *operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex));
393  return true;
394  }
395  return false;
396 
397  case kZeroOrOne:
398  if (operandStack.GetSize() >= sizeof(Frag)) {
399  Frag e = *operandStack.template Pop<Frag>(1);
400  SizeType s = NewState(kRegexInvalidState, e.start, 0);
401  *operandStack.template Push<Frag>() = Frag(s, Append(e.out, s), e.minIndex);
402  return true;
403  }
404  return false;
405 
406  case kZeroOrMore:
407  if (operandStack.GetSize() >= sizeof(Frag)) {
408  Frag e = *operandStack.template Pop<Frag>(1);
409  SizeType s = NewState(kRegexInvalidState, e.start, 0);
410  Patch(e.out, s);
411  *operandStack.template Push<Frag>() = Frag(s, s, e.minIndex);
412  return true;
413  }
414  return false;
415 
416  default:
417  RAPIDJSON_ASSERT(op == kOneOrMore);
418  if (operandStack.GetSize() >= sizeof(Frag)) {
419  Frag e = *operandStack.template Pop<Frag>(1);
420  SizeType s = NewState(kRegexInvalidState, e.start, 0);
421  Patch(e.out, s);
422  *operandStack.template Push<Frag>() = Frag(e.start, s, e.minIndex);
423  return true;
424  }
425  return false;
426  }
427  }
428 
429  bool EvalQuantifier(Stack<Allocator>& operandStack, unsigned n, unsigned m) {
430  RAPIDJSON_ASSERT(n <= m);
431  RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag));
432 
433  if (n == 0) {
434  if (m == 0) // a{0} not support
435  return false;
436  else if (m == kInfinityQuantifier)
437  Eval(operandStack, kZeroOrMore); // a{0,} -> a*
438  else {
439  Eval(operandStack, kZeroOrOne); // a{0,5} -> a?
440  for (unsigned i = 0; i < m - 1; i++)
441  CloneTopOperand(operandStack); // a{0,5} -> a? a? a? a? a?
442  for (unsigned i = 0; i < m - 1; i++)
443  Eval(operandStack, kConcatenation); // a{0,5} -> a?a?a?a?a?
444  }
445  return true;
446  }
447 
448  for (unsigned i = 0; i < n - 1; i++) // a{3} -> a a a
449  CloneTopOperand(operandStack);
450 
451  if (m == kInfinityQuantifier)
452  Eval(operandStack, kOneOrMore); // a{3,} -> a a a+
453  else if (m > n) {
454  CloneTopOperand(operandStack); // a{3,5} -> a a a a
455  Eval(operandStack, kZeroOrOne); // a{3,5} -> a a a a?
456  for (unsigned i = n; i < m - 1; i++)
457  CloneTopOperand(operandStack); // a{3,5} -> a a a a? a?
458  for (unsigned i = n; i < m; i++)
459  Eval(operandStack, kConcatenation); // a{3,5} -> a a aa?a?
460  }
461 
462  for (unsigned i = 0; i < n - 1; i++)
463  Eval(operandStack, kConcatenation); // a{3} -> aaa, a{3,} -> aaa+, a{3.5} -> aaaa?a?
464 
465  return true;
466  }
467 
468  static SizeType Min(SizeType a, SizeType b) { return a < b ? a : b; }
469 
470  void CloneTopOperand(Stack<Allocator>& operandStack) {
471  const Frag src = *operandStack.template Top<Frag>(); // Copy constructor to prevent invalidation
472  SizeType count = stateCount_ - src.minIndex; // Assumes top operand contains states in [src->minIndex, stateCount_)
473  State* s = states_.template Push<State>(count);
474  memcpy(s, &GetState(src.minIndex), count * sizeof(State));
475  for (SizeType j = 0; j < count; j++) {
476  if (s[j].out != kRegexInvalidState)
477  s[j].out += count;
478  if (s[j].out1 != kRegexInvalidState)
479  s[j].out1 += count;
480  }
481  *operandStack.template Push<Frag>() = Frag(src.start + count, src.out + count, src.minIndex + count);
482  stateCount_ += count;
483  }
484 
485  template <typename InputStream>
486  bool ParseUnsigned(DecodedStream<InputStream>& ds, unsigned* u) {
487  unsigned r = 0;
488  if (ds.Peek() < '0' || ds.Peek() > '9')
489  return false;
490  while (ds.Peek() >= '0' && ds.Peek() <= '9') {
491  if (r >= 429496729 && ds.Peek() > '5') // 2^32 - 1 = 4294967295
492  return false; // overflow
493  r = r * 10 + (ds.Take() - '0');
494  }
495  *u = r;
496  return true;
497  }
498 
499  template <typename InputStream>
500  bool ParseRange(DecodedStream<InputStream>& ds, SizeType* range) {
501  bool isBegin = true;
502  bool negate = false;
503  int step = 0;
504  SizeType start = kRegexInvalidRange;
505  SizeType current = kRegexInvalidRange;
506  unsigned codepoint;
507  while ((codepoint = ds.Take()) != 0) {
508  if (isBegin) {
509  isBegin = false;
510  if (codepoint == '^') {
511  negate = true;
512  continue;
513  }
514  }
515 
516  switch (codepoint) {
517  case ']':
518  if (start == kRegexInvalidRange)
519  return false; // Error: nothing inside []
520  if (step == 2) { // Add trailing '-'
521  SizeType r = NewRange('-');
522  RAPIDJSON_ASSERT(current != kRegexInvalidRange);
523  GetRange(current).next = r;
524  }
525  if (negate)
526  GetRange(start).start |= kRangeNegationFlag;
527  *range = start;
528  return true;
529 
530  case '\\':
531  if (ds.Peek() == 'b') {
532  ds.Take();
533  codepoint = 0x0008; // Escape backspace character
534  }
535  else if (!CharacterEscape(ds, &codepoint))
536  return false;
537  // fall through to default
538 
539  default:
540  switch (step) {
541  case 1:
542  if (codepoint == '-') {
543  step++;
544  break;
545  }
546  // fall through to step 0 for other characters
547 
548  case 0:
549  {
550  SizeType r = NewRange(codepoint);
551  if (current != kRegexInvalidRange)
552  GetRange(current).next = r;
553  if (start == kRegexInvalidRange)
554  start = r;
555  current = r;
556  }
557  step = 1;
558  break;
559 
560  default:
561  RAPIDJSON_ASSERT(step == 2);
562  GetRange(current).end = codepoint;
563  step = 0;
564  }
565  }
566  }
567  return false;
568  }
569 
570  SizeType NewRange(unsigned codepoint) {
571  Range* r = ranges_.template Push<Range>();
572  r->start = r->end = codepoint;
573  r->next = kRegexInvalidRange;
574  return rangeCount_++;
575  }
576 
577  template <typename InputStream>
578  bool CharacterEscape(DecodedStream<InputStream>& ds, unsigned* escapedCodepoint) {
579  unsigned codepoint;
580  switch (codepoint = ds.Take()) {
581  case '^':
582  case '$':
583  case '|':
584  case '(':
585  case ')':
586  case '?':
587  case '*':
588  case '+':
589  case '.':
590  case '[':
591  case ']':
592  case '{':
593  case '}':
594  case '\\':
595  *escapedCodepoint = codepoint; return true;
596  case 'f': *escapedCodepoint = 0x000C; return true;
597  case 'n': *escapedCodepoint = 0x000A; return true;
598  case 'r': *escapedCodepoint = 0x000D; return true;
599  case 't': *escapedCodepoint = 0x0009; return true;
600  case 'v': *escapedCodepoint = 0x000B; return true;
601  default:
602  return false; // Unsupported escape character
603  }
604  }
605 
606  template <typename InputStream>
607  bool SearchWithAnchoring(InputStream& is, bool anchorBegin, bool anchorEnd) const {
608  RAPIDJSON_ASSERT(IsValid());
609  DecodedStream<InputStream> ds(is);
610 
611  state0_.Clear();
612  Stack<Allocator> *current = &state0_, *next = &state1_;
613  const size_t stateSetSize = GetStateSetSize();
614  std::memset(stateSet_, 0, stateSetSize);
615 
616  bool matched = AddState(*current, root_);
617  unsigned codepoint;
618  while (!current->Empty() && (codepoint = ds.Take()) != 0) {
619  std::memset(stateSet_, 0, stateSetSize);
620  next->Clear();
621  matched = false;
622  for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
623  const State& sr = GetState(*s);
624  if (sr.codepoint == codepoint ||
625  sr.codepoint == kAnyCharacterClass ||
626  (sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
627  {
628  matched = AddState(*next, sr.out) || matched;
629  if (!anchorEnd && matched)
630  return true;
631  }
632  if (!anchorBegin)
633  AddState(*next, root_);
634  }
635  internal::Swap(current, next);
636  }
637 
638  return matched;
639  }
640 
641  size_t GetStateSetSize() const {
642  return (stateCount_ + 31) / 32 * 4;
643  }
644 
645  // Return whether the added states is a match state
646  bool AddState(Stack<Allocator>& l, SizeType index) const {
647  RAPIDJSON_ASSERT(index != kRegexInvalidState);
648 
649  const State& s = GetState(index);
650  if (s.out1 != kRegexInvalidState) { // Split
651  bool matched = AddState(l, s.out);
652  return AddState(l, s.out1) || matched;
653  }
654  else if (!(stateSet_[index >> 5] & (1 << (index & 31)))) {
655  stateSet_[index >> 5] |= (1 << (index & 31));
656  *l.template PushUnsafe<SizeType>() = index;
657  }
658  return s.out == kRegexInvalidState; // by using PushUnsafe() above, we can ensure s is not validated due to reallocation.
659  }
660 
661  bool MatchRange(SizeType rangeIndex, unsigned codepoint) const {
662  bool yes = (GetRange(rangeIndex).start & kRangeNegationFlag) == 0;
663  while (rangeIndex != kRegexInvalidRange) {
664  const Range& r = GetRange(rangeIndex);
665  if (codepoint >= (r.start & ~kRangeNegationFlag) && codepoint <= r.end)
666  return yes;
667  rangeIndex = r.next;
668  }
669  return !yes;
670  }
671 
672  Stack<Allocator> states_;
673  Stack<Allocator> ranges_;
674  SizeType root_;
675  SizeType stateCount_;
676  SizeType rangeCount_;
677 
678  static const unsigned kInfinityQuantifier = ~0u;
679 
680  // For SearchWithAnchoring()
681  uint32_t* stateSet_; // allocated by states_.GetAllocator()
682  mutable Stack<Allocator> state0_;
683  mutable Stack<Allocator> state1_;
684  bool anchorBegin_;
685  bool anchorEnd_;
686 };
687 
688 typedef GenericRegex<UTF8<> > Regex;
689 
690 } // namespace internal
692 
693 #ifdef __clang__
694 RAPIDJSON_DIAG_POP
695 #endif
696 
697 #ifdef _MSC_VER
698 RAPIDJSON_DIAG_POP
699 #endif
700 
701 #endif // RAPIDJSON_INTERNAL_REGEX_H_
Regular expression engine with subset of ECMAscript grammar.
Definition: regex.h:85
A type-unsafe stack for storing different types of data.
Definition: stack.h:36
Concept for allocating, resizing and freeing memory block.
#define RAPIDJSON_ASSERT(x)
Assertion.
Definition: rapidjson.h:402
#define RAPIDJSON_NAMESPACE_BEGIN
provide custom rapidjson namespace (opening expression)
Definition: rapidjson.h:116
#define RAPIDJSON_NAMESPACE_END
provide custom rapidjson namespace (closing expression)
Definition: rapidjson.h:119
RAPIDJSON_NAMESPACE_BEGIN typedef unsigned SizeType
Size type (for string lengths, array sizes, etc.)
Definition: rapidjson.h:380
Read-only string stream.
Definition: stream.h:110