15 #ifndef RAPIDJSON_INTERNAL_REGEX_H_
16 #define RAPIDJSON_INTERNAL_REGEX_H_
18 #include "../allocators.h"
19 #include "../stream.h"
24 RAPIDJSON_DIAG_OFF(padded)
25 RAPIDJSON_DIAG_OFF(
switch-
enum)
26 RAPIDJSON_DIAG_OFF(implicit-fallthrough)
31 RAPIDJSON_DIAG_OFF(effc++)
36 RAPIDJSON_DIAG_OFF(4512)
39 #ifndef RAPIDJSON_REGEX_VERBOSE
40 #define RAPIDJSON_REGEX_VERBOSE 0
84 template <
typename Encoding,
typename Allocator = CrtAllocator>
87 typedef typename Encoding::Ch Ch;
90 states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
91 stateSet_(), state0_(allocator, 0), state1_(allocator, 0), anchorBegin_(), anchorEnd_()
94 DecodedStream<GenericStringStream<Encoding> > ds(ss);
99 Allocator::Free(stateSet_);
102 bool IsValid()
const {
103 return root_ != kRegexInvalidState;
106 template <
typename InputStream>
107 bool Match(InputStream& is)
const {
108 return SearchWithAnchoring(is,
true,
true);
111 bool Match(
const Ch* s)
const {
116 template <
typename InputStream>
117 bool Search(InputStream& is)
const {
118 return SearchWithAnchoring(is, anchorBegin_, anchorEnd_);
121 bool Search(
const Ch* s)
const {
136 static const unsigned kAnyCharacterClass = 0xFFFFFFFF;
137 static const unsigned kRangeCharacterClass = 0xFFFFFFFE;
138 static const unsigned kRangeNegationFlag = 0x80000000;
160 template <
typename SourceStream>
161 class DecodedStream {
163 DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); }
164 unsigned Peek() {
return codepoint_; }
166 unsigned c = codepoint_;
174 if (!Encoding::Decode(ss_, &codepoint_))
184 return states_.template Bottom<State>()[index];
187 const State& GetState(
SizeType index)
const {
189 return states_.template Bottom<State>()[index];
194 return ranges_.template Bottom<Range>()[index];
197 const Range& GetRange(
SizeType index)
const {
199 return ranges_.template Bottom<Range>()[index];
202 template <
typename InputStream>
203 void Parse(DecodedStream<InputStream>& ds) {
209 *atomCountStack.template Push<unsigned>() = 0;
212 while (ds.Peek() != 0) {
213 switch (codepoint = ds.Take()) {
223 while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
224 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
226 *operatorStack.template Push<Operator>() = kAlternation;
227 *atomCountStack.template Top<unsigned>() = 0;
231 *operatorStack.template Push<Operator>() = kLeftParenthesis;
232 *atomCountStack.template Push<unsigned>() = 0;
236 while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
237 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
239 if (operatorStack.Empty())
241 operatorStack.template Pop<Operator>(1);
242 atomCountStack.template Pop<unsigned>(1);
243 ImplicitConcatenation(atomCountStack, operatorStack);
247 if (!Eval(operandStack, kZeroOrOne))
252 if (!Eval(operandStack, kZeroOrMore))
257 if (!Eval(operandStack, kOneOrMore))
264 if (!ParseUnsigned(ds, &n))
267 if (ds.Peek() ==
',') {
269 if (ds.Peek() ==
'}')
270 m = kInfinityQuantifier;
271 else if (!ParseUnsigned(ds, &m) || m < n)
277 if (!EvalQuantifier(operandStack, n, m) || ds.Peek() !=
'}')
284 PushOperand(operandStack, kAnyCharacterClass);
285 ImplicitConcatenation(atomCountStack, operatorStack);
291 if (!ParseRange(ds, &range))
293 SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass);
294 GetState(s).rangeStart = range;
295 *operandStack.template Push<Frag>() = Frag(s, s, s);
297 ImplicitConcatenation(atomCountStack, operatorStack);
301 if (!CharacterEscape(ds, &codepoint))
306 PushOperand(operandStack, codepoint);
307 ImplicitConcatenation(atomCountStack, operatorStack);
311 while (!operatorStack.Empty())
312 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
316 if (operandStack.GetSize() ==
sizeof(Frag)) {
317 Frag* e = operandStack.template Pop<Frag>(1);
318 Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
321 #if RAPIDJSON_REGEX_VERBOSE
322 printf(
"root: %d\n", root_);
323 for (
SizeType i = 0; i < stateCount_ ; i++) {
324 State& s = GetState(i);
325 printf(
"[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (
char)s.codepoint);
333 if (stateCount_ > 0) {
334 stateSet_ =
static_cast<unsigned*
>(states_.GetAllocator().Malloc(GetStateSetSize()));
335 state0_.template Reserve<SizeType>(stateCount_);
336 state1_.template Reserve<SizeType>(stateCount_);
341 State* s = states_.template Push<State>();
344 s->codepoint = codepoint;
345 s->rangeStart = kRegexInvalidRange;
346 return stateCount_++;
350 SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
351 *operandStack.template Push<Frag>() = Frag(s, s, s);
355 if (*atomCountStack.template Top<unsigned>())
356 *operatorStack.template Push<Operator>() = kConcatenation;
357 (*atomCountStack.template Top<unsigned>())++;
362 while (GetState(l1).out != kRegexInvalidState)
363 l1 = GetState(l1).out;
364 GetState(l1).out = l2;
369 for (
SizeType next; l != kRegexInvalidState; l = next) {
370 next = GetState(l).out;
380 Frag e2 = *operandStack.template Pop<Frag>(1);
381 Frag e1 = *operandStack.template Pop<Frag>(1);
382 Patch(e1.out, e2.start);
383 *operandStack.template Push<Frag>() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex));
388 if (operandStack.GetSize() >=
sizeof(Frag) * 2) {
389 Frag e2 = *operandStack.template Pop<Frag>(1);
390 Frag e1 = *operandStack.template Pop<Frag>(1);
391 SizeType s = NewState(e1.start, e2.start, 0);
392 *operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex));
398 if (operandStack.GetSize() >=
sizeof(Frag)) {
399 Frag e = *operandStack.template Pop<Frag>(1);
400 SizeType s = NewState(kRegexInvalidState, e.start, 0);
401 *operandStack.template Push<Frag>() = Frag(s, Append(e.out, s), e.minIndex);
407 if (operandStack.GetSize() >=
sizeof(Frag)) {
408 Frag e = *operandStack.template Pop<Frag>(1);
409 SizeType s = NewState(kRegexInvalidState, e.start, 0);
411 *operandStack.template Push<Frag>() = Frag(s, s, e.minIndex);
418 if (operandStack.GetSize() >=
sizeof(Frag)) {
419 Frag e = *operandStack.template Pop<Frag>(1);
420 SizeType s = NewState(kRegexInvalidState, e.start, 0);
422 *operandStack.template Push<Frag>() = Frag(e.start, s, e.minIndex);
429 bool EvalQuantifier(
Stack<Allocator>& operandStack,
unsigned n,
unsigned m) {
436 else if (m == kInfinityQuantifier)
437 Eval(operandStack, kZeroOrMore);
439 Eval(operandStack, kZeroOrOne);
440 for (
unsigned i = 0; i < m - 1; i++)
441 CloneTopOperand(operandStack);
442 for (
unsigned i = 0; i < m - 1; i++)
443 Eval(operandStack, kConcatenation);
448 for (
unsigned i = 0; i < n - 1; i++)
449 CloneTopOperand(operandStack);
451 if (m == kInfinityQuantifier)
452 Eval(operandStack, kOneOrMore);
454 CloneTopOperand(operandStack);
455 Eval(operandStack, kZeroOrOne);
456 for (
unsigned i = n; i < m - 1; i++)
457 CloneTopOperand(operandStack);
458 for (
unsigned i = n; i < m; i++)
459 Eval(operandStack, kConcatenation);
462 for (
unsigned i = 0; i < n - 1; i++)
463 Eval(operandStack, kConcatenation);
471 const Frag src = *operandStack.template Top<Frag>();
472 SizeType count = stateCount_ - src.minIndex;
473 State* s = states_.template Push<State>(count);
474 memcpy(s, &GetState(src.minIndex), count *
sizeof(State));
475 for (
SizeType j = 0; j < count; j++) {
476 if (s[j].out != kRegexInvalidState)
478 if (s[j].out1 != kRegexInvalidState)
481 *operandStack.template Push<Frag>() = Frag(src.start + count, src.out + count, src.minIndex + count);
482 stateCount_ += count;
485 template <
typename InputStream>
486 bool ParseUnsigned(DecodedStream<InputStream>& ds,
unsigned* u) {
488 if (ds.Peek() <
'0' || ds.Peek() >
'9')
490 while (ds.Peek() >=
'0' && ds.Peek() <=
'9') {
491 if (r >= 429496729 && ds.Peek() >
'5')
493 r = r * 10 + (ds.Take() -
'0');
499 template <
typename InputStream>
500 bool ParseRange(DecodedStream<InputStream>& ds,
SizeType* range) {
504 SizeType start = kRegexInvalidRange;
505 SizeType current = kRegexInvalidRange;
507 while ((codepoint = ds.Take()) != 0) {
510 if (codepoint ==
'^') {
518 if (start == kRegexInvalidRange)
523 GetRange(current).next = r;
526 GetRange(start).start |= kRangeNegationFlag;
531 if (ds.Peek() ==
'b') {
535 else if (!CharacterEscape(ds, &codepoint))
542 if (codepoint ==
'-') {
551 if (current != kRegexInvalidRange)
552 GetRange(current).next = r;
553 if (start == kRegexInvalidRange)
562 GetRange(current).end = codepoint;
570 SizeType NewRange(
unsigned codepoint) {
571 Range* r = ranges_.template Push<Range>();
572 r->start = r->end = codepoint;
573 r->next = kRegexInvalidRange;
574 return rangeCount_++;
577 template <
typename InputStream>
578 bool CharacterEscape(DecodedStream<InputStream>& ds,
unsigned* escapedCodepoint) {
580 switch (codepoint = ds.Take()) {
595 *escapedCodepoint = codepoint;
return true;
596 case 'f': *escapedCodepoint = 0x000C;
return true;
597 case 'n': *escapedCodepoint = 0x000A;
return true;
598 case 'r': *escapedCodepoint = 0x000D;
return true;
599 case 't': *escapedCodepoint = 0x0009;
return true;
600 case 'v': *escapedCodepoint = 0x000B;
return true;
606 template <
typename InputStream>
607 bool SearchWithAnchoring(InputStream& is,
bool anchorBegin,
bool anchorEnd)
const {
609 DecodedStream<InputStream> ds(is);
613 const size_t stateSetSize = GetStateSetSize();
614 std::memset(stateSet_, 0, stateSetSize);
616 bool matched = AddState(*current, root_);
618 while (!current->Empty() && (codepoint = ds.Take()) != 0) {
619 std::memset(stateSet_, 0, stateSetSize);
622 for (
const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
623 const State& sr = GetState(*s);
624 if (sr.codepoint == codepoint ||
625 sr.codepoint == kAnyCharacterClass ||
626 (sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
628 matched = AddState(*next, sr.out) || matched;
629 if (!anchorEnd && matched)
633 AddState(*next, root_);
635 internal::Swap(current, next);
641 size_t GetStateSetSize()
const {
642 return (stateCount_ + 31) / 32 * 4;
649 const State& s = GetState(index);
650 if (s.out1 != kRegexInvalidState) {
651 bool matched = AddState(l, s.out);
652 return AddState(l, s.out1) || matched;
654 else if (!(stateSet_[index >> 5] & (1 << (index & 31)))) {
655 stateSet_[index >> 5] |= (1 << (index & 31));
656 *l.template PushUnsafe<SizeType>() = index;
658 return s.out == kRegexInvalidState;
661 bool MatchRange(
SizeType rangeIndex,
unsigned codepoint)
const {
662 bool yes = (GetRange(rangeIndex).start & kRangeNegationFlag) == 0;
663 while (rangeIndex != kRegexInvalidRange) {
664 const Range& r = GetRange(rangeIndex);
665 if (codepoint >= (r.start & ~kRangeNegationFlag) && codepoint <= r.end)
678 static const unsigned kInfinityQuantifier = ~0u;
Regular expression engine with subset of ECMAscript grammar.
Definition: regex.h:85
A type-unsafe stack for storing different types of data.
Definition: stack.h:36
Concept for allocating, resizing and freeing memory block.
#define RAPIDJSON_ASSERT(x)
Assertion.
Definition: rapidjson.h:402
#define RAPIDJSON_NAMESPACE_BEGIN
provide custom rapidjson namespace (opening expression)
Definition: rapidjson.h:116
#define RAPIDJSON_NAMESPACE_END
provide custom rapidjson namespace (closing expression)
Definition: rapidjson.h:119
RAPIDJSON_NAMESPACE_BEGIN typedef unsigned SizeType
Size type (for string lengths, array sizes, etc.)
Definition: rapidjson.h:380
Read-only string stream.
Definition: stream.h:110