Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(83)

Side by Side Diff: re2/parse.cc

Issue 1174041: code review 1174041: re2: memory diet (Closed)
Patch Set: code review 1174041: re2: memory diet Created 14 years, 11 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | re2/re2.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006 The RE2 Authors. All Rights Reserved. 1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 // Regular expression parser. 5 // Regular expression parser.
6 6
7 // The parser is a simple precedence-based parser with a 7 // The parser is a simple precedence-based parser with a
8 // manual stack. The parsing work is done by the methods 8 // manual stack. The parsing work is done by the methods
9 // of the ParseState class. The Regexp::Parse function is 9 // of the ParseState class. The Regexp::Parse function is
10 // essentially just a lexer that calls the ParseState method 10 // essentially just a lexer that calls the ParseState method
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
95 bool DoLeftParenNoCapture(); 95 bool DoLeftParenNoCapture();
96 96
97 // Processes a vertical bar in the input. 97 // Processes a vertical bar in the input.
98 bool DoVerticalBar(); 98 bool DoVerticalBar();
99 99
100 // Processes a right parenthesis in the input. 100 // Processes a right parenthesis in the input.
101 bool DoRightParen(); 101 bool DoRightParen();
102 102
103 // Processes the end of input, returning the final regexp. 103 // Processes the end of input, returning the final regexp.
104 Regexp* DoFinish(); 104 Regexp* DoFinish();
105 ··
106 // Finishes the regexp if necessary, preparing it for use
107 // in a more complicated expression.
108 // If it is a CharClassBuilder, converts into a CharClass.
109 Regexp* FinishRegexp(Regexp*);
105 110
106 // These routines don't manipulate the parse stack 111 // These routines don't manipulate the parse stack
107 // directly, but they do need to look at flags_. 112 // directly, but they do need to look at flags_.
108 // ParseCharClass also manipulates the internals of Regexp 113 // ParseCharClass also manipulates the internals of Regexp
109 // while creating *out_re. 114 // while creating *out_re.
110 115
111 // Parse a character class into *out_re. 116 // Parse a character class into *out_re.
112 // Removes parsed text from s. 117 // Removes parsed text from s.
113 bool ParseCharClass(StringPiece* s, Regexp** out_re, 118 bool ParseCharClass(StringPiece* s, Regexp** out_re,
114 RegexpStatus* status); 119 RegexpStatus* status);
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
167 // Cleans up by freeing all the regexps on the stack. 172 // Cleans up by freeing all the regexps on the stack.
168 Regexp::ParseState::~ParseState() { 173 Regexp::ParseState::~ParseState() {
169 Regexp* next; 174 Regexp* next;
170 for (Regexp* re = stacktop_; re != NULL; re = next) { 175 for (Regexp* re = stacktop_; re != NULL; re = next) {
171 next = re->down_; 176 next = re->down_;
172 re->down_ = NULL; 177 re->down_ = NULL;
173 re->Decref(); 178 re->Decref();
174 } 179 }
175 } 180 }
176 181
182 // Finishes the regexp if necessary, preparing it for use in
183 // a more complex expression.
184 // If it is a CharClassBuilder, converts into a CharClass.
185 Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
186 if (re == NULL)
187 return NULL;
188 re->down_ = NULL;
189
190 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
191 CharClassBuilder* ccb = re->ccb_;
192 re->ccb_ = NULL;
193 re->cc_ = ccb->GetCharClass();
194 delete ccb;
195 }
196
197 return re;
198 }
199
177 // Pushes the given regular expression onto the stack. 200 // Pushes the given regular expression onto the stack.
178 // Could check for too much memory used here. 201 // Could check for too much memory used here.
179 bool Regexp::ParseState::PushRegexp(Regexp* re) { 202 bool Regexp::ParseState::PushRegexp(Regexp* re) {
180 MaybeConcatString(-1, NoParseFlags); 203 MaybeConcatString(-1, NoParseFlags);
181 204
182 // Special case: a character class of one character is just 205 // Special case: a character class of one character is just
183 // a literal. This is a common idiom for escaping 206 // a literal. This is a common idiom for escaping
184 // single characters (e.g., [.] instead of \.), and some 207 // single characters (e.g., [.] instead of \.), and some
185 // analysis does better with fewer character classes. 208 // analysis does better with fewer character classes.
186 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. 209 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
187 if (re->op_ == kRegexpCharClass) { 210 if (re->op_ == kRegexpCharClass) {
188 if (re->cc_->size() == 1) { 211 if (re->ccb_->size() == 1) {
189 Rune r = re->cc_->begin()->lo; 212 Rune r = re->ccb_->begin()->lo;
190 re->Decref(); 213 re->Decref();
191 re = new Regexp(kRegexpLiteral, flags_); 214 re = new Regexp(kRegexpLiteral, flags_);
192 re->rune_ = r; 215 re->rune_ = r;
193 } else if (re->cc_->size() == 2) { 216 } else if (re->ccb_->size() == 2) {
194 Rune r = re->cc_->begin()->lo; 217 Rune r = re->ccb_->begin()->lo;
195 if ('A' <= r && r <= 'Z' && re->cc_->Contains(r + 'a' - 'A')) { 218 if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {
196 re->Decref(); 219 re->Decref();
197 re = new Regexp(kRegexpLiteral, flags_ | FoldCase); 220 re = new Regexp(kRegexpLiteral, flags_ | FoldCase);
198 re->rune_ = r + 'a' - 'A'; 221 re->rune_ = r + 'a' - 'A';
199 } 222 }
200 } 223 }
201 } 224 }
202 225
203 if (!IsMarker(re->op())) 226 if (!IsMarker(re->op()))
204 re->simple_ = re->ComputeSimple(); 227 re->simple_ = re->ComputeSimple();
205 re->down_ = stacktop_; 228 re->down_ = stacktop_;
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
272 Rune CycleFoldRune(Rune r) { 295 Rune CycleFoldRune(Rune r) {
273 CaseFold* f = LookupCaseFold(r); 296 CaseFold* f = LookupCaseFold(r);
274 if (f == NULL || r < f->lo) 297 if (f == NULL || r < f->lo)
275 return r; 298 return r;
276 return ApplyFold(f, r); 299 return ApplyFold(f, r);
277 } 300 }
278 301
279 // Add lo-hi to the class, along with their fold-equivalent characters. 302 // Add lo-hi to the class, along with their fold-equivalent characters.
280 // If lo-hi is already in the class, assume that the fold-equivalent 303 // If lo-hi is already in the class, assume that the fold-equivalent
281 // chars are there too, so there's no work to do. 304 // chars are there too, so there's no work to do.
282 static void AddFoldedRange(CharClass* cc, Rune lo, Rune hi, int depth) { 305 static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
283 // AddFoldedRange calls itself recursively for each rune in the fold cycle. 306 // AddFoldedRange calls itself recursively for each rune in the fold cycle.
284 // Most folding cycles are small: there aren't any bigger than four in the 307 // Most folding cycles are small: there aren't any bigger than four in the
285 // current Unicode tables. make_unicode_casefold.py checks that 308 // current Unicode tables. make_unicode_casefold.py checks that
286 // the cycles are not too long, and we double-check here using depth. 309 // the cycles are not too long, and we double-check here using depth.
287 if (depth > 10) { 310 if (depth > 10) {
288 LOG(DFATAL) << "AddFoldedRange recurses too much."; 311 LOG(DFATAL) << "AddFoldedRange recurses too much.";
289 return; 312 return;
290 } 313 }
291 314
292 if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done 315 if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
328 // Pick up where this fold left off. 351 // Pick up where this fold left off.
329 lo = f->hi + 1; 352 lo = f->hi + 1;
330 } 353 }
331 } 354 }
332 355
333 // Pushes the literal rune r onto the stack. 356 // Pushes the literal rune r onto the stack.
334 bool Regexp::ParseState::PushLiteral(Rune r) { 357 bool Regexp::ParseState::PushLiteral(Rune r) {
335 // Do case folding if needed. 358 // Do case folding if needed.
336 if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { 359 if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
337 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 360 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
361 re->ccb_ = new CharClassBuilder;
338 Rune r1 = r; 362 Rune r1 = r;
339 do { 363 do {
340 if (!(flags_ & NeverNL) || r != '\n') { 364 if (!(flags_ & NeverNL) || r != '\n') {
341 re->cc()->AddRange(r, r); 365 re->ccb_->AddRange(r, r);
342 } 366 }
343 r = CycleFoldRune(r); 367 r = CycleFoldRune(r);
344 } while (r != r1); 368 } while (r != r1);
345 re->cc()->RemoveAbove(rune_max_); 369 re->ccb_->RemoveAbove(rune_max_);
346 return PushRegexp(re); 370 return PushRegexp(re);
347 } 371 }
348 372
349 // Exclude newline if applicable. 373 // Exclude newline if applicable.
350 if ((flags_ & NeverNL) && r == '\n') 374 if ((flags_ & NeverNL) && r == '\n')
351 return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); 375 return PushRegexp(new Regexp(kRegexpNoMatch, flags_));
352 376
353 // No fancy stuff worked. Ordinary literal. 377 // No fancy stuff worked. Ordinary literal.
354 if (MaybeConcatString(r, flags_)) 378 if (MaybeConcatString(r, flags_))
355 return true; 379 return true;
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
387 } 411 }
388 return PushSimpleOp(kRegexpEndLine); 412 return PushSimpleOp(kRegexpEndLine);
389 } 413 }
390 414
391 // Pushes a . onto the stack. 415 // Pushes a . onto the stack.
392 bool Regexp::ParseState::PushDot() { 416 bool Regexp::ParseState::PushDot() {
393 if ((flags_ & DotNL) && !(flags_ & NeverNL)) 417 if ((flags_ & DotNL) && !(flags_ & NeverNL))
394 return PushSimpleOp(kRegexpAnyChar); 418 return PushSimpleOp(kRegexpAnyChar);
395 // Rewrite . into [^\n] 419 // Rewrite . into [^\n]
396 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 420 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
397 re->cc_->AddRange(0, '\n' - 1); 421 re->ccb_ = new CharClassBuilder;
398 re->cc_->AddRange('\n' + 1, rune_max_); 422 re->ccb_->AddRange(0, '\n' - 1);
423 re->ccb_->AddRange('\n' + 1, rune_max_);
399 return PushRegexp(re); 424 return PushRegexp(re);
400 } 425 }
401 426
402 // Pushes a regexp with the given op (and no args) onto the stack. 427 // Pushes a regexp with the given op (and no args) onto the stack.
403 bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { 428 bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
404 Regexp* re = new Regexp(op, flags_); 429 Regexp* re = new Regexp(op, flags_);
405 return PushRegexp(re); 430 return PushRegexp(re);
406 } 431 }
407 432
408 // Pushes a repeat operator regexp onto the stack. 433 // Pushes a repeat operator regexp onto the stack.
409 // A valid argument for the operator must already be on the stack. 434 // A valid argument for the operator must already be on the stack.
410 // The char c is the name of the operator, for use in error messages. 435 // The char c is the name of the operator, for use in error messages.
411 bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, 436 bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,
412 bool nongreedy) { 437 bool nongreedy) {
413 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { 438 if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
414 status_->set_code(kRegexpRepeatArgument); 439 status_->set_code(kRegexpRepeatArgument);
415 status_->set_error_arg(s); 440 status_->set_error_arg(s);
416 return false; 441 return false;
417 } 442 }
418 Regexp::ParseFlags fl = flags_; 443 Regexp::ParseFlags fl = flags_;
419 if (nongreedy) 444 if (nongreedy)
420 fl = fl ^ NonGreedy; 445 fl = fl ^ NonGreedy;
421 Regexp* re = new Regexp(op, fl); 446 Regexp* re = new Regexp(op, fl);
422 re->AllocSub(1); 447 re->AllocSub(1);
423 re->sub_[0] = stacktop_; 448 re->down_ = stacktop_->down_;
449 re->sub()[0] = FinishRegexp(stacktop_);
424 re->simple_ = re->ComputeSimple(); 450 re->simple_ = re->ComputeSimple();
425 re->down_ = stacktop_->down_;
426 stacktop_ = re; 451 stacktop_ = re;
427 return true; 452 return true;
428 } 453 }
429 454
430 // Pushes a repetition regexp onto the stack. 455 // Pushes a repetition regexp onto the stack.
431 // A valid argument for the operator must already be on the stack. 456 // A valid argument for the operator must already be on the stack.
432 bool Regexp::ParseState::PushRepetition(int min, int max, 457 bool Regexp::ParseState::PushRepetition(int min, int max,
433 const StringPiece& s, 458 const StringPiece& s,
434 bool nongreedy) { 459 bool nongreedy) {
435 if ((max != -1 && max < min) || max > 1000) { 460 if ((max != -1 && max < min) || max > 1000) {
436 status_->set_code(kRegexpRepeatSize); 461 status_->set_code(kRegexpRepeatSize);
437 status_->set_error_arg(s); 462 status_->set_error_arg(s);
438 return false; 463 return false;
439 } 464 }
440 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { 465 if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
441 status_->set_code(kRegexpRepeatArgument); 466 status_->set_code(kRegexpRepeatArgument);
442 status_->set_error_arg(s); 467 status_->set_error_arg(s);
443 return false; 468 return false;
444 } 469 }
445 Regexp::ParseFlags fl = flags_; 470 Regexp::ParseFlags fl = flags_;
446 if (nongreedy) 471 if (nongreedy)
447 fl = fl ^ NonGreedy; 472 fl = fl ^ NonGreedy;
448 Regexp* re = new Regexp(kRegexpRepeat, fl); 473 Regexp* re = new Regexp(kRegexpRepeat, fl);
449 re->min_ = min; 474 re->min_ = min;
450 re->max_ = max; 475 re->max_ = max;
451 re->AllocSub(1); 476 re->AllocSub(1);
452 re->sub_[0] = stacktop_; 477 re->down_ = stacktop_->down_;
478 re->sub()[0] = FinishRegexp(stacktop_);
453 re->simple_ = re->ComputeSimple(); 479 re->simple_ = re->ComputeSimple();
454 480
455 re->down_ = stacktop_->down_;
456 stacktop_ = re; 481 stacktop_ = re;
457 return true; 482 return true;
458 } 483 }
459 484
460 // Pseudo-operators marking left paren and pipe operators 485 // Pseudo-operators - only on parse stack.
461 // in the regexp stack. 486
487 // Markers for left paren and pipe operators in the regexp stack.
462 const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1); 488 const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
463 const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2); 489 const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
464 490
465 // Checks whether a particular regexp op is a marker. 491 // Checks whether a particular regexp op is a marker.
466 bool Regexp::ParseState::IsMarker(RegexpOp op) { 492 bool Regexp::ParseState::IsMarker(RegexpOp op) {
467 return op >= kLeftParen; 493 return op >= kLeftParen;
468 } 494 }
469 495
470 // Processes a left parenthesis in the input. 496 // Processes a left parenthesis in the input.
471 // Pushes a marker onto the stack. 497 // Pushes a marker onto the stack.
472 bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { 498 bool Regexp::ParseState::DoLeftParen(const StringPiece& name) {
473 Regexp* re = new Regexp(kLeftParen, flags_); 499 Regexp* re = new Regexp(kLeftParen, flags_);
474 re->cap_ = ++ncap_; 500 re->cap_ = ++ncap_;
475 if (name.data() != NULL) 501 if (name.data() != NULL)
476 re->name_ = new string(name.as_string()); 502 re->name_ = new string(name.as_string());
477 return PushRegexp(re); 503 return PushRegexp(re);
478 } 504 }
479 505
480 // Pushes a non-capturing marker onto the stack. 506 // Pushes a non-capturing marker onto the stack.
481 bool Regexp::ParseState::DoLeftParenNoCapture() { 507 bool Regexp::ParseState::DoLeftParenNoCapture() {
482 Regexp* re = new Regexp(kLeftParen, flags_); 508 Regexp* re = new Regexp(kLeftParen, flags_);
483 re->cap_ = -1; 509 re->cap_ = -1;
484 return PushRegexp(re); 510 return PushRegexp(re);
485 } 511 }
486 512
487 // Adds r to cc, along with r's upper case if foldascii is set. 513 // Adds r to cc, along with r's upper case if foldascii is set.
488 static void AddLiteral(CharClass* cc, Rune r, bool foldascii) { 514 static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) {
489 cc->AddRange(r, r); 515 cc->AddRange(r, r);
490 if (foldascii && 'a' <= r && r <= 'z') 516 if (foldascii && 'a' <= r && r <= 'z')
491 cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); 517 cc->AddRange(r + 'A' - 'a', r + 'A' - 'a');
492 } 518 }
493 519
494 // Processes a vertical bar in the input. 520 // Processes a vertical bar in the input.
495 bool Regexp::ParseState::DoVerticalBar() { 521 bool Regexp::ParseState::DoVerticalBar() {
496 MaybeConcatString(-1, NoParseFlags); 522 MaybeConcatString(-1, NoParseFlags);
497 DoConcatenation(); 523 DoConcatenation();
498 524
499 // Below the vertical bar is a list to alternate. 525 // Below the vertical bar is a list to alternate.
500 // Above the vertical bar is a list to concatenate. 526 // Above the vertical bar is a list to concatenate.
501 // We just did the concatenation, so either swap 527 // We just did the concatenation, so either swap
502 // the result below the vertical bar or push a new 528 // the result below the vertical bar or push a new
503 // vertical bar on the stack. 529 // vertical bar on the stack.
504 Regexp* r1; 530 Regexp* r1;
505 Regexp* r2; 531 Regexp* r2;
506 if ((r1 = stacktop_) != NULL && 532 if ((r1 = stacktop_) != NULL &&
507 (r2 = stacktop_->down_) != NULL && 533 (r2 = stacktop_->down_) != NULL &&
508 r2->op() == kVerticalBar) { 534 r2->op() == kVerticalBar) {
509 // If above and below vertical bar are literal or char class, 535 // If above and below vertical bar are literal or char class,
510 // can merge into a single char class. 536 // can merge into a single char class.
511 Regexp* r3; 537 Regexp* r3;
512 if ((r1->op() == kRegexpLiteral || 538 if ((r1->op() == kRegexpLiteral ||
513 r1->op() == kRegexpCharClass || 539 r1->op() == kRegexpCharClass ||
514 r1->op() == kRegexpAnyChar) && 540 r1->op() == kRegexpAnyChar) &&
515 (r3 = r2->down_) != NULL) { 541 (r3 = r2->down_) != NULL) {
542 Rune rune;
516 switch (r3->op()) { 543 switch (r3->op()) {
517 case kRegexpLiteral: // convert to char class 544 case kRegexpLiteral: // convert to char class
545 rune = r3->rune_;
518 r3->op_ = kRegexpCharClass; 546 r3->op_ = kRegexpCharClass;
519 r3->cc_ = new CharClass; 547 r3->cc_ = NULL;
520 AddLiteral(r3->cc_, r3->rune_, r3->parse_flags_ & Regexp::FoldCase); 548 r3->ccb_ = new CharClassBuilder;
549 AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase);
521 // fall through 550 // fall through
522 case kRegexpCharClass: 551 case kRegexpCharClass:
523 if (r1->op() == kRegexpLiteral) 552 if (r1->op() == kRegexpLiteral)
524 AddLiteral(r3->cc_, r1->rune_, r1->parse_flags_ & Regexp::FoldCase); 553 AddLiteral(r3->ccb_, r1->rune_, r1->parse_flags_ & Regexp::FoldCase) ;
525 else if (r1->op() == kRegexpCharClass) 554 else if (r1->op() == kRegexpCharClass)
526 r3->cc_->AddCharClass(r1->cc()); 555 r3->ccb_->AddCharClass(r1->ccb_);
527 if (r1->op() == kRegexpAnyChar || r3->cc_->full()) { 556 if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) {
557 delete r3->ccb_;
558 r3->ccb_ = NULL;
528 r3->op_ = kRegexpAnyChar; 559 r3->op_ = kRegexpAnyChar;
529 delete r3->cc_;
530 r3->cc_ = NULL;
531 } 560 }
532 // fall through 561 // fall through
533 case kRegexpAnyChar: 562 case kRegexpAnyChar:
534 // pop r1 563 // pop r1
535 stacktop_ = r2; 564 stacktop_ = r2;
536 r1->Decref(); 565 r1->Decref();
537 return true; 566 return true;
538 default: 567 default:
539 break; 568 break;
540 } 569 }
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
572 601
573 // Restore flags from when paren opened. 602 // Restore flags from when paren opened.
574 Regexp* re = r2; 603 Regexp* re = r2;
575 flags_ = re->parse_flags(); 604 flags_ = re->parse_flags();
576 605
577 // Rewrite LeftParen as capture if needed. 606 // Rewrite LeftParen as capture if needed.
578 if (re->cap_ > 0) { 607 if (re->cap_ > 0) {
579 re->op_ = kRegexpCapture; 608 re->op_ = kRegexpCapture;
580 // re->cap_ is already set 609 // re->cap_ is already set
581 re->AllocSub(1); 610 re->AllocSub(1);
582 re->sub_[0] = r1; 611 re->sub()[0] = FinishRegexp(r1);
583 re->simple_ = re->ComputeSimple(); 612 re->simple_ = re->ComputeSimple();
584 } else { 613 } else {
585 re->Decref(); 614 re->Decref();
586 re = r1; 615 re = r1;
587 } 616 }
588 return PushRegexp(re); 617 return PushRegexp(re);
589 } 618 }
590 619
591 // Processes the end of input, returning the final regexp. 620 // Processes the end of input, returning the final regexp.
592 Regexp* Regexp::ParseState::DoFinish() { 621 Regexp* Regexp::ParseState::DoFinish() {
593 DoAlternation(); 622 DoAlternation();
594 Regexp* re = stacktop_; 623 Regexp* re = stacktop_;
595 if (re != NULL && re->down_ != NULL) { 624 if (re != NULL && re->down_ != NULL) {
596 status_->set_code(kRegexpMissingParen); 625 status_->set_code(kRegexpMissingParen);
597 status_->set_error_arg(whole_regexp_); 626 status_->set_error_arg(whole_regexp_);
598 return NULL; 627 return NULL;
599 } 628 }
600 stacktop_ = NULL; 629 stacktop_ = NULL;
601 return re; 630 return FinishRegexp(re);
602 } 631 }
603 632
604 // Collapse the regexps on top of the stack, down to the 633 // Collapse the regexps on top of the stack, down to the
605 // first marker, into a new op node (op == kRegexpAlternate 634 // first marker, into a new op node (op == kRegexpAlternate
606 // or op == kRegexpConcat). 635 // or op == kRegexpConcat).
607 void Regexp::ParseState::DoCollapse(RegexpOp op) { 636 void Regexp::ParseState::DoCollapse(RegexpOp op) {
608 // Scan backward to marker, counting children of composite. 637 // Scan backward to marker, counting children of composite.
609 int n = 0; 638 int n = 0;
610 Regexp* next = NULL; 639 Regexp* next = NULL;
611 Regexp* sub; 640 Regexp* sub;
612 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { 641 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
613 next = sub->down_; 642 next = sub->down_;
614 if (sub->op_ == op) 643 if (sub->op_ == op)
615 n += sub->nsub_; 644 n += sub->nsub_;
616 else 645 else
617 n++; 646 n++;
618 } 647 }
619 648
620 // If there's just one child, leave it alone. 649 // If there's just one child, leave it alone.
621 // (Concat of one thing is that one thing; alternate of one thing is same.) 650 // (Concat of one thing is that one thing; alternate of one thing is same.)
622 if (stacktop_ != NULL && stacktop_->down_ == next) 651 if (stacktop_ != NULL && stacktop_->down_ == next)
623 return; 652 return;
624 653
625 // Construct op (alternation or concatenation), flattening op of op. 654 // Construct op (alternation or concatenation), flattening op of op.
626 Regexp* re = new Regexp(op, flags_); 655 Regexp** subs = new Regexp*[n];
627 re->AllocSub(n);
628 next = NULL; 656 next = NULL;
657 int i = n;
629 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { 658 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
630 next = sub->down_; // in case we sub->Decref(). 659 next = sub->down_;
631 if (sub->op_ == op) { 660 if (sub->op_ == op) {
661 Regexp** sub_subs = sub->sub();
632 for (int k = sub->nsub_ - 1; k >= 0; k--) 662 for (int k = sub->nsub_ - 1; k >= 0; k--)
633 re->sub_[--n] = sub->sub_[k]->Incref(); 663 subs[--i] = sub_subs[k]->Incref();
634 sub->Decref(); 664 sub->Decref();
635 } else { 665 } else {
636 re->sub_[--n] = sub; 666 subs[--i] = FinishRegexp(sub);
637 } 667 }
638 } 668 }
669 ··
670 Regexp* re = ConcatOrAlternate(op, subs, n, flags_);
671 delete[] subs;
639 re->simple_ = re->ComputeSimple(); 672 re->simple_ = re->ComputeSimple();
640 re->down_ = next; 673 re->down_ = next;
641 stacktop_ = re; 674 stacktop_ = re;
642 } 675 }
643 676
644 // Finishes the current concatenation, 677 // Finishes the current concatenation,
645 // collapsing it into a single regexp on the stack. 678 // collapsing it into a single regexp on the stack.
646 void Regexp::ParseState::DoConcatenation() { 679 void Regexp::ParseState::DoConcatenation() {
647 Regexp* r1 = stacktop_; 680 Regexp* r1 = stacktop_;
648 if (r1 == NULL || IsMarker(r1->op())) { 681 if (r1 == NULL || IsMarker(r1->op())) {
649 // empty concatenation is special case 682 // empty concatenation is special case
650 Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); 683 Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);
651 PushRegexp(re); 684 PushRegexp(re);
652 } 685 }
653 DoCollapse(kRegexpConcat); 686 DoCollapse(kRegexpConcat);
654 } 687 }
655 688
656 // Finishes the current alternation, 689 // Finishes the current alternation,
657 // collapsing it to a single regexp on the stack. 690 // collapsing it to a single regexp on the stack.
658 void Regexp::ParseState::DoAlternation() { 691 void Regexp::ParseState::DoAlternation() {
659 DoVerticalBar(); 692 DoVerticalBar();
660 // Now stack top is kVerticalBar. 693 // Now stack top is kVerticalBar.
661 Regexp* r1 = stacktop_; 694 Regexp* r1 = stacktop_;
662 stacktop_ = r1->down_; 695 stacktop_ = r1->down_;
663 r1->Decref(); 696 r1->Decref();
664 DoCollapse(kRegexpAlternate); 697 DoCollapse(kRegexpAlternate);
665 } 698 }
666 699
667 // Incremental conversion of concatenated literals into strings. 700 // Incremental conversion of concatenated literals into strings.
668 // If top of stack is literal, literal or literal, string or string, literal 701 // If top of stack is (literal, literal) or (literal, string) or (string, litera l)
669 // or string, string, collapse into single string. 702 // or (string, string), collapse into single string.
670 // Don't walk down the stack -- the parser calls this frequently 703 // Don't walk down the stack -- the parser calls this frequently
671 // enough that below the bottom two is known to be collapsed. 704 // enough that below the bottom two is known to be collapsed.
672 // Only called when another regexp is about to be pushed 705 // Only called when another regexp is about to be pushed
673 // on the stack, so that the topmost literal is not being considered. 706 // on the stack, so that the topmost literal is not being considered.
674 // (Otherwise ab* would turn into (ab)*.) 707 // (Otherwise ab* would turn into (ab)*.)
675 // If r >= 0, consider pushing a literal r on the stack. 708 // If r >= 0, consider pushing a literal r on the stack.
676 // Return whether that happened. 709 // Return whether that happened.
677 bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { 710 bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
678 Regexp* re1; 711 Regexp* re1;
679 Regexp* re2; 712 Regexp* re2;
680 if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) 713 if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL)
681 return false; 714 return false;
682 715
683 if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) 716 if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
684 return false; 717 return false;
685 if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) 718 if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
686 return false; 719 return false;
687 if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) 720 if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
688 return false; 721 return false;
689 722
690 if (re2->op_ == kRegexpLiteral) { 723 if (re2->op_ == kRegexpLiteral) {
691 // convert into string 724 // convert into string
725 Rune rune = re2->rune_;
692 re2->op_ = kRegexpLiteralString; 726 re2->op_ = kRegexpLiteralString;
693 re2->AddRuneToString(re2->rune_); 727 re2->nrunes_ = 0;
728 re2->runes_ = NULL;
729 re2->AddRuneToString(rune);
694 } 730 }
695 731
696 // push re1 into re2. 732 // push re1 into re2.
697 if (re1->op_ == kRegexpLiteral) { 733 if (re1->op_ == kRegexpLiteral) {
698 re2->AddRuneToString(re1->rune_); 734 re2->AddRuneToString(re1->rune_);
699 } else { 735 } else {
700 for (int i = 0; i < re1->nrunes_; i++) 736 for (int i = 0; i < re1->nrunes_; i++)
701 re2->AddRuneToString(re1->runes_[i]); 737 re2->AddRuneToString(re1->runes_[i]);
702 re1->nrunes_ = 0; 738 re1->nrunes_ = 0;
703 delete[] re1->runes_; 739 delete[] re1->runes_;
(...skipping 272 matching lines...) Expand 10 before | Expand all | Expand 10 after
976 1012
977 BadEscape: 1013 BadEscape:
978 // Unrecognized escape sequence. 1014 // Unrecognized escape sequence.
979 status->set_code(kRegexpBadEscape); 1015 status->set_code(kRegexpBadEscape);
980 status->set_error_arg(StringPiece(begin, s->data() - begin)); 1016 status->set_error_arg(StringPiece(begin, s->data() - begin));
981 return false; 1017 return false;
982 } 1018 }
983 1019
984 // Add a range to the character class, but exclude newline if asked. 1020 // Add a range to the character class, but exclude newline if asked.
985 // Also handle case folding. 1021 // Also handle case folding.
986 static void AddRange(CharClass* cc, Rune lo, Rune hi, 1022 static void AddRange(CharClassBuilder* cc, Rune lo, Rune hi,
987 Regexp::ParseFlags parse_flags) { 1023 Regexp::ParseFlags parse_flags) {
988 1024
989 // Take out \n if the flags say so. 1025 // Take out \n if the flags say so.
990 bool cutnl = !(parse_flags & Regexp::ClassNL) || 1026 bool cutnl = !(parse_flags & Regexp::ClassNL) ||
991 (parse_flags & Regexp::NeverNL); 1027 (parse_flags & Regexp::NeverNL);
992 if (cutnl && lo <= '\n' && '\n' <= hi) { 1028 if (cutnl && lo <= '\n' && '\n' <= hi) {
993 if (lo < '\n') 1029 if (lo < '\n')
994 AddRange(cc, lo, '\n' - 1, parse_flags); 1030 AddRange(cc, lo, '\n' - 1, parse_flags);
995 if (hi > '\n') 1031 if (hi > '\n')
996 AddRange(cc, '\n' + 1, hi, parse_flags); 1032 AddRange(cc, '\n' + 1, hi, parse_flags);
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
1030 1066
1031 // Look for a Unicode group with the given name (e.g., "Han") 1067 // Look for a Unicode group with the given name (e.g., "Han")
1032 static UGroup* LookupUnicodeGroup(const StringPiece& name) { 1068 static UGroup* LookupUnicodeGroup(const StringPiece& name) {
1033 // Special case: "Any" means any. 1069 // Special case: "Any" means any.
1034 if (name == "Any") 1070 if (name == "Any")
1035 return &anygroup; 1071 return &anygroup;
1036 return LookupGroup(name, unicode_groups, num_unicode_groups); 1072 return LookupGroup(name, unicode_groups, num_unicode_groups);
1037 } 1073 }
1038 1074
1039 // Add a UGroup to the character class. 1075 // Add a UGroup to the character class.
1040 static void AddUGroup(CharClass *cc, UGroup *g, 1076 static void AddUGroup(CharClassBuilder *cc, UGroup *g,
1041 Regexp::ParseFlags parse_flags) { 1077 Regexp::ParseFlags parse_flags) {
1042 for (int i = 0; i < g->nr16; i++) { 1078 for (int i = 0; i < g->nr16; i++) {
1043 AddRange(cc, g->r16[i].lo, g->r16[i].hi, parse_flags); 1079 AddRange(cc, g->r16[i].lo, g->r16[i].hi, parse_flags);
1044 } 1080 }
1045 for (int i = 0; i < g->nr32; i++) { 1081 for (int i = 0; i < g->nr32; i++) {
1046 AddRange(cc, g->r32[i].lo, g->r32[i].hi, parse_flags); 1082 AddRange(cc, g->r32[i].lo, g->r32[i].hi, parse_flags);
1047 } 1083 }
1048 } 1084 }
1049 1085
1050 // Add the negation of a UGroup to the character class. 1086 // Add the negation of a UGroup to the character class.
1051 static void AddNegatedUGroup(CharClass* cc, UGroup *g, 1087 static void AddNegatedUGroup(CharClassBuilder* cc, UGroup *g,
1052 Regexp::ParseFlags parse_flags) { 1088 Regexp::ParseFlags parse_flags) {
1053 int next = 0; 1089 int next = 0;
1054 for (int i = 0; i < g->nr16; i++) { 1090 for (int i = 0; i < g->nr16; i++) {
1055 if (next < g->r16[i].lo) 1091 if (next < g->r16[i].lo)
1056 AddRange(cc, next, g->r16[i].lo - 1, parse_flags); 1092 AddRange(cc, next, g->r16[i].lo - 1, parse_flags);
1057 next = g->r16[i].hi + 1; 1093 next = g->r16[i].hi + 1;
1058 } 1094 }
1059 for (int i = 0; i < g->nr32; i++) { 1095 for (int i = 0; i < g->nr32; i++) {
1060 if (next < g->r32[i].lo) 1096 if (next < g->r32[i].lo)
1061 AddRange(cc, next, g->r32[i].lo - 1, parse_flags); 1097 AddRange(cc, next, g->r32[i].lo - 1, parse_flags);
(...skipping 26 matching lines...) Expand all
1088 1124
1089 enum ParseStatus { 1125 enum ParseStatus {
1090 kParseOk, // Did some parsing. 1126 kParseOk, // Did some parsing.
1091 kParseError, // Found an error. 1127 kParseError, // Found an error.
1092 kParseNothing, // Decided not to parse. 1128 kParseNothing, // Decided not to parse.
1093 }; 1129 };
1094 1130
1095 // Maybe parses a Unicode character group like \p{Han} or \P{Han} 1131 // Maybe parses a Unicode character group like \p{Han} or \P{Han}
1096 // (the latter is a negated group). 1132 // (the latter is a negated group).
1097 ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, 1133 ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
1098 CharClass *cc, 1134 CharClassBuilder *cc,
1099 RegexpStatus* status) { 1135 RegexpStatus* status) {
1100 // Decide whether to parse. 1136 // Decide whether to parse.
1101 if (!(parse_flags & Regexp::UnicodeGroups)) 1137 if (!(parse_flags & Regexp::UnicodeGroups))
1102 return kParseNothing; 1138 return kParseNothing;
1103 if (s->size() < 2 || (*s)[0] != '\\') 1139 if (s->size() < 2 || (*s)[0] != '\\')
1104 return kParseNothing; 1140 return kParseNothing;
1105 Rune c = (*s)[1]; 1141 Rune c = (*s)[1];
1106 if (c != 'p' && c != 'P') 1142 if (c != 'p' && c != 'P')
1107 return kParseNothing; 1143 return kParseNothing;
1108 1144
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
1153 AddUGroup(cc, g, parse_flags); 1189 AddUGroup(cc, g, parse_flags);
1154 else 1190 else
1155 AddNegatedUGroup(cc, g, parse_flags); 1191 AddNegatedUGroup(cc, g, parse_flags);
1156 return kParseOk; 1192 return kParseOk;
1157 } 1193 }
1158 1194
1159 // Parses a character class name like [:alnum:]. 1195 // Parses a character class name like [:alnum:].
1160 // Sets *s to span the remainder of the string. 1196 // Sets *s to span the remainder of the string.
1161 // Adds the ranges corresponding to the class to ranges. 1197 // Adds the ranges corresponding to the class to ranges.
1162 static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, 1198 static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
1163 CharClass *cc, 1199 CharClassBuilder *cc,
1164 RegexpStatus* status) { 1200 RegexpStatus* status) {
1165 // Check begins with [: 1201 // Check begins with [:
1166 const char* p = s->data(); 1202 const char* p = s->data();
1167 const char* ep = s->data() + s->size(); 1203 const char* ep = s->data() + s->size();
1168 if (ep - p < 2 || p[0] != '[' || p[1] != ':') 1204 if (ep - p < 2 || p[0] != '[' || p[1] != ':')
1169 return kParseNothing; 1205 return kParseNothing;
1170 1206
1171 // Look for closing :]. 1207 // Look for closing :].
1172 const char* q; 1208 const char* q;
1173 for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) 1209 for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++)
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after
1250 RegexpStatus* status) { 1286 RegexpStatus* status) {
1251 StringPiece whole_class = *s; 1287 StringPiece whole_class = *s;
1252 if (s->size() == 0 || (*s)[0] != '[') { 1288 if (s->size() == 0 || (*s)[0] != '[') {
1253 // Caller checked this. 1289 // Caller checked this.
1254 status->set_code(kRegexpInternalError); 1290 status->set_code(kRegexpInternalError);
1255 status->set_error_arg(NULL); 1291 status->set_error_arg(NULL);
1256 return false; 1292 return false;
1257 } 1293 }
1258 bool negated = false; 1294 bool negated = false;
1259 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 1295 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
1296 re->ccb_ = new CharClassBuilder;
1260 s->remove_prefix(1); // '[' 1297 s->remove_prefix(1); // '['
1261 if (s->size() > 0 && (*s)[0] == '^') { 1298 if (s->size() > 0 && (*s)[0] == '^') {
1262 s->remove_prefix(1); // '^' 1299 s->remove_prefix(1); // '^'
1263 negated = true; 1300 negated = true;
1264 if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { 1301 if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
1265 // If NL can't match implicitly, then pretend 1302 // If NL can't match implicitly, then pretend
1266 // negated classes include a leading \n. 1303 // negated classes include a leading \n.
1267 re->cc_->AddRange('\n', '\n'); 1304 re->ccb_->AddRange('\n', '\n');
1268 } 1305 }
1269 } 1306 }
1270 bool first = true; // ] is okay as first char in class 1307 bool first = true; // ] is okay as first char in class
1271 while (s->size() > 0 && ((*s)[0] != ']' || first)) { 1308 while (s->size() > 0 && ((*s)[0] != ']' || first)) {
1272 // - is only okay unescaped as first or last in class. 1309 // - is only okay unescaped as first or last in class.
1273 // Except that Perl allows - anywhere. 1310 // Except that Perl allows - anywhere.
1274 if ((*s)[0] == '-' && !first && !(flags_&PerlX) && 1311 if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
1275 (s->size() == 1 || (*s)[1] != ']')) { 1312 (s->size() == 1 || (*s)[1] != ']')) {
1276 StringPiece t = *s; 1313 StringPiece t = *s;
1277 t.remove_prefix(1); // '-' 1314 t.remove_prefix(1); // '-'
1278 Rune r; 1315 Rune r;
1279 int n = StringPieceToRune(&r, &t, status); 1316 int n = StringPieceToRune(&r, &t, status);
1280 if (n < 0) { 1317 if (n < 0) {
1281 re->Decref(); 1318 re->Decref();
1282 return false; 1319 return false;
1283 } 1320 }
1284 status->set_code(kRegexpBadCharRange); 1321 status->set_code(kRegexpBadCharRange);
1285 status->set_error_arg(StringPiece(s->data(), 1+n)); 1322 status->set_error_arg(StringPiece(s->data(), 1+n));
1286 re->Decref(); 1323 re->Decref();
1287 return false; 1324 return false;
1288 } 1325 }
1289 first = false; 1326 first = false;
1290 1327
1291 // Look for [:alnum:] etc. 1328 // Look for [:alnum:] etc.
1292 if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { 1329 if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {
1293 switch (ParseCCName(s, flags_, re->cc_, status)) { 1330 switch (ParseCCName(s, flags_, re->ccb_, status)) {
1294 case kParseOk: 1331 case kParseOk:
1295 continue; 1332 continue;
1296 case kParseError: 1333 case kParseError:
1297 re->Decref(); 1334 re->Decref();
1298 return false; 1335 return false;
1299 case kParseNothing: 1336 case kParseNothing:
1300 break; 1337 break;
1301 } 1338 }
1302 } 1339 }
1303 1340
1304 // Look for Unicode character group like \p{Han} 1341 // Look for Unicode character group like \p{Han}
1305 if (s->size() > 2 && 1342 if (s->size() > 2 &&
1306 (*s)[0] == '\\' && 1343 (*s)[0] == '\\' &&
1307 ((*s)[1] == 'p' || (*s)[1] == 'P')) { 1344 ((*s)[1] == 'p' || (*s)[1] == 'P')) {
1308 switch (ParseUnicodeGroup(s, flags_, re->cc_, status)) { 1345 switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
1309 case kParseOk: 1346 case kParseOk:
1310 continue; 1347 continue;
1311 case kParseError: 1348 case kParseError:
1312 re->Decref(); 1349 re->Decref();
1313 return false; 1350 return false;
1314 case kParseNothing: 1351 case kParseNothing:
1315 break; 1352 break;
1316 } 1353 }
1317 } 1354 }
1318 1355
1319 // Look for Perl character class symbols (extension). 1356 // Look for Perl character class symbols (extension).
1320 UGroup *g = MaybeParsePerlCCEscape(s, flags_); 1357 UGroup *g = MaybeParsePerlCCEscape(s, flags_);
1321 if (g != NULL) { 1358 if (g != NULL) {
1322 AddUGroup(re->cc_, g, flags_); 1359 AddUGroup(re->ccb_, g, flags_);
1323 continue; 1360 continue;
1324 } 1361 }
1325 1362
1326 // Otherwise assume single character or simple range. 1363 // Otherwise assume single character or simple range.
1327 RuneRange rr; 1364 RuneRange rr;
1328 if (!ParseCCRange(s, &rr, whole_class, status)) { 1365 if (!ParseCCRange(s, &rr, whole_class, status)) {
1329 re->Decref(); 1366 re->Decref();
1330 return false; 1367 return false;
1331 } 1368 }
1332 // AddRange is usually called in response to a class like 1369 // AddRange is usually called in response to a class like
1333 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless 1370 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
1334 // Regexp::ClassNL is set. In an explicit range or singleton 1371 // Regexp::ClassNL is set. In an explicit range or singleton
1335 // like we just parsed, we do not filter \n out, so set ClassNL 1372 // like we just parsed, we do not filter \n out, so set ClassNL
1336 // in the flags. 1373 // in the flags.
1337 AddRange(re->cc_, rr.lo, rr.hi, flags_ | Regexp::ClassNL); 1374 AddRange(re->ccb_, rr.lo, rr.hi, flags_ | Regexp::ClassNL);
1338 } 1375 }
1339 if (s->size() == 0) { 1376 if (s->size() == 0) {
1340 status->set_code(kRegexpMissingBracket); 1377 status->set_code(kRegexpMissingBracket);
1341 status->set_error_arg(whole_class); 1378 status->set_error_arg(whole_class);
1342 re->Decref(); 1379 re->Decref();
1343 return false; 1380 return false;
1344 } 1381 }
1345 s->remove_prefix(1); // ']' 1382 s->remove_prefix(1); // ']'
1346 1383
1347 if (negated) 1384 if (negated)
1348 re->cc_->Negate(); 1385 re->ccb_->Negate();
1349 re->cc_->RemoveAbove(rune_max_); 1386 re->ccb_->RemoveAbove(rune_max_);
1350 1387
1351 *out_re = re; 1388 *out_re = re;
1352 return true; 1389 return true;
1353 } 1390 }
1354 1391
1355 // Is this a valid capture name? [A-Za-z0-9_]+ 1392 // Is this a valid capture name? [A-Za-z0-9_]+
1356 // PCRE limits names to 32 bytes. 1393 // PCRE limits names to 32 bytes.
1357 // Python rejects names starting with digits. 1394 // Python rejects names starting with digits.
1358 // We don't enforce either of those. 1395 // We don't enforce either of those.
1359 static bool IsValidCaptureName(const StringPiece& name) { 1396 static bool IsValidCaptureName(const StringPiece& name) {
(...skipping 385 matching lines...) Expand 10 before | Expand all | Expand 10 after
1745 return NULL; 1782 return NULL;
1746 if (!ps.PushLiteral(r)) 1783 if (!ps.PushLiteral(r))
1747 return NULL; 1784 return NULL;
1748 } 1785 }
1749 break; 1786 break;
1750 } 1787 }
1751 } 1788 }
1752 1789
1753 if (t[1] == 'p' || t[1] == 'P') { 1790 if (t[1] == 'p' || t[1] == 'P') {
1754 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); 1791 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
1755 switch (ParseUnicodeGroup(&t, ps.flags(), re->cc_, status)) { 1792 re->ccb_ = new CharClassBuilder;
1793 switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
1756 case kParseOk: 1794 case kParseOk:
1757 if (!ps.PushRegexp(re)) 1795 if (!ps.PushRegexp(re))
1758 return NULL; 1796 return NULL;
1759 goto Break2; 1797 goto Break2;
1760 case kParseError: 1798 case kParseError:
1761 re->Decref(); 1799 re->Decref();
1762 return NULL; 1800 return NULL;
1763 case kParseNothing: 1801 case kParseNothing:
1764 re->Decref(); 1802 re->Decref();
1765 break; 1803 break;
1766 } 1804 }
1767 } 1805 }
1768 1806
1769 UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); 1807 UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
1770 if (g != NULL) { 1808 if (g != NULL) {
1771 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); 1809 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
1772 AddUGroup(re->cc_, g, ps.flags()); 1810 re->ccb_ = new CharClassBuilder;
1811 AddUGroup(re->ccb_, g, ps.flags());
1773 if (!ps.PushRegexp(re)) 1812 if (!ps.PushRegexp(re))
1774 return NULL; 1813 return NULL;
1775 break; 1814 break;
1776 } 1815 }
1777 1816
1778 Rune r; 1817 Rune r;
1779 if (!ParseEscape(&t, &r, status, ps.rune_max())) 1818 if (!ParseEscape(&t, &r, status, ps.rune_max()))
1780 return NULL; 1819 return NULL;
1781 if (!ps.PushLiteral(r)) 1820 if (!ps.PushLiteral(r))
1782 return NULL; 1821 return NULL;
1783 break; 1822 break;
1784 } 1823 }
1785 } 1824 }
1786 Break2: 1825 Break2:
1787 lastunary = isunary; 1826 lastunary = isunary;
1788 } 1827 }
1789 return ps.DoFinish(); 1828 return ps.DoFinish();
1790 } 1829 }
1791 1830
1792 } // namespace re2 1831 } // namespace re2
OLDNEW
« no previous file with comments | « no previous file | re2/re2.h » ('j') | no next file with comments »

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b