OLD | NEW |
1 // Copyright 2006 The RE2 Authors. All Rights Reserved. | 1 // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 // Regular expression parser. | 5 // Regular expression parser. |
6 | 6 |
7 // The parser is a simple precedence-based parser with a | 7 // The parser is a simple precedence-based parser with a |
8 // manual stack. The parsing work is done by the methods | 8 // manual stack. The parsing work is done by the methods |
9 // of the ParseState class. The Regexp::Parse function is | 9 // of the ParseState class. The Regexp::Parse function is |
10 // essentially just a lexer that calls the ParseState method | 10 // essentially just a lexer that calls the ParseState method |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
95 bool DoLeftParenNoCapture(); | 95 bool DoLeftParenNoCapture(); |
96 | 96 |
97 // Processes a vertical bar in the input. | 97 // Processes a vertical bar in the input. |
98 bool DoVerticalBar(); | 98 bool DoVerticalBar(); |
99 | 99 |
100 // Processes a right parenthesis in the input. | 100 // Processes a right parenthesis in the input. |
101 bool DoRightParen(); | 101 bool DoRightParen(); |
102 | 102 |
103 // Processes the end of input, returning the final regexp. | 103 // Processes the end of input, returning the final regexp. |
104 Regexp* DoFinish(); | 104 Regexp* DoFinish(); |
| 105 ·· |
| 106 // Finishes the regexp if necessary, preparing it for use |
| 107 // in a more complicated expression. |
| 108 // If it is a CharClassBuilder, converts into a CharClass. |
| 109 Regexp* FinishRegexp(Regexp*); |
105 | 110 |
106 // These routines don't manipulate the parse stack | 111 // These routines don't manipulate the parse stack |
107 // directly, but they do need to look at flags_. | 112 // directly, but they do need to look at flags_. |
108 // ParseCharClass also manipulates the internals of Regexp | 113 // ParseCharClass also manipulates the internals of Regexp |
109 // while creating *out_re. | 114 // while creating *out_re. |
110 | 115 |
111 // Parse a character class into *out_re. | 116 // Parse a character class into *out_re. |
112 // Removes parsed text from s. | 117 // Removes parsed text from s. |
113 bool ParseCharClass(StringPiece* s, Regexp** out_re, | 118 bool ParseCharClass(StringPiece* s, Regexp** out_re, |
114 RegexpStatus* status); | 119 RegexpStatus* status); |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
167 // Cleans up by freeing all the regexps on the stack. | 172 // Cleans up by freeing all the regexps on the stack. |
168 Regexp::ParseState::~ParseState() { | 173 Regexp::ParseState::~ParseState() { |
169 Regexp* next; | 174 Regexp* next; |
170 for (Regexp* re = stacktop_; re != NULL; re = next) { | 175 for (Regexp* re = stacktop_; re != NULL; re = next) { |
171 next = re->down_; | 176 next = re->down_; |
172 re->down_ = NULL; | 177 re->down_ = NULL; |
173 re->Decref(); | 178 re->Decref(); |
174 } | 179 } |
175 } | 180 } |
176 | 181 |
| 182 // Finishes the regexp if necessary, preparing it for use in |
| 183 // a more complex expression. |
| 184 // If it is a CharClassBuilder, converts into a CharClass. |
| 185 Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) { |
| 186 if (re == NULL) |
| 187 return NULL; |
| 188 re->down_ = NULL; |
| 189 |
| 190 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { |
| 191 CharClassBuilder* ccb = re->ccb_; |
| 192 re->ccb_ = NULL; |
| 193 re->cc_ = ccb->GetCharClass(); |
| 194 delete ccb; |
| 195 } |
| 196 |
| 197 return re; |
| 198 } |
| 199 |
177 // Pushes the given regular expression onto the stack. | 200 // Pushes the given regular expression onto the stack. |
178 // Could check for too much memory used here. | 201 // Could check for too much memory used here. |
179 bool Regexp::ParseState::PushRegexp(Regexp* re) { | 202 bool Regexp::ParseState::PushRegexp(Regexp* re) { |
180 MaybeConcatString(-1, NoParseFlags); | 203 MaybeConcatString(-1, NoParseFlags); |
181 | 204 |
182 // Special case: a character class of one character is just | 205 // Special case: a character class of one character is just |
183 // a literal. This is a common idiom for escaping | 206 // a literal. This is a common idiom for escaping |
184 // single characters (e.g., [.] instead of \.), and some | 207 // single characters (e.g., [.] instead of \.), and some |
185 // analysis does better with fewer character classes. | 208 // analysis does better with fewer character classes. |
186 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. | 209 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. |
187 if (re->op_ == kRegexpCharClass) { | 210 if (re->op_ == kRegexpCharClass) { |
188 if (re->cc_->size() == 1) { | 211 if (re->ccb_->size() == 1) { |
189 Rune r = re->cc_->begin()->lo; | 212 Rune r = re->ccb_->begin()->lo; |
190 re->Decref(); | 213 re->Decref(); |
191 re = new Regexp(kRegexpLiteral, flags_); | 214 re = new Regexp(kRegexpLiteral, flags_); |
192 re->rune_ = r; | 215 re->rune_ = r; |
193 } else if (re->cc_->size() == 2) { | 216 } else if (re->ccb_->size() == 2) { |
194 Rune r = re->cc_->begin()->lo; | 217 Rune r = re->ccb_->begin()->lo; |
195 if ('A' <= r && r <= 'Z' && re->cc_->Contains(r + 'a' - 'A')) { | 218 if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) { |
196 re->Decref(); | 219 re->Decref(); |
197 re = new Regexp(kRegexpLiteral, flags_ | FoldCase); | 220 re = new Regexp(kRegexpLiteral, flags_ | FoldCase); |
198 re->rune_ = r + 'a' - 'A'; | 221 re->rune_ = r + 'a' - 'A'; |
199 } | 222 } |
200 } | 223 } |
201 } | 224 } |
202 | 225 |
203 if (!IsMarker(re->op())) | 226 if (!IsMarker(re->op())) |
204 re->simple_ = re->ComputeSimple(); | 227 re->simple_ = re->ComputeSimple(); |
205 re->down_ = stacktop_; | 228 re->down_ = stacktop_; |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
272 Rune CycleFoldRune(Rune r) { | 295 Rune CycleFoldRune(Rune r) { |
273 CaseFold* f = LookupCaseFold(r); | 296 CaseFold* f = LookupCaseFold(r); |
274 if (f == NULL || r < f->lo) | 297 if (f == NULL || r < f->lo) |
275 return r; | 298 return r; |
276 return ApplyFold(f, r); | 299 return ApplyFold(f, r); |
277 } | 300 } |
278 | 301 |
279 // Add lo-hi to the class, along with their fold-equivalent characters. | 302 // Add lo-hi to the class, along with their fold-equivalent characters. |
280 // If lo-hi is already in the class, assume that the fold-equivalent | 303 // If lo-hi is already in the class, assume that the fold-equivalent |
281 // chars are there too, so there's no work to do. | 304 // chars are there too, so there's no work to do. |
282 static void AddFoldedRange(CharClass* cc, Rune lo, Rune hi, int depth) { | 305 static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { |
283 // AddFoldedRange calls itself recursively for each rune in the fold cycle. | 306 // AddFoldedRange calls itself recursively for each rune in the fold cycle. |
284 // Most folding cycles are small: there aren't any bigger than four in the | 307 // Most folding cycles are small: there aren't any bigger than four in the |
285 // current Unicode tables. make_unicode_casefold.py checks that | 308 // current Unicode tables. make_unicode_casefold.py checks that |
286 // the cycles are not too long, and we double-check here using depth. | 309 // the cycles are not too long, and we double-check here using depth. |
287 if (depth > 10) { | 310 if (depth > 10) { |
288 LOG(DFATAL) << "AddFoldedRange recurses too much."; | 311 LOG(DFATAL) << "AddFoldedRange recurses too much."; |
289 return; | 312 return; |
290 } | 313 } |
291 | 314 |
292 if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done | 315 if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
328 // Pick up where this fold left off. | 351 // Pick up where this fold left off. |
329 lo = f->hi + 1; | 352 lo = f->hi + 1; |
330 } | 353 } |
331 } | 354 } |
332 | 355 |
333 // Pushes the literal rune r onto the stack. | 356 // Pushes the literal rune r onto the stack. |
334 bool Regexp::ParseState::PushLiteral(Rune r) { | 357 bool Regexp::ParseState::PushLiteral(Rune r) { |
335 // Do case folding if needed. | 358 // Do case folding if needed. |
336 if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { | 359 if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { |
337 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); | 360 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); |
| 361 re->ccb_ = new CharClassBuilder; |
338 Rune r1 = r; | 362 Rune r1 = r; |
339 do { | 363 do { |
340 if (!(flags_ & NeverNL) || r != '\n') { | 364 if (!(flags_ & NeverNL) || r != '\n') { |
341 re->cc()->AddRange(r, r); | 365 re->ccb_->AddRange(r, r); |
342 } | 366 } |
343 r = CycleFoldRune(r); | 367 r = CycleFoldRune(r); |
344 } while (r != r1); | 368 } while (r != r1); |
345 re->cc()->RemoveAbove(rune_max_); | 369 re->ccb_->RemoveAbove(rune_max_); |
346 return PushRegexp(re); | 370 return PushRegexp(re); |
347 } | 371 } |
348 | 372 |
349 // Exclude newline if applicable. | 373 // Exclude newline if applicable. |
350 if ((flags_ & NeverNL) && r == '\n') | 374 if ((flags_ & NeverNL) && r == '\n') |
351 return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); | 375 return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); |
352 | 376 |
353 // No fancy stuff worked. Ordinary literal. | 377 // No fancy stuff worked. Ordinary literal. |
354 if (MaybeConcatString(r, flags_)) | 378 if (MaybeConcatString(r, flags_)) |
355 return true; | 379 return true; |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
387 } | 411 } |
388 return PushSimpleOp(kRegexpEndLine); | 412 return PushSimpleOp(kRegexpEndLine); |
389 } | 413 } |
390 | 414 |
391 // Pushes a . onto the stack. | 415 // Pushes a . onto the stack. |
392 bool Regexp::ParseState::PushDot() { | 416 bool Regexp::ParseState::PushDot() { |
393 if ((flags_ & DotNL) && !(flags_ & NeverNL)) | 417 if ((flags_ & DotNL) && !(flags_ & NeverNL)) |
394 return PushSimpleOp(kRegexpAnyChar); | 418 return PushSimpleOp(kRegexpAnyChar); |
395 // Rewrite . into [^\n] | 419 // Rewrite . into [^\n] |
396 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); | 420 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); |
397 re->cc_->AddRange(0, '\n' - 1); | 421 re->ccb_ = new CharClassBuilder; |
398 re->cc_->AddRange('\n' + 1, rune_max_); | 422 re->ccb_->AddRange(0, '\n' - 1); |
| 423 re->ccb_->AddRange('\n' + 1, rune_max_); |
399 return PushRegexp(re); | 424 return PushRegexp(re); |
400 } | 425 } |
401 | 426 |
402 // Pushes a regexp with the given op (and no args) onto the stack. | 427 // Pushes a regexp with the given op (and no args) onto the stack. |
403 bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { | 428 bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { |
404 Regexp* re = new Regexp(op, flags_); | 429 Regexp* re = new Regexp(op, flags_); |
405 return PushRegexp(re); | 430 return PushRegexp(re); |
406 } | 431 } |
407 | 432 |
408 // Pushes a repeat operator regexp onto the stack. | 433 // Pushes a repeat operator regexp onto the stack. |
409 // A valid argument for the operator must already be on the stack. | 434 // A valid argument for the operator must already be on the stack. |
410 // The char c is the name of the operator, for use in error messages. | 435 // The char c is the name of the operator, for use in error messages. |
411 bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, | 436 bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, |
412 bool nongreedy) { | 437 bool nongreedy) { |
413 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { | 438 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { |
414 status_->set_code(kRegexpRepeatArgument); | 439 status_->set_code(kRegexpRepeatArgument); |
415 status_->set_error_arg(s); | 440 status_->set_error_arg(s); |
416 return false; | 441 return false; |
417 } | 442 } |
418 Regexp::ParseFlags fl = flags_; | 443 Regexp::ParseFlags fl = flags_; |
419 if (nongreedy) | 444 if (nongreedy) |
420 fl = fl ^ NonGreedy; | 445 fl = fl ^ NonGreedy; |
421 Regexp* re = new Regexp(op, fl); | 446 Regexp* re = new Regexp(op, fl); |
422 re->AllocSub(1); | 447 re->AllocSub(1); |
423 re->sub_[0] = stacktop_; | 448 re->down_ = stacktop_->down_; |
| 449 re->sub()[0] = FinishRegexp(stacktop_); |
424 re->simple_ = re->ComputeSimple(); | 450 re->simple_ = re->ComputeSimple(); |
425 re->down_ = stacktop_->down_; | |
426 stacktop_ = re; | 451 stacktop_ = re; |
427 return true; | 452 return true; |
428 } | 453 } |
429 | 454 |
430 // Pushes a repetition regexp onto the stack. | 455 // Pushes a repetition regexp onto the stack. |
431 // A valid argument for the operator must already be on the stack. | 456 // A valid argument for the operator must already be on the stack. |
432 bool Regexp::ParseState::PushRepetition(int min, int max, | 457 bool Regexp::ParseState::PushRepetition(int min, int max, |
433 const StringPiece& s, | 458 const StringPiece& s, |
434 bool nongreedy) { | 459 bool nongreedy) { |
435 if ((max != -1 && max < min) || max > 1000) { | 460 if ((max != -1 && max < min) || max > 1000) { |
436 status_->set_code(kRegexpRepeatSize); | 461 status_->set_code(kRegexpRepeatSize); |
437 status_->set_error_arg(s); | 462 status_->set_error_arg(s); |
438 return false; | 463 return false; |
439 } | 464 } |
440 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { | 465 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { |
441 status_->set_code(kRegexpRepeatArgument); | 466 status_->set_code(kRegexpRepeatArgument); |
442 status_->set_error_arg(s); | 467 status_->set_error_arg(s); |
443 return false; | 468 return false; |
444 } | 469 } |
445 Regexp::ParseFlags fl = flags_; | 470 Regexp::ParseFlags fl = flags_; |
446 if (nongreedy) | 471 if (nongreedy) |
447 fl = fl ^ NonGreedy; | 472 fl = fl ^ NonGreedy; |
448 Regexp* re = new Regexp(kRegexpRepeat, fl); | 473 Regexp* re = new Regexp(kRegexpRepeat, fl); |
449 re->min_ = min; | 474 re->min_ = min; |
450 re->max_ = max; | 475 re->max_ = max; |
451 re->AllocSub(1); | 476 re->AllocSub(1); |
452 re->sub_[0] = stacktop_; | 477 re->down_ = stacktop_->down_; |
| 478 re->sub()[0] = FinishRegexp(stacktop_); |
453 re->simple_ = re->ComputeSimple(); | 479 re->simple_ = re->ComputeSimple(); |
454 | 480 |
455 re->down_ = stacktop_->down_; | |
456 stacktop_ = re; | 481 stacktop_ = re; |
457 return true; | 482 return true; |
458 } | 483 } |
459 | 484 |
460 // Pseudo-operators marking left paren and pipe operators | 485 // Pseudo-operators - only on parse stack. |
461 // in the regexp stack. | 486 |
| 487 // Markers for left paren and pipe operators in the regexp stack. |
462 const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1); | 488 const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1); |
463 const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2); | 489 const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2); |
464 | 490 |
465 // Checks whether a particular regexp op is a marker. | 491 // Checks whether a particular regexp op is a marker. |
466 bool Regexp::ParseState::IsMarker(RegexpOp op) { | 492 bool Regexp::ParseState::IsMarker(RegexpOp op) { |
467 return op >= kLeftParen; | 493 return op >= kLeftParen; |
468 } | 494 } |
469 | 495 |
470 // Processes a left parenthesis in the input. | 496 // Processes a left parenthesis in the input. |
471 // Pushes a marker onto the stack. | 497 // Pushes a marker onto the stack. |
472 bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { | 498 bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { |
473 Regexp* re = new Regexp(kLeftParen, flags_); | 499 Regexp* re = new Regexp(kLeftParen, flags_); |
474 re->cap_ = ++ncap_; | 500 re->cap_ = ++ncap_; |
475 if (name.data() != NULL) | 501 if (name.data() != NULL) |
476 re->name_ = new string(name.as_string()); | 502 re->name_ = new string(name.as_string()); |
477 return PushRegexp(re); | 503 return PushRegexp(re); |
478 } | 504 } |
479 | 505 |
480 // Pushes a non-capturing marker onto the stack. | 506 // Pushes a non-capturing marker onto the stack. |
481 bool Regexp::ParseState::DoLeftParenNoCapture() { | 507 bool Regexp::ParseState::DoLeftParenNoCapture() { |
482 Regexp* re = new Regexp(kLeftParen, flags_); | 508 Regexp* re = new Regexp(kLeftParen, flags_); |
483 re->cap_ = -1; | 509 re->cap_ = -1; |
484 return PushRegexp(re); | 510 return PushRegexp(re); |
485 } | 511 } |
486 | 512 |
487 // Adds r to cc, along with r's upper case if foldascii is set. | 513 // Adds r to cc, along with r's upper case if foldascii is set. |
488 static void AddLiteral(CharClass* cc, Rune r, bool foldascii) { | 514 static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) { |
489 cc->AddRange(r, r); | 515 cc->AddRange(r, r); |
490 if (foldascii && 'a' <= r && r <= 'z') | 516 if (foldascii && 'a' <= r && r <= 'z') |
491 cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); | 517 cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); |
492 } | 518 } |
493 | 519 |
494 // Processes a vertical bar in the input. | 520 // Processes a vertical bar in the input. |
495 bool Regexp::ParseState::DoVerticalBar() { | 521 bool Regexp::ParseState::DoVerticalBar() { |
496 MaybeConcatString(-1, NoParseFlags); | 522 MaybeConcatString(-1, NoParseFlags); |
497 DoConcatenation(); | 523 DoConcatenation(); |
498 | 524 |
499 // Below the vertical bar is a list to alternate. | 525 // Below the vertical bar is a list to alternate. |
500 // Above the vertical bar is a list to concatenate. | 526 // Above the vertical bar is a list to concatenate. |
501 // We just did the concatenation, so either swap | 527 // We just did the concatenation, so either swap |
502 // the result below the vertical bar or push a new | 528 // the result below the vertical bar or push a new |
503 // vertical bar on the stack. | 529 // vertical bar on the stack. |
504 Regexp* r1; | 530 Regexp* r1; |
505 Regexp* r2; | 531 Regexp* r2; |
506 if ((r1 = stacktop_) != NULL && | 532 if ((r1 = stacktop_) != NULL && |
507 (r2 = stacktop_->down_) != NULL && | 533 (r2 = stacktop_->down_) != NULL && |
508 r2->op() == kVerticalBar) { | 534 r2->op() == kVerticalBar) { |
509 // If above and below vertical bar are literal or char class, | 535 // If above and below vertical bar are literal or char class, |
510 // can merge into a single char class. | 536 // can merge into a single char class. |
511 Regexp* r3; | 537 Regexp* r3; |
512 if ((r1->op() == kRegexpLiteral || | 538 if ((r1->op() == kRegexpLiteral || |
513 r1->op() == kRegexpCharClass || | 539 r1->op() == kRegexpCharClass || |
514 r1->op() == kRegexpAnyChar) && | 540 r1->op() == kRegexpAnyChar) && |
515 (r3 = r2->down_) != NULL) { | 541 (r3 = r2->down_) != NULL) { |
| 542 Rune rune; |
516 switch (r3->op()) { | 543 switch (r3->op()) { |
517 case kRegexpLiteral: // convert to char class | 544 case kRegexpLiteral: // convert to char class |
| 545 rune = r3->rune_; |
518 r3->op_ = kRegexpCharClass; | 546 r3->op_ = kRegexpCharClass; |
519 r3->cc_ = new CharClass; | 547 r3->cc_ = NULL; |
520 AddLiteral(r3->cc_, r3->rune_, r3->parse_flags_ & Regexp::FoldCase); | 548 r3->ccb_ = new CharClassBuilder; |
| 549 AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase); |
521 // fall through | 550 // fall through |
522 case kRegexpCharClass: | 551 case kRegexpCharClass: |
523 if (r1->op() == kRegexpLiteral) | 552 if (r1->op() == kRegexpLiteral) |
524 AddLiteral(r3->cc_, r1->rune_, r1->parse_flags_ & Regexp::FoldCase); | 553 AddLiteral(r3->ccb_, r1->rune_, r1->parse_flags_ & Regexp::FoldCase)
; |
525 else if (r1->op() == kRegexpCharClass) | 554 else if (r1->op() == kRegexpCharClass) |
526 r3->cc_->AddCharClass(r1->cc()); | 555 r3->ccb_->AddCharClass(r1->ccb_); |
527 if (r1->op() == kRegexpAnyChar || r3->cc_->full()) { | 556 if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) { |
| 557 delete r3->ccb_; |
| 558 r3->ccb_ = NULL; |
528 r3->op_ = kRegexpAnyChar; | 559 r3->op_ = kRegexpAnyChar; |
529 delete r3->cc_; | |
530 r3->cc_ = NULL; | |
531 } | 560 } |
532 // fall through | 561 // fall through |
533 case kRegexpAnyChar: | 562 case kRegexpAnyChar: |
534 // pop r1 | 563 // pop r1 |
535 stacktop_ = r2; | 564 stacktop_ = r2; |
536 r1->Decref(); | 565 r1->Decref(); |
537 return true; | 566 return true; |
538 default: | 567 default: |
539 break; | 568 break; |
540 } | 569 } |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
572 | 601 |
573 // Restore flags from when paren opened. | 602 // Restore flags from when paren opened. |
574 Regexp* re = r2; | 603 Regexp* re = r2; |
575 flags_ = re->parse_flags(); | 604 flags_ = re->parse_flags(); |
576 | 605 |
577 // Rewrite LeftParen as capture if needed. | 606 // Rewrite LeftParen as capture if needed. |
578 if (re->cap_ > 0) { | 607 if (re->cap_ > 0) { |
579 re->op_ = kRegexpCapture; | 608 re->op_ = kRegexpCapture; |
580 // re->cap_ is already set | 609 // re->cap_ is already set |
581 re->AllocSub(1); | 610 re->AllocSub(1); |
582 re->sub_[0] = r1; | 611 re->sub()[0] = FinishRegexp(r1); |
583 re->simple_ = re->ComputeSimple(); | 612 re->simple_ = re->ComputeSimple(); |
584 } else { | 613 } else { |
585 re->Decref(); | 614 re->Decref(); |
586 re = r1; | 615 re = r1; |
587 } | 616 } |
588 return PushRegexp(re); | 617 return PushRegexp(re); |
589 } | 618 } |
590 | 619 |
591 // Processes the end of input, returning the final regexp. | 620 // Processes the end of input, returning the final regexp. |
592 Regexp* Regexp::ParseState::DoFinish() { | 621 Regexp* Regexp::ParseState::DoFinish() { |
593 DoAlternation(); | 622 DoAlternation(); |
594 Regexp* re = stacktop_; | 623 Regexp* re = stacktop_; |
595 if (re != NULL && re->down_ != NULL) { | 624 if (re != NULL && re->down_ != NULL) { |
596 status_->set_code(kRegexpMissingParen); | 625 status_->set_code(kRegexpMissingParen); |
597 status_->set_error_arg(whole_regexp_); | 626 status_->set_error_arg(whole_regexp_); |
598 return NULL; | 627 return NULL; |
599 } | 628 } |
600 stacktop_ = NULL; | 629 stacktop_ = NULL; |
601 return re; | 630 return FinishRegexp(re); |
602 } | 631 } |
603 | 632 |
604 // Collapse the regexps on top of the stack, down to the | 633 // Collapse the regexps on top of the stack, down to the |
605 // first marker, into a new op node (op == kRegexpAlternate | 634 // first marker, into a new op node (op == kRegexpAlternate |
606 // or op == kRegexpConcat). | 635 // or op == kRegexpConcat). |
607 void Regexp::ParseState::DoCollapse(RegexpOp op) { | 636 void Regexp::ParseState::DoCollapse(RegexpOp op) { |
608 // Scan backward to marker, counting children of composite. | 637 // Scan backward to marker, counting children of composite. |
609 int n = 0; | 638 int n = 0; |
610 Regexp* next = NULL; | 639 Regexp* next = NULL; |
611 Regexp* sub; | 640 Regexp* sub; |
612 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { | 641 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { |
613 next = sub->down_; | 642 next = sub->down_; |
614 if (sub->op_ == op) | 643 if (sub->op_ == op) |
615 n += sub->nsub_; | 644 n += sub->nsub_; |
616 else | 645 else |
617 n++; | 646 n++; |
618 } | 647 } |
619 | 648 |
620 // If there's just one child, leave it alone. | 649 // If there's just one child, leave it alone. |
621 // (Concat of one thing is that one thing; alternate of one thing is same.) | 650 // (Concat of one thing is that one thing; alternate of one thing is same.) |
622 if (stacktop_ != NULL && stacktop_->down_ == next) | 651 if (stacktop_ != NULL && stacktop_->down_ == next) |
623 return; | 652 return; |
624 | 653 |
625 // Construct op (alternation or concatenation), flattening op of op. | 654 // Construct op (alternation or concatenation), flattening op of op. |
626 Regexp* re = new Regexp(op, flags_); | 655 Regexp** subs = new Regexp*[n]; |
627 re->AllocSub(n); | |
628 next = NULL; | 656 next = NULL; |
| 657 int i = n; |
629 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { | 658 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { |
630 next = sub->down_; // in case we sub->Decref(). | 659 next = sub->down_; |
631 if (sub->op_ == op) { | 660 if (sub->op_ == op) { |
| 661 Regexp** sub_subs = sub->sub(); |
632 for (int k = sub->nsub_ - 1; k >= 0; k--) | 662 for (int k = sub->nsub_ - 1; k >= 0; k--) |
633 re->sub_[--n] = sub->sub_[k]->Incref(); | 663 subs[--i] = sub_subs[k]->Incref(); |
634 sub->Decref(); | 664 sub->Decref(); |
635 } else { | 665 } else { |
636 re->sub_[--n] = sub; | 666 subs[--i] = FinishRegexp(sub); |
637 } | 667 } |
638 } | 668 } |
| 669 ·· |
| 670 Regexp* re = ConcatOrAlternate(op, subs, n, flags_); |
| 671 delete[] subs; |
639 re->simple_ = re->ComputeSimple(); | 672 re->simple_ = re->ComputeSimple(); |
640 re->down_ = next; | 673 re->down_ = next; |
641 stacktop_ = re; | 674 stacktop_ = re; |
642 } | 675 } |
643 | 676 |
644 // Finishes the current concatenation, | 677 // Finishes the current concatenation, |
645 // collapsing it into a single regexp on the stack. | 678 // collapsing it into a single regexp on the stack. |
646 void Regexp::ParseState::DoConcatenation() { | 679 void Regexp::ParseState::DoConcatenation() { |
647 Regexp* r1 = stacktop_; | 680 Regexp* r1 = stacktop_; |
648 if (r1 == NULL || IsMarker(r1->op())) { | 681 if (r1 == NULL || IsMarker(r1->op())) { |
649 // empty concatenation is special case | 682 // empty concatenation is special case |
650 Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); | 683 Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); |
651 PushRegexp(re); | 684 PushRegexp(re); |
652 } | 685 } |
653 DoCollapse(kRegexpConcat); | 686 DoCollapse(kRegexpConcat); |
654 } | 687 } |
655 | 688 |
656 // Finishes the current alternation, | 689 // Finishes the current alternation, |
657 // collapsing it to a single regexp on the stack. | 690 // collapsing it to a single regexp on the stack. |
658 void Regexp::ParseState::DoAlternation() { | 691 void Regexp::ParseState::DoAlternation() { |
659 DoVerticalBar(); | 692 DoVerticalBar(); |
660 // Now stack top is kVerticalBar. | 693 // Now stack top is kVerticalBar. |
661 Regexp* r1 = stacktop_; | 694 Regexp* r1 = stacktop_; |
662 stacktop_ = r1->down_; | 695 stacktop_ = r1->down_; |
663 r1->Decref(); | 696 r1->Decref(); |
664 DoCollapse(kRegexpAlternate); | 697 DoCollapse(kRegexpAlternate); |
665 } | 698 } |
666 | 699 |
667 // Incremental conversion of concatenated literals into strings. | 700 // Incremental conversion of concatenated literals into strings. |
668 // If top of stack is literal, literal or literal, string or string, literal | 701 // If top of stack is (literal, literal) or (literal, string) or (string, litera
l) |
669 // or string, string, collapse into single string. | 702 // or (string, string), collapse into single string. |
670 // Don't walk down the stack -- the parser calls this frequently | 703 // Don't walk down the stack -- the parser calls this frequently |
671 // enough that below the bottom two is known to be collapsed. | 704 // enough that below the bottom two is known to be collapsed. |
672 // Only called when another regexp is about to be pushed | 705 // Only called when another regexp is about to be pushed |
673 // on the stack, so that the topmost literal is not being considered. | 706 // on the stack, so that the topmost literal is not being considered. |
674 // (Otherwise ab* would turn into (ab)*.) | 707 // (Otherwise ab* would turn into (ab)*.) |
675 // If r >= 0, consider pushing a literal r on the stack. | 708 // If r >= 0, consider pushing a literal r on the stack. |
676 // Return whether that happened. | 709 // Return whether that happened. |
677 bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { | 710 bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { |
678 Regexp* re1; | 711 Regexp* re1; |
679 Regexp* re2; | 712 Regexp* re2; |
680 if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) | 713 if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) |
681 return false; | 714 return false; |
682 | 715 |
683 if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) | 716 if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) |
684 return false; | 717 return false; |
685 if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) | 718 if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) |
686 return false; | 719 return false; |
687 if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) | 720 if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) |
688 return false; | 721 return false; |
689 | 722 |
690 if (re2->op_ == kRegexpLiteral) { | 723 if (re2->op_ == kRegexpLiteral) { |
691 // convert into string | 724 // convert into string |
| 725 Rune rune = re2->rune_; |
692 re2->op_ = kRegexpLiteralString; | 726 re2->op_ = kRegexpLiteralString; |
693 re2->AddRuneToString(re2->rune_); | 727 re2->nrunes_ = 0; |
| 728 re2->runes_ = NULL; |
| 729 re2->AddRuneToString(rune); |
694 } | 730 } |
695 | 731 |
696 // push re1 into re2. | 732 // push re1 into re2. |
697 if (re1->op_ == kRegexpLiteral) { | 733 if (re1->op_ == kRegexpLiteral) { |
698 re2->AddRuneToString(re1->rune_); | 734 re2->AddRuneToString(re1->rune_); |
699 } else { | 735 } else { |
700 for (int i = 0; i < re1->nrunes_; i++) | 736 for (int i = 0; i < re1->nrunes_; i++) |
701 re2->AddRuneToString(re1->runes_[i]); | 737 re2->AddRuneToString(re1->runes_[i]); |
702 re1->nrunes_ = 0; | 738 re1->nrunes_ = 0; |
703 delete[] re1->runes_; | 739 delete[] re1->runes_; |
(...skipping 272 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
976 | 1012 |
977 BadEscape: | 1013 BadEscape: |
978 // Unrecognized escape sequence. | 1014 // Unrecognized escape sequence. |
979 status->set_code(kRegexpBadEscape); | 1015 status->set_code(kRegexpBadEscape); |
980 status->set_error_arg(StringPiece(begin, s->data() - begin)); | 1016 status->set_error_arg(StringPiece(begin, s->data() - begin)); |
981 return false; | 1017 return false; |
982 } | 1018 } |
983 | 1019 |
984 // Add a range to the character class, but exclude newline if asked. | 1020 // Add a range to the character class, but exclude newline if asked. |
985 // Also handle case folding. | 1021 // Also handle case folding. |
986 static void AddRange(CharClass* cc, Rune lo, Rune hi, | 1022 static void AddRange(CharClassBuilder* cc, Rune lo, Rune hi, |
987 Regexp::ParseFlags parse_flags) { | 1023 Regexp::ParseFlags parse_flags) { |
988 | 1024 |
989 // Take out \n if the flags say so. | 1025 // Take out \n if the flags say so. |
990 bool cutnl = !(parse_flags & Regexp::ClassNL) || | 1026 bool cutnl = !(parse_flags & Regexp::ClassNL) || |
991 (parse_flags & Regexp::NeverNL); | 1027 (parse_flags & Regexp::NeverNL); |
992 if (cutnl && lo <= '\n' && '\n' <= hi) { | 1028 if (cutnl && lo <= '\n' && '\n' <= hi) { |
993 if (lo < '\n') | 1029 if (lo < '\n') |
994 AddRange(cc, lo, '\n' - 1, parse_flags); | 1030 AddRange(cc, lo, '\n' - 1, parse_flags); |
995 if (hi > '\n') | 1031 if (hi > '\n') |
996 AddRange(cc, '\n' + 1, hi, parse_flags); | 1032 AddRange(cc, '\n' + 1, hi, parse_flags); |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1030 | 1066 |
1031 // Look for a Unicode group with the given name (e.g., "Han") | 1067 // Look for a Unicode group with the given name (e.g., "Han") |
1032 static UGroup* LookupUnicodeGroup(const StringPiece& name) { | 1068 static UGroup* LookupUnicodeGroup(const StringPiece& name) { |
1033 // Special case: "Any" means any. | 1069 // Special case: "Any" means any. |
1034 if (name == "Any") | 1070 if (name == "Any") |
1035 return &anygroup; | 1071 return &anygroup; |
1036 return LookupGroup(name, unicode_groups, num_unicode_groups); | 1072 return LookupGroup(name, unicode_groups, num_unicode_groups); |
1037 } | 1073 } |
1038 | 1074 |
1039 // Add a UGroup to the character class. | 1075 // Add a UGroup to the character class. |
1040 static void AddUGroup(CharClass *cc, UGroup *g, | 1076 static void AddUGroup(CharClassBuilder *cc, UGroup *g, |
1041 Regexp::ParseFlags parse_flags) { | 1077 Regexp::ParseFlags parse_flags) { |
1042 for (int i = 0; i < g->nr16; i++) { | 1078 for (int i = 0; i < g->nr16; i++) { |
1043 AddRange(cc, g->r16[i].lo, g->r16[i].hi, parse_flags); | 1079 AddRange(cc, g->r16[i].lo, g->r16[i].hi, parse_flags); |
1044 } | 1080 } |
1045 for (int i = 0; i < g->nr32; i++) { | 1081 for (int i = 0; i < g->nr32; i++) { |
1046 AddRange(cc, g->r32[i].lo, g->r32[i].hi, parse_flags); | 1082 AddRange(cc, g->r32[i].lo, g->r32[i].hi, parse_flags); |
1047 } | 1083 } |
1048 } | 1084 } |
1049 | 1085 |
1050 // Add the negation of a UGroup to the character class. | 1086 // Add the negation of a UGroup to the character class. |
1051 static void AddNegatedUGroup(CharClass* cc, UGroup *g, | 1087 static void AddNegatedUGroup(CharClassBuilder* cc, UGroup *g, |
1052 Regexp::ParseFlags parse_flags) { | 1088 Regexp::ParseFlags parse_flags) { |
1053 int next = 0; | 1089 int next = 0; |
1054 for (int i = 0; i < g->nr16; i++) { | 1090 for (int i = 0; i < g->nr16; i++) { |
1055 if (next < g->r16[i].lo) | 1091 if (next < g->r16[i].lo) |
1056 AddRange(cc, next, g->r16[i].lo - 1, parse_flags); | 1092 AddRange(cc, next, g->r16[i].lo - 1, parse_flags); |
1057 next = g->r16[i].hi + 1; | 1093 next = g->r16[i].hi + 1; |
1058 } | 1094 } |
1059 for (int i = 0; i < g->nr32; i++) { | 1095 for (int i = 0; i < g->nr32; i++) { |
1060 if (next < g->r32[i].lo) | 1096 if (next < g->r32[i].lo) |
1061 AddRange(cc, next, g->r32[i].lo - 1, parse_flags); | 1097 AddRange(cc, next, g->r32[i].lo - 1, parse_flags); |
(...skipping 26 matching lines...) Expand all Loading... |
1088 | 1124 |
1089 enum ParseStatus { | 1125 enum ParseStatus { |
1090 kParseOk, // Did some parsing. | 1126 kParseOk, // Did some parsing. |
1091 kParseError, // Found an error. | 1127 kParseError, // Found an error. |
1092 kParseNothing, // Decided not to parse. | 1128 kParseNothing, // Decided not to parse. |
1093 }; | 1129 }; |
1094 | 1130 |
1095 // Maybe parses a Unicode character group like \p{Han} or \P{Han} | 1131 // Maybe parses a Unicode character group like \p{Han} or \P{Han} |
1096 // (the latter is a negated group). | 1132 // (the latter is a negated group). |
1097 ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, | 1133 ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, |
1098 CharClass *cc, | 1134 CharClassBuilder *cc, |
1099 RegexpStatus* status) { | 1135 RegexpStatus* status) { |
1100 // Decide whether to parse. | 1136 // Decide whether to parse. |
1101 if (!(parse_flags & Regexp::UnicodeGroups)) | 1137 if (!(parse_flags & Regexp::UnicodeGroups)) |
1102 return kParseNothing; | 1138 return kParseNothing; |
1103 if (s->size() < 2 || (*s)[0] != '\\') | 1139 if (s->size() < 2 || (*s)[0] != '\\') |
1104 return kParseNothing; | 1140 return kParseNothing; |
1105 Rune c = (*s)[1]; | 1141 Rune c = (*s)[1]; |
1106 if (c != 'p' && c != 'P') | 1142 if (c != 'p' && c != 'P') |
1107 return kParseNothing; | 1143 return kParseNothing; |
1108 | 1144 |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1153 AddUGroup(cc, g, parse_flags); | 1189 AddUGroup(cc, g, parse_flags); |
1154 else | 1190 else |
1155 AddNegatedUGroup(cc, g, parse_flags); | 1191 AddNegatedUGroup(cc, g, parse_flags); |
1156 return kParseOk; | 1192 return kParseOk; |
1157 } | 1193 } |
1158 | 1194 |
1159 // Parses a character class name like [:alnum:]. | 1195 // Parses a character class name like [:alnum:]. |
1160 // Sets *s to span the remainder of the string. | 1196 // Sets *s to span the remainder of the string. |
1161 // Adds the ranges corresponding to the class to ranges. | 1197 // Adds the ranges corresponding to the class to ranges. |
1162 static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, | 1198 static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, |
1163 CharClass *cc, | 1199 CharClassBuilder *cc, |
1164 RegexpStatus* status) { | 1200 RegexpStatus* status) { |
1165 // Check begins with [: | 1201 // Check begins with [: |
1166 const char* p = s->data(); | 1202 const char* p = s->data(); |
1167 const char* ep = s->data() + s->size(); | 1203 const char* ep = s->data() + s->size(); |
1168 if (ep - p < 2 || p[0] != '[' || p[1] != ':') | 1204 if (ep - p < 2 || p[0] != '[' || p[1] != ':') |
1169 return kParseNothing; | 1205 return kParseNothing; |
1170 | 1206 |
1171 // Look for closing :]. | 1207 // Look for closing :]. |
1172 const char* q; | 1208 const char* q; |
1173 for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) | 1209 for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) |
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1250 RegexpStatus* status) { | 1286 RegexpStatus* status) { |
1251 StringPiece whole_class = *s; | 1287 StringPiece whole_class = *s; |
1252 if (s->size() == 0 || (*s)[0] != '[') { | 1288 if (s->size() == 0 || (*s)[0] != '[') { |
1253 // Caller checked this. | 1289 // Caller checked this. |
1254 status->set_code(kRegexpInternalError); | 1290 status->set_code(kRegexpInternalError); |
1255 status->set_error_arg(NULL); | 1291 status->set_error_arg(NULL); |
1256 return false; | 1292 return false; |
1257 } | 1293 } |
1258 bool negated = false; | 1294 bool negated = false; |
1259 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); | 1295 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); |
| 1296 re->ccb_ = new CharClassBuilder; |
1260 s->remove_prefix(1); // '[' | 1297 s->remove_prefix(1); // '[' |
1261 if (s->size() > 0 && (*s)[0] == '^') { | 1298 if (s->size() > 0 && (*s)[0] == '^') { |
1262 s->remove_prefix(1); // '^' | 1299 s->remove_prefix(1); // '^' |
1263 negated = true; | 1300 negated = true; |
1264 if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { | 1301 if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { |
1265 // If NL can't match implicitly, then pretend | 1302 // If NL can't match implicitly, then pretend |
1266 // negated classes include a leading \n. | 1303 // negated classes include a leading \n. |
1267 re->cc_->AddRange('\n', '\n'); | 1304 re->ccb_->AddRange('\n', '\n'); |
1268 } | 1305 } |
1269 } | 1306 } |
1270 bool first = true; // ] is okay as first char in class | 1307 bool first = true; // ] is okay as first char in class |
1271 while (s->size() > 0 && ((*s)[0] != ']' || first)) { | 1308 while (s->size() > 0 && ((*s)[0] != ']' || first)) { |
1272 // - is only okay unescaped as first or last in class. | 1309 // - is only okay unescaped as first or last in class. |
1273 // Except that Perl allows - anywhere. | 1310 // Except that Perl allows - anywhere. |
1274 if ((*s)[0] == '-' && !first && !(flags_&PerlX) && | 1311 if ((*s)[0] == '-' && !first && !(flags_&PerlX) && |
1275 (s->size() == 1 || (*s)[1] != ']')) { | 1312 (s->size() == 1 || (*s)[1] != ']')) { |
1276 StringPiece t = *s; | 1313 StringPiece t = *s; |
1277 t.remove_prefix(1); // '-' | 1314 t.remove_prefix(1); // '-' |
1278 Rune r; | 1315 Rune r; |
1279 int n = StringPieceToRune(&r, &t, status); | 1316 int n = StringPieceToRune(&r, &t, status); |
1280 if (n < 0) { | 1317 if (n < 0) { |
1281 re->Decref(); | 1318 re->Decref(); |
1282 return false; | 1319 return false; |
1283 } | 1320 } |
1284 status->set_code(kRegexpBadCharRange); | 1321 status->set_code(kRegexpBadCharRange); |
1285 status->set_error_arg(StringPiece(s->data(), 1+n)); | 1322 status->set_error_arg(StringPiece(s->data(), 1+n)); |
1286 re->Decref(); | 1323 re->Decref(); |
1287 return false; | 1324 return false; |
1288 } | 1325 } |
1289 first = false; | 1326 first = false; |
1290 | 1327 |
1291 // Look for [:alnum:] etc. | 1328 // Look for [:alnum:] etc. |
1292 if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { | 1329 if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { |
1293 switch (ParseCCName(s, flags_, re->cc_, status)) { | 1330 switch (ParseCCName(s, flags_, re->ccb_, status)) { |
1294 case kParseOk: | 1331 case kParseOk: |
1295 continue; | 1332 continue; |
1296 case kParseError: | 1333 case kParseError: |
1297 re->Decref(); | 1334 re->Decref(); |
1298 return false; | 1335 return false; |
1299 case kParseNothing: | 1336 case kParseNothing: |
1300 break; | 1337 break; |
1301 } | 1338 } |
1302 } | 1339 } |
1303 | 1340 |
1304 // Look for Unicode character group like \p{Han} | 1341 // Look for Unicode character group like \p{Han} |
1305 if (s->size() > 2 && | 1342 if (s->size() > 2 && |
1306 (*s)[0] == '\\' && | 1343 (*s)[0] == '\\' && |
1307 ((*s)[1] == 'p' || (*s)[1] == 'P')) { | 1344 ((*s)[1] == 'p' || (*s)[1] == 'P')) { |
1308 switch (ParseUnicodeGroup(s, flags_, re->cc_, status)) { | 1345 switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) { |
1309 case kParseOk: | 1346 case kParseOk: |
1310 continue; | 1347 continue; |
1311 case kParseError: | 1348 case kParseError: |
1312 re->Decref(); | 1349 re->Decref(); |
1313 return false; | 1350 return false; |
1314 case kParseNothing: | 1351 case kParseNothing: |
1315 break; | 1352 break; |
1316 } | 1353 } |
1317 } | 1354 } |
1318 | 1355 |
1319 // Look for Perl character class symbols (extension). | 1356 // Look for Perl character class symbols (extension). |
1320 UGroup *g = MaybeParsePerlCCEscape(s, flags_); | 1357 UGroup *g = MaybeParsePerlCCEscape(s, flags_); |
1321 if (g != NULL) { | 1358 if (g != NULL) { |
1322 AddUGroup(re->cc_, g, flags_); | 1359 AddUGroup(re->ccb_, g, flags_); |
1323 continue; | 1360 continue; |
1324 } | 1361 } |
1325 | 1362 |
1326 // Otherwise assume single character or simple range. | 1363 // Otherwise assume single character or simple range. |
1327 RuneRange rr; | 1364 RuneRange rr; |
1328 if (!ParseCCRange(s, &rr, whole_class, status)) { | 1365 if (!ParseCCRange(s, &rr, whole_class, status)) { |
1329 re->Decref(); | 1366 re->Decref(); |
1330 return false; | 1367 return false; |
1331 } | 1368 } |
1332 // AddRange is usually called in response to a class like | 1369 // AddRange is usually called in response to a class like |
1333 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless | 1370 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless |
1334 // Regexp::ClassNL is set. In an explicit range or singleton | 1371 // Regexp::ClassNL is set. In an explicit range or singleton |
1335 // like we just parsed, we do not filter \n out, so set ClassNL | 1372 // like we just parsed, we do not filter \n out, so set ClassNL |
1336 // in the flags. | 1373 // in the flags. |
1337 AddRange(re->cc_, rr.lo, rr.hi, flags_ | Regexp::ClassNL); | 1374 AddRange(re->ccb_, rr.lo, rr.hi, flags_ | Regexp::ClassNL); |
1338 } | 1375 } |
1339 if (s->size() == 0) { | 1376 if (s->size() == 0) { |
1340 status->set_code(kRegexpMissingBracket); | 1377 status->set_code(kRegexpMissingBracket); |
1341 status->set_error_arg(whole_class); | 1378 status->set_error_arg(whole_class); |
1342 re->Decref(); | 1379 re->Decref(); |
1343 return false; | 1380 return false; |
1344 } | 1381 } |
1345 s->remove_prefix(1); // ']' | 1382 s->remove_prefix(1); // ']' |
1346 | 1383 |
1347 if (negated) | 1384 if (negated) |
1348 re->cc_->Negate(); | 1385 re->ccb_->Negate(); |
1349 re->cc_->RemoveAbove(rune_max_); | 1386 re->ccb_->RemoveAbove(rune_max_); |
1350 | 1387 |
1351 *out_re = re; | 1388 *out_re = re; |
1352 return true; | 1389 return true; |
1353 } | 1390 } |
1354 | 1391 |
1355 // Is this a valid capture name? [A-Za-z0-9_]+ | 1392 // Is this a valid capture name? [A-Za-z0-9_]+ |
1356 // PCRE limits names to 32 bytes. | 1393 // PCRE limits names to 32 bytes. |
1357 // Python rejects names starting with digits. | 1394 // Python rejects names starting with digits. |
1358 // We don't enforce either of those. | 1395 // We don't enforce either of those. |
1359 static bool IsValidCaptureName(const StringPiece& name) { | 1396 static bool IsValidCaptureName(const StringPiece& name) { |
(...skipping 385 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1745 return NULL; | 1782 return NULL; |
1746 if (!ps.PushLiteral(r)) | 1783 if (!ps.PushLiteral(r)) |
1747 return NULL; | 1784 return NULL; |
1748 } | 1785 } |
1749 break; | 1786 break; |
1750 } | 1787 } |
1751 } | 1788 } |
1752 | 1789 |
1753 if (t[1] == 'p' || t[1] == 'P') { | 1790 if (t[1] == 'p' || t[1] == 'P') { |
1754 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); | 1791 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); |
1755 switch (ParseUnicodeGroup(&t, ps.flags(), re->cc_, status)) { | 1792 re->ccb_ = new CharClassBuilder; |
| 1793 switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) { |
1756 case kParseOk: | 1794 case kParseOk: |
1757 if (!ps.PushRegexp(re)) | 1795 if (!ps.PushRegexp(re)) |
1758 return NULL; | 1796 return NULL; |
1759 goto Break2; | 1797 goto Break2; |
1760 case kParseError: | 1798 case kParseError: |
1761 re->Decref(); | 1799 re->Decref(); |
1762 return NULL; | 1800 return NULL; |
1763 case kParseNothing: | 1801 case kParseNothing: |
1764 re->Decref(); | 1802 re->Decref(); |
1765 break; | 1803 break; |
1766 } | 1804 } |
1767 } | 1805 } |
1768 | 1806 |
1769 UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); | 1807 UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); |
1770 if (g != NULL) { | 1808 if (g != NULL) { |
1771 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); | 1809 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); |
1772 AddUGroup(re->cc_, g, ps.flags()); | 1810 re->ccb_ = new CharClassBuilder; |
| 1811 AddUGroup(re->ccb_, g, ps.flags()); |
1773 if (!ps.PushRegexp(re)) | 1812 if (!ps.PushRegexp(re)) |
1774 return NULL; | 1813 return NULL; |
1775 break; | 1814 break; |
1776 } | 1815 } |
1777 | 1816 |
1778 Rune r; | 1817 Rune r; |
1779 if (!ParseEscape(&t, &r, status, ps.rune_max())) | 1818 if (!ParseEscape(&t, &r, status, ps.rune_max())) |
1780 return NULL; | 1819 return NULL; |
1781 if (!ps.PushLiteral(r)) | 1820 if (!ps.PushLiteral(r)) |
1782 return NULL; | 1821 return NULL; |
1783 break; | 1822 break; |
1784 } | 1823 } |
1785 } | 1824 } |
1786 Break2: | 1825 Break2: |
1787 lastunary = isunary; | 1826 lastunary = isunary; |
1788 } | 1827 } |
1789 return ps.DoFinish(); | 1828 return ps.DoFinish(); |
1790 } | 1829 } |
1791 | 1830 |
1792 } // namespace re2 | 1831 } // namespace re2 |
OLD | NEW |