Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(165)

Delta Between Two Patch Sets: src/pkg/exp/regexp/syntax/parse.go

Issue 4635082: code review 4635082: exp/regexp/syntax: finish Regexp manipulation (Closed)
Left Patch Set: diff -r e93aca7ce587 https://go.googlecode.com/hg/ Created 13 years, 8 months ago
Right Patch Set: diff -r 229a6b3d92e7 https://go.googlecode.com/hg/ Created 13 years, 8 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « no previous file | src/pkg/exp/regexp/syntax/parse_test.go » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 // Copyright 2011 The Go Authors. All rights reserved. 1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 package syntax 5 package syntax
6 6
7 import ( 7 import (
8 "os" 8 "os"
9 "sort" 9 "sort"
10 "strings" 10 "strings"
(...skipping 283 matching lines...) Expand 10 before | Expand all | Expand 10 after
294 } 294 }
295 if cap(re.Rune)-len(re.Rune) > 100 { 295 if cap(re.Rune)-len(re.Rune) > 100 {
296 // re.Rune will not grow any more. 296 // re.Rune will not grow any more.
297 // Make a copy or inline to reclaim storage. 297 // Make a copy or inline to reclaim storage.
298 re.Rune = append(re.Rune0[:0], re.Rune...) 298 re.Rune = append(re.Rune0[:0], re.Rune...)
299 } 299 }
300 } 300 }
301 } 301 }
302 302
303 // collapse returns the result of applying op to sub. 303 // collapse returns the result of applying op to sub.
304 // If sub contains op nodes, they all 304 // If sub contains op nodes, they all get hoisted up
305 // get flattened into a single node. 305 // so that there is never a concat of a concat or an
r 2011/06/30 03:06:02 odd line break
306 // alternate of an alternate.
306 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { 307 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
307 if len(subs) == 1 { 308 if len(subs) == 1 {
308 return subs[0] 309 return subs[0]
309 } 310 }
310 re := p.newRegexp(op) 311 re := p.newRegexp(op)
311 re.Sub = re.Sub0[:0] 312 re.Sub = re.Sub0[:0]
312 for _, sub := range subs { 313 for _, sub := range subs {
313 if sub.Op == op { 314 if sub.Op == op {
314 re.Sub = append(re.Sub, sub.Sub...) 315 re.Sub = append(re.Sub, sub.Sub...)
315 p.reuse(sub) 316 p.reuse(sub)
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
373 } 374 }
374 } 375 }
375 } 376 }
376 377
377 // Found end of a run with common leading literal string: 378 // Found end of a run with common leading literal string:
378 // sub[start:i] all begin with str[0:len(str)], but sub[i] 379 // sub[start:i] all begin with str[0:len(str)], but sub[i]
379 // does not even begin with str[0]. 380 // does not even begin with str[0].
380 // 381 //
381 // Factor out common string and append factored expression to ou t. 382 // Factor out common string and append factored expression to ou t.
382 if i == start { 383 if i == start {
383 » » » // Nothing to do - first iteration. 384 » » » // Nothing to do - run of length 0.
r 2011/06/30 03:06:02 first iteration has i == 0. you mean something els
384 } else if i == start+1 { 385 } else if i == start+1 {
385 // Just one: don't bother factoring. 386 // Just one: don't bother factoring.
386 out = append(out, sub[start]) 387 out = append(out, sub[start])
387 } else { 388 } else {
388 // Construct factored form: prefix(suffix1|suffix2|...) 389 // Construct factored form: prefix(suffix1|suffix2|...)
389 prefix := p.newRegexp(OpLiteral) 390 prefix := p.newRegexp(OpLiteral)
390 prefix.Flags = strflags 391 prefix.Flags = strflags
391 prefix.Rune = append(prefix.Rune[:0], str...) 392 prefix.Rune = append(prefix.Rune[:0], str...)
392 393
393 for j := start; j < i; j++ { 394 for j := start; j < i; j++ {
394 sub[j] = p.removeLeadingString(sub[j], len(str)) 395 sub[j] = p.removeLeadingString(sub[j], len(str))
395 } 396 }
396 suffix := p.collapse(sub[start:i], OpAlternate) // recur se 397 suffix := p.collapse(sub[start:i], OpAlternate) // recur se
397 398
398 re := p.newRegexp(OpConcat) 399 re := p.newRegexp(OpConcat)
399 re.Sub = append(re.Sub[:0], prefix, suffix) 400 re.Sub = append(re.Sub[:0], prefix, suffix)
400 out = append(out, re) 401 out = append(out, re)
401 } 402 }
402 403
403 » » // Prepare for next iteration (if there is one). 404 » » // Prepare for next iteration.
r 2011/06/30 03:06:02 delete parenthetical remark
404 start = i 405 start = i
405 str = istr 406 str = istr
406 strflags = iflags 407 strflags = iflags
407 } 408 }
408 sub = out 409 sub = out
409 410
410 // Round 2: Factor out common complex prefixes, 411 // Round 2: Factor out common complex prefixes,
411 // just the first piece of each concatenation, 412 // just the first piece of each concatenation,
412 // whatever it is. This is good enough a lot of the time. 413 // whatever it is. This is good enough a lot of the time.
413 start = 0 414 start = 0
(...skipping 12 matching lines...) Expand all
426 if first != nil && first.Equal(ifirst) { 427 if first != nil && first.Equal(ifirst) {
427 continue 428 continue
428 } 429 }
429 } 430 }
430 431
431 // Found end of a run with common leading regexp: 432 // Found end of a run with common leading regexp:
432 // sub[start:i] all begin with first but sub[i] does not. 433 // sub[start:i] all begin with first but sub[i] does not.
433 // 434 //
434 // Factor out common regexp and append factored expression to ou t. 435 // Factor out common regexp and append factored expression to ou t.
435 if i == start { 436 if i == start {
436 » » » // Nothing to do - first iteration. 437 » » » // Nothing to do - run of length 0.
r 2011/06/30 03:06:02 first iteration has i == 0. you mean something els
437 } else if i == start+1 { 438 } else if i == start+1 {
438 // Just one: don't bother factoring. 439 // Just one: don't bother factoring.
439 out = append(out, sub[start]) 440 out = append(out, sub[start])
440 } else { 441 } else {
441 // Construct factored form: prefix(suffix1|suffix2|...) 442 // Construct factored form: prefix(suffix1|suffix2|...)
442 prefix := first 443 prefix := first
443 444
444 for j := start; j < i; j++ { 445 for j := start; j < i; j++ {
445 reuse := j != start // prefix came from sub[star t]· 446 reuse := j != start // prefix came from sub[star t]·
446 sub[j] = p.removeLeadingRegexp(sub[j], reuse) 447 sub[j] = p.removeLeadingRegexp(sub[j], reuse)
447 } 448 }
448 suffix := p.collapse(sub[start:i], OpAlternate) // recur se 449 suffix := p.collapse(sub[start:i], OpAlternate) // recur se
449 450
450 re := p.newRegexp(OpConcat) 451 re := p.newRegexp(OpConcat)
451 re.Sub = append(re.Sub[:0], prefix, suffix) 452 re.Sub = append(re.Sub[:0], prefix, suffix)
452 out = append(out, re) 453 out = append(out, re)
453 } 454 }
454 455
455 » » // Prepare for next iteration (if there is one). 456 » » // Prepare for next iteration.
r 2011/06/30 03:06:02 ditto
456 start = i 457 start = i
457 first = ifirst 458 first = ifirst
458 } 459 }
459 sub = out 460 sub = out
460 461
461 // Round 3: Collapse runs of single literals into character classes. 462 // Round 3: Collapse runs of single literals into character classes.
462 start = 0 463 start = 0
463 out = sub[:0] 464 out = sub[:0]
464 for i := 0; i <= len(sub); i++ { 465 for i := 0; i <= len(sub); i++ {
465 // Invariant: the Regexps that were in sub[0:start] have been 466 // Invariant: the Regexps that were in sub[0:start] have been
466 // used or marked for reuse, and the slice space has been reused 467 // used or marked for reuse, and the slice space has been reused
467 // for out (len(out) <= start). 468 // for out (len(out) <= start).
468 // 469 //
469 // Invariant: sub[start:i] consists of regexps that are either 470 // Invariant: sub[start:i] consists of regexps that are either
470 // literal runes or character classes. 471 // literal runes or character classes.
471 if i < len(sub) && isCharClass(sub[i]) { 472 if i < len(sub) && isCharClass(sub[i]) {
472 continue 473 continue
473 } 474 }
474 475
475 // sub[i] is not a char or char class; 476 // sub[i] is not a char or char class;
476 // emit char class for sub[start:i]... 477 // emit char class for sub[start:i]...
477 if i == start { 478 if i == start {
478 » » » // Nothing to do 479 » » » // Nothing to do - run of length 0.
479 } else if i == start+1 { 480 } else if i == start+1 {
480 out = append(out, sub[start]) 481 out = append(out, sub[start])
481 } else { 482 } else {
482 // Make new char class. 483 // Make new char class.
483 // Start with most complex regexp in sub[start]. 484 // Start with most complex regexp in sub[start].
484 max := start 485 max := start
485 for j := start + 1; j < i; j++ { 486 for j := start + 1; j < i; j++ {
486 if sub[max].Op < sub[j].Op || sub[max].Op == sub [j].Op && len(sub[max].Rune) < len(sub[j].Rune) { 487 if sub[max].Op < sub[j].Op || sub[max].Op == sub [j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
487 max = j 488 max = j
488 } 489 }
(...skipping 23 matching lines...) Expand all
512 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { 513 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
513 continue 514 continue
514 } 515 }
515 out = append(out, sub[i]) 516 out = append(out, sub[i])
516 } 517 }
517 sub = out 518 sub = out
518 519
519 return sub 520 return sub
520 } 521 }
521 522
522 // leadingString returns the leading literal string that re starts with. 523 // leadingString returns the leading literal string that re begins with.
r 2011/06/30 03:06:02 s/re starts with/begins re/
523 // The string refers to storage in re or its children. 524 // The string refers to storage in re or its children.
524 func (p *parser) leadingString(re *Regexp) ([]int, Flags) { 525 func (p *parser) leadingString(re *Regexp) ([]int, Flags) {
525 if re.Op == OpConcat && len(re.Sub) > 0 { 526 if re.Op == OpConcat && len(re.Sub) > 0 {
526 re = re.Sub[0] 527 re = re.Sub[0]
527 } 528 }
528 if re.Op != OpLiteral { 529 if re.Op != OpLiteral {
529 return nil, 0 530 return nil, 0
530 } 531 }
531 return re.Rune, re.Flags & FoldCase 532 return re.Rune, re.Flags & FoldCase
532 } 533 }
(...skipping 28 matching lines...) Expand all
561 562
562 if re.Op == OpLiteral { 563 if re.Op == OpLiteral {
563 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] 564 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
564 if len(re.Rune) == 0 { 565 if len(re.Rune) == 0 {
565 re.Op = OpEmptyMatch 566 re.Op = OpEmptyMatch
566 } 567 }
567 } 568 }
568 return re 569 return re
569 } 570 }
570 571
571 // leadingRegexp returns the leading regexp that re starts with. 572 // leadingRegexp returns the leading regexp that re begins with.
572 // The regexp refers to storage in re or its children. 573 // The regexp refers to storage in re or its children.
573 func (p *parser) leadingRegexp(re *Regexp) *Regexp { 574 func (p *parser) leadingRegexp(re *Regexp) *Regexp {
574 if re.Op == OpEmptyMatch { 575 if re.Op == OpEmptyMatch {
575 return nil 576 return nil
576 } 577 }
577 if re.Op == OpConcat && len(re.Sub) > 0 { 578 if re.Op == OpConcat && len(re.Sub) > 0 {
578 sub := re.Sub[0] 579 sub := re.Sub[0]
579 if sub.Op == OpEmptyMatch { 580 if sub.Op == OpEmptyMatch {
580 return nil 581 return nil
581 } 582 }
(...skipping 448 matching lines...) Expand 10 before | Expand all | Expand 10 after
1030 // If it sits above an opVerticalBar, swap it below 1031 // If it sits above an opVerticalBar, swap it below
1031 // (things below an opVerticalBar become an alternation). 1032 // (things below an opVerticalBar become an alternation).
1032 // Otherwise, push a new vertical bar. 1033 // Otherwise, push a new vertical bar.
1033 if !p.swapVerticalBar() { 1034 if !p.swapVerticalBar() {
1034 p.op(opVerticalBar) 1035 p.op(opVerticalBar)
1035 } 1036 }
1036 1037
1037 return nil 1038 return nil
1038 } 1039 }
1039 1040
1040 // mergeCharClass makes re1 = re1|re2. 1041 // mergeCharClass makes dst = dst|src.
1041 // The caller must ensure that re1.Op > re2.Op. 1042 // The caller must ensure that dst.Op >= src.Op,
r 2011/06/30 03:06:02 i'd expect re2.Op > re1.Op
1042 func mergeCharClass(re1, re2 *Regexp) { 1043 // to reduce the amount of copying.
1043 » switch re1.Op { 1044 func mergeCharClass(dst, src *Regexp) {
1045 » switch dst.Op {
1044 case OpAnyChar: 1046 case OpAnyChar:
1045 » » // re2 doesn't add anything. 1047 » » // src doesn't add anything.
1046 case OpAnyCharNotNL: 1048 case OpAnyCharNotNL:
1047 » » // re2 might add \n 1049 » » // src might add \n
1048 » » if matchRune(re2, '\n') { 1050 » » if matchRune(src, '\n') {
1049 » » » re1.Op = OpAnyChar 1051 » » » dst.Op = OpAnyChar
1050 } 1052 }
1051 case OpCharClass: 1053 case OpCharClass:
1052 » » // re2 is simpler, so either literal or char class 1054 » » // src is simpler, so either literal or char class
1053 » » if re2.Op == OpLiteral { 1055 » » if src.Op == OpLiteral {
1054 » » » re1.Rune = appendRange(re1.Rune, re2.Rune[0], re2.Rune[0 ]) 1056 » » » dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0 ])
1055 } else { 1057 } else {
1056 » » » re1.Rune = appendClass(re1.Rune, re2.Rune) 1058 » » » dst.Rune = appendClass(dst.Rune, src.Rune)
1057 } 1059 }
1058 case OpLiteral: 1060 case OpLiteral:
1059 // both literal 1061 // both literal
1060 » » if re2.Rune[0] == re1.Rune[0] { 1062 » » if src.Rune[0] == dst.Rune[0] {
1061 break 1063 break
1062 } 1064 }
1063 » » re1.Op = OpCharClass 1065 » » dst.Op = OpCharClass
1064 » » re1.Rune = append(re1.Rune, re1.Rune[0]) 1066 » » dst.Rune = append(dst.Rune, dst.Rune[0])
1065 » » re1.Rune = appendRange(re1.Rune, re2.Rune[0], re2.Rune[0]) 1067 » » dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
1066 } 1068 }
1067 } 1069 }
1068 1070
1069 // If the top of the stack is an element followed by an opVerticalBar 1071 // If the top of the stack is an element followed by an opVerticalBar
1070 // swapVerticalBar swaps the two and returns true. 1072 // swapVerticalBar swaps the two and returns true.
1071 // Otherwise it returns false. 1073 // Otherwise it returns false.
1072 func (p *parser) swapVerticalBar() bool { 1074 func (p *parser) swapVerticalBar() bool {
1073 // If above and below vertical bar are literal or char class, 1075 // If above and below vertical bar are literal or char class,
1074 // can merge into a single char class. 1076 // can merge into a single char class.
1075 n := len(p.stack) 1077 n := len(p.stack)
(...skipping 711 matching lines...) Expand 10 before | Expand all | Expand 10 after
1787 return c - '0' 1789 return c - '0'
1788 } 1790 }
1789 if 'a' <= c && c <= 'f' { 1791 if 'a' <= c && c <= 'f' {
1790 return c - 'a' + 10 1792 return c - 'a' + 10
1791 } 1793 }
1792 if 'A' <= c && c <= 'F' { 1794 if 'A' <= c && c <= 'F' {
1793 return c - 'A' + 10 1795 return c - 'A' + 10
1794 } 1796 }
1795 return -1 1797 return -1
1796 } 1798 }
LEFTRIGHT

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b