src/pkg/exp/regexp/syntax/parse.go - Issue 4635082: code review 4635082: exp/regexp/syntax: finish Regexp manipulation

Delta Between Two Patch Sets: src/pkg/exp/regexp/syntax/parse.go

Issue 4635082: code review 4635082: exp/regexp/syntax: finish Regexp manipulation (Closed)

Left Patch Set: diff -r e93aca7ce587 https://go.googlecode.com/hg/ Created 13 years, 8 months ago

Right Patch Set: diff -r 229a6b3d92e7 https://go.googlecode.com/hg/ Created 13 years, 8 months ago

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 // Copyright 2011 The Go Authors. All rights reserved.	1 // Copyright 2011 The Go Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style	2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.	3 // license that can be found in the LICENSE file.

4	4

5 package syntax	5 package syntax

6	6

7 import (	7 import (

8 "os"	8 "os"

9 "sort"	9 "sort"

10 "strings"	10 "strings"

(...skipping 283 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
294 }	294 }

295 if cap(re.Rune)-len(re.Rune) > 100 {	295 if cap(re.Rune)-len(re.Rune) > 100 {

296 // re.Rune will not grow any more.	296 // re.Rune will not grow any more.

297 // Make a copy or inline to reclaim storage.	297 // Make a copy or inline to reclaim storage.

298 re.Rune = append(re.Rune0[:0], re.Rune...)	298 re.Rune = append(re.Rune0[:0], re.Rune...)

299 }	299 }

300 }	300 }

301 }	301 }

302	302

303 // collapse returns the result of applying op to sub.	303 // collapse returns the result of applying op to sub.

304 // If sub contains op nodes, they all	304 // If sub contains op nodes, they all get hoisted up

305 // get flattened into a single node.	305 // so that there is never a concat of a concat or an
r 2011/06/30 03:06:02 odd line break odd line break
	306 // alternate of an alternate.

306 func (p parser) collapse(subs []Regexp, op Op) *Regexp {	307 func (p parser) collapse(subs []Regexp, op Op) *Regexp {

307 if len(subs) == 1 {	308 if len(subs) == 1 {

308 return subs[0]	309 return subs[0]

309 }	310 }

310 re := p.newRegexp(op)	311 re := p.newRegexp(op)

311 re.Sub = re.Sub0[:0]	312 re.Sub = re.Sub0[:0]

312 for _, sub := range subs {	313 for _, sub := range subs {

313 if sub.Op == op {	314 if sub.Op == op {

314 re.Sub = append(re.Sub, sub.Sub...)	315 re.Sub = append(re.Sub, sub.Sub...)

315 p.reuse(sub)	316 p.reuse(sub)

(...skipping 57 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
373 }	374 }

374 }	375 }

375 }	376 }

376	377

377 // Found end of a run with common leading literal string:	378 // Found end of a run with common leading literal string:

378 // sub[start:i] all begin with str[0:len(str)], but sub[i]	379 // sub[start:i] all begin with str[0:len(str)], but sub[i]

379 // does not even begin with str[0].	380 // does not even begin with str[0].

380 //	381 //

381 // Factor out common string and append factored expression to ou t.	382 // Factor out common string and append factored expression to ou t.

382 if i == start {	383 if i == start {

383 » » » // Nothing to do - first iteration.	384 » » » // Nothing to do - run of length 0.
r 2011/06/30 03:06:02 first iteration has i == 0. you mean something els first iteration has i == 0. you mean something else.
384 } else if i == start+1 {	385 } else if i == start+1 {

385 // Just one: don't bother factoring.	386 // Just one: don't bother factoring.

386 out = append(out, sub[start])	387 out = append(out, sub[start])

387 } else {	388 } else {

388 // Construct factored form: prefix(suffix1\|suffix2\|...)	389 // Construct factored form: prefix(suffix1\|suffix2\|...)

389 prefix := p.newRegexp(OpLiteral)	390 prefix := p.newRegexp(OpLiteral)

390 prefix.Flags = strflags	391 prefix.Flags = strflags

391 prefix.Rune = append(prefix.Rune[:0], str...)	392 prefix.Rune = append(prefix.Rune[:0], str...)

392	393

393 for j := start; j < i; j++ {	394 for j := start; j < i; j++ {

394 sub[j] = p.removeLeadingString(sub[j], len(str))	395 sub[j] = p.removeLeadingString(sub[j], len(str))

395 }	396 }

396 suffix := p.collapse(sub[start:i], OpAlternate) // recur se	397 suffix := p.collapse(sub[start:i], OpAlternate) // recur se

397	398

398 re := p.newRegexp(OpConcat)	399 re := p.newRegexp(OpConcat)

399 re.Sub = append(re.Sub[:0], prefix, suffix)	400 re.Sub = append(re.Sub[:0], prefix, suffix)

400 out = append(out, re)	401 out = append(out, re)

401 }	402 }

402	403

403 » » // Prepare for next iteration (if there is one).	404 » » // Prepare for next iteration.
r 2011/06/30 03:06:02 delete parenthetical remark delete parenthetical remark
404 start = i	405 start = i

405 str = istr	406 str = istr

406 strflags = iflags	407 strflags = iflags

407 }	408 }

408 sub = out	409 sub = out

409	410

410 // Round 2: Factor out common complex prefixes,	411 // Round 2: Factor out common complex prefixes,

411 // just the first piece of each concatenation,	412 // just the first piece of each concatenation,

412 // whatever it is. This is good enough a lot of the time.	413 // whatever it is. This is good enough a lot of the time.

413 start = 0	414 start = 0

(...skipping 12 matching lines...) Expand all Loading...
426 if first != nil && first.Equal(ifirst) {	427 if first != nil && first.Equal(ifirst) {

427 continue	428 continue

428 }	429 }

429 }	430 }

430	431

431 // Found end of a run with common leading regexp:	432 // Found end of a run with common leading regexp:

432 // sub[start:i] all begin with first but sub[i] does not.	433 // sub[start:i] all begin with first but sub[i] does not.

433 //	434 //

434 // Factor out common regexp and append factored expression to ou t.	435 // Factor out common regexp and append factored expression to ou t.

435 if i == start {	436 if i == start {

436 » » » // Nothing to do - first iteration.	437 » » » // Nothing to do - run of length 0.
r 2011/06/30 03:06:02 first iteration has i == 0. you mean something els first iteration has i == 0. you mean something else.
437 } else if i == start+1 {	438 } else if i == start+1 {

438 // Just one: don't bother factoring.	439 // Just one: don't bother factoring.

439 out = append(out, sub[start])	440 out = append(out, sub[start])

440 } else {	441 } else {

441 // Construct factored form: prefix(suffix1\|suffix2\|...)	442 // Construct factored form: prefix(suffix1\|suffix2\|...)

442 prefix := first	443 prefix := first

443	444

444 for j := start; j < i; j++ {	445 for j := start; j < i; j++ {

445 reuse := j != start // prefix came from sub[star t]·	446 reuse := j != start // prefix came from sub[star t]·

446 sub[j] = p.removeLeadingRegexp(sub[j], reuse)	447 sub[j] = p.removeLeadingRegexp(sub[j], reuse)

447 }	448 }

448 suffix := p.collapse(sub[start:i], OpAlternate) // recur se	449 suffix := p.collapse(sub[start:i], OpAlternate) // recur se

449	450

450 re := p.newRegexp(OpConcat)	451 re := p.newRegexp(OpConcat)

451 re.Sub = append(re.Sub[:0], prefix, suffix)	452 re.Sub = append(re.Sub[:0], prefix, suffix)

452 out = append(out, re)	453 out = append(out, re)

453 }	454 }

454	455

455 » » // Prepare for next iteration (if there is one).	456 » » // Prepare for next iteration.
r 2011/06/30 03:06:02 ditto ditto
456 start = i	457 start = i

457 first = ifirst	458 first = ifirst

458 }	459 }

459 sub = out	460 sub = out

460	461

461 // Round 3: Collapse runs of single literals into character classes.	462 // Round 3: Collapse runs of single literals into character classes.

462 start = 0	463 start = 0

463 out = sub[:0]	464 out = sub[:0]

464 for i := 0; i <= len(sub); i++ {	465 for i := 0; i <= len(sub); i++ {

465 // Invariant: the Regexps that were in sub[0:start] have been	466 // Invariant: the Regexps that were in sub[0:start] have been

466 // used or marked for reuse, and the slice space has been reused	467 // used or marked for reuse, and the slice space has been reused

467 // for out (len(out) <= start).	468 // for out (len(out) <= start).

468 //	469 //

469 // Invariant: sub[start:i] consists of regexps that are either	470 // Invariant: sub[start:i] consists of regexps that are either

470 // literal runes or character classes.	471 // literal runes or character classes.

471 if i < len(sub) && isCharClass(sub[i]) {	472 if i < len(sub) && isCharClass(sub[i]) {

472 continue	473 continue

473 }	474 }

474	475

475 // sub[i] is not a char or char class;	476 // sub[i] is not a char or char class;

476 // emit char class for sub[start:i]...	477 // emit char class for sub[start:i]...

477 if i == start {	478 if i == start {

478 » » » // Nothing to do	479 » » » // Nothing to do - run of length 0.

479 } else if i == start+1 {	480 } else if i == start+1 {

480 out = append(out, sub[start])	481 out = append(out, sub[start])

481 } else {	482 } else {

482 // Make new char class.	483 // Make new char class.

483 // Start with most complex regexp in sub[start].	484 // Start with most complex regexp in sub[start].

484 max := start	485 max := start

485 for j := start + 1; j < i; j++ {	486 for j := start + 1; j < i; j++ {

486 if sub[max].Op < sub[j].Op \|\| sub[max].Op == sub [j].Op && len(sub[max].Rune) < len(sub[j].Rune) {	487 if sub[max].Op < sub[j].Op \|\| sub[max].Op == sub [j].Op && len(sub[max].Rune) < len(sub[j].Rune) {

487 max = j	488 max = j

488 }	489 }

(...skipping 23 matching lines...) Expand all Loading...
512 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {	513 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {

513 continue	514 continue

514 }	515 }

515 out = append(out, sub[i])	516 out = append(out, sub[i])

516 }	517 }

517 sub = out	518 sub = out

518	519

519 return sub	520 return sub

520 }	521 }

521	522

522 // leadingString returns the leading literal string that re starts with.	523 // leadingString returns the leading literal string that re begins with.
r 2011/06/30 03:06:02 s/re starts with/begins re/ s/re starts with/begins re/
523 // The string refers to storage in re or its children.	524 // The string refers to storage in re or its children.

524 func (p parser) leadingString(re Regexp) ([]int, Flags) {	525 func (p parser) leadingString(re Regexp) ([]int, Flags) {

525 if re.Op == OpConcat && len(re.Sub) > 0 {	526 if re.Op == OpConcat && len(re.Sub) > 0 {

526 re = re.Sub[0]	527 re = re.Sub[0]

527 }	528 }

528 if re.Op != OpLiteral {	529 if re.Op != OpLiteral {

529 return nil, 0	530 return nil, 0

530 }	531 }

531 return re.Rune, re.Flags & FoldCase	532 return re.Rune, re.Flags & FoldCase

532 }	533 }

(...skipping 28 matching lines...) Expand all Loading...
561	562

562 if re.Op == OpLiteral {	563 if re.Op == OpLiteral {

563 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]	564 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]

564 if len(re.Rune) == 0 {	565 if len(re.Rune) == 0 {

565 re.Op = OpEmptyMatch	566 re.Op = OpEmptyMatch

566 }	567 }

567 }	568 }

568 return re	569 return re

569 }	570 }

570	571

571 // leadingRegexp returns the leading regexp that re starts with.	572 // leadingRegexp returns the leading regexp that re begins with.

572 // The regexp refers to storage in re or its children.	573 // The regexp refers to storage in re or its children.

573 func (p parser) leadingRegexp(re Regexp) *Regexp {	574 func (p parser) leadingRegexp(re Regexp) *Regexp {

574 if re.Op == OpEmptyMatch {	575 if re.Op == OpEmptyMatch {

575 return nil	576 return nil

576 }	577 }

577 if re.Op == OpConcat && len(re.Sub) > 0 {	578 if re.Op == OpConcat && len(re.Sub) > 0 {

578 sub := re.Sub[0]	579 sub := re.Sub[0]

579 if sub.Op == OpEmptyMatch {	580 if sub.Op == OpEmptyMatch {

580 return nil	581 return nil

581 }	582 }

(...skipping 448 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1030 // If it sits above an opVerticalBar, swap it below	1031 // If it sits above an opVerticalBar, swap it below

1031 // (things below an opVerticalBar become an alternation).	1032 // (things below an opVerticalBar become an alternation).

1032 // Otherwise, push a new vertical bar.	1033 // Otherwise, push a new vertical bar.

1033 if !p.swapVerticalBar() {	1034 if !p.swapVerticalBar() {

1034 p.op(opVerticalBar)	1035 p.op(opVerticalBar)

1035 }	1036 }

1036	1037

1037 return nil	1038 return nil

1038 }	1039 }

1039	1040

1040 // mergeCharClass makes re1 = re1\|re2.	1041 // mergeCharClass makes dst = dst\|src.

1041 // The caller must ensure that re1.Op > re2.Op.	1042 // The caller must ensure that dst.Op >= src.Op,
r 2011/06/30 03:06:02 i'd expect re2.Op > re1.Op i'd expect re2.Op > re1.Op
1042 func mergeCharClass(re1, re2 *Regexp) {	1043 // to reduce the amount of copying.

1043 » switch re1.Op {	1044 func mergeCharClass(dst, src *Regexp) {

	1045 » switch dst.Op {

1044 case OpAnyChar:	1046 case OpAnyChar:

1045 » » // re2 doesn't add anything.	1047 » » // src doesn't add anything.

1046 case OpAnyCharNotNL:	1048 case OpAnyCharNotNL:

1047 » » // re2 might add \n	1049 » » // src might add \n

1048 » » if matchRune(re2, '\n') {	1050 » » if matchRune(src, '\n') {

1049 » » » re1.Op = OpAnyChar	1051 » » » dst.Op = OpAnyChar

1050 }	1052 }

1051 case OpCharClass:	1053 case OpCharClass:

1052 » » // re2 is simpler, so either literal or char class	1054 » » // src is simpler, so either literal or char class

1053 » » if re2.Op == OpLiteral {	1055 » » if src.Op == OpLiteral {

1054 » » » re1.Rune = appendRange(re1.Rune, re2.Rune[0], re2.Rune[0 ])	1056 » » » dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0 ])

1055 } else {	1057 } else {

1056 » » » re1.Rune = appendClass(re1.Rune, re2.Rune)	1058 » » » dst.Rune = appendClass(dst.Rune, src.Rune)

1057 }	1059 }

1058 case OpLiteral:	1060 case OpLiteral:

1059 // both literal	1061 // both literal

1060 » » if re2.Rune[0] == re1.Rune[0] {	1062 » » if src.Rune[0] == dst.Rune[0] {

1061 break	1063 break

1062 }	1064 }

1063 » » re1.Op = OpCharClass	1065 » » dst.Op = OpCharClass

1064 » » re1.Rune = append(re1.Rune, re1.Rune[0])	1066 » » dst.Rune = append(dst.Rune, dst.Rune[0])

1065 » » re1.Rune = appendRange(re1.Rune, re2.Rune[0], re2.Rune[0])	1067 » » dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])

1066 }	1068 }

1067 }	1069 }

1068	1070

1069 // If the top of the stack is an element followed by an opVerticalBar	1071 // If the top of the stack is an element followed by an opVerticalBar

1070 // swapVerticalBar swaps the two and returns true.	1072 // swapVerticalBar swaps the two and returns true.

1071 // Otherwise it returns false.	1073 // Otherwise it returns false.

1072 func (p *parser) swapVerticalBar() bool {	1074 func (p *parser) swapVerticalBar() bool {

1073 // If above and below vertical bar are literal or char class,	1075 // If above and below vertical bar are literal or char class,

1074 // can merge into a single char class.	1076 // can merge into a single char class.

1075 n := len(p.stack)	1077 n := len(p.stack)

(...skipping 711 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1787 return c - '0'	1789 return c - '0'

1788 }	1790 }

1789 if 'a' <= c && c <= 'f' {	1791 if 'a' <= c && c <= 'f' {

1790 return c - 'a' + 10	1792 return c - 'a' + 10

1791 }	1793 }

1792 if 'A' <= c && c <= 'F' {	1794 if 'A' <= c && c <= 'F' {

1793 return c - 'A' + 10	1795 return c - 'A' + 10

1794 }	1796 }

1795 return -1	1797 return -1

1796 }	1798 }

LEFT	RIGHT