src/pkg/exp/regexp/syntax/parse.go - Issue 4538123: code review 4538123: exp/regexp/syntax: syntax data structures, parser

Delta Between Two Patch Sets: src/pkg/exp/regexp/syntax/parse.go

Issue 4538123: code review 4538123: exp/regexp/syntax: syntax data structures, parser (Closed)

Left Patch Set: diff -r 69f12bae0f09 https://go.googlecode.com/hg Created 13 years, 10 months ago

Right Patch Set: diff -r 881a0fc6528d https://go.googlecode.com/hg Created 13 years, 9 months ago

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 // Copyright 2011 The Go Authors. All rights reserved.	1 // Copyright 2011 The Go Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style	2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.	3 // license that can be found in the LICENSE file.

4	4

5 package syntax	5 package syntax

6	6

7 import (	7 import (

8 "os"	8 "os"

9 "sort"	9 "sort"

10 "unicode"	10 "unicode"

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
53	53

54 const (	54 const (

55 FoldCase Flags = 1 << iota // case-insensitive match	55 FoldCase Flags = 1 << iota // case-insensitive match

56 Literal // treat pattern as literal string	56 Literal // treat pattern as literal string

57 ClassNL // allow character classes like [^a-z] a nd [[:space:]] to match newline	57 ClassNL // allow character classes like [^a-z] a nd [[:space:]] to match newline

58 DotNL // allow . to match newline	58 DotNL // allow . to match newline

59 OneLine // treat ^ and $ as only matching at beg inning and end of text	59 OneLine // treat ^ and $ as only matching at beg inning and end of text

60 NonGreedy // make repetition operators default to non-greedy	60 NonGreedy // make repetition operators default to non-greedy

61 PerlX // allow Perl extensions	61 PerlX // allow Perl extensions

62 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode gr oup and negation	62 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode gr oup and negation

63 WasDollar // regexp OpEndText was $, not \z	63 WasDollar // regexp OpEndText was $, not \z
Sam 2011/06/10 00:14:55 I'm kind of confused as to why you need this in th I'm kind of confused as to why you need this in this way, and don't need a parallel for ^ vs \a - it's clearly for use in a follow-up CL. I suspect you know what you're doing :) rsc 2011/06/10 00:27:38 In Perl, there is \Z (match end of text or right b Show quoted text On 2011/06/10 00:14:55, Sam wrote: > I'm kind of confused as to why you need this in this way, and don't need a > parallel for ^ vs \a - it's clearly for use in a follow-up CL. I suspect you > know what you're doing :) In Perl, there is \Z (match end of text or right before \n at end of text) and then there is \z (end of text only). In Perl single-line mode $ means \Z, but we don't support \Z, so in our single-line mode we map $ to \z. If we every want to run tests against PCRE to make sure we get the same answers, it's important to know if a particular OpEndText was a $, because then we should expect our library and PCRE to disagree once in a while about whether something matches. There is no equivalent for ^ and \A because Perl didn't screw up \A.
64 Simple // regexp contains no counted repetition	64 Simple // regexp contains no counted repetition

65	65

66 MatchNL = ClassNL \| DotNL	66 MatchNL = ClassNL \| DotNL

67	67

68 Perl = ClassNL \| OneLine \| PerlX \| UnicodeGroups // as close to P erl as possible	68 Perl = ClassNL \| OneLine \| PerlX \| UnicodeGroups // as close to P erl as possible

69 POSIX Flags = 0 // POSIX syntax	69 POSIX Flags = 0 // POSIX syntax

70

71 )	70 )

72	71

73 // Pseudo-ops for parsing stack.	72 // Pseudo-ops for parsing stack.

74 const (	73 const (

75 opLeftParen = opPseudo + iota	74 opLeftParen = opPseudo + iota

76 opVerticalBar	75 opVerticalBar

77 )	76 )

78	77

79 type parser struct {	78 type parser struct {

80 flags Flags // parse mode flags	79 flags Flags // parse mode flags

(...skipping 76 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
157 re = &Regexp{Op: OpConcat}	156 re = &Regexp{Op: OpConcat}

158 re.Sub = append(re.Sub0[:0], sub...)	157 re.Sub = append(re.Sub0[:0], sub...)

159 }	158 }

160 return p.push(re)	159 return p.push(re)

161 }	160 }

162	161

163 // alternate replaces the top of the stack (above the topmost '(') with its alte rnation.	162 // alternate replaces the top of the stack (above the topmost '(') with its alte rnation.

164 func (p parser) alternate() Regexp {	163 func (p parser) alternate() Regexp {

165 // TODO: Flatten alternates.	164 // TODO: Flatten alternates.

166	165

167 » // Scan down to find pseudo-operator \| or (.	166 » // Scan down to find pseudo-operator (.

	167 » // There are no \| above (.

168 i := len(p.stack)	168 i := len(p.stack)

169 for i > 0 && p.stack[i-1].Op < opPseudo {	169 for i > 0 && p.stack[i-1].Op < opPseudo {

170 i--	170 i--

171 }	171 }

172 sub := p.stack[i:]	172 sub := p.stack[i:]

173 p.stack = p.stack[:i]	173 p.stack = p.stack[:i]

174	174

175 var re *Regexp	175 var re *Regexp

176 switch len(sub) {	176 switch len(sub) {

177 case 0:	177 case 0:

(...skipping 24 matching lines...) Expand all Loading...
202 if len(re.Rune) >= cap(re.Rune) {	202 if len(re.Rune) >= cap(re.Rune) {

203 // string is too long to fit in Rune0. let Go h andle it	203 // string is too long to fit in Rune0. let Go h andle it

204 re.Rune = []int(s)	204 re.Rune = []int(s)

205 break	205 break

206 }	206 }

207 re.Rune = append(re.Rune, c)	207 re.Rune = append(re.Rune, c)

208 }	208 }

209 return re, nil	209 return re, nil

210 }	210 }

211	211

212 » // Otherwise, have to do real work.	212 » // Otherwise, must do real work.

213 var (	213 var (

214 p parser	214 p parser

215 err os.Error	215 err os.Error

216 c int	216 c int

217 op Op	217 op Op

218 )	218 )

219 p.flags = flags	219 p.flags = flags

220 p.wholeRegexp = s	220 p.wholeRegexp = s

221 t := s	221 t := s

222 for t != "" {	222 for t != "" {

(...skipping 173 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
396 t = t[1:]	396 t = t[1:]

397	397

398 // If character class does not match \n, add it here,	398 // If character class does not match \n, add it here,

399 // so that negation later will do the right thing.	399 // so that negation later will do the right thing.

400 if p.flags&ClassNL == 0 {	400 if p.flags&ClassNL == 0 {

401 re.Rune = append(re.Rune, '\n', '\n')	401 re.Rune = append(re.Rune, '\n', '\n')

402 }	402 }

403 }	403 }

404	404

405 class := re.Rune	405 class := re.Rune

406 » first := true // ] is okay as first char in class	406 » first := true // ] and - are okay as first char in class

407 for t == "" \|\| t[0] != ']' \|\| first {	407 for t == "" \|\| t[0] != ']' \|\| first {

408 // POSIX: - is only okay unescaped as first or last in class.	408 // POSIX: - is only okay unescaped as first or last in class.

409 // Perl: - is okay anywhere.	409 // Perl: - is okay anywhere.

410 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (le n(t) == 1 \|\| t[1] != ']') {	410 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (le n(t) == 1 \|\| t[1] != ']') {

411 _, size := utf8.DecodeRuneInString(t[1:])	411 _, size := utf8.DecodeRuneInString(t[1:])

412 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+ size]}	412 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+ size]}

413 }	413 }

414 first = false	414 first = false

415	415

416 // TODO: Look for [:alnum:]	416 // TODO: Look for [:alnum:]

(...skipping 135 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
552 return nil	552 return nil

553 }	553 }

554	554

555 func nextRune(s string) (c int, t string, err os.Error) {	555 func nextRune(s string) (c int, t string, err os.Error) {

556 c, size := utf8.DecodeRuneInString(s)	556 c, size := utf8.DecodeRuneInString(s)

557 if c == utf8.RuneError && size == 1 {	557 if c == utf8.RuneError && size == 1 {

558 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}	558 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}

559 }	559 }

560 return c, s[size:], nil	560 return c, s[size:], nil

561 }	561 }

LEFT	RIGHT