Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(137)

Side by Side Diff: src/pkg/exp/regexp/syntax/parse_test.go

Issue 4538123: code review 4538123: exp/regexp/syntax: syntax data structures, parser (Closed)
Patch Set: diff -r 881a0fc6528d https://go.googlecode.com/hg Created 13 years, 9 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/pkg/exp/regexp/syntax/parse.go ('k') | src/pkg/exp/regexp/syntax/regexp.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package syntax
6
7 import (
8 "bytes"
9 "fmt"
10 "testing"
11 "unicode"
12 )
13
14 var parseTests = []struct {
15 Regexp string
16 Dump string
17 }{
18 // Base cases
19 {"a", "lit{a}"},
20 {"a.", "cat{lit{a}dot{}}"},
21 {"a.b", "cat{lit{a}dot{}lit{b}}"},
22 // { "ab", "str{ab}" },
23 {"ab", "cat{lit{a}lit{b}}"},
24 {"a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}"},
25 // { "abc", "str{abc}" },
26 {"abc", "cat{lit{a}lit{b}lit{c}}"},
27 {"a|^", "alt{lit{a}bol{}}"},
28 // { "a|b", "cc{0x61-0x62}" },
29 {"a|b", "alt{lit{a}lit{b}}"},
30 {"(a)", "cap{lit{a}}"},
31 {"(a)|b", "alt{cap{lit{a}}lit{b}}"},
32 {"a*", "star{lit{a}}"},
33 {"a+", "plus{lit{a}}"},
34 {"a?", "que{lit{a}}"},
35 // { "a{2}", "rep{2,2 lit{a}}" },
36 // { "a{2,3}", "rep{2,3 lit{a}}" },
37 // { "a{2,}", "rep{2,-1 lit{a}}" },
38 // { "a*?", "nstar{lit{a}}" },
39 // { "a+?", "nplus{lit{a}}" },
40 // { "a??", "nque{lit{a}}" },
41 // { "a{2}?", "nrep{2,2 lit{a}}" },
42 // { "a{2,3}?", "nrep{2,3 lit{a}}" },
43 // { "a{2,}?", "nrep{2,-1 lit{a}}" },
44 {"", "emp{}"},
45 // { "|", "emp{}" }, // alt{emp{}emp{}} but got factored
46 // { "|", "alt{emp{}emp{}}" },
47 {"|x|", "alt{emp{}lit{x}emp{}}"},
48 {".", "dot{}"},
49 {"^", "bol{}"},
50 {"$", "eol{}"},
51 // { "\\|", "lit{|}" },
52 // { "\\(", "lit{(}" },
53 // { "\\)", "lit{)}" },
54 // { "\\*", "lit{*}" },
55 // { "\\+", "lit{+}" },
56 // { "\\?", "lit{?}" },
57 // { "{", "lit{{}" },
58 {"}", "lit{}}"},
59 // { "\\.", "lit{.}" },
60 // { "\\^", "lit{^}" },
61 // { "\\$", "lit{$}" },
62 // { "\\\\", "lit{\\}" },
63 {"[ace]", "cc{0x61 0x63 0x65}"},
64 {"[abc]", "cc{0x61-0x63}"},
65 {"[a-z]", "cc{0x61-0x7a}"},
66 // { "[a]", "lit{a}" },
67 {"[a]", "cc{0x61}"},
68 // { "\\-", "lit{-}" },
69 {"-", "lit{-}"},
70 // { "\\_", "lit{_}" },
71
72 // Posix and Perl extensions
73 // { "[[:lower:]]", "cc{0x61-0x7a}" },
74 // { "[a-z]", "cc{0x61-0x7a}" },
75 // { "[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" },
76 // { "[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" },
77 // { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
78 // { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
79 // { "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x 2129 0x212b-0x10ffff}" },
80 // { "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x 2129 0x212b-0x10ffff}" },
81 // { "\\d", "cc{0x30-0x39}" },
82 // { "\\D", "cc{0x0-0x2f 0x3a-0x10ffff}" },
83 // { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
84 // { "\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
85 // { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
86 // { "\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" } ,
87 // { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a }" },
88 // { "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x 180-0x2129 0x212b-0x10ffff}" },
89 // { "[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}" },
90 // { "\\C", "byte{}" },
91
92 // Unicode, negatives, and a double negative.
93 // { "\\p{Braille}", "cc{0x2800-0x28ff}" },
94 // { "\\P{Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" },
95 // { "\\p{^Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" },
96 // { "\\P{^Braille}", "cc{0x2800-0x28ff}" },
97
98 // More interesting regular expressions.
99 // { "a{,2}", "str{a{,2}}" },
100 // { "\\.\\^\\$\\\\", "str{.^$\\}" },
101 {"[a-zABC]", "cc{0x41-0x43 0x61-0x7a}"},
102 {"[^a]", "cc{0x0-0x60 0x62-0x10ffff}"},
103 {"[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}"}, // utf-8
104 // { "a*{", "cat{star{lit{a}}lit{{}}" },
105
106 // Test precedences
107 // { "(?:ab)*", "star{str{ab}}" },
108 // { "(ab)*", "star{cap{str{ab}}}" },
109 // { "ab|cd", "alt{str{ab}str{cd}}" },
110 // { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
111 {"(?:ab)*", "star{cat{lit{a}lit{b}}}"},
112 {"(ab)*", "star{cap{cat{lit{a}lit{b}}}}"},
113 {"ab|cd", "alt{cat{lit{a}lit{b}}cat{lit{c}lit{d}}}"},
114 {"a(b|c)d", "cat{lit{a}cap{alt{lit{b}lit{c}}}lit{d}}"},
115
116 // Test flattening.
117 // { "(?:a)", "lit{a}" },
118 // { "(?:ab)(?:cd)", "str{abcd}" },
119 // { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
120 // { "a|.", "dot{}" },
121 // { ".|a", "dot{}" },
122
123 // Test Perl quoted literals
124 // { "\\Q+|*?{[\\E", "str{+|*?{[}" },
125 // { "\\Q+\\E+", "plus{lit{+}}" },
126 // { "\\Q\\\\E", "lit{\\}" },
127 // { "\\Q\\\\\\E", "str{\\\\}" },
128
129 // Test Perl \A and \z
130 // { "(?m)^", "bol{}" },
131 // { "(?m)$", "eol{}" },
132 // { "(?-m)^", "bot{}" },
133 // { "(?-m)$", "eot{}" },
134 // { "(?m)\\A", "bot{}" },
135 // { "(?m)\\z", "eot{\\z}" },
136 // { "(?-m)\\A", "bot{}" },
137 // { "(?-m)\\z", "eot{\\z}" },
138
139 // Test named captures
140 // { "(?P<name>a)", "cap{name:lit{a}}" },
141
142 // Case-folded literals
143 // { "[Aa]", "litfold{a}" },
144
145 // Strings
146 // { "abcde", "str{abcde}" },
147 // { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
148 }
149
150 const testFlags = MatchNL | PerlX | UnicodeGroups
151
152 // Test Parse -> Dump.
153 func TestParseDump(t *testing.T) {
154 for _, tt := range parseTests {
155 re, err := Parse(tt.Regexp, testFlags)
156 if err != nil {
157 t.Errorf("Parse(%#q): %v", tt.Regexp, err)
158 continue
159 }
160 d := dump(re)
161 if d != tt.Dump {
162 t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
163 }
164 }
165 }
166
167 // dump prints a string representation of the regexp showing
168 // the structure explicitly.
169 func dump(re *Regexp) string {
170 var b bytes.Buffer
171 dumpRegexp(&b, re)
172 return b.String()
173 }
174
175 var opNames = []string{
176 OpNoMatch: "no",
177 OpEmptyMatch: "emp",
178 OpLiteral: "lit",
179 OpCharClass: "cc",
180 OpAnyCharNotNL: "dnl",
181 OpAnyChar: "dot",
182 OpBeginLine: "bol",
183 OpEndLine: "eol",
184 OpBeginText: "bot",
185 OpEndText: "eot",
186 OpWordBoundary: "wb",
187 OpNoWordBoundary: "nwb",
188 OpCapture: "cap",
189 OpStar: "star",
190 OpPlus: "plus",
191 OpQuest: "que",
192 OpRepeat: "rep",
193 OpConcat: "cat",
194 OpAlternate: "alt",
195 }
196
197 // dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
198 // It is used during testing to distinguish between parses that might print
199 // the same using re's String method.
200 func dumpRegexp(b *bytes.Buffer, re *Regexp) {
201 if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
202 fmt.Fprintf(b, "op%d", re.Op)
203 } else {
204 switch re.Op {
205 default:
206 b.WriteString(opNames[re.Op])
207 case OpStar, OpPlus, OpQuest, OpRepeat:
208 if re.Flags&NonGreedy != 0 {
209 b.WriteByte('n')
210 }
211 b.WriteString(opNames[re.Op])
212 case OpLiteral:
213 if len(re.Rune) > 1 {
214 b.WriteString("str")
215 } else {
216 b.WriteString("lit")
217 }
218 if re.Flags&FoldCase != 0 {
219 for _, r := range re.Rune {
220 if unicode.ToUpper(r) != r {
221 b.WriteString("fold")
222 }
223 }
224 }
225 }
226 }
227 b.WriteByte('{')
228 switch re.Op {
229 case OpEndText:
230 if re.Flags&WasDollar == 0 {
231 b.WriteString(`\z`)
232 }
233 case OpLiteral:
234 for _, r := range re.Rune {
235 b.WriteRune(r)
236 }
237 case OpConcat, OpAlternate:
238 for _, sub := range re.Sub {
239 dumpRegexp(b, sub)
240 }
241 case OpStar, OpPlus, OpQuest:
242 dumpRegexp(b, re.Sub[0])
243 case OpRepeat:
244 fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
245 dumpRegexp(b, re.Sub[0])
246 case OpCapture:
247 if re.Name != "" {
248 b.WriteString(re.Name)
249 b.WriteByte(':')
250 }
251 dumpRegexp(b, re.Sub[0])
252 case OpCharClass:
253 sep := ""
254 for i := 0; i < len(re.Rune); i += 2 {
255 b.WriteString(sep)
256 sep = " "
257 lo, hi := re.Rune[i], re.Rune[i+1]
258 if lo == hi {
259 fmt.Fprintf(b, "%#x", lo)
260 } else {
261 fmt.Fprintf(b, "%#x-%#x", lo, hi)
262 }
263 }
264 }
265 b.WriteByte('}')
266 }
OLDNEW
« no previous file with comments | « src/pkg/exp/regexp/syntax/parse.go ('k') | src/pkg/exp/regexp/syntax/regexp.go » ('j') | no next file with comments »

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b