OLD | NEW |
(Empty) | |
| 1 // Copyright 2011 The Go Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. |
| 4 |
| 5 package syntax |
| 6 |
| 7 import ( |
| 8 "bytes" |
| 9 "fmt" |
| 10 "testing" |
| 11 "unicode" |
| 12 ) |
| 13 |
| 14 var parseTests = []struct { |
| 15 Regexp string |
| 16 Dump string |
| 17 }{ |
| 18 // Base cases |
| 19 {"a", "lit{a}"}, |
| 20 {"a.", "cat{lit{a}dot{}}"}, |
| 21 {"a.b", "cat{lit{a}dot{}lit{b}}"}, |
| 22 // { "ab", "str{ab}" }, |
| 23 {"ab", "cat{lit{a}lit{b}}"}, |
| 24 {"a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}"}, |
| 25 // { "abc", "str{abc}" }, |
| 26 {"abc", "cat{lit{a}lit{b}lit{c}}"}, |
| 27 {"a|^", "alt{lit{a}bol{}}"}, |
| 28 // { "a|b", "cc{0x61-0x62}" }, |
| 29 {"a|b", "alt{lit{a}lit{b}}"}, |
| 30 {"(a)", "cap{lit{a}}"}, |
| 31 {"(a)|b", "alt{cap{lit{a}}lit{b}}"}, |
| 32 {"a*", "star{lit{a}}"}, |
| 33 {"a+", "plus{lit{a}}"}, |
| 34 {"a?", "que{lit{a}}"}, |
| 35 // { "a{2}", "rep{2,2 lit{a}}" }, |
| 36 // { "a{2,3}", "rep{2,3 lit{a}}" }, |
| 37 // { "a{2,}", "rep{2,-1 lit{a}}" }, |
| 38 // { "a*?", "nstar{lit{a}}" }, |
| 39 // { "a+?", "nplus{lit{a}}" }, |
| 40 // { "a??", "nque{lit{a}}" }, |
| 41 // { "a{2}?", "nrep{2,2 lit{a}}" }, |
| 42 // { "a{2,3}?", "nrep{2,3 lit{a}}" }, |
| 43 // { "a{2,}?", "nrep{2,-1 lit{a}}" }, |
| 44 {"", "emp{}"}, |
| 45 // { "|", "emp{}" }, // alt{emp{}emp{}} but got factored |
| 46 // { "|", "alt{emp{}emp{}}" }, |
| 47 {"|x|", "alt{emp{}lit{x}emp{}}"}, |
| 48 {".", "dot{}"}, |
| 49 {"^", "bol{}"}, |
| 50 {"$", "eol{}"}, |
| 51 // { "\\|", "lit{|}" }, |
| 52 // { "\\(", "lit{(}" }, |
| 53 // { "\\)", "lit{)}" }, |
| 54 // { "\\*", "lit{*}" }, |
| 55 // { "\\+", "lit{+}" }, |
| 56 // { "\\?", "lit{?}" }, |
| 57 // { "{", "lit{{}" }, |
| 58 {"}", "lit{}}"}, |
| 59 // { "\\.", "lit{.}" }, |
| 60 // { "\\^", "lit{^}" }, |
| 61 // { "\\$", "lit{$}" }, |
| 62 // { "\\\\", "lit{\\}" }, |
| 63 {"[ace]", "cc{0x61 0x63 0x65}"}, |
| 64 {"[abc]", "cc{0x61-0x63}"}, |
| 65 {"[a-z]", "cc{0x61-0x7a}"}, |
| 66 // { "[a]", "lit{a}" }, |
| 67 {"[a]", "cc{0x61}"}, |
| 68 // { "\\-", "lit{-}" }, |
| 69 {"-", "lit{-}"}, |
| 70 // { "\\_", "lit{_}" }, |
| 71 |
| 72 // Posix and Perl extensions |
| 73 // { "[[:lower:]]", "cc{0x61-0x7a}" }, |
| 74 // { "[a-z]", "cc{0x61-0x7a}" }, |
| 75 // { "[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" }, |
| 76 // { "[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" }, |
| 77 // { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, |
| 78 // { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, |
| 79 // { "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x
2129 0x212b-0x10ffff}" }, |
| 80 // { "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x
2129 0x212b-0x10ffff}" }, |
| 81 // { "\\d", "cc{0x30-0x39}" }, |
| 82 // { "\\D", "cc{0x0-0x2f 0x3a-0x10ffff}" }, |
| 83 // { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" }, |
| 84 // { "\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" }, |
| 85 // { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" }, |
| 86 // { "\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" }
, |
| 87 // { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a
}" }, |
| 88 // { "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x
180-0x2129 0x212b-0x10ffff}" }, |
| 89 // { "[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}" }, |
| 90 // { "\\C", "byte{}" }, |
| 91 |
| 92 // Unicode, negatives, and a double negative. |
| 93 // { "\\p{Braille}", "cc{0x2800-0x28ff}" }, |
| 94 // { "\\P{Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" }, |
| 95 // { "\\p{^Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" }, |
| 96 // { "\\P{^Braille}", "cc{0x2800-0x28ff}" }, |
| 97 |
| 98 // More interesting regular expressions. |
| 99 // { "a{,2}", "str{a{,2}}" }, |
| 100 // { "\\.\\^\\$\\\\", "str{.^$\\}" }, |
| 101 {"[a-zABC]", "cc{0x41-0x43 0x61-0x7a}"}, |
| 102 {"[^a]", "cc{0x0-0x60 0x62-0x10ffff}"}, |
| 103 {"[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}"}, // utf-8 |
| 104 // { "a*{", "cat{star{lit{a}}lit{{}}" }, |
| 105 |
| 106 // Test precedences |
| 107 // { "(?:ab)*", "star{str{ab}}" }, |
| 108 // { "(ab)*", "star{cap{str{ab}}}" }, |
| 109 // { "ab|cd", "alt{str{ab}str{cd}}" }, |
| 110 // { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" }, |
| 111 {"(?:ab)*", "star{cat{lit{a}lit{b}}}"}, |
| 112 {"(ab)*", "star{cap{cat{lit{a}lit{b}}}}"}, |
| 113 {"ab|cd", "alt{cat{lit{a}lit{b}}cat{lit{c}lit{d}}}"}, |
| 114 {"a(b|c)d", "cat{lit{a}cap{alt{lit{b}lit{c}}}lit{d}}"}, |
| 115 |
| 116 // Test flattening. |
| 117 // { "(?:a)", "lit{a}" }, |
| 118 // { "(?:ab)(?:cd)", "str{abcd}" }, |
| 119 // { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" }, |
| 120 // { "a|.", "dot{}" }, |
| 121 // { ".|a", "dot{}" }, |
| 122 |
| 123 // Test Perl quoted literals |
| 124 // { "\\Q+|*?{[\\E", "str{+|*?{[}" }, |
| 125 // { "\\Q+\\E+", "plus{lit{+}}" }, |
| 126 // { "\\Q\\\\E", "lit{\\}" }, |
| 127 // { "\\Q\\\\\\E", "str{\\\\}" }, |
| 128 |
| 129 // Test Perl \A and \z |
| 130 // { "(?m)^", "bol{}" }, |
| 131 // { "(?m)$", "eol{}" }, |
| 132 // { "(?-m)^", "bot{}" }, |
| 133 // { "(?-m)$", "eot{}" }, |
| 134 // { "(?m)\\A", "bot{}" }, |
| 135 // { "(?m)\\z", "eot{\\z}" }, |
| 136 // { "(?-m)\\A", "bot{}" }, |
| 137 // { "(?-m)\\z", "eot{\\z}" }, |
| 138 |
| 139 // Test named captures |
| 140 // { "(?P<name>a)", "cap{name:lit{a}}" }, |
| 141 |
| 142 // Case-folded literals |
| 143 // { "[Aa]", "litfold{a}" }, |
| 144 |
| 145 // Strings |
| 146 // { "abcde", "str{abcde}" }, |
| 147 // { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" }, |
| 148 } |
| 149 |
| 150 const testFlags = MatchNL | PerlX | UnicodeGroups |
| 151 |
| 152 // Test Parse -> Dump. |
| 153 func TestParseDump(t *testing.T) { |
| 154 for _, tt := range parseTests { |
| 155 re, err := Parse(tt.Regexp, testFlags) |
| 156 if err != nil { |
| 157 t.Errorf("Parse(%#q): %v", tt.Regexp, err) |
| 158 continue |
| 159 } |
| 160 d := dump(re) |
| 161 if d != tt.Dump { |
| 162 t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp,
d, tt.Dump) |
| 163 } |
| 164 } |
| 165 } |
| 166 |
| 167 // dump prints a string representation of the regexp showing |
| 168 // the structure explicitly. |
| 169 func dump(re *Regexp) string { |
| 170 var b bytes.Buffer |
| 171 dumpRegexp(&b, re) |
| 172 return b.String() |
| 173 } |
| 174 |
| 175 var opNames = []string{ |
| 176 OpNoMatch: "no", |
| 177 OpEmptyMatch: "emp", |
| 178 OpLiteral: "lit", |
| 179 OpCharClass: "cc", |
| 180 OpAnyCharNotNL: "dnl", |
| 181 OpAnyChar: "dot", |
| 182 OpBeginLine: "bol", |
| 183 OpEndLine: "eol", |
| 184 OpBeginText: "bot", |
| 185 OpEndText: "eot", |
| 186 OpWordBoundary: "wb", |
| 187 OpNoWordBoundary: "nwb", |
| 188 OpCapture: "cap", |
| 189 OpStar: "star", |
| 190 OpPlus: "plus", |
| 191 OpQuest: "que", |
| 192 OpRepeat: "rep", |
| 193 OpConcat: "cat", |
| 194 OpAlternate: "alt", |
| 195 } |
| 196 |
| 197 // dumpRegexp writes an encoding of the syntax tree for the regexp re to b. |
| 198 // It is used during testing to distinguish between parses that might print |
| 199 // the same using re's String method. |
| 200 func dumpRegexp(b *bytes.Buffer, re *Regexp) { |
| 201 if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { |
| 202 fmt.Fprintf(b, "op%d", re.Op) |
| 203 } else { |
| 204 switch re.Op { |
| 205 default: |
| 206 b.WriteString(opNames[re.Op]) |
| 207 case OpStar, OpPlus, OpQuest, OpRepeat: |
| 208 if re.Flags&NonGreedy != 0 { |
| 209 b.WriteByte('n') |
| 210 } |
| 211 b.WriteString(opNames[re.Op]) |
| 212 case OpLiteral: |
| 213 if len(re.Rune) > 1 { |
| 214 b.WriteString("str") |
| 215 } else { |
| 216 b.WriteString("lit") |
| 217 } |
| 218 if re.Flags&FoldCase != 0 { |
| 219 for _, r := range re.Rune { |
| 220 if unicode.ToUpper(r) != r { |
| 221 b.WriteString("fold") |
| 222 } |
| 223 } |
| 224 } |
| 225 } |
| 226 } |
| 227 b.WriteByte('{') |
| 228 switch re.Op { |
| 229 case OpEndText: |
| 230 if re.Flags&WasDollar == 0 { |
| 231 b.WriteString(`\z`) |
| 232 } |
| 233 case OpLiteral: |
| 234 for _, r := range re.Rune { |
| 235 b.WriteRune(r) |
| 236 } |
| 237 case OpConcat, OpAlternate: |
| 238 for _, sub := range re.Sub { |
| 239 dumpRegexp(b, sub) |
| 240 } |
| 241 case OpStar, OpPlus, OpQuest: |
| 242 dumpRegexp(b, re.Sub[0]) |
| 243 case OpRepeat: |
| 244 fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) |
| 245 dumpRegexp(b, re.Sub[0]) |
| 246 case OpCapture: |
| 247 if re.Name != "" { |
| 248 b.WriteString(re.Name) |
| 249 b.WriteByte(':') |
| 250 } |
| 251 dumpRegexp(b, re.Sub[0]) |
| 252 case OpCharClass: |
| 253 sep := "" |
| 254 for i := 0; i < len(re.Rune); i += 2 { |
| 255 b.WriteString(sep) |
| 256 sep = " " |
| 257 lo, hi := re.Rune[i], re.Rune[i+1] |
| 258 if lo == hi { |
| 259 fmt.Fprintf(b, "%#x", lo) |
| 260 } else { |
| 261 fmt.Fprintf(b, "%#x-%#x", lo, hi) |
| 262 } |
| 263 } |
| 264 } |
| 265 b.WriteByte('}') |
| 266 } |
OLD | NEW |