src/pkg/exp/regexp/syntax/parse_test.go - Issue 4538123: code review 4538123: exp/regexp/syntax: syntax data structures, parser

Side by Side Diff: src/pkg/exp/regexp/syntax/parse_test.go

Issue 4538123: code review 4538123: exp/regexp/syntax: syntax data structures, parser (Closed)

Patch Set: diff -r 881a0fc6528d https://go.googlecode.com/hg Created 13 years, 9 months ago

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2011 The Go Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style

	3 // license that can be found in the LICENSE file.

	4

	5 package syntax

	6

	7 import (

	8 "bytes"

	9 "fmt"

	10 "testing"

	11 "unicode"

	12 )

	13

	14 var parseTests = []struct {

	15 Regexp string

	16 Dump string

	17 }{

	18 // Base cases

	19 {"a", "lit{a}"},

	20 {"a.", "cat{lit{a}dot{}}"},

	21 {"a.b", "cat{lit{a}dot{}lit{b}}"},

	22 // { "ab", "str{ab}" },

	23 {"ab", "cat{lit{a}lit{b}}"},

	24 {"a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}"},

	25 // { "abc", "str{abc}" },

	26 {"abc", "cat{lit{a}lit{b}lit{c}}"},

	27 {"a\|^", "alt{lit{a}bol{}}"},

	28 // { "a\|b", "cc{0x61-0x62}" },

	29 {"a\|b", "alt{lit{a}lit{b}}"},

	30 {"(a)", "cap{lit{a}}"},

	31 {"(a)\|b", "alt{cap{lit{a}}lit{b}}"},

	32 {"a*", "star{lit{a}}"},

	33 {"a+", "plus{lit{a}}"},

	34 {"a?", "que{lit{a}}"},

	35 // { "a{2}", "rep{2,2 lit{a}}" },

	36 // { "a{2,3}", "rep{2,3 lit{a}}" },

	37 // { "a{2,}", "rep{2,-1 lit{a}}" },

	38 // { "a*?", "nstar{lit{a}}" },

	39 // { "a+?", "nplus{lit{a}}" },

	40 // { "a??", "nque{lit{a}}" },

	41 // { "a{2}?", "nrep{2,2 lit{a}}" },

	42 // { "a{2,3}?", "nrep{2,3 lit{a}}" },

	43 // { "a{2,}?", "nrep{2,-1 lit{a}}" },

	44 {"", "emp{}"},

	45 // { "\|", "emp{}" }, // alt{emp{}emp{}} but got factored

	46 // { "\|", "alt{emp{}emp{}}" },

	47 {"\|x\|", "alt{emp{}lit{x}emp{}}"},

	48 {".", "dot{}"},

	49 {"^", "bol{}"},

	50 {"$", "eol{}"},

	51 // { "\\\|", "lit{\|}" },

	52 // { "\\(", "lit{(}" },

	53 // { "\\)", "lit{)}" },

	54 // { "\\", "lit{}" },

	55 // { "\\+", "lit{+}" },

	56 // { "\\?", "lit{?}" },

	57 // { "{", "lit{{}" },

	58 {"}", "lit{}}"},

	59 // { "\\.", "lit{.}" },

	60 // { "\\^", "lit{^}" },

	61 // { "\\$", "lit{$}" },

	62 // { "\\\\", "lit{\\}" },

	63 {"[ace]", "cc{0x61 0x63 0x65}"},

	64 {"[abc]", "cc{0x61-0x63}"},

	65 {"[a-z]", "cc{0x61-0x7a}"},

	66 // { "[a]", "lit{a}" },

	67 {"[a]", "cc{0x61}"},

	68 // { "\\-", "lit{-}" },

	69 {"-", "lit{-}"},

	70 // { "\\_", "lit{_}" },

	71

	72 // Posix and Perl extensions

	73 // { "[[:lower:]]", "cc{0x61-0x7a}" },

	74 // { "[a-z]", "cc{0x61-0x7a}" },

	75 // { "[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" },

	76 // { "[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" },

	77 // { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },

	78 // { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },

	79 // { "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x 2129 0x212b-0x10ffff}" },

	80 // { "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x 2129 0x212b-0x10ffff}" },

	81 // { "\\d", "cc{0x30-0x39}" },

	82 // { "\\D", "cc{0x0-0x2f 0x3a-0x10ffff}" },

	83 // { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },

	84 // { "\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },

	85 // { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },

	86 // { "\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" } ,

	87 // { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a }" },

	88 // { "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x 180-0x2129 0x212b-0x10ffff}" },

	89 // { "[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}" },

	90 // { "\\C", "byte{}" },

	91

	92 // Unicode, negatives, and a double negative.

	93 // { "\\p{Braille}", "cc{0x2800-0x28ff}" },

	94 // { "\\P{Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" },

	95 // { "\\p{^Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" },

	96 // { "\\P{^Braille}", "cc{0x2800-0x28ff}" },

	97

	98 // More interesting regular expressions.

	99 // { "a{,2}", "str{a{,2}}" },

	100 // { "\\.\\^\\$\\\\", "str{.^$\\}" },

	101 {"[a-zABC]", "cc{0x41-0x43 0x61-0x7a}"},

	102 {"[^a]", "cc{0x0-0x60 0x62-0x10ffff}"},

	103 {"[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}"}, // utf-8

	104 // { "a*{", "cat{star{lit{a}}lit{{}}" },

	105

	106 // Test precedences

	107 // { "(?:ab)*", "star{str{ab}}" },

	108 // { "(ab)*", "star{cap{str{ab}}}" },

	109 // { "ab\|cd", "alt{str{ab}str{cd}}" },

	110 // { "a(b\|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },

	111 {"(?:ab)*", "star{cat{lit{a}lit{b}}}"},

	112 {"(ab)*", "star{cap{cat{lit{a}lit{b}}}}"},

	113 {"ab\|cd", "alt{cat{lit{a}lit{b}}cat{lit{c}lit{d}}}"},

	114 {"a(b\|c)d", "cat{lit{a}cap{alt{lit{b}lit{c}}}lit{d}}"},

	115

	116 // Test flattening.

	117 // { "(?:a)", "lit{a}" },

	118 // { "(?:ab)(?:cd)", "str{abcd}" },

	119 // { "(?:a\|b)\|(?:c\|d)", "cc{0x61-0x64}" },

	120 // { "a\|.", "dot{}" },

	121 // { ".\|a", "dot{}" },

	122

	123 // Test Perl quoted literals

	124 // { "\\Q+\|?{[\\E", "str{+\|?{[}" },

	125 // { "\\Q+\\E+", "plus{lit{+}}" },

	126 // { "\\Q\\\\E", "lit{\\}" },

	127 // { "\\Q\\\\\\E", "str{\\\\}" },

	128

	129 // Test Perl \A and \z

	130 // { "(?m)^", "bol{}" },

	131 // { "(?m)$", "eol{}" },

	132 // { "(?-m)^", "bot{}" },

	133 // { "(?-m)$", "eot{}" },

	134 // { "(?m)\\A", "bot{}" },

	135 // { "(?m)\\z", "eot{\\z}" },

	136 // { "(?-m)\\A", "bot{}" },

	137 // { "(?-m)\\z", "eot{\\z}" },

	138

	139 // Test named captures

	140 // { "(?P<name>a)", "cap{name:lit{a}}" },

	141

	142 // Case-folded literals

	143 // { "[Aa]", "litfold{a}" },

	144

	145 // Strings

	146 // { "abcde", "str{abcde}" },

	147 // { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },

	148 }

	149

	150 const testFlags = MatchNL \| PerlX \| UnicodeGroups

	151

	152 // Test Parse -> Dump.

	153 func TestParseDump(t *testing.T) {

	154 for _, tt := range parseTests {

	155 re, err := Parse(tt.Regexp, testFlags)

	156 if err != nil {

	157 t.Errorf("Parse(%#q): %v", tt.Regexp, err)

	158 continue

	159 }

	160 d := dump(re)

	161 if d != tt.Dump {

	162 t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)

	163 }

	164 }

	165 }

	166

	167 // dump prints a string representation of the regexp showing

	168 // the structure explicitly.

	169 func dump(re *Regexp) string {

	170 var b bytes.Buffer

	171 dumpRegexp(&b, re)

	172 return b.String()

	173 }

	174

	175 var opNames = []string{

	176 OpNoMatch: "no",

	177 OpEmptyMatch: "emp",

	178 OpLiteral: "lit",

	179 OpCharClass: "cc",

	180 OpAnyCharNotNL: "dnl",

	181 OpAnyChar: "dot",

	182 OpBeginLine: "bol",

	183 OpEndLine: "eol",

	184 OpBeginText: "bot",

	185 OpEndText: "eot",

	186 OpWordBoundary: "wb",

	187 OpNoWordBoundary: "nwb",

	188 OpCapture: "cap",

	189 OpStar: "star",

	190 OpPlus: "plus",

	191 OpQuest: "que",

	192 OpRepeat: "rep",

	193 OpConcat: "cat",

	194 OpAlternate: "alt",

	195 }

	196

	197 // dumpRegexp writes an encoding of the syntax tree for the regexp re to b.

	198 // It is used during testing to distinguish between parses that might print

	199 // the same using re's String method.

	200 func dumpRegexp(b bytes.Buffer, re Regexp) {

	201 if int(re.Op) >= len(opNames) \|\| opNames[re.Op] == "" {

	202 fmt.Fprintf(b, "op%d", re.Op)

	203 } else {

	204 switch re.Op {

	205 default:

	206 b.WriteString(opNames[re.Op])

	207 case OpStar, OpPlus, OpQuest, OpRepeat:

	208 if re.Flags&NonGreedy != 0 {

	209 b.WriteByte('n')

	210 }

	211 b.WriteString(opNames[re.Op])

	212 case OpLiteral:

	213 if len(re.Rune) > 1 {

	214 b.WriteString("str")

	215 } else {

	216 b.WriteString("lit")

	217 }

	218 if re.Flags&FoldCase != 0 {

	219 for _, r := range re.Rune {

	220 if unicode.ToUpper(r) != r {

	221 b.WriteString("fold")

	222 }

	223 }

	224 }

	225 }

	226 }

	227 b.WriteByte('{')

	228 switch re.Op {

	229 case OpEndText:

	230 if re.Flags&WasDollar == 0 {

	231 b.WriteString(`\z`)

	232 }

	233 case OpLiteral:

	234 for _, r := range re.Rune {

	235 b.WriteRune(r)

	236 }

	237 case OpConcat, OpAlternate:

	238 for _, sub := range re.Sub {

	239 dumpRegexp(b, sub)

	240 }

	241 case OpStar, OpPlus, OpQuest:

	242 dumpRegexp(b, re.Sub[0])

	243 case OpRepeat:

	244 fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)

	245 dumpRegexp(b, re.Sub[0])

	246 case OpCapture:

	247 if re.Name != "" {

	248 b.WriteString(re.Name)

	249 b.WriteByte(':')

	250 }

	251 dumpRegexp(b, re.Sub[0])

	252 case OpCharClass:

	253 sep := ""

	254 for i := 0; i < len(re.Rune); i += 2 {

	255 b.WriteString(sep)

	256 sep = " "

	257 lo, hi := re.Rune[i], re.Rune[i+1]

	258 if lo == hi {

	259 fmt.Fprintf(b, "%#x", lo)

	260 } else {

	261 fmt.Fprintf(b, "%#x-%#x", lo, hi)

	262 }

	263 }

	264 }

	265 b.WriteByte('}')

	266 }

OLD	NEW

« no previous file with comments | « src/pkg/exp/regexp/syntax/parse.go ('k') | src/pkg/exp/regexp/syntax/regexp.go » ('j') | no next file with comments »