Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(1137)

Delta Between Two Patch Sets: src/pkg/strings/replace.go

Issue 6492076: code review 6492076: strings: implement a faster generic Replacer (Closed)
Left Patch Set: diff -r df382f6986cf https://code.google.com/p/go Created 11 years, 6 months ago
Right Patch Set: diff -r cdee8bf43694 https://code.google.com/p/go Created 11 years, 6 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
LEFTRIGHT
1 // Copyright 2011 The Go Authors. All rights reserved. 1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 package strings 5 package strings
6 6
7 import "io" 7 import "io"
8 8
9 // A Replacer replaces a list of strings with replacements. 9 // A Replacer replaces a list of strings with replacements.
10 type Replacer struct { 10 type Replacer struct {
(...skipping 15 matching lines...) Expand all
26 m[b>>5] |= uint32(1 << (b & 31)) 26 m[b>>5] |= uint32(1 << (b & 31))
27 } 27 }
28 28
29 // NewReplacer returns a new Replacer from a list of old, new string pairs. 29 // NewReplacer returns a new Replacer from a list of old, new string pairs.
30 // Replacements are performed in order, without overlapping matches. 30 // Replacements are performed in order, without overlapping matches.
31 func NewReplacer(oldnew ...string) *Replacer { 31 func NewReplacer(oldnew ...string) *Replacer {
32 if len(oldnew)%2 == 1 { 32 if len(oldnew)%2 == 1 {
33 panic("strings.NewReplacer: odd argument count") 33 panic("strings.NewReplacer: odd argument count")
34 } 34 }
35 35
36 // Possible implementations.
37 var (
38 bb byteReplacer
39 bs byteStringReplacer
40 )
41
42 allNewBytes := true 36 allNewBytes := true
43 for i := 0; i < len(oldnew); i += 2 { 37 for i := 0; i < len(oldnew); i += 2 {
44 » » old, new := oldnew[i], oldnew[i+1] 38 » » if len(oldnew[i]) != 1 {
45 » » if len(old) != 1 { 39 » » » return &Replacer{r: makeGenericReplacer(oldnew)}
46 » » » return &Replacer{r: buildGenReplacer(oldnew)} 40 » » }
47 » » } 41 » » if len(oldnew[i+1]) != 1 {
48 » » if len(new) != 1 {
49 allNewBytes = false 42 allNewBytes = false
50 } 43 }
51
52 // byte -> string
53 bs.old.set(old[0])
54 bs.new[old[0]] = []byte(new)
55
56 // byte -> byte
57 if allNewBytes {
58 bb.old.set(old[0])
59 bb.new[old[0]] = new[0]
60 }
61 } 44 }
62 45
63 if allNewBytes { 46 if allNewBytes {
64 » » return &Replacer{r: &bb} 47 » » bb := &byteReplacer{}
65 » } 48 » » for i := 0; i < len(oldnew); i += 2 {
66 » return &Replacer{r: &bs} 49 » » » o, n := oldnew[i][0], oldnew[i+1][0]
50 » » » if bb.old[o>>5]&uint32(1<<(o&31)) != 0 {
51 » » » » // Later old->new maps do not override previous ones with the same old string.
52 » » » » continue
53 » » » }
54 » » » bb.old.set(o)
55 » » » bb.new[o] = n
56 » » }
57 » » return &Replacer{r: bb}
58 » }
59
60 » bs := &byteStringReplacer{}
61 » for i := 0; i < len(oldnew); i += 2 {
62 » » o, new := oldnew[i][0], oldnew[i+1]
63 » » if bs.old[o>>5]&uint32(1<<(o&31)) != 0 {
64 » » » // Later old->new maps do not override previous ones wit h the same old string.
65 » » » continue
66 » » }
67 » » bs.old.set(o)
68 » » bs.new[o] = []byte(new)
69 » }
70 » return &Replacer{r: bs}
67 } 71 }
68 72
69 // Replace returns a copy of s with all replacements performed. 73 // Replace returns a copy of s with all replacements performed.
70 func (r *Replacer) Replace(s string) string { 74 func (r *Replacer) Replace(s string) string {
71 return r.r.Replace(s) 75 return r.r.Replace(s)
72 } 76 }
73 77
74 // WriteString writes s to w with all replacements performed. 78 // WriteString writes s to w with all replacements performed.
75 func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) { 79 func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) {
76 return r.r.WriteString(w, s) 80 return r.r.WriteString(w, s)
77 } 81 }
78 82
79 type trie struct { 83 // trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
80 » // The value at this node. 84 // and values may be empty. For example, the trie containing keys "ax", "ay",
85 // "bcbc", "x" and "xy" could have eight nodes:
86 //
87 // n0 -
88 // n1 a-
89 // n2 .x+
90 // n3 .y+
91 // n4 b-
92 // n5 .cbc+
93 // n6 x+
94 // n7 .y+
95 //
96 // n0 is the root node, and its children are n1, n4 and n6; n1's children are
97 // n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
98 // with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
99 // (marked with a trailing "+") are complete keys.
100 type trieNode struct {
101 » // value is the value of the trie node's key/value pair. It is empty if
102 » // this node is not a complete key.
81 value string 103 value string
82 » // Priority of this match vs superstrings. Higher is better, and 0 means 104 » // priority is the priority (higher is more important) of the trie node' s
83 » // this node does not have a value. 105 » // key/value pair; keys are not necessarily matched shortest- or longest -
106 » // first. Priority is positive if this node is a complete key, and zero
107 » // otherwise. In the example above, positive/zero priorities are marked
108 » // with a trailing "+" or "-".
84 priority int 109 priority int
85 » // If this node has multiple children (and thus with multiple different 110
86 » // initial bytes) this will point to a lookup table based on the first 111 » // A trie node may have zero, one or more child nodes:
87 » // byte. 112 » // * if the remaining fields are zero, there are no children.
88 » table *[256]*trie 113 » // * if prefix and next are non-zero, there is one child in next
nigeltao 2012/09/17 01:49:24 Trailing full stop, and ditto on the next line.
89 » // If this node has exactly one child, this will be nonempty. 114 » // * if table is non-zero, it defines all the children
115 » //
116 » // Prefixes are preferred over tables when there is one child, but the
117 » // root node always uses a table for lookup efficiency.
118
119 » // prefix is the difference in keys between this trie node and the next.
120 » // In the example above, node n4 has prefix "cbc" and n4's next node is n5.
121 » // Node n5 has no children and so has zero prefix, next and table fields .
90 prefix string 122 prefix string
91 » // The prefixed node. 123 » next *trieNode
92 » next *trie 124
93 » // If this node has no children, the previous 3 fields will be nil. 125 » // table is a lookup table indexed by the next byte in the key, after
94 } 126 » // remapping that byte through genericReplacer.mapping to create a dense
95 127 » // index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
96 func (t *trie) add(key, val string, priority int) { 128 » // 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
97 » if len(key) == 0 && t.priority == 0 { 129 » // genericReplacer.tableSize will be 5. Node n0's table will be
98 » » t.value = val 130 » // []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remappe d
99 » » t.priority = priority 131 » // 'a', 'b' and 'x'.
132 » table []*trieNode
133 }
134
135 func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
136 » if key == "" {
137 » » if t.priority == 0 {
138 » » » t.value = val
139 » » » t.priority = priority
140 » » }
100 return 141 return
101 } 142 }
102 143
103 » if t.next != nil { 144 » if t.prefix != "" {
104 // Need to split the prefix among multiple nodes. 145 // Need to split the prefix among multiple nodes.
105 var n int // length of the longest common prefix 146 var n int // length of the longest common prefix
106 » » for n = 0; n < len(t.prefix) && n < len(key); n++ { 147 » » for ; n < len(t.prefix) && n < len(key); n++ {
107 if t.prefix[n] != key[n] { 148 if t.prefix[n] != key[n] {
108 break 149 break
109 } 150 }
110 } 151 }
111 if n == len(t.prefix) { 152 if n == len(t.prefix) {
112 » » » t.next.add(key[n:], val, priority) 153 » » » t.next.add(key[n:], val, priority, r)
113 } else if n == 0 { 154 } else if n == 0 {
114 » » » // First byte differs, start a new lookup table here. 155 » » » // First byte differs, start a new lookup table here. L ooking up
115 » » » var prefnode *trie 156 » » » // what is crrently t.prefix[0] will lead to prefixNode, and
nigeltao 2012/09/17 01:49:24 Typo in currently.
157 » » » // looking up key[0] will lead to keyNode.
158 » » » var prefixNode *trieNode
116 if len(t.prefix) == 1 { 159 if len(t.prefix) == 1 {
117 » » » » prefnode = t.next 160 » » » » prefixNode = t.next
118 } else { 161 } else {
119 » » » » prefnode = &trie{ 162 » » » » prefixNode = &trieNode{
120 prefix: t.prefix[1:], 163 prefix: t.prefix[1:],
121 next: t.next, 164 next: t.next,
122 } 165 }
123 } 166 }
124 » » » keynode := new(trie) 167 » » » keyNode := new(trieNode)
125 » » » t.table = new([256]*trie) 168 » » » t.table = make([]*trieNode, r.tableSize)
126 » » » t.table[t.prefix[0]] = prefnode 169 » » » t.table[r.mapping[t.prefix[0]]] = prefixNode
127 » » » t.table[key[0]] = keynode 170 » » » t.table[r.mapping[key[0]]] = keyNode
128 t.prefix = "" 171 t.prefix = ""
129 t.next = nil 172 t.next = nil
130 » » » keynode.add(key[1:], val, priority) 173 » » » keyNode.add(key[1:], val, priority, r)
131 } else { 174 } else {
132 // Insert new node after the common section of the prefi x. 175 // Insert new node after the common section of the prefi x.
133 » » » next := &trie{ 176 » » » next := &trieNode{
134 prefix: t.prefix[n:], 177 prefix: t.prefix[n:],
135 next: t.next, 178 next: t.next,
136 } 179 }
137 t.prefix = t.prefix[:n] 180 t.prefix = t.prefix[:n]
138 t.next = next 181 t.next = next
139 » » » next.add(key[n:], val, priority) 182 » » » next.add(key[n:], val, priority, r)
140 } 183 }
141 } else if t.table != nil { 184 } else if t.table != nil {
142 // Insert into existing table. 185 // Insert into existing table.
143 » » b := key[0] 186 » » m := r.mapping[key[0]]
144 » » if t.table[b] == nil { 187 » » if t.table[m] == nil {
145 » » » t.table[b] = new(trie) 188 » » » t.table[m] = new(trieNode)
146 » » } 189 » » }
147 » » next := t.table[b] 190 » » t.table[m].add(key[1:], val, priority, r)
148
149 » » next.add(key[1:], val, priority)
150 } else { 191 } else {
151 t.prefix = key 192 t.prefix = key
152 » » t.next = new(trie) 193 » » t.next = new(trieNode)
153 » » t.next.add("", val, priority) 194 » » t.next.add("", val, priority, r)
154 » } 195 » }
155 } 196 }
156 197
157 func (t *trie) lookup(s string) (val string, keylen int, found bool) { 198 func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
158 // Iterate down the trie to the end, and grab the value and keylen with 199 // Iterate down the trie to the end, and grab the value and keylen with
159 // the highest priority. 200 // the highest priority.
160 bestPriority := 0 201 bestPriority := 0
161 » node := t 202 » node := &r.root
162 n := 0 203 n := 0
163 for node != nil { 204 for node != nil {
164 » » if node.priority > bestPriority { 205 » » if node.priority > bestPriority && !(ignoreRoot && node == &r.ro ot) {
165 bestPriority = node.priority 206 bestPriority = node.priority
166 val = node.value 207 val = node.value
167 keylen = n 208 keylen = n
168 found = true 209 found = true
169 } 210 }
170 211
171 » » if len(s) == 0 { 212 » » if s == "" {
172 break 213 break
173 } 214 }
174 if node.table != nil { 215 if node.table != nil {
175 » » » node = node.table[s[0]] 216 » » » index := r.mapping[s[0]]
217 » » » if int(index) == r.tableSize {
218 » » » » break
219 » » » }
220 » » » node = node.table[index]
176 s = s[1:] 221 s = s[1:]
177 n++ 222 n++
178 } else if node.prefix != "" && HasPrefix(s, node.prefix) { 223 } else if node.prefix != "" && HasPrefix(s, node.prefix) {
179 n += len(node.prefix) 224 n += len(node.prefix)
180 s = s[len(node.prefix):] 225 s = s[len(node.prefix):]
181 node = node.next 226 node = node.next
182 } else { 227 } else {
183 break 228 break
184 } 229 }
185 } 230 }
186 return 231 return
187 } 232 }
188 233
189 // genericReplacer is the fully generic algorithm. 234 // genericReplacer is the fully generic algorithm.
190 // It's used as a fallback when nothing faster can be used. 235 // It's used as a fallback when nothing faster can be used.
191 type genericReplacer struct { 236 type genericReplacer struct {
192 » trie *trie 237 » root trieNode
193 » // handle empty match separately 238 » // tableSize is the size of a trie node's lookup table. It is the number
194 » emptyMatch string 239 » // of unique key bytes.
195 } 240 » tableSize int
196 241 » // mapping maps from key bytes to a dense index for trieNode.table.
197 func buildGenReplacer(oldnew []string) *genericReplacer { 242 » mapping [256]byte
198 » t := new(trie) 243 }
199 » r := &genericReplacer{trie: t} 244
245 func makeGenericReplacer(oldnew []string) *genericReplacer {
246 » r := new(genericReplacer)
247 » // Find each byte used, then assign them each an index.
200 for i := 0; i < len(oldnew); i += 2 { 248 for i := 0; i < len(oldnew); i += 2 {
201 » » key, val := oldnew[i], oldnew[i+1] 249 » » key := oldnew[i]
202 » » if key == "" { 250 » » for j := 0; j < len(key); j++ {
203 » » » if r.emptyMatch == "" { 251 » » » r.mapping[key[j]] = 1
204 » » » » r.emptyMatch = val 252 » » }
205 » » » } 253 » }
206 » » } else { 254
207 » » » t.add(key, val, len(oldnew)-i) 255 » for _, b := range r.mapping {
208 » » } 256 » » r.tableSize += int(b)
257 » }
258
259 » var index byte
260 » for i, b := range r.mapping {
261 » » if b == 0 {
262 » » » r.mapping[i] = byte(r.tableSize)
263 » » } else {
264 » » » r.mapping[i] = index
265 » » » index++
266 » » }
267 » }
268 » // Ensure root node uses a lookup table (for performance).
269 » r.root.table = make([]*trieNode, r.tableSize)
270
271 » for i := 0; i < len(oldnew); i += 2 {
272 » » r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
209 } 273 }
210 return r 274 return r
211 } 275 }
212 276
277 type appendSliceWriter []byte
278
279 // Write writes to the buffer to satisfy io.Writer.
280 func (w *appendSliceWriter) Write(p []byte) (int, error) {
281 *w = append(*w, p...)
282 return len(p), nil
283 }
284
285 // WriteString writes to the buffer without string->[]byte->string allocations.
286 func (w *appendSliceWriter) WriteString(s string) (int, error) {
287 *w = append(*w, s...)
288 return len(s), nil
289 }
290
291 type stringWriter struct {
292 w io.Writer
293 }
294
295 func (w stringWriter) WriteString(s string) (int, error) {
296 return w.w.Write([]byte(s))
297 }
298
213 func (r *genericReplacer) Replace(s string) string { 299 func (r *genericReplacer) Replace(s string) string {
214 » var buf []byte 300 » buf := make(appendSliceWriter, 0, len(s))
215 » last := 0 301 » r.WriteString(&buf, s)
216 » for i := 0; i < len(s); { 302 » return string(buf)
217 » » if r.emptyMatch != "" { 303 }
218 » » » buf = append(buf, r.emptyMatch...) 304
219 » » } 305 func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
220 306 » sw, ok := w.(interface {
221 » » val, keylen, match := r.trie.lookup(s[i:]) 307 » » WriteString(string) (int, error)
308 » })
309 » if !ok {
310 » » sw = stringWriter{w}
311 » }
312
313 » var last, wn int
314 » var prevMatchEmpty bool
315 » for i := 0; i <= len(s); {
316 » » // Ignore the empty match iff the previous loop found the empty match.
317 » » val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
318 » » prevMatchEmpty = match && keylen == 0
222 if match { 319 if match {
223 » » » if r.emptyMatch == "" { 320 » » » wn, err = sw.WriteString(s[last:i])
224 » » » » // can only aggregate when there's no empty matc h 321 » » » n += wn
225 » » » » buf = append(buf, s[last:i]...) 322 » » » if err != nil {
226 » » » } 323 » » » » return
227 » » » buf = append(buf, val...) 324 » » » }
325 » » » wn, err = sw.WriteString(val)
326 » » » n += wn
327 » » » if err != nil {
328 » » » » return
329 » » » }
228 i += keylen 330 i += keylen
229 last = i 331 last = i
230 continue 332 continue
231 } 333 }
232
233 if r.emptyMatch != "" {
234 buf = append(buf, s[i])
235 }
236 i++ 334 i++
237 } 335 }
238 » if r.emptyMatch == "" { 336 » if last != len(s) {
239 » » if buf == nil { 337 » » wn, err = sw.WriteString(s[last:])
240 » » » // If nothing got replaced, avoid any allocation.
241 » » » return s
242 » » }
243 » » buf = append(buf, s[last:]...)
244 » } else {
245 » » buf = append(buf, r.emptyMatch...)
246 » }
247 » return string(buf)
248 }
249
250 // WriteString writes the replaced version of s to w, using the same logic as
251 // Replace. If you want to avoid potentially many small writes, write to a
252 // bufio.Writer.
253 func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
254 » var last, wn int
255 » for i := 0; i < len(s); {
256 » » if r.emptyMatch != "" {
257 » » » wn, err = io.WriteString(w, r.emptyMatch)
258 » » » n += wn
259 » » » if err != nil {
260 » » » » return
261 » » » }
262 » » }
263
264 » » val, keylen, match := r.trie.lookup(s[i:])
265 » » if match {
266 » » » if r.emptyMatch == "" {
267 » » » » // can only aggregate when there's no empty matc h
268 » » » » wn, err = io.WriteString(w, s[last:i])
269 » » » » n += wn
270 » » » » if err != nil {
271 » » » » » return
272 » » » » }
273 » » » }
274 » » » wn, err = io.WriteString(w, val)
275 » » » n += wn
276 » » » if err != nil {
277 » » » » return
278 » » » }
279 » » » i += keylen
280 » » » last = i
281 » » » continue
282 » » }
283
284 » » if r.emptyMatch != "" {
285 » » » wn, err = io.WriteString(w, s[i:i+1])
286 » » » n += wn
287 » » » if err != nil {
288 » » » » return
289 » » » }
290 » » }
291 » » i++
292 » }
293 » if r.emptyMatch == "" {
294 » » wn, err = io.WriteString(w, s[last:])
295 » » n += wn
296 » } else {
297 » » wn, err = io.WriteString(w, r.emptyMatch)
298 n += wn 338 n += wn
299 } 339 }
300 return 340 return
301 } 341 }
302 342
303 // byteReplacer is the implementation that's used when all the "old" 343 // byteReplacer is the implementation that's used when all the "old"
304 // and "new" values are single ASCII bytes. 344 // and "new" values are single ASCII bytes.
305 type byteReplacer struct { 345 type byteReplacer struct {
306 // old has a bit set for each old byte that should be replaced. 346 // old has a bit set for each old byte that should be replaced.
307 old byteBitmap 347 old byteBitmap
(...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after
433 } 473 }
434 if len(bi) > 0 { 474 if len(bi) > 0 {
435 nw, err := w.Write(bi) 475 nw, err := w.Write(bi)
436 n += nw 476 n += nw
437 if err != nil { 477 if err != nil {
438 return n, err 478 return n, err
439 } 479 }
440 } 480 }
441 return n, nil 481 return n, nil
442 } 482 }
443
444 // strings is too low-level to import io/ioutil
445 var discard io.Writer = devNull(0)
446
447 type devNull int
448
449 func (devNull) Write(p []byte) (int, error) {
450 return len(p), nil
451 }
LEFTRIGHT

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b