Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 // Copyright 2011 The Go Authors. All rights reserved. | 1 // Copyright 2011 The Go Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 package strings | 5 package strings |
6 | 6 |
7 import "io" | 7 import "io" |
8 | 8 |
9 // A Replacer replaces a list of strings with replacements. | 9 // A Replacer replaces a list of strings with replacements. |
10 type Replacer struct { | 10 type Replacer struct { |
(...skipping 15 matching lines...) Expand all Loading... | |
26 m[b>>5] |= uint32(1 << (b & 31)) | 26 m[b>>5] |= uint32(1 << (b & 31)) |
27 } | 27 } |
28 | 28 |
29 // NewReplacer returns a new Replacer from a list of old, new string pairs. | 29 // NewReplacer returns a new Replacer from a list of old, new string pairs. |
30 // Replacements are performed in order, without overlapping matches. | 30 // Replacements are performed in order, without overlapping matches. |
31 func NewReplacer(oldnew ...string) *Replacer { | 31 func NewReplacer(oldnew ...string) *Replacer { |
32 if len(oldnew)%2 == 1 { | 32 if len(oldnew)%2 == 1 { |
33 panic("strings.NewReplacer: odd argument count") | 33 panic("strings.NewReplacer: odd argument count") |
34 } | 34 } |
35 | 35 |
36 // Possible implementations. | |
37 var ( | |
38 bb byteReplacer | |
39 bs byteStringReplacer | |
40 ) | |
41 | |
42 allNewBytes := true | 36 allNewBytes := true |
43 for i := 0; i < len(oldnew); i += 2 { | 37 for i := 0; i < len(oldnew); i += 2 { |
44 » » old, new := oldnew[i], oldnew[i+1] | 38 » » if len(oldnew[i]) != 1 { |
45 » » if len(old) != 1 { | 39 » » » return &Replacer{r: makeGenericReplacer(oldnew)} |
46 » » » return &Replacer{r: buildGenReplacer(oldnew)} | 40 » » } |
47 » » } | 41 » » if len(oldnew[i+1]) != 1 { |
48 » » if len(new) != 1 { | |
49 allNewBytes = false | 42 allNewBytes = false |
50 } | 43 } |
51 | |
52 // byte -> string | |
53 bs.old.set(old[0]) | |
54 bs.new[old[0]] = []byte(new) | |
55 | |
56 // byte -> byte | |
57 if allNewBytes { | |
58 bb.old.set(old[0]) | |
59 bb.new[old[0]] = new[0] | |
60 } | |
61 } | 44 } |
62 | 45 |
63 if allNewBytes { | 46 if allNewBytes { |
64 » » return &Replacer{r: &bb} | 47 » » bb := &byteReplacer{} |
65 » } | 48 » » for i := 0; i < len(oldnew); i += 2 { |
66 » return &Replacer{r: &bs} | 49 » » » o, n := oldnew[i][0], oldnew[i+1][0] |
50 » » » if bb.old[o>>5]&uint32(1<<(o&31)) != 0 { | |
51 » » » » // Later old->new maps do not override previous ones with the same old string. | |
52 » » » » continue | |
53 » » » } | |
54 » » » bb.old.set(o) | |
55 » » » bb.new[o] = n | |
56 » » } | |
57 » » return &Replacer{r: bb} | |
58 » } | |
59 | |
60 » bs := &byteStringReplacer{} | |
61 » for i := 0; i < len(oldnew); i += 2 { | |
62 » » o, new := oldnew[i][0], oldnew[i+1] | |
63 » » if bs.old[o>>5]&uint32(1<<(o&31)) != 0 { | |
64 » » » // Later old->new maps do not override previous ones wit h the same old string. | |
65 » » » continue | |
66 » » } | |
67 » » bs.old.set(o) | |
68 » » bs.new[o] = []byte(new) | |
69 » } | |
70 » return &Replacer{r: bs} | |
67 } | 71 } |
68 | 72 |
69 // Replace returns a copy of s with all replacements performed. | 73 // Replace returns a copy of s with all replacements performed. |
70 func (r *Replacer) Replace(s string) string { | 74 func (r *Replacer) Replace(s string) string { |
71 return r.r.Replace(s) | 75 return r.r.Replace(s) |
72 } | 76 } |
73 | 77 |
74 // WriteString writes s to w with all replacements performed. | 78 // WriteString writes s to w with all replacements performed. |
75 func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) { | 79 func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) { |
76 return r.r.WriteString(w, s) | 80 return r.r.WriteString(w, s) |
77 } | 81 } |
78 | 82 |
79 type trie struct { | 83 // trieNode is a node in a lookup trie for prioritized key/value pairs. Keys |
80 » // The value at this node. | 84 // and values may be empty. For example, the trie containing keys "ax", "ay", |
85 // "bcbc", "x" and "xy" could have eight nodes: | |
86 // | |
87 // n0 - | |
88 // n1 a- | |
89 // n2 .x+ | |
90 // n3 .y+ | |
91 // n4 b- | |
92 // n5 .cbc+ | |
93 // n6 x+ | |
94 // n7 .y+ | |
95 // | |
96 // n0 is the root node, and its children are n1, n4 and n6; n1's children are | |
97 // n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked | |
98 // with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7 | |
99 // (marked with a trailing "+") are complete keys. | |
100 type trieNode struct { | |
101 » // value is the value of the trie node's key/value pair. It is empty if | |
102 » // this node is not a complete key. | |
81 value string | 103 value string |
82 » // Priority of this match vs superstrings. Higher is better, and 0 means | 104 » // priority is the priority (higher is more important) of the trie node' s |
83 » // this node does not have a value. | 105 » // key/value pair; keys are not necessarily matched shortest- or longest - |
106 » // first. Priority is positive if this node is a complete key, and zero | |
107 » // otherwise. In the example above, positive/zero priorities are marked | |
108 » // with a trailing "+" or "-". | |
84 priority int | 109 priority int |
85 » // If this node has multiple children (and thus with multiple different | 110 |
86 » // initial bytes) this will point to a lookup table based on the first | 111 » // A trie node may have zero, one or more child nodes: |
87 » // byte. | 112 » // * if the remaining fields are zero, there are no children. |
88 » table *[256]*trie | 113 » // * if prefix and next are non-zero, there is one child in next |
nigeltao
2012/09/17 01:49:24
Trailing full stop, and ditto on the next line.
| |
89 » // If this node has exactly one child, this will be nonempty. | 114 » // * if table is non-zero, it defines all the children |
115 » // | |
116 » // Prefixes are preferred over tables when there is one child, but the | |
117 » // root node always uses a table for lookup efficiency. | |
118 | |
119 » // prefix is the difference in keys between this trie node and the next. | |
120 » // In the example above, node n4 has prefix "cbc" and n4's next node is n5. | |
121 » // Node n5 has no children and so has zero prefix, next and table fields . | |
90 prefix string | 122 prefix string |
91 » // The prefixed node. | 123 » next *trieNode |
92 » next *trie | 124 |
93 » // If this node has no children, the previous 3 fields will be nil. | 125 » // table is a lookup table indexed by the next byte in the key, after |
94 } | 126 » // remapping that byte through genericReplacer.mapping to create a dense |
95 | 127 » // index. In the example above, the keys only use 'a', 'b', 'c', 'x' and |
96 func (t *trie) add(key, val string, priority int) { | 128 » // 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and |
97 » if len(key) == 0 && t.priority == 0 { | 129 » // genericReplacer.tableSize will be 5. Node n0's table will be |
98 » » t.value = val | 130 » // []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remappe d |
99 » » t.priority = priority | 131 » // 'a', 'b' and 'x'. |
132 » table []*trieNode | |
133 } | |
134 | |
135 func (t *trieNode) add(key, val string, priority int, r *genericReplacer) { | |
136 » if key == "" { | |
137 » » if t.priority == 0 { | |
138 » » » t.value = val | |
139 » » » t.priority = priority | |
140 » » } | |
100 return | 141 return |
101 } | 142 } |
102 | 143 |
103 » if t.next != nil { | 144 » if t.prefix != "" { |
104 // Need to split the prefix among multiple nodes. | 145 // Need to split the prefix among multiple nodes. |
105 var n int // length of the longest common prefix | 146 var n int // length of the longest common prefix |
106 » » for n = 0; n < len(t.prefix) && n < len(key); n++ { | 147 » » for ; n < len(t.prefix) && n < len(key); n++ { |
107 if t.prefix[n] != key[n] { | 148 if t.prefix[n] != key[n] { |
108 break | 149 break |
109 } | 150 } |
110 } | 151 } |
111 if n == len(t.prefix) { | 152 if n == len(t.prefix) { |
112 » » » t.next.add(key[n:], val, priority) | 153 » » » t.next.add(key[n:], val, priority, r) |
113 } else if n == 0 { | 154 } else if n == 0 { |
114 » » » // First byte differs, start a new lookup table here. | 155 » » » // First byte differs, start a new lookup table here. L ooking up |
115 » » » var prefnode *trie | 156 » » » // what is crrently t.prefix[0] will lead to prefixNode, and |
nigeltao
2012/09/17 01:49:24
Typo in currently.
| |
157 » » » // looking up key[0] will lead to keyNode. | |
158 » » » var prefixNode *trieNode | |
116 if len(t.prefix) == 1 { | 159 if len(t.prefix) == 1 { |
117 » » » » prefnode = t.next | 160 » » » » prefixNode = t.next |
118 } else { | 161 } else { |
119 » » » » prefnode = &trie{ | 162 » » » » prefixNode = &trieNode{ |
120 prefix: t.prefix[1:], | 163 prefix: t.prefix[1:], |
121 next: t.next, | 164 next: t.next, |
122 } | 165 } |
123 } | 166 } |
124 » » » keynode := new(trie) | 167 » » » keyNode := new(trieNode) |
125 » » » t.table = new([256]*trie) | 168 » » » t.table = make([]*trieNode, r.tableSize) |
126 » » » t.table[t.prefix[0]] = prefnode | 169 » » » t.table[r.mapping[t.prefix[0]]] = prefixNode |
127 » » » t.table[key[0]] = keynode | 170 » » » t.table[r.mapping[key[0]]] = keyNode |
128 t.prefix = "" | 171 t.prefix = "" |
129 t.next = nil | 172 t.next = nil |
130 » » » keynode.add(key[1:], val, priority) | 173 » » » keyNode.add(key[1:], val, priority, r) |
131 } else { | 174 } else { |
132 // Insert new node after the common section of the prefi x. | 175 // Insert new node after the common section of the prefi x. |
133 » » » next := &trie{ | 176 » » » next := &trieNode{ |
134 prefix: t.prefix[n:], | 177 prefix: t.prefix[n:], |
135 next: t.next, | 178 next: t.next, |
136 } | 179 } |
137 t.prefix = t.prefix[:n] | 180 t.prefix = t.prefix[:n] |
138 t.next = next | 181 t.next = next |
139 » » » next.add(key[n:], val, priority) | 182 » » » next.add(key[n:], val, priority, r) |
140 } | 183 } |
141 } else if t.table != nil { | 184 } else if t.table != nil { |
142 // Insert into existing table. | 185 // Insert into existing table. |
143 » » b := key[0] | 186 » » m := r.mapping[key[0]] |
144 » » if t.table[b] == nil { | 187 » » if t.table[m] == nil { |
145 » » » t.table[b] = new(trie) | 188 » » » t.table[m] = new(trieNode) |
146 » » } | 189 » » } |
147 » » next := t.table[b] | 190 » » t.table[m].add(key[1:], val, priority, r) |
148 | |
149 » » next.add(key[1:], val, priority) | |
150 } else { | 191 } else { |
151 t.prefix = key | 192 t.prefix = key |
152 » » t.next = new(trie) | 193 » » t.next = new(trieNode) |
153 » » t.next.add("", val, priority) | 194 » » t.next.add("", val, priority, r) |
154 » } | 195 » } |
155 } | 196 } |
156 | 197 |
157 func (t *trie) lookup(s string) (val string, keylen int, found bool) { | 198 func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) { |
158 // Iterate down the trie to the end, and grab the value and keylen with | 199 // Iterate down the trie to the end, and grab the value and keylen with |
159 // the highest priority. | 200 // the highest priority. |
160 bestPriority := 0 | 201 bestPriority := 0 |
161 » node := t | 202 » node := &r.root |
162 n := 0 | 203 n := 0 |
163 for node != nil { | 204 for node != nil { |
164 » » if node.priority > bestPriority { | 205 » » if node.priority > bestPriority && !(ignoreRoot && node == &r.ro ot) { |
165 bestPriority = node.priority | 206 bestPriority = node.priority |
166 val = node.value | 207 val = node.value |
167 keylen = n | 208 keylen = n |
168 found = true | 209 found = true |
169 } | 210 } |
170 | 211 |
171 » » if len(s) == 0 { | 212 » » if s == "" { |
172 break | 213 break |
173 } | 214 } |
174 if node.table != nil { | 215 if node.table != nil { |
175 » » » node = node.table[s[0]] | 216 » » » index := r.mapping[s[0]] |
217 » » » if int(index) == r.tableSize { | |
218 » » » » break | |
219 » » » } | |
220 » » » node = node.table[index] | |
176 s = s[1:] | 221 s = s[1:] |
177 n++ | 222 n++ |
178 } else if node.prefix != "" && HasPrefix(s, node.prefix) { | 223 } else if node.prefix != "" && HasPrefix(s, node.prefix) { |
179 n += len(node.prefix) | 224 n += len(node.prefix) |
180 s = s[len(node.prefix):] | 225 s = s[len(node.prefix):] |
181 node = node.next | 226 node = node.next |
182 } else { | 227 } else { |
183 break | 228 break |
184 } | 229 } |
185 } | 230 } |
186 return | 231 return |
187 } | 232 } |
188 | 233 |
189 // genericReplacer is the fully generic algorithm. | 234 // genericReplacer is the fully generic algorithm. |
190 // It's used as a fallback when nothing faster can be used. | 235 // It's used as a fallback when nothing faster can be used. |
191 type genericReplacer struct { | 236 type genericReplacer struct { |
192 » trie *trie | 237 » root trieNode |
193 » // handle empty match separately | 238 » // tableSize is the size of a trie node's lookup table. It is the number |
194 » emptyMatch string | 239 » // of unique key bytes. |
195 } | 240 » tableSize int |
196 | 241 » // mapping maps from key bytes to a dense index for trieNode.table. |
197 func buildGenReplacer(oldnew []string) *genericReplacer { | 242 » mapping [256]byte |
198 » t := new(trie) | 243 } |
199 » r := &genericReplacer{trie: t} | 244 |
245 func makeGenericReplacer(oldnew []string) *genericReplacer { | |
246 » r := new(genericReplacer) | |
247 » // Find each byte used, then assign them each an index. | |
200 for i := 0; i < len(oldnew); i += 2 { | 248 for i := 0; i < len(oldnew); i += 2 { |
201 » » key, val := oldnew[i], oldnew[i+1] | 249 » » key := oldnew[i] |
202 » » if key == "" { | 250 » » for j := 0; j < len(key); j++ { |
203 » » » if r.emptyMatch == "" { | 251 » » » r.mapping[key[j]] = 1 |
204 » » » » r.emptyMatch = val | 252 » » } |
205 » » » } | 253 » } |
206 » » } else { | 254 |
207 » » » t.add(key, val, len(oldnew)-i) | 255 » for _, b := range r.mapping { |
208 » » } | 256 » » r.tableSize += int(b) |
257 » } | |
258 | |
259 » var index byte | |
260 » for i, b := range r.mapping { | |
261 » » if b == 0 { | |
262 » » » r.mapping[i] = byte(r.tableSize) | |
263 » » } else { | |
264 » » » r.mapping[i] = index | |
265 » » » index++ | |
266 » » } | |
267 » } | |
268 » // Ensure root node uses a lookup table (for performance). | |
269 » r.root.table = make([]*trieNode, r.tableSize) | |
270 | |
271 » for i := 0; i < len(oldnew); i += 2 { | |
272 » » r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r) | |
209 } | 273 } |
210 return r | 274 return r |
211 } | 275 } |
212 | 276 |
277 type appendSliceWriter []byte | |
278 | |
279 // Write writes to the buffer to satisfy io.Writer. | |
280 func (w *appendSliceWriter) Write(p []byte) (int, error) { | |
281 *w = append(*w, p...) | |
282 return len(p), nil | |
283 } | |
284 | |
285 // WriteString writes to the buffer without string->[]byte->string allocations. | |
286 func (w *appendSliceWriter) WriteString(s string) (int, error) { | |
287 *w = append(*w, s...) | |
288 return len(s), nil | |
289 } | |
290 | |
291 type stringWriter struct { | |
292 w io.Writer | |
293 } | |
294 | |
295 func (w stringWriter) WriteString(s string) (int, error) { | |
296 return w.w.Write([]byte(s)) | |
297 } | |
298 | |
213 func (r *genericReplacer) Replace(s string) string { | 299 func (r *genericReplacer) Replace(s string) string { |
214 » var buf []byte | 300 » buf := make(appendSliceWriter, 0, len(s)) |
215 » last := 0 | 301 » r.WriteString(&buf, s) |
216 » for i := 0; i < len(s); { | 302 » return string(buf) |
217 » » if r.emptyMatch != "" { | 303 } |
218 » » » buf = append(buf, r.emptyMatch...) | 304 |
219 » » } | 305 func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) { |
220 | 306 » sw, ok := w.(interface { |
221 » » val, keylen, match := r.trie.lookup(s[i:]) | 307 » » WriteString(string) (int, error) |
308 » }) | |
309 » if !ok { | |
310 » » sw = stringWriter{w} | |
311 » } | |
312 | |
313 » var last, wn int | |
314 » var prevMatchEmpty bool | |
315 » for i := 0; i <= len(s); { | |
316 » » // Ignore the empty match iff the previous loop found the empty match. | |
317 » » val, keylen, match := r.lookup(s[i:], prevMatchEmpty) | |
318 » » prevMatchEmpty = match && keylen == 0 | |
222 if match { | 319 if match { |
223 » » » if r.emptyMatch == "" { | 320 » » » wn, err = sw.WriteString(s[last:i]) |
224 » » » » // can only aggregate when there's no empty matc h | 321 » » » n += wn |
225 » » » » buf = append(buf, s[last:i]...) | 322 » » » if err != nil { |
226 » » » } | 323 » » » » return |
227 » » » buf = append(buf, val...) | 324 » » » } |
325 » » » wn, err = sw.WriteString(val) | |
326 » » » n += wn | |
327 » » » if err != nil { | |
328 » » » » return | |
329 » » » } | |
228 i += keylen | 330 i += keylen |
229 last = i | 331 last = i |
230 continue | 332 continue |
231 } | 333 } |
232 | |
233 if r.emptyMatch != "" { | |
234 buf = append(buf, s[i]) | |
235 } | |
236 i++ | 334 i++ |
237 } | 335 } |
238 » if r.emptyMatch == "" { | 336 » if last != len(s) { |
239 » » if buf == nil { | 337 » » wn, err = sw.WriteString(s[last:]) |
240 » » » // If nothing got replaced, avoid any allocation. | |
241 » » » return s | |
242 » » } | |
243 » » buf = append(buf, s[last:]...) | |
244 » } else { | |
245 » » buf = append(buf, r.emptyMatch...) | |
246 » } | |
247 » return string(buf) | |
248 } | |
249 | |
250 // WriteString writes the replaced version of s to w, using the same logic as | |
251 // Replace. If you want to avoid potentially many small writes, write to a | |
252 // bufio.Writer. | |
253 func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) { | |
254 » var last, wn int | |
255 » for i := 0; i < len(s); { | |
256 » » if r.emptyMatch != "" { | |
257 » » » wn, err = io.WriteString(w, r.emptyMatch) | |
258 » » » n += wn | |
259 » » » if err != nil { | |
260 » » » » return | |
261 » » » } | |
262 » » } | |
263 | |
264 » » val, keylen, match := r.trie.lookup(s[i:]) | |
265 » » if match { | |
266 » » » if r.emptyMatch == "" { | |
267 » » » » // can only aggregate when there's no empty matc h | |
268 » » » » wn, err = io.WriteString(w, s[last:i]) | |
269 » » » » n += wn | |
270 » » » » if err != nil { | |
271 » » » » » return | |
272 » » » » } | |
273 » » » } | |
274 » » » wn, err = io.WriteString(w, val) | |
275 » » » n += wn | |
276 » » » if err != nil { | |
277 » » » » return | |
278 » » » } | |
279 » » » i += keylen | |
280 » » » last = i | |
281 » » » continue | |
282 » » } | |
283 | |
284 » » if r.emptyMatch != "" { | |
285 » » » wn, err = io.WriteString(w, s[i:i+1]) | |
286 » » » n += wn | |
287 » » » if err != nil { | |
288 » » » » return | |
289 » » » } | |
290 » » } | |
291 » » i++ | |
292 » } | |
293 » if r.emptyMatch == "" { | |
294 » » wn, err = io.WriteString(w, s[last:]) | |
295 » » n += wn | |
296 » } else { | |
297 » » wn, err = io.WriteString(w, r.emptyMatch) | |
298 n += wn | 338 n += wn |
299 } | 339 } |
300 return | 340 return |
301 } | 341 } |
302 | 342 |
303 // byteReplacer is the implementation that's used when all the "old" | 343 // byteReplacer is the implementation that's used when all the "old" |
304 // and "new" values are single ASCII bytes. | 344 // and "new" values are single ASCII bytes. |
305 type byteReplacer struct { | 345 type byteReplacer struct { |
306 // old has a bit set for each old byte that should be replaced. | 346 // old has a bit set for each old byte that should be replaced. |
307 old byteBitmap | 347 old byteBitmap |
(...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
433 } | 473 } |
434 if len(bi) > 0 { | 474 if len(bi) > 0 { |
435 nw, err := w.Write(bi) | 475 nw, err := w.Write(bi) |
436 n += nw | 476 n += nw |
437 if err != nil { | 477 if err != nil { |
438 return n, err | 478 return n, err |
439 } | 479 } |
440 } | 480 } |
441 return n, nil | 481 return n, nil |
442 } | 482 } |
443 | |
444 // strings is too low-level to import io/ioutil | |
445 var discard io.Writer = devNull(0) | |
446 | |
447 type devNull int | |
448 | |
449 func (devNull) Write(p []byte) (int, error) { | |
450 return len(p), nil | |
451 } | |
LEFT | RIGHT |