LEFT | RIGHT |
(no file at all) | |
1 // Copyright 2010 The Go Authors. All rights reserved. | 1 // Copyright 2010 The Go Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 // Package suffixarray implements substring search in logarithmic time using | 5 // Package suffixarray implements substring search in logarithmic time using |
6 // an in-memory suffix array. | 6 // an in-memory suffix array. |
7 // | 7 // |
8 // Example use: | 8 // Example use: |
9 // | 9 // |
10 // // create index for some data | 10 // // create index for some data |
11 // index := suffixarray.New(data) | 11 // index := suffixarray.New(data) |
12 // | 12 // |
13 // // lookup byte slice s | 13 // // lookup byte slice s |
14 // offsets1 := index.Lookup(s, -1) // the list of all indices where s occur
s in data | 14 // offsets1 := index.Lookup(s, -1) // the list of all indices where s occur
s in data |
15 // offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s
occurs in data | 15 // offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s
occurs in data |
16 // | 16 // |
17 package suffixarray | 17 package suffixarray |
18 | 18 |
19 import ( | 19 import ( |
20 "bytes" | 20 "bytes" |
21 "regexp" | 21 "regexp" |
22 "sort" | 22 "sort" |
23 ) | 23 ) |
24 | 24 |
25 | |
26 // Index implements a suffix array for fast substring search. | 25 // Index implements a suffix array for fast substring search. |
27 type Index struct { | 26 type Index struct { |
28 data []byte | 27 data []byte |
29 sa []int // suffix array for data | 28 sa []int // suffix array for data |
30 } | 29 } |
31 | 30 |
32 | |
33 // New creates a new Index for data. | 31 // New creates a new Index for data. |
34 // Index creation time is O(N*log(N)) for N = len(data). | 32 // Index creation time is O(N*log(N)) for N = len(data). |
35 func New(data []byte) *Index { | 33 func New(data []byte) *Index { |
36 return &Index{data, qsufsort(data)} | 34 return &Index{data, qsufsort(data)} |
37 } | 35 } |
38 | |
39 | 36 |
40 // Bytes returns the data over which the index was created. | 37 // Bytes returns the data over which the index was created. |
41 // It must not be modified. | 38 // It must not be modified. |
42 // | 39 // |
43 func (x *Index) Bytes() []byte { | 40 func (x *Index) Bytes() []byte { |
44 return x.data | 41 return x.data |
45 } | 42 } |
46 | 43 |
47 | |
48 func (x *Index) at(i int) []byte { | 44 func (x *Index) at(i int) []byte { |
49 return x.data[x.sa[i]:] | 45 return x.data[x.sa[i]:] |
50 } | 46 } |
51 | |
52 | 47 |
53 // lookupAll returns a slice into the matching region of the index. | 48 // lookupAll returns a slice into the matching region of the index. |
54 // The runtime is O(log(N)*len(s)). | 49 // The runtime is O(log(N)*len(s)). |
55 func (x *Index) lookupAll(s []byte) []int { | 50 func (x *Index) lookupAll(s []byte) []int { |
56 // find matching suffix index range [i:j] | 51 // find matching suffix index range [i:j] |
57 // find the first index where s would be the prefix | 52 // find the first index where s would be the prefix |
58 i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at
(i), s) >= 0 }) | 53 i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at
(i), s) >= 0 }) |
59 // starting at i, find the first index at which s is not a prefix | 54 // starting at i, find the first index at which s is not a prefix |
60 j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPr
efix(x.at(j+i), s) }) | 55 j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPr
efix(x.at(j+i), s) }) |
61 return x.sa[i:j] | 56 return x.sa[i:j] |
62 } | 57 } |
63 | |
64 | 58 |
65 // Lookup returns an unsorted list of at most n indices where the byte string s | 59 // Lookup returns an unsorted list of at most n indices where the byte string s |
66 // occurs in the indexed data. If n < 0, all occurrences are returned. | 60 // occurs in the indexed data. If n < 0, all occurrences are returned. |
67 // The result is nil if s is empty, s is not found, or n == 0. | 61 // The result is nil if s is empty, s is not found, or n == 0. |
68 // Lookup time is O(log(N)*len(s) + len(result)) where N is the | 62 // Lookup time is O(log(N)*len(s) + len(result)) where N is the |
69 // size of the indexed data. | 63 // size of the indexed data. |
70 // | 64 // |
71 func (x *Index) Lookup(s []byte, n int) (result []int) { | 65 func (x *Index) Lookup(s []byte, n int) (result []int) { |
72 if len(s) > 0 && n != 0 { | 66 if len(s) > 0 && n != 0 { |
73 matches := x.lookupAll(s) | 67 matches := x.lookupAll(s) |
74 if len(matches) < n || n < 0 { | 68 if len(matches) < n || n < 0 { |
75 n = len(matches) | 69 n = len(matches) |
76 } | 70 } |
77 if n > 0 { | 71 if n > 0 { |
78 result = make([]int, n) | 72 result = make([]int, n) |
79 copy(result, matches) | 73 copy(result, matches) |
80 } | 74 } |
81 } | 75 } |
82 return | 76 return |
83 } | 77 } |
84 | |
85 | 78 |
86 // FindAllIndex returns a sorted list of non-overlapping matches of the | 79 // FindAllIndex returns a sorted list of non-overlapping matches of the |
87 // regular expression r, where a match is a pair of indices specifying | 80 // regular expression r, where a match is a pair of indices specifying |
88 // the matched slice of x.Bytes(). If n < 0, all matches are returned | 81 // the matched slice of x.Bytes(). If n < 0, all matches are returned |
89 // in successive order. Otherwise, at most n matches are returned and | 82 // in successive order. Otherwise, at most n matches are returned and |
90 // they may not be successive. The result is nil if there are no matches, | 83 // they may not be successive. The result is nil if there are no matches, |
91 // or if n == 0. | 84 // or if n == 0. |
92 // | 85 // |
93 func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) { | 86 func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) { |
94 // a non-empty literal prefix is used to determine possible | 87 // a non-empty literal prefix is used to determine possible |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
179 // found all matches or there's no chance to find more | 172 // found all matches or there's no chance to find more |
180 // (n and n1 can be negative) | 173 // (n and n1 can be negative) |
181 break | 174 break |
182 } | 175 } |
183 } | 176 } |
184 if len(result) == 0 { | 177 if len(result) == 0 { |
185 result = nil | 178 result = nil |
186 } | 179 } |
187 return | 180 return |
188 } | 181 } |
LEFT | RIGHT |