LEFT | RIGHT |
1 #! /usr/bin/env python3 | 1 #! /usr/bin/env python3 |
2 | 2 |
3 """ | 3 """ |
4 Module difflib -- helpers for computing deltas between objects. | 4 Module difflib -- helpers for computing deltas between objects. |
5 | 5 |
6 Function get_close_matches(word, possibilities, n=3, cutoff=0.6): | 6 Function get_close_matches(word, possibilities, n=3, cutoff=0.6): |
7 Use SequenceMatcher to return list of the best "good enough" matches. | 7 Use SequenceMatcher to return list of the best "good enough" matches. |
8 | 8 |
9 Function context_diff(a, b): | 9 Function context_diff(a, b): |
10 For two lists of strings, return a delta in context diff format. | 10 For two lists of strings, return a delta in context diff format. |
(...skipping 14 matching lines...) Expand all Loading... |
25 For producing human-readable deltas from sequences of lines of text. | 25 For producing human-readable deltas from sequences of lines of text. |
26 | 26 |
27 Class HtmlDiff: | 27 Class HtmlDiff: |
28 For producing HTML side by side comparison with change highlights. | 28 For producing HTML side by side comparison with change highlights. |
29 """ | 29 """ |
30 | 30 |
31 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', | 31 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', |
32 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', | 32 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', |
33 'unified_diff', 'HtmlDiff', 'Match'] | 33 'unified_diff', 'HtmlDiff', 'Match'] |
34 | 34 |
| 35 import warnings |
35 import heapq | 36 import heapq |
36 from collections import namedtuple as _namedtuple | 37 from collections import namedtuple as _namedtuple |
37 | 38 |
38 Match = _namedtuple('Match', 'a b size') | 39 Match = _namedtuple('Match', 'a b size') |
39 | 40 |
40 def _calculate_ratio(matches, length): | 41 def _calculate_ratio(matches, length): |
41 if length: | 42 if length: |
42 return 2.0 * matches / length | 43 return 2.0 * matches / length |
43 return 1.0 | 44 return 1.0 |
44 | 45 |
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
143 ratio() | 144 ratio() |
144 Return a measure of the sequences' similarity (float in [0,1]). | 145 Return a measure of the sequences' similarity (float in [0,1]). |
145 | 146 |
146 quick_ratio() | 147 quick_ratio() |
147 Return an upper bound on .ratio() relatively quickly. | 148 Return an upper bound on .ratio() relatively quickly. |
148 | 149 |
149 real_quick_ratio() | 150 real_quick_ratio() |
150 Return an upper bound on ratio() very quickly. | 151 Return an upper bound on ratio() very quickly. |
151 """ | 152 """ |
152 | 153 |
153 def __init__(self, isjunk=None, a='', b=''): | 154 def __init__(self, isjunk=None, a='', b='', autojunk=True): |
154 """Construct a SequenceMatcher. | 155 """Construct a SequenceMatcher. |
155 | 156 |
156 Optional arg isjunk is None (the default), or a one-argument | 157 Optional arg isjunk is None (the default), or a one-argument |
157 function that takes a sequence element and returns true iff the | 158 function that takes a sequence element and returns true iff the |
158 element is junk. None is equivalent to passing "lambda x: 0", i.e. | 159 element is junk. None is equivalent to passing "lambda x: 0", i.e. |
159 no elements are considered to be junk. For example, pass | 160 no elements are considered to be junk. For example, pass |
160 lambda x: x in " \\t" | 161 lambda x: x in " \\t" |
161 if you're comparing lines as sequences of characters, and don't | 162 if you're comparing lines as sequences of characters, and don't |
162 want to synch up on blanks or hard tabs. | 163 want to synch up on blanks or hard tabs. |
163 | 164 |
164 Optional arg a is the first of two sequences to be compared. By | 165 Optional arg a is the first of two sequences to be compared. By |
165 default, an empty string. The elements of a must be hashable. See | 166 default, an empty string. The elements of a must be hashable. See |
166 also .set_seqs() and .set_seq1(). | 167 also .set_seqs() and .set_seq1(). |
167 | 168 |
168 Optional arg b is the second of two sequences to be compared. By | 169 Optional arg b is the second of two sequences to be compared. By |
169 default, an empty string. The elements of b must be hashable. See | 170 default, an empty string. The elements of b must be hashable. See |
170 also .set_seqs() and .set_seq2(). | 171 also .set_seqs() and .set_seq2(). |
| 172 |
| 173 Optional arg autojunk should be set to False to disable the |
| 174 "automatic junk heuristic" that treats popular elements as junk |
| 175 (see module documentation for more information). |
171 """ | 176 """ |
172 | 177 |
173 # Members: | 178 # Members: |
174 # a | 179 # a |
175 # first sequence | 180 # first sequence |
176 # b | 181 # b |
177 # second sequence; differences are computed as "what do | 182 # second sequence; differences are computed as "what do |
178 # we need to do to 'a' to change it into 'b'?" | 183 # we need to do to 'a' to change it into 'b'?" |
179 # b2j | 184 # b2j |
180 # for x in b, b2j[x] is a list of the indices (into b) | 185 # for x in b, b2j[x] is a list of the indices (into b) |
181 # at which x appears; junk elements do not appear | 186 # at which x appears; junk and popular elements do not appear |
182 # fullbcount | 187 # fullbcount |
183 # for x in b, fullbcount[x] == the number of times x | 188 # for x in b, fullbcount[x] == the number of times x |
184 # appears in b; only materialized if really needed (used | 189 # appears in b; only materialized if really needed (used |
185 # only for computing quick_ratio()) | 190 # only for computing quick_ratio()) |
186 # matching_blocks | 191 # matching_blocks |
187 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; | 192 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; |
188 # ascending & non-overlapping in i and in j; terminated by | 193 # ascending & non-overlapping in i and in j; terminated by |
189 # a dummy (len(a), len(b), 0) sentinel | 194 # a dummy (len(a), len(b), 0) sentinel |
190 # opcodes | 195 # opcodes |
191 # a list of (tag, i1, i2, j1, j2) tuples, where tag is | 196 # a list of (tag, i1, i2, j1, j2) tuples, where tag is |
192 # one of | 197 # one of |
193 # 'replace' a[i1:i2] should be replaced by b[j1:j2] | 198 # 'replace' a[i1:i2] should be replaced by b[j1:j2] |
194 # 'delete' a[i1:i2] should be deleted | 199 # 'delete' a[i1:i2] should be deleted |
195 # 'insert' b[j1:j2] should be inserted | 200 # 'insert' b[j1:j2] should be inserted |
196 # 'equal' a[i1:i2] == b[j1:j2] | 201 # 'equal' a[i1:i2] == b[j1:j2] |
197 # isjunk | 202 # isjunk |
198 # a user-supplied function taking a sequence element and | 203 # a user-supplied function taking a sequence element and |
199 # returning true iff the element is "junk" -- this has | 204 # returning true iff the element is "junk" -- this has |
200 # subtle but helpful effects on the algorithm, which I'll | 205 # subtle but helpful effects on the algorithm, which I'll |
201 # get around to writing up someday <0.9 wink>. | 206 # get around to writing up someday <0.9 wink>. |
202 # DON'T USE! Only __chain_b uses this. Use isbjunk. | 207 # DON'T USE! Only __chain_b uses this. Use isbjunk. |
203 # isbjunk | 208 # bjunk |
204 # for x in b, isbjunk(x) == isjunk(x) but much faster; | 209 # the items in b for which isjunk is True. |
205 # it's really the __contains__ method of a hidden dict. | 210 # bpopular |
206 # DOES NOT WORK for x in a! | 211 # nonjunk items in b treated as junk by the heuristic (if used). |
207 # isbpopular | |
208 # for x in b, isbpopular(x) is true iff b is reasonably long | |
209 # (at least 200 elements) and x accounts for more than 1% of | |
210 # its elements. DOES NOT WORK for x in a! | |
211 | 212 |
212 self.isjunk = isjunk | 213 self.isjunk = isjunk |
213 self.a = self.b = None | 214 self.a = self.b = None |
| 215 self.autojunk = autojunk |
214 self.set_seqs(a, b) | 216 self.set_seqs(a, b) |
215 | 217 |
216 def set_seqs(self, a, b): | 218 def set_seqs(self, a, b): |
217 """Set the two sequences to be compared. | 219 """Set the two sequences to be compared. |
218 | 220 |
219 >>> s = SequenceMatcher() | 221 >>> s = SequenceMatcher() |
220 >>> s.set_seqs("abcd", "bcde") | 222 >>> s.set_seqs("abcd", "bcde") |
221 >>> s.ratio() | 223 >>> s.ratio() |
222 0.75 | 224 0.75 |
223 """ | 225 """ |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
280 self.__chain_b() | 282 self.__chain_b() |
281 | 283 |
282 # For each element x in b, set b2j[x] to a list of the indices in | 284 # For each element x in b, set b2j[x] to a list of the indices in |
283 # b where x appears; the indices are in increasing order; note that | 285 # b where x appears; the indices are in increasing order; note that |
284 # the number of times x appears in b is len(b2j[x]) ... | 286 # the number of times x appears in b is len(b2j[x]) ... |
285 # when self.isjunk is defined, junk elements don't show up in this | 287 # when self.isjunk is defined, junk elements don't show up in this |
286 # map at all, which stops the central find_longest_match method | 288 # map at all, which stops the central find_longest_match method |
287 # from starting any matching block at a junk element ... | 289 # from starting any matching block at a junk element ... |
288 # also creates the fast isbjunk function ... | 290 # also creates the fast isbjunk function ... |
289 # b2j also does not contain entries for "popular" elements, meaning | 291 # b2j also does not contain entries for "popular" elements, meaning |
290 # elements that account for more than 1% of the total elements, and | 292 # elements that account for more than 1 + 1% of the total elements, and |
291 # when the sequence is reasonably large (>= 200 elements); this can | 293 # when the sequence is reasonably large (>= 200 elements); this can |
292 # be viewed as an adaptive notion of semi-junk, and yields an enormous | 294 # be viewed as an adaptive notion of semi-junk, and yields an enormous |
293 # speedup when, e.g., comparing program files with hundreds of | 295 # speedup when, e.g., comparing program files with hundreds of |
294 # instances of "return NULL;" ... | 296 # instances of "return NULL;" ... |
295 # note that this is only called when b changes; so for cross-product | 297 # note that this is only called when b changes; so for cross-product |
296 # kinds of matches, it's best to call set_seq2 once, then set_seq1 | 298 # kinds of matches, it's best to call set_seq2 once, then set_seq1 |
297 # repeatedly | 299 # repeatedly |
298 | 300 |
299 def __chain_b(self): | 301 def __chain_b(self): |
300 # Because isjunk is a user-defined (not C) function, and we test | 302 # Because isjunk is a user-defined (not C) function, and we test |
301 # for junk a LOT, it's important to minimize the number of calls. | 303 # for junk a LOT, it's important to minimize the number of calls. |
302 # Before the tricks described here, __chain_b was by far the most | 304 # Before the tricks described here, __chain_b was by far the most |
303 # time-consuming routine in the whole module! If anyone sees | 305 # time-consuming routine in the whole module! If anyone sees |
304 # Jim Roskind, thank him again for profile.py -- I never would | 306 # Jim Roskind, thank him again for profile.py -- I never would |
305 # have guessed that. | 307 # have guessed that. |
306 # The first trick is to build b2j ignoring the possibility | 308 # The first trick is to build b2j ignoring the possibility |
307 # of junk. I.e., we don't call isjunk at all yet. Throwing | 309 # of junk. I.e., we don't call isjunk at all yet. Throwing |
308 # out the junk later is much cheaper than building b2j "right" | 310 # out the junk later is much cheaper than building b2j "right" |
309 # from the start. | 311 # from the start. |
310 b = self.b | 312 b = self.b |
| 313 self.b2j = b2j = {} |
| 314 |
| 315 for i, elt in enumerate(b): |
| 316 indices = b2j.setdefault(elt, []) |
| 317 indices.append(i) |
| 318 |
| 319 # Purge junk elements |
| 320 self.bjunk = junk = set() |
| 321 isjunk = self.isjunk |
| 322 if isjunk: |
| 323 for elt in b2j.keys(): |
| 324 if isjunk(elt): |
| 325 junk.add(elt) |
| 326 for elt in junk: # separate loop avoids separate list of keys |
| 327 del b2j[elt] |
| 328 |
| 329 # Purge popular elements that are not junk |
| 330 self.bpopular = popular = set() |
311 n = len(b) | 331 n = len(b) |
312 self.b2j = b2j = {} | 332 if self.autojunk and n >= 200: |
313 populardict = {} | 333 ntest = n // 100 + 1 |
314 for i, elt in enumerate(b): | 334 for elt, idxs in b2j.items(): |
315 if elt in b2j: | 335 if len(idxs) > ntest: |
316 indices = b2j[elt] | 336 popular.add(elt) |
317 if n >= 200 and len(indices) * 100 > n: | 337 for elt in popular: # ditto; as fast for 1% deletion |
318 populardict[elt] = 1 | 338 del b2j[elt] |
319 del indices[:] | 339 |
320 else: | 340 def isbjunk(self, item): |
321 indices.append(i) | 341 "Deprecated; use 'item in SequenceMatcher().bjunk'." |
322 else: | 342 warnings.warn("'SequenceMatcher().isbjunk(item)' is deprecated;\n" |
323 b2j[elt] = [i] | 343 "use 'item in SMinstance.bjunk' instead.", |
324 | 344 DeprecationWarning, 2) |
325 # Purge leftover indices for popular elements. | 345 return item in self.bjunk |
326 for elt in populardict: | 346 |
327 del b2j[elt] | 347 def isbpopular(self, item): |
328 | 348 "Deprecated; use 'item in SequenceMatcher().bpopular'." |
329 # Now b2j.keys() contains elements uniquely, and especially when | 349 warnings.warn("'SequenceMatcher().isbpopular(item)' is deprecated;\n" |
330 # the sequence is a string, that's usually a good deal smaller | 350 "use 'item in SMinstance.bpopular' instead.", |
331 # than len(string). The difference is the number of isjunk calls | 351 DeprecationWarning, 2) |
332 # saved. | 352 return item in self.bpopular |
333 isjunk = self.isjunk | |
334 junkdict = {} | |
335 if isjunk: | |
336 for d in populardict, b2j: | |
337 for elt in list(d.keys()): | |
338 if isjunk(elt): | |
339 junkdict[elt] = 1 | |
340 del d[elt] | |
341 | |
342 # Now for x in b, isjunk(x) == x in junkdict, but the | |
343 # latter is much faster. Note too that while there may be a | |
344 # lot of junk in the sequence, the number of *unique* junk | |
345 # elements is probably small. So the memory burden of keeping | |
346 # this dict alive is likely trivial compared to the size of b2j. | |
347 self.isbjunk = junkdict.__contains__ | |
348 self.isbpopular = populardict.__contains__ | |
349 | 353 |
350 def find_longest_match(self, alo, ahi, blo, bhi): | 354 def find_longest_match(self, alo, ahi, blo, bhi): |
351 """Find longest matching block in a[alo:ahi] and b[blo:bhi]. | 355 """Find longest matching block in a[alo:ahi] and b[blo:bhi]. |
352 | 356 |
353 If isjunk is not defined: | 357 If isjunk is not defined: |
354 | 358 |
355 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where | 359 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where |
356 alo <= i <= i+k <= ahi | 360 alo <= i <= i+k <= ahi |
357 blo <= j <= j+k <= bhi | 361 blo <= j <= j+k <= bhi |
358 and for all (i',j',k') meeting those conditions, | 362 and for all (i',j',k') meeting those conditions, |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
396 # ab | 400 # ab |
397 # acab | 401 # acab |
398 # Longest matching block is "ab", but if common prefix is | 402 # Longest matching block is "ab", but if common prefix is |
399 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so | 403 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so |
400 # strip, so ends up claiming that ab is changed to acab by | 404 # strip, so ends up claiming that ab is changed to acab by |
401 # inserting "ca" in the middle. That's minimal but unintuitive: | 405 # inserting "ca" in the middle. That's minimal but unintuitive: |
402 # "it's obvious" that someone inserted "ac" at the front. | 406 # "it's obvious" that someone inserted "ac" at the front. |
403 # Windiff ends up at the same place as diff, but by pairing up | 407 # Windiff ends up at the same place as diff, but by pairing up |
404 # the unique 'b's and then matching the first two 'a's. | 408 # the unique 'b's and then matching the first two 'a's. |
405 | 409 |
406 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk | 410 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__ |
407 besti, bestj, bestsize = alo, blo, 0 | 411 besti, bestj, bestsize = alo, blo, 0 |
408 # find longest junk-free match | 412 # find longest junk-free match |
409 # during an iteration of the loop, j2len[j] = length of longest | 413 # during an iteration of the loop, j2len[j] = length of longest |
410 # junk-free match ending with a[i-1] and b[j] | 414 # junk-free match ending with a[i-1] and b[j] |
411 j2len = {} | 415 j2len = {} |
412 nothing = [] | 416 nothing = [] |
413 for i in range(alo, ahi): | 417 for i in range(alo, ahi): |
414 # look at all instances of a[i] in b; note that because | 418 # look at all instances of a[i] in b; note that because |
415 # b2j has no junk keys, the loop is skipped if a[i] is junk | 419 # b2j has no junk keys, the loop is skipped if a[i] is junk |
416 j2lenget = j2len.get | 420 j2lenget = j2len.get |
(...skipping 1621 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2038 for line in delta: | 2042 for line in delta: |
2039 if line[:2] in prefixes: | 2043 if line[:2] in prefixes: |
2040 yield line[2:] | 2044 yield line[2:] |
2041 | 2045 |
2042 def _test(): | 2046 def _test(): |
2043 import doctest, difflib | 2047 import doctest, difflib |
2044 return doctest.testmod(difflib) | 2048 return doctest.testmod(difflib) |
2045 | 2049 |
2046 if __name__ == "__main__": | 2050 if __name__ == "__main__": |
2047 _test() | 2051 _test() |
LEFT | RIGHT |