Package translate :: Package misc :: Module diff_match_patch
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.diff_match_patch

   1  #!/usr/bin/python2.4 
   2   
   3  """Diff Match and Patch 
   4   
   5  Copyright 2006 Google Inc. 
   6  http://code.google.com/p/google-diff-match-patch/ 
   7   
   8  Licensed under the Apache License, Version 2.0 (the "License"); 
   9  you may not use this file except in compliance with the License. 
  10  You may obtain a copy of the License at 
  11   
  12    http://www.apache.org/licenses/LICENSE-2.0 
  13   
  14  Unless required by applicable law or agreed to in writing, software 
  15  distributed under the License is distributed on an "AS IS" BASIS, 
  16  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
  17  See the License for the specific language governing permissions and 
  18  limitations under the License. 
  19  """ 
  20   
  21  """Functions for diff, match and patch. 
  22   
  23  Computes the difference between two texts to create a patch. 
  24  Applies the patch onto another text, allowing for errors. 
  25  """ 
  26   
  27  __author__ = 'fraser@google.com (Neil Fraser)' 
  28   
  29  import math 
  30  import time 
  31  import urllib 
  32  import re 
  33   
34 -class diff_match_patch:
35 """Class containing the diff, match and patch methods. 36 37 Also contains the behaviour settings. 38 """ 39
40 - def __init__(self):
41 """Inits a diff_match_patch object with default settings. 42 Redefine these in your program to override the defaults. 43 """ 44 45 # Number of seconds to map a diff before giving up (0 for infinity). 46 self.Diff_Timeout = 1.0 47 # Cost of an empty edit operation in terms of edit characters. 48 self.Diff_EditCost = 4 49 # The size beyond which the double-ended diff activates. 50 # Double-ending is twice as fast, but less accurate. 51 self.Diff_DualThreshold = 32 52 # At what point is no match declared (0.0 = perfection, 1.0 = very loose). 53 self.Match_Threshold = 0.5 54 # How far to search for a match (0 = exact location, 1000+ = broad match). 55 # A match this many characters away from the expected location will add 56 # 1.0 to the score (0.0 is a perfect match). 57 self.Match_Distance = 1000 58 # When deleting a large block of text (over ~64 characters), how close does 59 # the contents have to match the expected contents. (0.0 = perfection, 60 # 1.0 = very loose). Note that Match_Threshold controls how closely the 61 # end points of a delete need to match. 62 self.Patch_DeleteThreshold = 0.5 63 # Chunk size for context length. 64 self.Patch_Margin = 4 65 66 # How many bits in a number? 67 # Python has no maximum, thus to disable patch splitting set to 0. 68 # However to avoid long patches in certain pathological cases, use 32. 69 # Multiple short patches (using native ints) are much faster than long ones. 70 self.Match_MaxBits = 32
71 72 # DIFF FUNCTIONS 73 74 # The data structure representing a diff is an array of tuples: 75 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")] 76 # which means: delete "Hello", add "Goodbye" and keep " world." 77 DIFF_DELETE = -1 78 DIFF_INSERT = 1 79 DIFF_EQUAL = 0 80
81 - def diff_main(self, text1, text2, checklines=True):
82 """Find the differences between two texts. Simplifies the problem by 83 stripping any common prefix or suffix off the texts before diffing. 84 85 Args: 86 text1: Old string to be diffed. 87 text2: New string to be diffed. 88 checklines: Optional speedup flag. If present and false, then don't run 89 a line-level diff first to identify the changed areas. 90 Defaults to true, which does a faster, slightly less optimal diff. 91 92 Returns: 93 Array of changes. 94 """ 95 96 # Check for null inputs. 97 if text1 == None or text2 == None: 98 raise ValueError("Null inputs. (diff_main)") 99 100 # Check for equality (speedup). 101 if text1 == text2: 102 return [(self.DIFF_EQUAL, text1)] 103 104 # Trim off common prefix (speedup). 105 commonlength = self.diff_commonPrefix(text1, text2) 106 commonprefix = text1[:commonlength] 107 text1 = text1[commonlength:] 108 text2 = text2[commonlength:] 109 110 # Trim off common suffix (speedup). 111 commonlength = self.diff_commonSuffix(text1, text2) 112 if commonlength == 0: 113 commonsuffix = '' 114 else: 115 commonsuffix = text1[-commonlength:] 116 text1 = text1[:-commonlength] 117 text2 = text2[:-commonlength] 118 119 # Compute the diff on the middle block. 120 diffs = self.diff_compute(text1, text2, checklines) 121 122 # Restore the prefix and suffix. 123 if commonprefix: 124 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)] 125 if commonsuffix: 126 diffs.append((self.DIFF_EQUAL, commonsuffix)) 127 self.diff_cleanupMerge(diffs) 128 return diffs
129
130 - def diff_compute(self, text1, text2, checklines):
131 """Find the differences between two texts. Assumes that the texts do not 132 have any common prefix or suffix. 133 134 Args: 135 text1: Old string to be diffed. 136 text2: New string to be diffed. 137 checklines: Speedup flag. If false, then don't run a line-level diff 138 first to identify the changed areas. 139 If true, then run a faster, slightly less optimal diff. 140 141 Returns: 142 Array of changes. 143 """ 144 if not text1: 145 # Just add some text (speedup). 146 return [(self.DIFF_INSERT, text2)] 147 148 if not text2: 149 # Just delete some text (speedup). 150 return [(self.DIFF_DELETE, text1)] 151 152 if len(text1) > len(text2): 153 (longtext, shorttext) = (text1, text2) 154 else: 155 (shorttext, longtext) = (text1, text2) 156 i = longtext.find(shorttext) 157 if i != -1: 158 # Shorter text is inside the longer text (speedup). 159 diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext), 160 (self.DIFF_INSERT, longtext[i + len(shorttext):])] 161 # Swap insertions for deletions if diff is reversed. 162 if len(text1) > len(text2): 163 diffs[0] = (self.DIFF_DELETE, diffs[0][1]) 164 diffs[2] = (self.DIFF_DELETE, diffs[2][1]) 165 return diffs 166 longtext = shorttext = None # Garbage collect. 167 168 # Check to see if the problem can be split in two. 169 hm = self.diff_halfMatch(text1, text2) 170 if hm: 171 # A half-match was found, sort out the return data. 172 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm 173 # Send both pairs off for separate processing. 174 diffs_a = self.diff_main(text1_a, text2_a, checklines) 175 diffs_b = self.diff_main(text1_b, text2_b, checklines) 176 # Merge the results. 177 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b 178 179 # Perform a real diff. 180 if checklines and (len(text1) < 100 or len(text2) < 100): 181 checklines = False # Too trivial for the overhead. 182 if checklines: 183 # Scan the text on a line-by-line basis first. 184 (text1, text2, linearray) = self.diff_linesToChars(text1, text2) 185 186 diffs = self.diff_map(text1, text2) 187 if not diffs: # No acceptable result. 188 diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] 189 if checklines: 190 # Convert the diff back to original text. 191 self.diff_charsToLines(diffs, linearray) 192 # Eliminate freak matches (e.g. blank lines) 193 self.diff_cleanupSemantic(diffs) 194 195 # Rediff any replacement blocks, this time character-by-character. 196 # Add a dummy entry at the end. 197 diffs.append((self.DIFF_EQUAL, '')) 198 pointer = 0 199 count_delete = 0 200 count_insert = 0 201 text_delete = '' 202 text_insert = '' 203 while pointer < len(diffs): 204 if diffs[pointer][0] == self.DIFF_INSERT: 205 count_insert += 1 206 text_insert += diffs[pointer][1] 207 elif diffs[pointer][0] == self.DIFF_DELETE: 208 count_delete += 1 209 text_delete += diffs[pointer][1] 210 elif diffs[pointer][0] == self.DIFF_EQUAL: 211 # Upon reaching an equality, check for prior redundancies. 212 if count_delete >= 1 and count_insert >= 1: 213 # Delete the offending records and add the merged ones. 214 a = self.diff_main(text_delete, text_insert, False) 215 diffs[pointer - count_delete - count_insert : pointer] = a 216 pointer = pointer - count_delete - count_insert + len(a) 217 count_insert = 0 218 count_delete = 0 219 text_delete = '' 220 text_insert = '' 221 222 pointer += 1 223 224 diffs.pop() # Remove the dummy entry at the end. 225 return diffs
226
227 - def diff_linesToChars(self, text1, text2):
228 """Split two texts into an array of strings. Reduce the texts to a string 229 of hashes where each Unicode character represents one line. 230 231 Args: 232 text1: First string. 233 text2: Second string. 234 235 Returns: 236 Three element tuple, containing the encoded text1, the encoded text2 and 237 the array of unique strings. The zeroth element of the array of unique 238 strings is intentionally blank. 239 """ 240 lineArray = [] # e.g. lineArray[4] == "Hello\n" 241 lineHash = {} # e.g. lineHash["Hello\n"] == 4 242 243 # "\x00" is a valid character, but various debuggers don't like it. 244 # So we'll insert a junk entry to avoid generating a null character. 245 lineArray.append('') 246 247 def diff_linesToCharsMunge(text): 248 """Split a text into an array of strings. Reduce the texts to a string 249 of hashes where each Unicode character represents one line. 250 Modifies linearray and linehash through being a closure. 251 252 Args: 253 text: String to encode. 254 255 Returns: 256 Encoded string. 257 """ 258 chars = [] 259 # Walk the text, pulling out a substring for each line. 260 # text.split('\n') would would temporarily double our memory footprint. 261 # Modifying text would create many large strings to garbage collect. 262 lineStart = 0 263 lineEnd = -1 264 while lineEnd < len(text) - 1: 265 lineEnd = text.find('\n', lineStart) 266 if lineEnd == -1: 267 lineEnd = len(text) - 1 268 line = text[lineStart:lineEnd + 1] 269 lineStart = lineEnd + 1 270 271 if line in lineHash: 272 chars.append(unichr(lineHash[line])) 273 else: 274 lineArray.append(line) 275 lineHash[line] = len(lineArray) - 1 276 chars.append(unichr(len(lineArray) - 1)) 277 return "".join(chars)
278 279 chars1 = diff_linesToCharsMunge(text1) 280 chars2 = diff_linesToCharsMunge(text2) 281 return (chars1, chars2, lineArray)
282
283 - def diff_charsToLines(self, diffs, lineArray):
284 """Rehydrate the text in a diff from a string of line hashes to real lines 285 of text. 286 287 Args: 288 diffs: Array of diff tuples. 289 lineArray: Array of unique strings. 290 """ 291 for x in xrange(len(diffs)): 292 text = [] 293 for char in diffs[x][1]: 294 text.append(lineArray[ord(char)]) 295 diffs[x] = (diffs[x][0], "".join(text))
296
297 - def diff_map(self, text1, text2):
298 """Explore the intersection points between the two texts. 299 300 Args: 301 text1: Old string to be diffed. 302 text2: New string to be diffed. 303 304 Returns: 305 Array of diff tuples or None if no diff available. 306 """ 307 308 # Unlike in most languages, Python counts time in seconds. 309 s_end = time.time() + self.Diff_Timeout # Don't run for too long. 310 # Cache the text lengths to prevent multiple calls. 311 text1_length = len(text1) 312 text2_length = len(text2) 313 max_d = text1_length + text2_length - 1 314 doubleEnd = self.Diff_DualThreshold * 2 < max_d 315 # Python efficiency note: (x << 32) + y is the fastest way to combine 316 # x and y into a single hashable value. Tested in Python 2.5. 317 # It is unclear why it is faster for v_map[d] to be indexed with an 318 # integer whereas footsteps is indexed with a string. 319 v_map1 = [] 320 v_map2 = [] 321 v1 = {} 322 v2 = {} 323 v1[1] = 0 324 v2[1] = 0 325 footsteps = {} 326 done = False 327 # If the total number of characters is odd, then the front path will 328 # collide with the reverse path. 329 front = (text1_length + text2_length) % 2 330 for d in xrange(max_d): 331 # Bail out if timeout reached. 332 if self.Diff_Timeout > 0 and time.time() > s_end: 333 return None 334 335 # Walk the front path one step. 336 v_map1.append({}) 337 for k in xrange(-d, d + 1, 2): 338 if k == -d or k != d and v1[k - 1] < v1[k + 1]: 339 x = v1[k + 1] 340 else: 341 x = v1[k - 1] + 1 342 y = x - k 343 if doubleEnd: 344 footstep = str((x << 32) + y) 345 if front and footstep in footsteps: 346 done = True 347 if not front: 348 footsteps[footstep] = d 349 350 while (not done and x < text1_length and y < text2_length and 351 text1[x] == text2[y]): 352 x += 1 353 y += 1 354 if doubleEnd: 355 footstep = str((x << 32) + y) 356 if front and footstep in footsteps: 357 done = True 358 if not front: 359 footsteps[footstep] = d 360 361 v1[k] = x 362 v_map1[d][(x << 32) + y] = True 363 if x == text1_length and y == text2_length: 364 # Reached the end in single-path mode. 365 return self.diff_path1(v_map1, text1, text2) 366 elif done: 367 # Front path ran over reverse path. 368 v_map2 = v_map2[:footsteps[footstep] + 1] 369 a = self.diff_path1(v_map1, text1[:x], text2[:y]) 370 b = self.diff_path2(v_map2, text1[x:], text2[y:]) 371 return a + b 372 373 if doubleEnd: 374 # Walk the reverse path one step. 375 v_map2.append({}) 376 for k in xrange(-d, d + 1, 2): 377 if k == -d or k != d and v2[k - 1] < v2[k + 1]: 378 x = v2[k + 1] 379 else: 380 x = v2[k - 1] + 1 381 y = x - k 382 footstep = str((text1_length - x << 32) + text2_length - y) 383 if not front and footstep in footsteps: 384 done = True 385 if front: 386 footsteps[footstep] = d 387 while (not done and x < text1_length and y < text2_length and 388 text1[-x - 1] == text2[-y - 1]): 389 x += 1 390 y += 1 391 footstep = str((text1_length - x << 32) + text2_length - y) 392 if not front and footstep in footsteps: 393 done = True 394 if front: 395 footsteps[footstep] = d 396 397 v2[k] = x 398 v_map2[d][(x << 32) + y] = True 399 if done: 400 # Reverse path ran over front path. 401 v_map1 = v_map1[:footsteps[footstep] + 1] 402 a = self.diff_path1(v_map1, text1[:text1_length - x], 403 text2[:text2_length - y]) 404 b = self.diff_path2(v_map2, text1[text1_length - x:], 405 text2[text2_length - y:]) 406 return a + b 407 408 # Number of diffs equals number of characters, no commonality at all. 409 return None
410
411 - def diff_path1(self, v_map, text1, text2):
412 """Work from the middle back to the start to determine the path. 413 414 Args: 415 v_map: Array of paths. 416 text1: Old string fragment to be diffed. 417 text2: New string fragment to be diffed. 418 419 Returns: 420 Array of diff tuples. 421 """ 422 path = [] 423 x = len(text1) 424 y = len(text2) 425 last_op = None 426 for d in xrange(len(v_map) - 2, -1, -1): 427 while True: 428 if (x - 1 << 32) + y in v_map[d]: 429 x -= 1 430 if last_op == self.DIFF_DELETE: 431 path[0] = (self.DIFF_DELETE, text1[x] + path[0][1]) 432 else: 433 path[:0] = [(self.DIFF_DELETE, text1[x])] 434 last_op = self.DIFF_DELETE 435 break 436 elif (x << 32) + y - 1 in v_map[d]: 437 y -= 1 438 if last_op == self.DIFF_INSERT: 439 path[0] = (self.DIFF_INSERT, text2[y] + path[0][1]) 440 else: 441 path[:0] = [(self.DIFF_INSERT, text2[y])] 442 last_op = self.DIFF_INSERT 443 break 444 else: 445 x -= 1 446 y -= 1 447 assert text1[x] == text2[y], ("No diagonal. " + 448 "Can't happen. (diff_path1)") 449 if last_op == self.DIFF_EQUAL: 450 path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1]) 451 else: 452 path[:0] = [(self.DIFF_EQUAL, text1[x])] 453 last_op = self.DIFF_EQUAL 454 return path
455
456 - def diff_path2(self, v_map, text1, text2):
457 """Work from the middle back to the end to determine the path. 458 459 Args: 460 v_map: Array of paths. 461 text1: Old string fragment to be diffed. 462 text2: New string fragment to be diffed. 463 464 Returns: 465 Array of diff tuples. 466 """ 467 path = [] 468 x = len(text1) 469 y = len(text2) 470 last_op = None 471 for d in xrange(len(v_map) - 2, -1, -1): 472 while True: 473 if (x - 1 << 32) + y in v_map[d]: 474 x -= 1 475 if last_op == self.DIFF_DELETE: 476 path[-1] = (self.DIFF_DELETE, path[-1][1] + text1[-x - 1]) 477 else: 478 path.append((self.DIFF_DELETE, text1[-x - 1])) 479 last_op = self.DIFF_DELETE 480 break 481 elif (x << 32) + y - 1 in v_map[d]: 482 y -= 1 483 if last_op == self.DIFF_INSERT: 484 path[-1] = (self.DIFF_INSERT, path[-1][1] + text2[-y - 1]) 485 else: 486 path.append((self.DIFF_INSERT, text2[-y - 1])) 487 last_op = self.DIFF_INSERT 488 break 489 else: 490 x -= 1 491 y -= 1 492 assert text1[-x - 1] == text2[-y - 1], ("No diagonal. " + 493 "Can't happen. (diff_path2)") 494 if last_op == self.DIFF_EQUAL: 495 path[-1] = (self.DIFF_EQUAL, path[-1][1] + text1[-x - 1]) 496 else: 497 path.append((self.DIFF_EQUAL, text1[-x - 1])) 498 last_op = self.DIFF_EQUAL 499 return path
500
501 - def diff_commonPrefix(self, text1, text2):
502 """Determine the common prefix of two strings. 503 504 Args: 505 text1: First string. 506 text2: Second string. 507 508 Returns: 509 The number of characters common to the start of each string. 510 """ 511 # Quick check for common null cases. 512 if not text1 or not text2 or text1[0] != text2[0]: 513 return 0 514 # Binary search. 515 # Performance analysis: http://neil.fraser.name/news/2007/10/09/ 516 pointermin = 0 517 pointermax = min(len(text1), len(text2)) 518 pointermid = pointermax 519 pointerstart = 0 520 while pointermin < pointermid: 521 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]: 522 pointermin = pointermid 523 pointerstart = pointermin 524 else: 525 pointermax = pointermid 526 pointermid = int((pointermax - pointermin) / 2 + pointermin) 527 return pointermid
528
529 - def diff_commonSuffix(self, text1, text2):
530 """Determine the common suffix of two strings. 531 532 Args: 533 text1: First string. 534 text2: Second string. 535 536 Returns: 537 The number of characters common to the end of each string. 538 """ 539 # Quick check for common null cases. 540 if not text1 or not text2 or text1[-1] != text2[-1]: 541 return 0 542 # Binary search. 543 # Performance analysis: http://neil.fraser.name/news/2007/10/09/ 544 pointermin = 0 545 pointermax = min(len(text1), len(text2)) 546 pointermid = pointermax 547 pointerend = 0 548 while pointermin < pointermid: 549 if (text1[-pointermid:len(text1) - pointerend] == 550 text2[-pointermid:len(text2) - pointerend]): 551 pointermin = pointermid 552 pointerend = pointermin 553 else: 554 pointermax = pointermid 555 pointermid = int((pointermax - pointermin) / 2 + pointermin) 556 return pointermid
557
558 - def diff_halfMatch(self, text1, text2):
559 """Do the two texts share a substring which is at least half the length of 560 the longer text? 561 562 Args: 563 text1: First string. 564 text2: Second string. 565 566 Returns: 567 Five element Array, containing the prefix of text1, the suffix of text1, 568 the prefix of text2, the suffix of text2 and the common middle. Or None 569 if there was no match. 570 """ 571 if len(text1) > len(text2): 572 (longtext, shorttext) = (text1, text2) 573 else: 574 (shorttext, longtext) = (text1, text2) 575 if len(longtext) < 10 or len(shorttext) < 1: 576 return None # Pointless. 577 578 def diff_halfMatchI(longtext, shorttext, i): 579 """Does a substring of shorttext exist within longtext such that the 580 substring is at least half the length of longtext? 581 Closure, but does not reference any external variables. 582 583 Args: 584 longtext: Longer string. 585 shorttext: Shorter string. 586 i: Start index of quarter length substring within longtext. 587 588 Returns: 589 Five element Array, containing the prefix of longtext, the suffix of 590 longtext, the prefix of shorttext, the suffix of shorttext and the 591 common middle. Or None if there was no match. 592 """ 593 seed = longtext[i:i + len(longtext) / 4] 594 best_common = '' 595 j = shorttext.find(seed) 596 while j != -1: 597 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:]) 598 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j]) 599 if len(best_common) < suffixLength + prefixLength: 600 best_common = (shorttext[j - suffixLength:j] + 601 shorttext[j:j + prefixLength]) 602 best_longtext_a = longtext[:i - suffixLength] 603 best_longtext_b = longtext[i + prefixLength:] 604 best_shorttext_a = shorttext[:j - suffixLength] 605 best_shorttext_b = shorttext[j + prefixLength:] 606 j = shorttext.find(seed, j + 1) 607 608 if len(best_common) >= len(longtext) / 2: 609 return (best_longtext_a, best_longtext_b, 610 best_shorttext_a, best_shorttext_b, best_common) 611 else: 612 return None
613 614 # First check if the second quarter is the seed for a half-match. 615 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4) 616 # Check again based on the third quarter. 617 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2) 618 if not hm1 and not hm2: 619 return None 620 elif not hm2: 621 hm = hm1 622 elif not hm1: 623 hm = hm2 624 else: 625 # Both matched. Select the longest. 626 if len(hm1[4]) > len(hm2[4]): 627 hm = hm1 628 else: 629 hm = hm2 630 631 # A half-match was found, sort out the return data. 632 if len(text1) > len(text2): 633 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm 634 else: 635 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm 636 return (text1_a, text1_b, text2_a, text2_b, mid_common) 637
638 - def diff_cleanupSemantic(self, diffs):
639 """Reduce the number of edits by eliminating semantically trivial 640 equalities. 641 642 Args: 643 diffs: Array of diff tuples. 644 """ 645 changes = False 646 equalities = [] # Stack of indices where equalities are found. 647 lastequality = None # Always equal to equalities[-1][1] 648 pointer = 0 # Index of current position. 649 length_changes1 = 0 # Number of chars that changed prior to the equality. 650 length_changes2 = 0 # Number of chars that changed after the equality. 651 while pointer < len(diffs): 652 if diffs[pointer][0] == self.DIFF_EQUAL: # equality found 653 equalities.append(pointer) 654 length_changes1 = length_changes2 655 length_changes2 = 0 656 lastequality = diffs[pointer][1] 657 else: # an insertion or deletion 658 length_changes2 += len(diffs[pointer][1]) 659 if (lastequality != None and (len(lastequality) <= length_changes1) and 660 (len(lastequality) <= length_changes2)): 661 # Duplicate record 662 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality)) 663 # Change second copy to insert. 664 diffs[equalities[-1] + 1] = (self.DIFF_INSERT, 665 diffs[equalities[-1] + 1][1]) 666 # Throw away the equality we just deleted. 667 equalities.pop() 668 # Throw away the previous equality (it needs to be reevaluated). 669 if len(equalities) != 0: 670 equalities.pop() 671 if len(equalities): 672 pointer = equalities[-1] 673 else: 674 pointer = -1 675 length_changes1 = 0 # Reset the counters. 676 length_changes2 = 0 677 lastequality = None 678 changes = True 679 pointer += 1 680 681 if changes: 682 self.diff_cleanupMerge(diffs) 683 684 self.diff_cleanupSemanticLossless(diffs)
685
686 - def diff_cleanupSemanticLossless(self, diffs):
687 """Look for single edits surrounded on both sides by equalities 688 which can be shifted sideways to align the edit to a word boundary. 689 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. 690 691 Args: 692 diffs: Array of diff tuples. 693 """ 694 695 def diff_cleanupSemanticScore(one, two): 696 """Given two strings, compute a score representing whether the 697 internal boundary falls on logical boundaries. 698 Scores range from 5 (best) to 0 (worst). 699 Closure, but does not reference any external variables. 700 701 Args: 702 one: First string. 703 two: Second string. 704 705 Returns: 706 The score. 707 """ 708 if not one or not two: 709 # Edges are the best. 710 return 5 711 712 # Each port of this function behaves slightly differently due to 713 # subtle differences in each language's definition of things like 714 # 'whitespace'. Since this function's purpose is largely cosmetic, 715 # the choice has been made to use each language's native features 716 # rather than force total conformity. 717 score = 0 718 # One point for non-alphanumeric. 719 if not one[-1].isalnum() or not two[0].isalnum(): 720 score += 1 721 # Two points for whitespace. 722 if one[-1].isspace() or two[0].isspace(): 723 score += 1 724 # Three points for line breaks. 725 if (one[-1] == "\r" or one[-1] == "\n" or 726 two[0] == "\r" or two[0] == "\n"): 727 score += 1 728 # Four points for blank lines. 729 if (re.search("\\n\\r?\\n$", one) or 730 re.match("^\\r?\\n\\r?\\n", two)): 731 score += 1 732 return score
733 734 pointer = 1 735 # Intentionally ignore the first and last element (don't need checking). 736 while pointer < len(diffs) - 1: 737 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and 738 diffs[pointer + 1][0] == self.DIFF_EQUAL): 739 # This is a single edit surrounded by equalities. 740 equality1 = diffs[pointer - 1][1] 741 edit = diffs[pointer][1] 742 equality2 = diffs[pointer + 1][1] 743 744 # First, shift the edit as far left as possible. 745 commonOffset = self.diff_commonSuffix(equality1, edit) 746 if commonOffset: 747 commonString = edit[-commonOffset:] 748 equality1 = equality1[:-commonOffset] 749 edit = commonString + edit[:-commonOffset] 750 equality2 = commonString + equality2 751 752 # Second, step character by character right, looking for the best fit. 753 bestEquality1 = equality1 754 bestEdit = edit 755 bestEquality2 = equality2 756 bestScore = (diff_cleanupSemanticScore(equality1, edit) + 757 diff_cleanupSemanticScore(edit, equality2)) 758 while edit and equality2 and edit[0] == equality2[0]: 759 equality1 += edit[0] 760 edit = edit[1:] + equality2[0] 761 equality2 = equality2[1:] 762 score = (diff_cleanupSemanticScore(equality1, edit) + 763 diff_cleanupSemanticScore(edit, equality2)) 764 # The >= encourages trailing rather than leading whitespace on edits. 765 if score >= bestScore: 766 bestScore = score 767 bestEquality1 = equality1 768 bestEdit = edit 769 bestEquality2 = equality2 770 771 if diffs[pointer - 1][1] != bestEquality1: 772 # We have an improvement, save it back to the diff. 773 if bestEquality1: 774 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1) 775 else: 776 del diffs[pointer - 1] 777 pointer -= 1 778 diffs[pointer] = (diffs[pointer][0], bestEdit) 779 if bestEquality2: 780 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2) 781 else: 782 del diffs[pointer + 1] 783 pointer -= 1 784 pointer += 1 785
786 - def diff_cleanupEfficiency(self, diffs):
787 """Reduce the number of edits by eliminating operationally trivial 788 equalities. 789 790 Args: 791 diffs: Array of diff tuples. 792 """ 793 changes = False 794 equalities = [] # Stack of indices where equalities are found. 795 lastequality = '' # Always equal to equalities[-1][1] 796 pointer = 0 # Index of current position. 797 pre_ins = False # Is there an insertion operation before the last equality. 798 pre_del = False # Is there a deletion operation before the last equality. 799 post_ins = False # Is there an insertion operation after the last equality. 800 post_del = False # Is there a deletion operation after the last equality. 801 while pointer < len(diffs): 802 if diffs[pointer][0] == self.DIFF_EQUAL: # equality found 803 if (len(diffs[pointer][1]) < self.Diff_EditCost and 804 (post_ins or post_del)): 805 # Candidate found. 806 equalities.append(pointer) 807 pre_ins = post_ins 808 pre_del = post_del 809 lastequality = diffs[pointer][1] 810 else: 811 # Not a candidate, and can never become one. 812 equalities = [] 813 lastequality = '' 814 815 post_ins = post_del = False 816 else: # an insertion or deletion 817 if diffs[pointer][0] == self.DIFF_DELETE: 818 post_del = True 819 else: 820 post_ins = True 821 822 # Five types to be split: 823 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del> 824 # <ins>A</ins>X<ins>C</ins><del>D</del> 825 # <ins>A</ins><del>B</del>X<ins>C</ins> 826 # <ins>A</del>X<ins>C</ins><del>D</del> 827 # <ins>A</ins><del>B</del>X<del>C</del> 828 829 if lastequality and ((pre_ins and pre_del and post_ins and post_del) or 830 ((len(lastequality) < self.Diff_EditCost / 2) and 831 (pre_ins + pre_del + post_ins + post_del) == 3)): 832 # Duplicate record 833 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality)) 834 # Change second copy to insert. 835 diffs[equalities[-1] + 1] = (self.DIFF_INSERT, 836 diffs[equalities[-1] + 1][1]) 837 equalities.pop() # Throw away the equality we just deleted 838 lastequality = '' 839 if pre_ins and pre_del: 840 # No changes made which could affect previous entry, keep going. 841 post_ins = post_del = True 842 equalities = [] 843 else: 844 if len(equalities): 845 equalities.pop() # Throw away the previous equality 846 if len(equalities): 847 pointer = equalities[-1] 848 else: 849 pointer = -1 850 post_ins = post_del = False 851 changes = True 852 pointer += 1 853 854 if changes: 855 self.diff_cleanupMerge(diffs)
856
857 - def diff_cleanupMerge(self, diffs):
858 """Reorder and merge like edit sections. Merge equalities. 859 Any edit section can move as long as it doesn't cross an equality. 860 861 Args: 862 diffs: Array of diff tuples. 863 """ 864 diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end. 865 pointer = 0 866 count_delete = 0 867 count_insert = 0 868 text_delete = '' 869 text_insert = '' 870 while pointer < len(diffs): 871 if diffs[pointer][0] == self.DIFF_INSERT: 872 count_insert += 1 873 text_insert += diffs[pointer][1] 874 pointer += 1 875 elif diffs[pointer][0] == self.DIFF_DELETE: 876 count_delete += 1 877 text_delete += diffs[pointer][1] 878 pointer += 1 879 elif diffs[pointer][0] == self.DIFF_EQUAL: 880 # Upon reaching an equality, check for prior redundancies. 881 if count_delete != 0 or count_insert != 0: 882 if count_delete != 0 and count_insert != 0: 883 # Factor out any common prefixies. 884 commonlength = self.diff_commonPrefix(text_insert, text_delete) 885 if commonlength != 0: 886 x = pointer - count_delete - count_insert - 1 887 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL: 888 diffs[x] = (diffs[x][0], diffs[x][1] + 889 text_insert[:commonlength]) 890 else: 891 diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength])) 892 pointer += 1 893 text_insert = text_insert[commonlength:] 894 text_delete = text_delete[commonlength:] 895 # Factor out any common suffixies. 896 commonlength = self.diff_commonSuffix(text_insert, text_delete) 897 if commonlength != 0: 898 diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] + 899 diffs[pointer][1]) 900 text_insert = text_insert[:-commonlength] 901 text_delete = text_delete[:-commonlength] 902 # Delete the offending records and add the merged ones. 903 if count_delete == 0: 904 diffs[pointer - count_insert : pointer] = [ 905 (self.DIFF_INSERT, text_insert)] 906 elif count_insert == 0: 907 diffs[pointer - count_delete : pointer] = [ 908 (self.DIFF_DELETE, text_delete)] 909 else: 910 diffs[pointer - count_delete - count_insert : pointer] = [ 911 (self.DIFF_DELETE, text_delete), 912 (self.DIFF_INSERT, text_insert)] 913 pointer = pointer - count_delete - count_insert + 1 914 if count_delete != 0: 915 pointer += 1 916 if count_insert != 0: 917 pointer += 1 918 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL: 919 # Merge this equality with the previous one. 920 diffs[pointer - 1] = (diffs[pointer - 1][0], 921 diffs[pointer - 1][1] + diffs[pointer][1]) 922 del diffs[pointer] 923 else: 924 pointer += 1 925 926 count_insert = 0 927 count_delete = 0 928 text_delete = '' 929 text_insert = '' 930 931 if diffs[-1][1] == '': 932 diffs.pop() # Remove the dummy entry at the end. 933 934 # Second pass: look for single edits surrounded on both sides by equalities 935 # which can be shifted sideways to eliminate an equality. 936 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC 937 changes = False 938 pointer = 1 939 # Intentionally ignore the first and last element (don't need checking). 940 while pointer < len(diffs) - 1: 941 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and 942 diffs[pointer + 1][0] == self.DIFF_EQUAL): 943 # This is a single edit surrounded by equalities. 944 if diffs[pointer][1].endswith(diffs[pointer - 1][1]): 945 # Shift the edit over the previous equality. 946 diffs[pointer] = (diffs[pointer][0], 947 diffs[pointer - 1][1] + 948 diffs[pointer][1][:-len(diffs[pointer - 1][1])]) 949 diffs[pointer + 1] = (diffs[pointer + 1][0], 950 diffs[pointer - 1][1] + diffs[pointer + 1][1]) 951 del diffs[pointer - 1] 952 changes = True 953 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]): 954 # Shift the edit over the next equality. 955 diffs[pointer - 1] = (diffs[pointer - 1][0], 956 diffs[pointer - 1][1] + diffs[pointer + 1][1]) 957 diffs[pointer] = (diffs[pointer][0], 958 diffs[pointer][1][len(diffs[pointer + 1][1]):] + 959 diffs[pointer + 1][1]) 960 del diffs[pointer + 1] 961 changes = True 962 pointer += 1 963 964 # If shifts were made, the diff needs reordering and another shift sweep. 965 if changes: 966 self.diff_cleanupMerge(diffs)
967
968 - def diff_xIndex(self, diffs, loc):
969 """loc is a location in text1, compute and return the equivalent location 970 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8 971 972 Args: 973 diffs: Array of diff tuples. 974 loc: Location within text1. 975 976 Returns: 977 Location within text2. 978 """ 979 chars1 = 0 980 chars2 = 0 981 last_chars1 = 0 982 last_chars2 = 0 983 for x in xrange(len(diffs)): 984 (op, text) = diffs[x] 985 if op != self.DIFF_INSERT: # Equality or deletion. 986 chars1 += len(text) 987 if op != self.DIFF_DELETE: # Equality or insertion. 988 chars2 += len(text) 989 if chars1 > loc: # Overshot the location. 990 break 991 last_chars1 = chars1 992 last_chars2 = chars2 993 994 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE: 995 # The location was deleted. 996 return last_chars2 997 # Add the remaining len(character). 998 return last_chars2 + (loc - last_chars1)
999
1000 - def diff_prettyHtml(self, diffs):
1001 """Convert a diff array into a pretty HTML report. 1002 1003 Args: 1004 diffs: Array of diff tuples. 1005 1006 Returns: 1007 HTML representation. 1008 """ 1009 html = [] 1010 i = 0 1011 for (op, data) in diffs: 1012 text = (data.replace("&", "&amp;").replace("<", "&lt;") 1013 .replace(">", "&gt;").replace("\n", "&para;<BR>")) 1014 if op == self.DIFF_INSERT: 1015 html.append("<INS STYLE=\"background:#E6FFE6;\" TITLE=\"i=%i\">%s</INS>" 1016 % (i, text)) 1017 elif op == self.DIFF_DELETE: 1018 html.append("<DEL STYLE=\"background:#FFE6E6;\" TITLE=\"i=%i\">%s</DEL>" 1019 % (i, text)) 1020 elif op == self.DIFF_EQUAL: 1021 html.append("<SPAN TITLE=\"i=%i\">%s</SPAN>" % (i, text)) 1022 if op != self.DIFF_DELETE: 1023 i += len(data) 1024 return "".join(html)
1025
1026 - def diff_text1(self, diffs):
1027 """Compute and return the source text (all equalities and deletions). 1028 1029 Args: 1030 diffs: Array of diff tuples. 1031 1032 Returns: 1033 Source text. 1034 """ 1035 text = [] 1036 for (op, data) in diffs: 1037 if op != self.DIFF_INSERT: 1038 text.append(data) 1039 return "".join(text)
1040
1041 - def diff_text2(self, diffs):
1042 """Compute and return the destination text (all equalities and insertions). 1043 1044 Args: 1045 diffs: Array of diff tuples. 1046 1047 Returns: 1048 Destination text. 1049 """ 1050 text = [] 1051 for (op, data) in diffs: 1052 if op != self.DIFF_DELETE: 1053 text.append(data) 1054 return "".join(text)
1055
1056 - def diff_levenshtein(self, diffs):
1057 """Compute the Levenshtein distance; the number of inserted, deleted or 1058 substituted characters. 1059 1060 Args: 1061 diffs: Array of diff tuples. 1062 1063 Returns: 1064 Number of changes. 1065 """ 1066 levenshtein = 0 1067 insertions = 0 1068 deletions = 0 1069 for (op, data) in diffs: 1070 if op == self.DIFF_INSERT: 1071 insertions += len(data) 1072 elif op == self.DIFF_DELETE: 1073 deletions += len(data) 1074 elif op == self.DIFF_EQUAL: 1075 # A deletion and an insertion is one substitution. 1076 levenshtein += max(insertions, deletions) 1077 insertions = 0 1078 deletions = 0 1079 levenshtein += max(insertions, deletions) 1080 return levenshtein
1081
1082 - def diff_toDelta(self, diffs):
1083 """Crush the diff into an encoded string which describes the operations 1084 required to transform text1 into text2. 1085 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. 1086 Operations are tab-separated. Inserted text is escaped using %xx notation. 1087 1088 Args: 1089 diffs: Array of diff tuples. 1090 1091 Returns: 1092 Delta text. 1093 """ 1094 text = [] 1095 for (op, data) in diffs: 1096 if op == self.DIFF_INSERT: 1097 # High ascii will raise UnicodeDecodeError. Use Unicode instead. 1098 data = data.encode("utf-8") 1099 text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# ")) 1100 elif op == self.DIFF_DELETE: 1101 text.append("-%d" % len(data)) 1102 elif op == self.DIFF_EQUAL: 1103 text.append("=%d" % len(data)) 1104 return "\t".join(text)
1105
1106 - def diff_fromDelta(self, text1, delta):
1107 """Given the original text1, and an encoded string which describes the 1108 operations required to transform text1 into text2, compute the full diff. 1109 1110 Args: 1111 text1: Source string for the diff. 1112 delta: Delta text. 1113 1114 Returns: 1115 Array of diff tuples. 1116 1117 Raises: 1118 ValueError: If invalid input. 1119 """ 1120 if type(delta) == unicode: 1121 # Deltas should be composed of a subset of ascii chars, Unicode not 1122 # required. If this encode raises UnicodeEncodeError, delta is invalid. 1123 delta = delta.encode("ascii") 1124 diffs = [] 1125 pointer = 0 # Cursor in text1 1126 tokens = delta.split("\t") 1127 for token in tokens: 1128 if token == "": 1129 # Blank tokens are ok (from a trailing \t). 1130 continue 1131 # Each token begins with a one character parameter which specifies the 1132 # operation of this token (delete, insert, equality). 1133 param = token[1:] 1134 if token[0] == "+": 1135 param = urllib.unquote(param).decode("utf-8") 1136 diffs.append((self.DIFF_INSERT, param)) 1137 elif token[0] == "-" or token[0] == "=": 1138 try: 1139 n = int(param) 1140 except ValueError: 1141 raise ValueError("Invalid number in diff_fromDelta: " + param) 1142 if n < 0: 1143 raise ValueError("Negative number in diff_fromDelta: " + param) 1144 text = text1[pointer : pointer + n] 1145 pointer += n 1146 if token[0] == "=": 1147 diffs.append((self.DIFF_EQUAL, text)) 1148 else: 1149 diffs.append((self.DIFF_DELETE, text)) 1150 else: 1151 # Anything else is an error. 1152 raise ValueError("Invalid diff operation in diff_fromDelta: " + 1153 token[0]) 1154 if pointer != len(text1): 1155 raise ValueError( 1156 "Delta length (%d) does not equal source text length (%d)." % 1157 (pointer, len(text1))) 1158 return diffs
1159 1160 # MATCH FUNCTIONS 1161
1162 - def match_main(self, text, pattern, loc):
1163 """Locate the best instance of 'pattern' in 'text' near 'loc'. 1164 1165 Args: 1166 text: The text to search. 1167 pattern: The pattern to search for. 1168 loc: The location to search around. 1169 1170 Returns: 1171 Best match index or -1. 1172 """ 1173 # Check for null inputs. 1174 if text == None or pattern == None: 1175 raise ValueError("Null inputs. (match_main)") 1176 1177 loc = max(0, min(loc, len(text))) 1178 if text == pattern: 1179 # Shortcut (potentially not guaranteed by the algorithm) 1180 return 0 1181 elif not text: 1182 # Nothing to match. 1183 return -1 1184 elif text[loc:loc + len(pattern)] == pattern: 1185 # Perfect match at the perfect spot! (Includes case of null pattern) 1186 return loc 1187 else: 1188 # Do a fuzzy compare. 1189 match = self.match_bitap(text, pattern, loc) 1190 return match
1191
1192 - def match_bitap(self, text, pattern, loc):
1193 """Locate the best instance of 'pattern' in 'text' near 'loc' using the 1194 Bitap algorithm. 1195 1196 Args: 1197 text: The text to search. 1198 pattern: The pattern to search for. 1199 loc: The location to search around. 1200 1201 Returns: 1202 Best match index or -1. 1203 """ 1204 # Python doesn't have a maxint limit, so ignore this check. 1205 #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits: 1206 # raise ValueError("Pattern too long for this application.") 1207 1208 # Initialise the alphabet. 1209 s = self.match_alphabet(pattern) 1210 1211 def match_bitapScore(e, x): 1212 """Compute and return the score for a match with e errors and x location. 1213 Accesses loc and pattern through being a closure. 1214 1215 Args: 1216 e: Number of errors in match. 1217 x: Location of match. 1218 1219 Returns: 1220 Overall score for match (0.0 = good, 1.0 = bad). 1221 """ 1222 accuracy = float(e) / len(pattern) 1223 proximity = abs(loc - x) 1224 if not self.Match_Distance: 1225 # Dodge divide by zero error. 1226 return proximity and 1.0 or accuracy 1227 return accuracy + (proximity / float(self.Match_Distance))
1228 1229 # Highest score beyond which we give up. 1230 score_threshold = self.Match_Threshold 1231 # Is there a nearby exact match? (speedup) 1232 best_loc = text.find(pattern, loc) 1233 if best_loc != -1: 1234 score_threshold = min(match_bitapScore(0, best_loc), score_threshold) 1235 # What about in the other direction? (speedup) 1236 best_loc = text.rfind(pattern, loc + len(pattern)) 1237 if best_loc != -1: 1238 score_threshold = min(match_bitapScore(0, best_loc), score_threshold) 1239 1240 # Initialise the bit arrays. 1241 matchmask = 1 << (len(pattern) - 1) 1242 best_loc = -1 1243 1244 bin_max = len(pattern) + len(text) 1245 # Empty initialization added to appease pychecker. 1246 last_rd = None 1247 for d in xrange(len(pattern)): 1248 # Scan for the best match each iteration allows for one more error. 1249 # Run a binary search to determine how far from 'loc' we can stray at 1250 # this error level. 1251 bin_min = 0 1252 bin_mid = bin_max 1253 while bin_min < bin_mid: 1254 if match_bitapScore(d, loc + bin_mid) <= score_threshold: 1255 bin_min = bin_mid 1256 else: 1257 bin_max = bin_mid 1258 bin_mid = (bin_max - bin_min) / 2 + bin_min 1259 1260 # Use the result from this iteration as the maximum for the next. 1261 bin_max = bin_mid 1262 start = max(1, loc - bin_mid + 1) 1263 finish = min(loc + bin_mid, len(text)) + len(pattern) 1264 1265 rd = range(finish + 1) 1266 rd.append((1 << d) - 1) 1267 for j in xrange(finish, start - 1, -1): 1268 if len(text) <= j - 1: 1269 # Out of range. 1270 charMatch = 0 1271 else: 1272 charMatch = s.get(text[j - 1], 0) 1273 if d == 0: # First pass: exact match. 1274 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch 1275 else: # Subsequent passes: fuzzy match. 1276 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | ( 1277 ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1] 1278 if rd[j] & matchmask: 1279 score = match_bitapScore(d, j - 1) 1280 # This match will almost certainly be better than any existing match. 1281 # But check anyway. 1282 if score <= score_threshold: 1283 # Told you so. 1284 score_threshold = score 1285 best_loc = j - 1 1286 if best_loc > loc: 1287 # When passing loc, don't exceed our current distance from loc. 1288 start = max(1, 2 * loc - best_loc) 1289 else: 1290 # Already passed loc, downhill from here on in. 1291 break 1292 # No hope for a (better) match at greater error levels. 1293 if match_bitapScore(d + 1, loc) > score_threshold: 1294 break 1295 last_rd = rd 1296 return best_loc 1297
1298 - def match_alphabet(self, pattern):
1299 """Initialise the alphabet for the Bitap algorithm. 1300 1301 Args: 1302 pattern: The text to encode. 1303 1304 Returns: 1305 Hash of character locations. 1306 """ 1307 s = {} 1308 for char in pattern: 1309 s[char] = 0 1310 for i in xrange(len(pattern)): 1311 s[pattern[i]] |= 1 << (len(pattern) - i - 1) 1312 return s
1313 1314 # PATCH FUNCTIONS 1315
1316 - def patch_addContext(self, patch, text):
1317 """Increase the context until it is unique, 1318 but don't let the pattern expand beyond Match_MaxBits. 1319 1320 Args: 1321 patch: The patch to grow. 1322 text: Source text. 1323 """ 1324 if len(text) == 0: 1325 return 1326 pattern = text[patch.start2 : patch.start2 + patch.length1] 1327 padding = 0 1328 1329 # Look for the first and last matches of pattern in text. If two different 1330 # matches are found, increase the pattern length. 1331 while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits == 1332 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - 1333 self.Patch_Margin)): 1334 padding += self.Patch_Margin 1335 pattern = text[max(0, patch.start2 - padding) : 1336 patch.start2 + patch.length1 + padding] 1337 # Add one chunk for good luck. 1338 padding += self.Patch_Margin 1339 1340 # Add the prefix. 1341 prefix = text[max(0, patch.start2 - padding) : patch.start2] 1342 if prefix: 1343 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)] 1344 # Add the suffix. 1345 suffix = text[patch.start2 + patch.length1 : 1346 patch.start2 + patch.length1 + padding] 1347 if suffix: 1348 patch.diffs.append((self.DIFF_EQUAL, suffix)) 1349 1350 # Roll back the start points. 1351 patch.start1 -= len(prefix) 1352 patch.start2 -= len(prefix) 1353 # Extend lengths. 1354 patch.length1 += len(prefix) + len(suffix) 1355 patch.length2 += len(prefix) + len(suffix)
1356
1357 - def patch_make(self, a, b=None, c=None):
1358 """Compute a list of patches to turn text1 into text2. 1359 Use diffs if provided, otherwise compute it ourselves. 1360 There are four ways to call this function, depending on what data is 1361 available to the caller: 1362 Method 1: 1363 a = text1, b = text2 1364 Method 2: 1365 a = diffs 1366 Method 3 (optimal): 1367 a = text1, b = diffs 1368 Method 4 (deprecated, use method 3): 1369 a = text1, b = text2, c = diffs 1370 1371 Args: 1372 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to 1373 text2 (method 2). 1374 b: text2 (methods 1,4) or Array of diff tuples for text1 to 1375 text2 (method 3) or undefined (method 2). 1376 c: Array of diff tuples for text1 to text2 (method 4) or 1377 undefined (methods 1,2,3). 1378 1379 Returns: 1380 Array of patch objects. 1381 """ 1382 text1 = None 1383 diffs = None 1384 # Note that texts may arrive as 'str' or 'unicode'. 1385 if isinstance(a, basestring) and isinstance(b, basestring) and c is None: 1386 # Method 1: text1, text2 1387 # Compute diffs from text1 and text2. 1388 text1 = a 1389 diffs = self.diff_main(text1, b, True) 1390 if len(diffs) > 2: 1391 self.diff_cleanupSemantic(diffs) 1392 self.diff_cleanupEfficiency(diffs) 1393 elif isinstance(a, list) and b is None and c is None: 1394 # Method 2: diffs 1395 # Compute text1 from diffs. 1396 diffs = a 1397 text1 = self.diff_text1(diffs) 1398 elif isinstance(a, basestring) and isinstance(b, list) and c is None: 1399 # Method 3: text1, diffs 1400 text1 = a 1401 diffs = b 1402 elif (isinstance(a, basestring) and isinstance(b, basestring) and 1403 isinstance(c, list)): 1404 # Method 4: text1, text2, diffs 1405 # text2 is not used. 1406 text1 = a 1407 diffs = c 1408 else: 1409 raise ValueError("Unknown call format to patch_make.") 1410 1411 if not diffs: 1412 return [] # Get rid of the None case. 1413 patches = [] 1414 patch = patch_obj() 1415 char_count1 = 0 # Number of characters into the text1 string. 1416 char_count2 = 0 # Number of characters into the text2 string. 1417 prepatch_text = text1 # Recreate the patches to determine context info. 1418 postpatch_text = text1 1419 for x in xrange(len(diffs)): 1420 (diff_type, diff_text) = diffs[x] 1421 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL: 1422 # A new patch starts here. 1423 patch.start1 = char_count1 1424 patch.start2 = char_count2 1425 if diff_type == self.DIFF_INSERT: 1426 # Insertion 1427 patch.diffs.append(diffs[x]) 1428 patch.length2 += len(diff_text) 1429 postpatch_text = (postpatch_text[:char_count2] + diff_text + 1430 postpatch_text[char_count2:]) 1431 elif diff_type == self.DIFF_DELETE: 1432 # Deletion. 1433 patch.length1 += len(diff_text) 1434 patch.diffs.append(diffs[x]) 1435 postpatch_text = (postpatch_text[:char_count2] + 1436 postpatch_text[char_count2 + len(diff_text):]) 1437 elif (diff_type == self.DIFF_EQUAL and 1438 len(diff_text) <= 2 * self.Patch_Margin and 1439 len(patch.diffs) != 0 and len(diffs) != x + 1): 1440 # Small equality inside a patch. 1441 patch.diffs.append(diffs[x]) 1442 patch.length1 += len(diff_text) 1443 patch.length2 += len(diff_text) 1444 1445 if (diff_type == self.DIFF_EQUAL and 1446 len(diff_text) >= 2 * self.Patch_Margin): 1447 # Time for a new patch. 1448 if len(patch.diffs) != 0: 1449 self.patch_addContext(patch, prepatch_text) 1450 patches.append(patch) 1451 patch = patch_obj() 1452 # Unlike Unidiff, our patch lists have a rolling context. 1453 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff 1454 # Update prepatch text & pos to reflect the application of the 1455 # just completed patch. 1456 prepatch_text = postpatch_text 1457 char_count1 = char_count2 1458 1459 # Update the current character count. 1460 if diff_type != self.DIFF_INSERT: 1461 char_count1 += len(diff_text) 1462 if diff_type != self.DIFF_DELETE: 1463 char_count2 += len(diff_text) 1464 1465 # Pick up the leftover patch if not empty. 1466 if len(patch.diffs) != 0: 1467 self.patch_addContext(patch, prepatch_text) 1468 patches.append(patch) 1469 return patches
1470
1471 - def patch_deepCopy(self, patches):
1472 """Given an array of patches, return another array that is identical. 1473 1474 Args: 1475 patches: Array of patch objects. 1476 1477 Returns: 1478 Array of patch objects. 1479 """ 1480 patchesCopy = [] 1481 for patch in patches: 1482 patchCopy = patch_obj() 1483 # No need to deep copy the tuples since they are immutable. 1484 patchCopy.diffs = patch.diffs[:] 1485 patchCopy.start1 = patch.start1 1486 patchCopy.start2 = patch.start2 1487 patchCopy.length1 = patch.length1 1488 patchCopy.length2 = patch.length2 1489 patchesCopy.append(patchCopy) 1490 return patchesCopy
1491
1492 - def patch_apply(self, patches, text):
1493 """Merge a set of patches onto the text. Return a patched text, as well 1494 as a list of true/false values indicating which patches were applied. 1495 1496 Args: 1497 patches: Array of patch objects. 1498 text: Old text. 1499 1500 Returns: 1501 Two element Array, containing the new text and an array of boolean values. 1502 """ 1503 if not patches: 1504 return (text, []) 1505 1506 # Deep copy the patches so that no changes are made to originals. 1507 patches = self.patch_deepCopy(patches) 1508 1509 nullPadding = self.patch_addPadding(patches) 1510 text = nullPadding + text + nullPadding 1511 self.patch_splitMax(patches) 1512 1513 # delta keeps track of the offset between the expected and actual location 1514 # of the previous patch. If there are patches expected at positions 10 and 1515 # 20, but the first patch was found at 12, delta is 2 and the second patch 1516 # has an effective expected position of 22. 1517 delta = 0 1518 results = [] 1519 for patch in patches: 1520 expected_loc = patch.start2 + delta 1521 text1 = self.diff_text1(patch.diffs) 1522 end_loc = -1 1523 if len(text1) > self.Match_MaxBits: 1524 # patch_splitMax will only provide an oversized pattern in the case of 1525 # a monster delete. 1526 start_loc = self.match_main(text, text1[:self.Match_MaxBits], 1527 expected_loc) 1528 if start_loc != -1: 1529 end_loc = self.match_main(text, text1[-self.Match_MaxBits:], 1530 expected_loc + len(text1) - self.Match_MaxBits) 1531 if end_loc == -1 or start_loc >= end_loc: 1532 # Can't find valid trailing context. Drop this patch. 1533 start_loc = -1 1534 else: 1535 start_loc = self.match_main(text, text1, expected_loc) 1536 if start_loc == -1: 1537 # No match found. :( 1538 results.append(False) 1539 # Subtract the delta for this failed patch from subsequent patches. 1540 delta -= patch.length2 - patch.length1 1541 else: 1542 # Found a match. :) 1543 results.append(True) 1544 delta = start_loc - expected_loc 1545 if end_loc == -1: 1546 text2 = text[start_loc : start_loc + len(text1)] 1547 else: 1548 text2 = text[start_loc : end_loc + self.Match_MaxBits] 1549 if text1 == text2: 1550 # Perfect match, just shove the replacement text in. 1551 text = (text[:start_loc] + self.diff_text2(patch.diffs) + 1552 text[start_loc + len(text1):]) 1553 else: 1554 # Imperfect match. 1555 # Run a diff to get a framework of equivalent indices. 1556 diffs = self.diff_main(text1, text2, False) 1557 if (len(text1) > self.Match_MaxBits and 1558 self.diff_levenshtein(diffs) / float(len(text1)) > 1559 self.Patch_DeleteThreshold): 1560 # The end points match, but the content is unacceptably bad. 1561 results[-1] = False 1562 else: 1563 self.diff_cleanupSemanticLossless(diffs) 1564 index1 = 0 1565 for (op, data) in patch.diffs: 1566 if op != self.DIFF_EQUAL: 1567 index2 = self.diff_xIndex(diffs, index1) 1568 if op == self.DIFF_INSERT: # Insertion 1569 text = text[:start_loc + index2] + data + text[start_loc + 1570 index2:] 1571 elif op == self.DIFF_DELETE: # Deletion 1572 text = text[:start_loc + index2] + text[start_loc + 1573 self.diff_xIndex(diffs, index1 + len(data)):] 1574 if op != self.DIFF_DELETE: 1575 index1 += len(data) 1576 # Strip the padding off. 1577 text = text[len(nullPadding):-len(nullPadding)] 1578 return (text, results)
1579
1580 - def patch_addPadding(self, patches):
1581 """Add some padding on text start and end so that edges can match 1582 something. Intended to be called only from within patch_apply. 1583 1584 Args: 1585 patches: Array of patch objects. 1586 1587 Returns: 1588 The padding string added to each side. 1589 """ 1590 paddingLength = self.Patch_Margin 1591 nullPadding = "" 1592 for x in xrange(1, paddingLength + 1): 1593 nullPadding += chr(x) 1594 1595 # Bump all the patches forward. 1596 for patch in patches: 1597 patch.start1 += paddingLength 1598 patch.start2 += paddingLength 1599 1600 # Add some padding on start of first diff. 1601 patch = patches[0] 1602 diffs = patch.diffs 1603 if not diffs or diffs[0][0] != self.DIFF_EQUAL: 1604 # Add nullPadding equality. 1605 diffs.insert(0, (self.DIFF_EQUAL, nullPadding)) 1606 patch.start1 -= paddingLength # Should be 0. 1607 patch.start2 -= paddingLength # Should be 0. 1608 patch.length1 += paddingLength 1609 patch.length2 += paddingLength 1610 elif paddingLength > len(diffs[0][1]): 1611 # Grow first equality. 1612 extraLength = paddingLength - len(diffs[0][1]) 1613 newText = nullPadding[len(diffs[0][1]):] + diffs[0][1] 1614 diffs[0] = (diffs[0][0], newText) 1615 patch.start1 -= extraLength 1616 patch.start2 -= extraLength 1617 patch.length1 += extraLength 1618 patch.length2 += extraLength 1619 1620 # Add some padding on end of last diff. 1621 patch = patches[-1] 1622 diffs = patch.diffs 1623 if not diffs or diffs[-1][0] != self.DIFF_EQUAL: 1624 # Add nullPadding equality. 1625 diffs.append((self.DIFF_EQUAL, nullPadding)) 1626 patch.length1 += paddingLength 1627 patch.length2 += paddingLength 1628 elif paddingLength > len(diffs[-1][1]): 1629 # Grow last equality. 1630 extraLength = paddingLength - len(diffs[-1][1]) 1631 newText = diffs[-1][1] + nullPadding[:extraLength] 1632 diffs[-1] = (diffs[-1][0], newText) 1633 patch.length1 += extraLength 1634 patch.length2 += extraLength 1635 1636 return nullPadding
1637
1638 - def patch_splitMax(self, patches):
1639 """Look through the patches and break up any which are longer than the 1640 maximum limit of the match algorithm. 1641 1642 Args: 1643 patches: Array of patch objects. 1644 """ 1645 if self.Match_MaxBits == 0: 1646 return 1647 for x in xrange(len(patches)): 1648 if patches[x].length1 > self.Match_MaxBits: 1649 bigpatch = patches[x] 1650 # Remove the big old patch. 1651 del patches[x] 1652 x -= 1 1653 patch_size = self.Match_MaxBits 1654 start1 = bigpatch.start1 1655 start2 = bigpatch.start2 1656 precontext = '' 1657 while len(bigpatch.diffs) != 0: 1658 # Create one of several smaller patches. 1659 patch = patch_obj() 1660 empty = True 1661 patch.start1 = start1 - len(precontext) 1662 patch.start2 = start2 - len(precontext) 1663 if precontext: 1664 patch.length1 = patch.length2 = len(precontext) 1665 patch.diffs.append((self.DIFF_EQUAL, precontext)) 1666 1667 while (len(bigpatch.diffs) != 0 and 1668 patch.length1 < patch_size - self.Patch_Margin): 1669 (diff_type, diff_text) = bigpatch.diffs[0] 1670 if diff_type == self.DIFF_INSERT: 1671 # Insertions are harmless. 1672 patch.length2 += len(diff_text) 1673 start2 += len(diff_text) 1674 patch.diffs.append(bigpatch.diffs.pop(0)) 1675 empty = False 1676 elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and 1677 patch.diffs[0][0] == self.DIFF_EQUAL and 1678 len(diff_text) > 2 * patch_size): 1679 # This is a large deletion. Let it pass in one chunk. 1680 patch.length1 += len(diff_text) 1681 start1 += len(diff_text) 1682 empty = False 1683 patch.diffs.append((diff_type, diff_text)) 1684 del bigpatch.diffs[0] 1685 else: 1686 # Deletion or equality. Only take as much as we can stomach. 1687 diff_text = diff_text[:patch_size - patch.length1 - 1688 self.Patch_Margin] 1689 patch.length1 += len(diff_text) 1690 start1 += len(diff_text) 1691 if diff_type == self.DIFF_EQUAL: 1692 patch.length2 += len(diff_text) 1693 start2 += len(diff_text) 1694 else: 1695 empty = False 1696 1697 patch.diffs.append((diff_type, diff_text)) 1698 if diff_text == bigpatch.diffs[0][1]: 1699 del bigpatch.diffs[0] 1700 else: 1701 bigpatch.diffs[0] = (bigpatch.diffs[0][0], 1702 bigpatch.diffs[0][1][len(diff_text):]) 1703 1704 # Compute the head context for the next patch. 1705 precontext = self.diff_text2(patch.diffs) 1706 precontext = precontext[-self.Patch_Margin:] 1707 # Append the end context for this patch. 1708 postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin] 1709 if postcontext: 1710 patch.length1 += len(postcontext) 1711 patch.length2 += len(postcontext) 1712 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL: 1713 patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] + 1714 postcontext) 1715 else: 1716 patch.diffs.append((self.DIFF_EQUAL, postcontext)) 1717 1718 if not empty: 1719 x += 1 1720 patches.insert(x, patch)
1721
1722 - def patch_toText(self, patches):
1723 """Take a list of patches and return a textual representation. 1724 1725 Args: 1726 patches: Array of patch objects. 1727 1728 Returns: 1729 Text representation of patches. 1730 """ 1731 text = [] 1732 for patch in patches: 1733 text.append(str(patch)) 1734 return "".join(text)
1735
1736 - def patch_fromText(self, textline):
1737 """Parse a textual representation of patches and return a list of patch 1738 objects. 1739 1740 Args: 1741 textline: Text representation of patches. 1742 1743 Returns: 1744 Array of patch objects. 1745 1746 Raises: 1747 ValueError: If invalid input. 1748 """ 1749 if type(textline) == unicode: 1750 # Patches should be composed of a subset of ascii chars, Unicode not 1751 # required. If this encode raises UnicodeEncodeError, patch is invalid. 1752 textline = textline.encode("ascii") 1753 patches = [] 1754 if not textline: 1755 return patches 1756 text = textline.split('\n') 1757 while len(text) != 0: 1758 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0]) 1759 if not m: 1760 raise ValueError("Invalid patch string: " + text[0]) 1761 patch = patch_obj() 1762 patches.append(patch) 1763 patch.start1 = int(m.group(1)) 1764 if m.group(2) == '': 1765 patch.start1 -= 1 1766 patch.length1 = 1 1767 elif m.group(2) == '0': 1768 patch.length1 = 0 1769 else: 1770 patch.start1 -= 1 1771 patch.length1 = int(m.group(2)) 1772 1773 patch.start2 = int(m.group(3)) 1774 if m.group(4) == '': 1775 patch.start2 -= 1 1776 patch.length2 = 1 1777 elif m.group(4) == '0': 1778 patch.length2 = 0 1779 else: 1780 patch.start2 -= 1 1781 patch.length2 = int(m.group(4)) 1782 1783 del text[0] 1784 1785 while len(text) != 0: 1786 if text[0]: 1787 sign = text[0][0] 1788 else: 1789 sign = '' 1790 line = urllib.unquote(text[0][1:]) 1791 line = line.decode("utf-8") 1792 if sign == '+': 1793 # Insertion. 1794 patch.diffs.append((self.DIFF_INSERT, line)) 1795 elif sign == '-': 1796 # Deletion. 1797 patch.diffs.append((self.DIFF_DELETE, line)) 1798 elif sign == ' ': 1799 # Minor equality. 1800 patch.diffs.append((self.DIFF_EQUAL, line)) 1801 elif sign == '@': 1802 # Start of next patch. 1803 break 1804 elif sign == '': 1805 # Blank line? Whatever. 1806 pass 1807 else: 1808 # WTF? 1809 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line)) 1810 del text[0] 1811 return patches
1812 1813
1814 -class patch_obj:
1815 """Class representing one patch operation. 1816 """ 1817
1818 - def __init__(self):
1819 """Initializes with an empty list of diffs. 1820 """ 1821 self.diffs = [] 1822 self.start1 = None 1823 self.start2 = None 1824 self.length1 = 0 1825 self.length2 = 0
1826
1827 - def __str__(self):
1828 """Emmulate GNU diff's format. 1829 Header: @@ -382,8 +481,9 @@ 1830 Indicies are printed as 1-based, not 0-based. 1831 1832 Returns: 1833 The GNU diff string. 1834 """ 1835 if self.length1 == 0: 1836 coords1 = str(self.start1) + ",0" 1837 elif self.length1 == 1: 1838 coords1 = str(self.start1 + 1) 1839 else: 1840 coords1 = str(self.start1 + 1) + "," + str(self.length1) 1841 if self.length2 == 0: 1842 coords2 = str(self.start2) + ",0" 1843 elif self.length2 == 1: 1844 coords2 = str(self.start2 + 1) 1845 else: 1846 coords2 = str(self.start2 + 1) + "," + str(self.length2) 1847 text = ["@@ -", coords1, " +", coords2, " @@\n"] 1848 # Escape the body of the patch with %xx notation. 1849 for (op, data) in self.diffs: 1850 if op == diff_match_patch.DIFF_INSERT: 1851 text.append("+") 1852 elif op == diff_match_patch.DIFF_DELETE: 1853 text.append("-") 1854 elif op == diff_match_patch.DIFF_EQUAL: 1855 text.append(" ") 1856 # High ascii will raise UnicodeDecodeError. Use Unicode instead. 1857 data = data.encode("utf-8") 1858 text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n") 1859 return "".join(text)
1860