Browse Source

diff_tree: Consider dissimilar modifies to be delete/add pairs.

Modifies are split into delete/add pairs if their similarity is below
a given threshold, and are rejoined at the end of the rename detection
process.

Change-Id: I0f11549cc1117be7b8bb67cb584e4101e72a4913
Signed-off-by: Jelmer Vernooij <jelmer@samba.org>
Dave Borowitz 14 years ago
parent
commit
8599f045fd
2 changed files with 101 additions and 3 deletions
  1. 44 2
      dulwich/diff_tree.py
  2. 57 1
      dulwich/tests/test_diff_tree.py

+ 44 - 2
dulwich/diff_tree.py

@@ -44,6 +44,7 @@ _NULL_ENTRY = TreeEntry(None, None, None)
 _MAX_SCORE = 100
 _MAX_SCORE = 100
 _RENAME_THRESHOLD = 60
 _RENAME_THRESHOLD = 60
 _MAX_FILES = 200
 _MAX_FILES = 200
+_REWRITE_THRESHOLD = None
 
 
 
 
 class TreeChange(TreeChangeTuple):
 class TreeChange(TreeChangeTuple):
@@ -268,7 +269,8 @@ class RenameDetector(object):
     """Object for handling rename detection between two trees."""
     """Object for handling rename detection between two trees."""
 
 
     def __init__(self, store, tree1_id, tree2_id,
     def __init__(self, store, tree1_id, tree2_id,
-                 rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES):
+                 rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES,
+                 rewrite_threshold=_REWRITE_THRESHOLD):
         """Initialize the rename detector.
         """Initialize the rename detector.
 
 
         :param store: An ObjectStore for looking up objects.
         :param store: An ObjectStore for looking up objects.
@@ -281,23 +283,38 @@ class RenameDetector(object):
             than max_files ** 2 add/delete pairs. This limit is provided because
             than max_files ** 2 add/delete pairs. This limit is provided because
             rename detection can be quadratic in the project size. If the limit
             rename detection can be quadratic in the project size. If the limit
             is exceeded, no content rename detection is attempted.
             is exceeded, no content rename detection is attempted.
+        :param rewrite_threshold: The threshold similarity score below which a
+            modify should be considered a delete/add, or None to not break
+            modifies; see _similarity_score.
         """
         """
         self._tree1_id = tree1_id
         self._tree1_id = tree1_id
         self._tree2_id = tree2_id
         self._tree2_id = tree2_id
         self._store = store
         self._store = store
         self._rename_threshold = rename_threshold
         self._rename_threshold = rename_threshold
+        self._rewrite_threshold = rewrite_threshold
         self._max_files = max_files
         self._max_files = max_files
 
 
         self._adds = []
         self._adds = []
         self._deletes = []
         self._deletes = []
         self._changes = []
         self._changes = []
 
 
+    def _should_split(self, change):
+        if (self._rewrite_threshold is None or change.type != CHANGE_MODIFY or
+            change.old.sha == change.new.sha):
+            return False
+        old_obj = self._store[change.old.sha]
+        new_obj = self._store[change.new.sha]
+        return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
+
     def _collect_changes(self):
     def _collect_changes(self):
         for change in tree_changes(self._store, self._tree1_id, self._tree2_id):
         for change in tree_changes(self._store, self._tree1_id, self._tree2_id):
             if change.type == CHANGE_ADD:
             if change.type == CHANGE_ADD:
                 self._adds.append(change)
                 self._adds.append(change)
             elif change.type == CHANGE_DELETE:
             elif change.type == CHANGE_DELETE:
                 self._deletes.append(change)
                 self._deletes.append(change)
+            elif self._should_split(change):
+                self._deletes.append(TreeChange.delete(change.old))
+                self._adds.append(TreeChange.add(change.new))
             else:
             else:
                 self._changes.append(change)
                 self._changes.append(change)
 
 
@@ -344,6 +361,7 @@ class RenameDetector(object):
         if len(self._adds) * len(self._deletes) > self._max_files ** 2:
         if len(self._adds) * len(self._deletes) > self._max_files ** 2:
             return
             return
 
 
+        check_paths = self._rename_threshold is not None
         candidates = []
         candidates = []
         for delete in self._deletes:
         for delete in self._deletes:
             if S_ISGITLINK(delete.old.mode):
             if S_ISGITLINK(delete.old.mode):
@@ -358,7 +376,13 @@ class RenameDetector(object):
                 score = _similarity_score(old_obj, new_obj,
                 score = _similarity_score(old_obj, new_obj,
                                           block_cache={old_sha: old_blocks})
                                           block_cache={old_sha: old_blocks})
                 if score > self._rename_threshold:
                 if score > self._rename_threshold:
-                    rename = TreeChange(CHANGE_RENAME, delete.old, add.new)
+                    if check_paths and delete.old.path == add.new.path:
+                        # If the paths match, this must be a split modify, so
+                        # make sure it comes out as a modify.
+                        new_type = CHANGE_MODIFY
+                    else:
+                        new_type = CHANGE_RENAME
+                    rename = TreeChange(new_type, delete.old, add.new)
                     candidates.append((-score, rename))
                     candidates.append((-score, rename))
 
 
         # Sort scores from highest to lowest, but keep names in ascending order.
         # Sort scores from highest to lowest, but keep names in ascending order.
@@ -378,6 +402,23 @@ class RenameDetector(object):
             self._changes.append(change)
             self._changes.append(change)
         self._prune(add_paths, delete_paths)
         self._prune(add_paths, delete_paths)
 
 
+    def _join_modifies(self):
+        if self._rewrite_threshold is None:
+            return
+
+        modifies = {}
+        delete_map = dict((d.old.path, d) for d in self._deletes)
+        for add in self._adds:
+            path = add.new.path
+            delete = delete_map.get(path)
+            if (delete is not None and
+              stat.S_IFMT(delete.old.mode) == stat.S_IFMT(add.new.mode)):
+                modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new)
+
+        self._adds = [a for a in self._adds if a.new.path not in modifies]
+        self._deletes = [a for a in self._deletes if a.new.path not in modifies]
+        self._changes += modifies.values()
+
     def _sorted_changes(self):
     def _sorted_changes(self):
         result = []
         result = []
         result.extend(self._adds)
         result.extend(self._adds)
@@ -391,6 +432,7 @@ class RenameDetector(object):
         self._collect_changes()
         self._collect_changes()
         self._find_exact_renames()
         self._find_exact_renames()
         self._find_content_renames()
         self._find_content_renames()
+        self._join_modifies()
         return self._sorted_changes()
         return self._sorted_changes()
 
 
 
 

+ 57 - 1
dulwich/tests/test_diff_tree.py

@@ -376,6 +376,16 @@ class RenameDetectionTest(DiffTestCase):
         detector = RenameDetector(self.store, tree1.id, tree2.id, **kwargs)
         detector = RenameDetector(self.store, tree1.id, tree2.id, **kwargs)
         return detector.changes_with_renames()
         return detector.changes_with_renames()
 
 
+    def test_no_renames(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\ne\nf\n')
+        blob3 = make_object(Blob, data='a\nb\ng\nh\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob1), ('b', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('b', F, blob2.id), ('b', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+
     def test_exact_rename_one_to_one(self):
     def test_exact_rename_one_to_one(self):
         blob1 = make_object(Blob, data='1')
         blob1 = make_object(Blob, data='1')
         blob2 = make_object(Blob, data='2')
         blob2 = make_object(Blob, data='2')
@@ -433,7 +443,7 @@ class RenameDetectionTest(DiffTestCase):
            TreeChange(CHANGE_RENAME, ('b', F, blob.id), ('d', F, blob.id))],
            TreeChange(CHANGE_RENAME, ('b', F, blob.id), ('d', F, blob.id))],
           self.detect_renames(tree1, tree2))
           self.detect_renames(tree1, tree2))
 
 
-    def test_content_rename_threshold(self):
+    def test_rename_threshold(self):
         blob1 = make_object(Blob, data='a\nb\nc\n')
         blob1 = make_object(Blob, data='a\nb\nc\n')
         blob2 = make_object(Blob, data='a\nb\nd\n')
         blob2 = make_object(Blob, data='a\nb\nd\n')
         tree1 = self.commit_tree([('a', blob1)])
         tree1 = self.commit_tree([('a', blob1)])
@@ -545,3 +555,49 @@ class RenameDetectionTest(DiffTestCase):
            TreeChange.add(('c', 0100644, blob2.id)),
            TreeChange.add(('c', 0100644, blob2.id)),
            TreeChange.add(('d', 0160000, link2))],
            TreeChange.add(('d', 0160000, link2))],
           self.detect_renames(tree1, tree2))
           self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_swap(self):
+        blob1 = make_object(Blob, data='1')
+        blob2 = make_object(Blob, data='2')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob2), ('b', blob1)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+           TreeChange(CHANGE_MODIFY, ('b', F, blob2.id), ('b', F, blob1.id))],
+          self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob1.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('a', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=50))
+
+    def test_content_rename_swap(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='e\nf\ng\nh\n')
+        blob3 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob4 = make_object(Blob, data='e\nf\ng\ni\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob4), ('b', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob3.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('a', F, blob4.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=60))
+
+    def test_rewrite_threshold(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob3 = make_object(Blob, data='a\nb\nf\ng\n')
+
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob3), ('b', blob2)])
+
+        no_renames = [
+          TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob3.id)),
+          TreeChange.add(('b', F, blob2.id))]
+        self.assertEqual(
+          no_renames, self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          no_renames, self.detect_renames(tree1, tree2, rewrite_threshold=40))
+        self.assertEqual(
+          [TreeChange.add(('a', F, blob3.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=80))