Browse Source

Bug 562676: Push efficiency - report missing objects only.

Fix with tests.
Artem Tikhomirov 12 years ago
parent
commit
c1787f51f4
3 changed files with 309 additions and 20 deletions
  1. 112 20
      dulwich/object_store.py
  2. 1 0
      dulwich/tests/__init__.py
  3. 196 0
      dulwich/tests/test_missing_obj_finder.py

+ 112 - 20
dulwich/object_store.py

@@ -1,5 +1,6 @@
 # object_store.py -- Object store for git objects
-# Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
+# Copyright (C) 2008-2012 Jelmer Vernooij <jelmer@samba.org>
+#                         and others
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -220,6 +221,28 @@ class BaseObjectStore(object):
             obj = self[sha]
         return obj
 
+    def _collect_ancestors(self, heads, common = set()):
+        """Collect all ancestors of heads up to (excluding) those in common
+        :param heads: commits to start from
+        :param common: commits to end at, or empty set to walk repository completely
+        :return: a tuple (A, B) where A - all commits reachable
+            from heads but not present in common, B - common (shared) elements
+            that are directly reachable from heads
+        """
+        bases = set()
+        commits = set()
+        queue = []
+        queue.extend(heads)
+        while queue:
+            e = queue.pop(0)
+            if e in common:
+                bases.add(e)
+            elif e not in commits:
+                commits.add(e)
+                cmt = self[e]
+                queue.extend(cmt.parents)
+        return (commits, bases)
+
 
 class PackBasedObjectStore(BaseObjectStore):
 
@@ -792,6 +815,52 @@ def tree_lookup_path(lookup_obj, root_sha, path):
         raise NotTreeError(root_sha)
     return tree.lookup_path(lookup_obj, path)
 
+def _collect_filetree_revs(obj_store, tree_sha, kset):
+    """Collect SHA1s of files and directories for specified tree
+        (identified by SHA1)
+    :param obj_store: Object store to get objects by SHA from
+    :param tree_sha: tree reference to walk
+    :param kset: set to fill with references to files and directories
+    """
+    filetree = obj_store[tree_sha]
+    for name,mode,sha in filetree.iteritems():
+       if not S_ISGITLINK(mode) and sha not in kset:
+           kset.add(sha)
+           if stat.S_ISDIR(mode):
+               _collect_filetree_revs(obj_store, sha, kset)
+
+def _split_commits_and_tags(obj_store, lst, ignore_unknown = False):
+    """Split lst into two lists, one with commit SHA1s, another with
+        tag SHA1s. Commits referenced by tags are included into commits
+        list as well. Only SHA1s known in this repository will get
+        through, and unless ignore_unknown argument is True, KeyError
+        is thrown for SHA1 missing in the repository
+    :param obj_store: Object store to get objects by SHA1 from
+    :param lst: Collection of commit and tag SHAs
+    :param ignore_unknown: True to skip SHA1 missing in the 
+        repository silently.
+    :return: A tuple of (commits, tags) SHA1s
+    """
+    commits = set()
+    tags = set()
+    for e in lst:
+        try:
+            o = obj_store[e]
+        except KeyError:
+            if ignore_unknown:
+                pass
+            else:
+                raise
+        else:
+            if isinstance(o, Commit):
+                commits.add(e)
+            elif isinstance(o, Tag):
+                tags.add(e)
+                commits.add(o.object[1])
+            else:
+                raise KeyError('Not a commit or a tag: %s' % e)
+    return (commits, tags)
+
 
 class MissingObjectFinder(object):
     """Find the objects missing from another object store.
@@ -808,11 +877,44 @@ class MissingObjectFinder(object):
 
     def __init__(self, object_store, haves, wants, progress=None,
                  get_tagged=None):
-        haves = set(haves)
-        self.sha_done = haves
-        self.objects_to_send = set([(w, None, False) for w in wants
-                                    if w not in haves])
         self.object_store = object_store
+        # process Commits and Tags differently
+        # Note, while haves may list commits/tags not available locally,
+        # and such SHAs would get filtered out by _split_commits_and_tags,
+        # wants shall list only known SHAs, and otherwise
+        # _split_commits_and_tags fails with KeyError
+        have_commits, have_tags = \
+                _split_commits_and_tags(object_store, haves, True)
+        want_commits, want_tags = \
+                _split_commits_and_tags(object_store, wants, False)
+        # all_ancestors is a set of commits that shall not be sent
+        # (complete repository up to 'haves')
+        all_ancestors = object_store._collect_ancestors(have_commits)[0]
+        # all_missing - complete set of commits between haves and wants
+        # common - commits from all_ancestors we hit into while
+        # traversing parent hierarchy of wants
+        missing_commits, common_commits = \
+            object_store._collect_ancestors(want_commits, all_ancestors)
+        self.sha_done = set()
+        # Now, fill sha_done with commits and revisions of
+        # files and directories known to be both locally
+        # and on target. Thus these commits and files
+        # won't get selected for fetch
+        for h in common_commits:
+            self.sha_done.add(h)
+            cmt = object_store[h]
+            _collect_filetree_revs(object_store, cmt.tree, self.sha_done)
+        # record tags we have as visited, too
+        for t in have_tags:
+            self.sha_done.add(t)
+
+        missing_tags = want_tags.difference(have_tags)
+        # in fact, what we 'want' is commits and tags
+        # we've found missing
+        wants = missing_commits.union(missing_tags)
+
+        self.objects_to_send = set([(w, None, False) for w in wants])
+
         if progress is None:
             self.progress = lambda x: None
         else:
@@ -823,18 +925,6 @@ class MissingObjectFinder(object):
         self.objects_to_send.update([e for e in entries
                                      if not e[0] in self.sha_done])
 
-    def parse_tree(self, tree):
-        self.add_todo([(sha, name, not stat.S_ISDIR(mode))
-                       for name, mode, sha in tree.iteritems()
-                       if not S_ISGITLINK(mode)])
-
-    def parse_commit(self, commit):
-        self.add_todo([(commit.tree, "", False)])
-        self.add_todo([(p, None, False) for p in commit.parents])
-
-    def parse_tag(self, tag):
-        self.add_todo([(tag.object[1], None, False)])
-
     def next(self):
         while True:
             if not self.objects_to_send:
@@ -845,11 +935,13 @@ class MissingObjectFinder(object):
         if not leaf:
             o = self.object_store[sha]
             if isinstance(o, Commit):
-                self.parse_commit(o)
+                self.add_todo([(o.tree, "", False)])
             elif isinstance(o, Tree):
-                self.parse_tree(o)
+                self.add_todo([(s, n, not stat.S_ISDIR(m))
+                               for n, m, s in o.iteritems()
+                               if not S_ISGITLINK(m)])
             elif isinstance(o, Tag):
-                self.parse_tag(o)
+                self.add_todo([(o.object[1], None, False)])
         if sha in self._tagged:
             self.add_todo([(self._tagged[sha], None, True)])
         self.sha_done.add(sha)

+ 1 - 0
dulwich/tests/__init__.py

@@ -121,6 +121,7 @@ def self_test_suite():
         'lru_cache',
         'objects',
         'object_store',
+        'missing_obj_finder',
         'pack',
         'patch',
         'protocol',

+ 196 - 0
dulwich/tests/test_missing_obj_finder.py

@@ -0,0 +1,196 @@
+# test_missing_obj_finder.py -- tests for MissingObjectFinder
+# Copyright (C) 2012 syntevo GmbH
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# or (at your option) any later version of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+from dulwich.errors import (
+    MissingCommitError,
+    )
+from dulwich.object_store import (
+    MemoryObjectStore,
+    )
+from dulwich.objects import (
+    Commit,
+    Blob,
+    )
+from dulwich.tests import TestCase
+from utils import (
+    F,
+    make_object,
+    build_commit_graph,
+    )
+
+class MissingObjectFinderTest(TestCase):
+
+    def setUp(self):
+        super(MissingObjectFinderTest, self).setUp()
+        self.store = MemoryObjectStore()
+        self.commits = []
+
+    def __getitem__(self, n):
+        # rename for brevity
+        return self.commits[n-1]
+
+    def cmt(self, n):
+        return self[n]
+
+    def assertMissingMatch(self, haves, wants, expected):
+        for sha,path in self.store.find_missing_objects(haves, wants):
+            self.assertTrue(sha in expected, "FAILURE: (%s,%s) erroneously reported as missing" % (sha,path))
+            expected.remove(sha)
+
+        self.assertFalse(len(expected) > 0, "FAILURE: some objects are not reported as missing: %s" % (expected))
+
+class MOFLinearRepoTest(MissingObjectFinderTest):
+    def setUp(self):
+        super(MOFLinearRepoTest, self).setUp()
+        f1_1 = make_object(Blob, data='f1') # present in 1, removed in 3
+        f2_1 = make_object(Blob, data='f2') # present in all revisions, changed in 2 and 3
+        f2_2 = make_object(Blob, data='f2-changed')
+        f2_3 = make_object(Blob, data='f2-changed-again')
+        f3_2 = make_object(Blob, data='f3') # added in 2, left unmodified in 3
+
+        commit_spec = [[1], [2,1], [3,2]]
+        trees = {1: [('f1', f1_1), ('f2',f2_1)],
+                2: [('f1',f1_1), ('f2', f2_2), ('f3', f3_2)],
+                3: [('f2', f2_3), ('f3', f3_2)] }
+        # commit 1: f1 and f2
+        # commit 2: f3 added, f2 changed. Missing shall report commit id and a tree referenced by commit
+        # commit 3: f1 removed, f2 changed. Commit sha and root tree sha shall be reported as modified
+        self.commits = build_commit_graph(self.store, commit_spec, trees)
+        self.missing_1_2 = [self.cmt(2).id, self.cmt(2).tree, f2_2.id, f3_2.id]
+        self.missing_2_3 = [self.cmt(3).id, self.cmt(3).tree, f2_3.id]
+        self.missing_1_3 = [
+            self.cmt(2).id, self.cmt(3).id,
+            self.cmt(2).tree, self.cmt(3).tree,
+            f2_2.id, f3_2.id, f2_3.id]
+
+
+    def test_1_to_2(self):
+        self.assertMissingMatch([self.cmt(1).id], [self.cmt(2).id], self.missing_1_2)
+
+    def test_2_to_3(self):
+        self.assertMissingMatch([self.cmt(2).id], [self.cmt(3).id], self.missing_2_3)
+
+    def test_1_to_3(self):
+        self.assertMissingMatch([self.cmt(1).id], [self.cmt(3).id], self.missing_1_3)
+
+    def test_bogus_haves_failure(self):
+        """Ensure non-existent SHA in haves are not tolerated"""
+        bogus_sha = self.cmt(2).id[::-1]
+        haves = [self.cmt(1).id, bogus_sha]
+        wants = [self.cmt(3).id]
+        self.assertRaises(KeyError, self.store.find_missing_objects, self.store, haves, wants)
+
+    def test_bogus_wants_failure(self):
+        """Ensure non-existent SHA in wants are not tolerated"""
+        bogus_sha = self.cmt(2).id[::-1]
+        haves = [self.cmt(1).id]
+        wants = [self.cmt(3).id, bogus_sha]
+        self.assertRaises(KeyError, self.store.find_missing_objects, self.store, haves, wants)
+
+    def test_no_changes(self):
+        self.assertMissingMatch([self.cmt(3).id], [self.cmt(3).id], [])
+
+class MOFMergeForkRepoTest(MissingObjectFinderTest):
+    """ 1 --- 2 --- 4 --- 6 --- 7
+               \        /
+                3  ---
+                 \
+                  5
+    """
+
+    def setUp(self):
+        super(MOFMergeForkRepoTest, self).setUp()
+        f1_1 = make_object(Blob, data='f1')
+        f1_2 = make_object(Blob, data='f1-2')
+        f1_4 = make_object(Blob, data='f1-4')
+        f1_7 = make_object(Blob, data='f1-2') # same data as in rev 2
+        f2_1 = make_object(Blob, data='f2')
+        f2_3 = make_object(Blob, data='f2-3')
+        f3_3 = make_object(Blob, data='f3')
+        f3_5 = make_object(Blob, data='f3-5')
+        commit_spec = [[1], [2,1], [3,2], [4,2], [5,3], [6,3,4], [7,6]]
+        trees = {1: [('f1', f1_1), ('f2',f2_1)],
+                2: [('f1',f1_2), ('f2', f2_1)], # f1 changed
+                3: [('f1',f1_2), ('f2', f2_3), ('f3', f3_3)], # f3 added, f2 changed
+                4: [('f1',f1_4), ('f2',f2_1)],  # f1 changed
+                5: [('f1',f1_2), ('f3', f3_5)], # f2 removed, f3 changed
+                6: [('f1',f1_4), ('f2',f2_3), ('f3', f3_3)], # merged 3 and 4
+                7: [('f1',f1_7), ('f2',f2_3)]} # f1 changed to match rev2. f3 removed
+        self.commits = build_commit_graph(self.store, commit_spec, trees)
+
+        self.f1_2_id = f1_2.id
+        self.f1_4_id = f1_4.id
+        self.f1_7_id = f1_7.id
+        self.f2_3_id = f2_3.id
+        self.f3_3_id = f3_3.id
+
+        self.assertEquals(f1_2.id, f1_7.id, "[sanity]")
+
+    def test_have6_want7(self):
+        """
+          have 6, want 7. Ideally, shall not report f1_7 as it's the same as f1_2,
+          however, to do so, MissingObjectFinder shall not record trees of common commits
+          only, but also all parent trees and tree items, which is an overkill
+          (i.e. in sha_done it records f1_4 as known, and doesn't record f1_2 was known
+          prior to that, hence can't detect f1_7 is in fact f1_2 and shall not be reported)
+        """
+        self.assertMissingMatch([self.cmt(6).id], [self.cmt(7).id], [self.cmt(7).id, self.cmt(7).tree, self.f1_7_id])
+
+    def test_have4_want7(self):
+        """
+          have 4, want 7. Shall not include rev5 as it is not in the tree between 4 and 7
+          (well, it is, but its SHA's are irrelevant for 4..7 commit hierarchy)
+        """
+        self.assertMissingMatch([self.cmt(4).id], [self.cmt(7).id], [
+            self.cmt(7).id, self.cmt(6).id, self.cmt(3).id,
+            self.cmt(7).tree, self.cmt(6).tree, self.cmt(3).tree,
+            self.f2_3_id, self.f3_3_id])
+
+    def test_have1_want6(self):
+        """
+          have 1, want 6. Shall not include rev5
+        """
+        self.assertMissingMatch([self.cmt(1).id], [self.cmt(6).id], [
+            self.cmt(6).id, self.cmt(4).id, self.cmt(3).id, self.cmt(2).id,
+            self.cmt(6).tree, self.cmt(4).tree, self.cmt(3).tree, self.cmt(2).tree,
+            self.f1_2_id, self.f1_4_id, self.f2_3_id, self.f3_3_id])
+
+    def test_have3_want6(self):
+        """
+          have 3, want 7. Shall not report rev2 and its tree, because
+          haves(3) means has parents, i.e. rev2, too
+          BUT shall report any changes descending rev2 (excluding rev3)
+          Shall NOT report f1_7 as it's techically == f1_2
+        """
+        self.assertMissingMatch([self.cmt(3).id], [self.cmt(7).id], [
+              self.cmt(7).id, self.cmt(6).id, self.cmt(4).id,
+              self.cmt(7).tree, self.cmt(6).tree, self.cmt(4).tree,
+              self.f1_4_id])
+
+    def test_have5_want7(self):
+        """
+          have 5, want 7. Common parent is rev2, hence children of rev2 from
+          a descent line other than rev5 shall be reported
+        """
+        # expects f1_4 from rev6. f3_5 is known in rev5; 
+        # f1_7 shall be the same as f1_2 (known, too)
+        self.assertMissingMatch([self.cmt(5).id], [self.cmt(7).id], [
+              self.cmt(7).id, self.cmt(6).id, self.cmt(4).id,
+              self.cmt(7).tree, self.cmt(6).tree, self.cmt(4).tree,
+              self.f1_4_id]) 
+