Explorar o código

Simplify and fix thin pack resolving.

This change cuts down on passing around callbacks to resolve external
references by creating a ThinPackData class that has different semantics
for resolving ref deltas. In all cases except thin packs, PackData
objects have associated pack objects (added as a member variable in
this change), so we can use the index to resolve ref deltas if they
exist. ThinPackDatas, on the other hand, are constructed with an
ObjectStore that is used for external ref resolving. Since we now
make use of the pack index, we only have to deal with postponing shas
in ThinPackData.iterentries. Since this is the only method used for
constructing an index, we still have to be able to handle ref deltas to
objects later in the file.

Modified the compat tests (and one small error in server.py) to
successfully exercise this code, but still missing unit tests.

Change-Id: I244b92c00cbae8e30883c5de4dbf47a9195bd174
Dave Borowitz %!s(int64=15) %!d(string=hai) anos
pai
achega
36a682557f

+ 4 - 4
dulwich/object_store.py

@@ -45,6 +45,7 @@ from dulwich.objects import (
 from dulwich.pack import (
 from dulwich.pack import (
     Pack,
     Pack,
     PackData,
     PackData,
+    ThinPackData,
     iter_sha1,
     iter_sha1,
     load_pack_index,
     load_pack_index,
     write_pack,
     write_pack,
@@ -390,19 +391,18 @@ class DiskObjectStore(PackBasedObjectStore):
 
 
         :param path: Path to the pack file.
         :param path: Path to the pack file.
         """
         """
-        data = PackData(path)
+        data = ThinPackData(self, path)
 
 
         # Write index for the thin pack (do we really need this?)
         # Write index for the thin pack (do we really need this?)
         temppath = os.path.join(self.pack_dir, 
         temppath = os.path.join(self.pack_dir, 
             sha_to_hex(urllib2.randombytes(20))+".tempidx")
             sha_to_hex(urllib2.randombytes(20))+".tempidx")
-        data.create_index_v2(temppath, self.get_raw)
+        data.create_index_v2(temppath)
         p = Pack.from_objects(data, load_pack_index(temppath))
         p = Pack.from_objects(data, load_pack_index(temppath))
 
 
         # Write a full pack version
         # Write a full pack version
         temppath = os.path.join(self.pack_dir, 
         temppath = os.path.join(self.pack_dir, 
             sha_to_hex(urllib2.randombytes(20))+".temppack")
             sha_to_hex(urllib2.randombytes(20))+".temppack")
-        write_pack(temppath, ((o, None) for o in p.iterobjects(self.get_raw)), 
-                len(p))
+        write_pack(temppath, ((o, None) for o in p.iterobjects()), len(p))
         pack_sha = load_pack_index(temppath+".idx").objects_sha1()
         pack_sha = load_pack_index(temppath+".idx").objects_sha1()
         newbasename = os.path.join(self.pack_dir, "pack-%s" % pack_sha)
         newbasename = os.path.join(self.pack_dir, "pack-%s" % pack_sha)
         os.rename(temppath+".pack", newbasename+".pack")
         os.rename(temppath+".pack", newbasename+".pack")

+ 127 - 77
dulwich/pack.py

@@ -536,6 +536,7 @@ class PackData(object):
         (version, self._num_objects) = read_pack_header(self._file.read)
         (version, self._num_objects) = read_pack_header(self._file.read)
         self._offset_cache = LRUSizeCache(1024*1024*20,
         self._offset_cache = LRUSizeCache(1024*1024*20,
             compute_size=_compute_object_size)
             compute_size=_compute_object_size)
+        self.pack = None
 
 
     @classmethod
     @classmethod
     def from_file(cls, file, size):
     def from_file(cls, file, size):
@@ -579,7 +580,18 @@ class PackData(object):
             todo -= len(x)
             todo -= len(x)
         return s.digest()
         return s.digest()
 
 
-    def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
+    def get_ref(self, sha):
+        """Get the object for a ref SHA, only looking in this pack."""
+        # TODO: cache these results
+        if self.pack is None:
+            raise KeyError(sha)
+        offset = self.pack.index.object_index(sha)
+        if not offset:
+            raise KeyError(sha)
+        type, obj = self.get_object_at(offset)
+        return offset, type, obj
+
+    def resolve_object(self, offset, type, obj, get_ref=None):
         """Resolve an object, possibly resolving deltas when necessary.
         """Resolve an object, possibly resolving deltas when necessary.
 
 
         :return: Tuple with object type and contents.
         :return: Tuple with object type and contents.
@@ -587,28 +599,30 @@ class PackData(object):
         if type not in DELTA_TYPES:
         if type not in DELTA_TYPES:
             return type, obj
             return type, obj
 
 
-        if get_offset is None:
-            get_offset = self.get_object_at
-
+        if get_ref is None:
+            get_ref = self.get_ref
         if type == OFS_DELTA:
         if type == OFS_DELTA:
             (delta_offset, delta) = obj
             (delta_offset, delta) = obj
+            # TODO: clean up asserts and replace with nicer error messages
+            assert isinstance(offset, int)
             assert isinstance(delta_offset, int)
             assert isinstance(delta_offset, int)
             base_offset = offset-delta_offset
             base_offset = offset-delta_offset
-            type, base_obj = get_offset(base_offset)
+            type, base_obj = self.get_object_at(base_offset)
             assert isinstance(type, int)
             assert isinstance(type, int)
         elif type == REF_DELTA:
         elif type == REF_DELTA:
             (basename, delta) = obj
             (basename, delta) = obj
             assert isinstance(basename, str) and len(basename) == 20
             assert isinstance(basename, str) and len(basename) == 20
-            type, base_obj = get_ref(basename)
+            base_offset, type, base_obj = get_ref(basename)
             assert isinstance(type, int)
             assert isinstance(type, int)
-            # Can't be a ofs delta, as we wouldn't know the base offset
-            assert type != OFS_DELTA
-            base_offset = None
-        type, base_chunks = self.resolve_object(base_offset, type, base_obj,
-                                                get_ref)
-        if base_offset is not None:
-            self._offset_cache[base_offset] = type, base_chunks
-        return (type, apply_delta(base_chunks, delta))
+        type, base_chunks = self.resolve_object(base_offset, type, base_obj)
+        chunks = apply_delta(base_chunks, delta)
+        # TODO(dborowitz): This can result in poor performance if large base
+        # objects are separated from deltas in the pack. We should reorganize
+        # so that we apply deltas to all objects in a chain one after the other
+        # to optimize cache performance.
+        if offset is not None:
+            self._offset_cache[offset] = type, chunks
+        return type, chunks
 
 
     def iterobjects(self, progress=None):
     def iterobjects(self, progress=None):
 
 
@@ -641,98 +655,64 @@ class PackData(object):
                 return ret
                 return ret
         return ObjectIterator(self)
         return ObjectIterator(self)
 
 
-    def iterentries(self, ext_resolve_ref=None, progress=None):
+    def iterentries(self, progress=None):
         """Yield entries summarizing the contents of this pack.
         """Yield entries summarizing the contents of this pack.
 
 
-        :param ext_resolve_ref: Optional function to resolve base
-            objects (in case this is a thin pack)
         :param progress: Progress function, called with current and
         :param progress: Progress function, called with current and
             total object count.
             total object count.
 
 
         This will yield tuples with (sha, offset, crc32)
         This will yield tuples with (sha, offset, crc32)
         """
         """
-        found = {}
-        postponed = defaultdict(list)
-        class Postpone(Exception):
-            """Raised to postpone delta resolving."""
-
-        def get_ref_text(sha):
-            assert len(sha) == 20
-            if sha in found:
-                return self.get_object_at(found[sha])
-            if ext_resolve_ref:
-                try:
-                    return ext_resolve_ref(sha)
-                except KeyError:
-                    pass
-            raise Postpone, (sha, )
-        extra = []
-        todo = chain(self.iterobjects(progress=progress), extra)
-        for (offset, type, obj, crc32) in todo:
+        for offset, type, obj, crc32 in self.iterobjects(progress=progress):
             assert isinstance(offset, int)
             assert isinstance(offset, int)
             assert isinstance(type, int)
             assert isinstance(type, int)
-            try:
-                type, obj = self.resolve_object(offset, type, obj,
-                    get_ref_text)
-            except Postpone, (sha, ):
-                postponed[sha].append((offset, type, obj))
-            else:
-                shafile = ShaFile.from_raw_chunks(type, obj)
-                sha = shafile.sha().digest()
-                found[sha] = offset
-                yield sha, offset, crc32
-                extra.extend(postponed.get(sha, []))
-        if postponed:
-            raise KeyError([sha_to_hex(h) for h in postponed.keys()])
-
-    def sorted_entries(self, resolve_ext_ref=None, progress=None):
+            assert isinstance(obj, list) or isinstance(obj, tuple)
+            type, obj = self.resolve_object(offset, type, obj)
+            # TODO: hash the data directly to avoid object parsing overhead.
+            shafile = ShaFile.from_raw_chunks(type, obj)
+            sha = shafile.sha().digest()
+            yield sha, offset, crc32
+
+    def sorted_entries(self, progress=None):
         """Return entries in this pack, sorted by SHA.
         """Return entries in this pack, sorted by SHA.
 
 
-        :param resolve_ext_ref: Optional function to resolve base
-            objects (in case this is a thin pack)
         :param progress: Progress function, called with current and
         :param progress: Progress function, called with current and
             total object count
             total object count
         :return: List of tuples with (sha, offset, crc32)
         :return: List of tuples with (sha, offset, crc32)
         """
         """
-        ret = list(self.iterentries(resolve_ext_ref, progress=progress))
+        ret = list(self.iterentries(progress=progress))
         ret.sort()
         ret.sort()
         return ret
         return ret
 
 
-    def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
+    def create_index_v1(self, filename, progress=None):
         """Create a version 1 file for this data file.
         """Create a version 1 file for this data file.
 
 
         :param filename: Index filename.
         :param filename: Index filename.
-        :param resolve_ext_ref: Function to use for resolving externally
-            referenced SHA1s (for thin packs)
         :param progress: Progress report function
         :param progress: Progress report function
         """
         """
-        entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+        entries = self.sorted_entries(progress=progress)
         write_pack_index_v1(filename, entries, self.calculate_checksum())
         write_pack_index_v1(filename, entries, self.calculate_checksum())
 
 
-    def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
+    def create_index_v2(self, filename, progress=None):
         """Create a version 2 index file for this data file.
         """Create a version 2 index file for this data file.
 
 
         :param filename: Index filename.
         :param filename: Index filename.
-        :param resolve_ext_ref: Function to use for resolving externally
-            referenced SHA1s (for thin packs)
         :param progress: Progress report function
         :param progress: Progress report function
         """
         """
-        entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+        entries = self.sorted_entries(progress=progress)
         write_pack_index_v2(filename, entries, self.calculate_checksum())
         write_pack_index_v2(filename, entries, self.calculate_checksum())
 
 
-    def create_index(self, filename, resolve_ext_ref=None, progress=None,
+    def create_index(self, filename, progress=None,
                      version=2):
                      version=2):
         """Create an  index file for this data file.
         """Create an  index file for this data file.
 
 
         :param filename: Index filename.
         :param filename: Index filename.
-        :param resolve_ext_ref: Function to use for resolving externally
-            referenced SHA1s (for thin packs)
         :param progress: Progress report function
         :param progress: Progress report function
         """
         """
         if version == 1:
         if version == 1:
-            self.create_index_v1(filename, resolve_ext_ref, progress)
+            self.create_index_v1(filename, progress)
         elif version == 2:
         elif version == 2:
-            self.create_index_v2(filename, resolve_ext_ref, progress)
+            self.create_index_v2(filename, progress)
         else:
         else:
             raise ValueError("unknown index format %d" % version)
             raise ValueError("unknown index format %d" % version)
 
 
@@ -761,6 +741,79 @@ class PackData(object):
         return unpack_object(self._file.read)[:2]
         return unpack_object(self._file.read)[:2]
 
 
 
 
+class ThinPackData(PackData):
+    """PackData for thin packs, which require an ObjectStore for resolving."""
+
+    def __init__(self, store, *args, **kwargs):
+        super(ThinPackData, self).__init__(*args, **kwargs)
+        self.store = store
+
+    def get_ref(self, sha):
+        """Resolve a reference looking in both this pack and the store."""
+        try:
+            # As part of completing a pack we create a Pack object with a
+            # ThinPackData and a full PackIndex, so check in the index first if
+            # possible.
+            # TODO(dborowitz): reevaluate this when the pack completion code is
+            # rewritten.
+            return super(ThinPackData, self).get_ref(sha)
+        except KeyError:
+            type, obj = self.store.get_raw(sha)
+            return None, type, obj
+
+    def iterentries(self, progress=None):
+        """Yield entries summarizing the contents of this pack.
+
+        :param progress: Progress function, called with current and
+            total object count.
+
+        This will yield tuples with (sha, offset, crc32)
+        """
+        found = {}
+        postponed = defaultdict(list)
+
+        class Postpone(Exception):
+            """Raised to postpone delta resolving."""
+
+            def __init__(self, sha):
+                self.sha = sha
+
+        def get_ref_text(sha):
+            assert len(sha) == 20
+            if sha in found:
+                offset = found[sha]
+                type, obj = self.get_object_at(offset)
+                return offset, type, obj
+            try:
+                return self.get_ref(sha)
+            except KeyError:
+                raise Postpone(sha)
+
+        extra = []
+        todo = chain(self.iterobjects(progress=progress), extra)
+        for (offset, type, obj, crc32) in todo:
+            assert isinstance(offset, int)
+            if obj is None:
+                # Inflate postponed delta
+                obj, type = self.get_object_at(offset)
+            assert isinstance(type, int)
+            assert isinstance(obj, list) or isinstance(obj, tuple)
+            try:
+                type, obj = self.resolve_object(offset, type, obj, get_ref_text)
+            except Postpone, e:
+                # Save memory by not storing the inflated obj in postponed
+                postponed[e.sha].append((offset, type, None, crc32))
+            else:
+                # TODO: hash the data directly to avoid object parsing overhead.
+                shafile = ShaFile.from_raw_chunks(type, obj)
+                sha = shafile.sha().digest()
+                found[sha] = offset
+                yield sha, offset, crc32
+                extra.extend(postponed.pop(sha, []))
+        if postponed:
+            raise KeyError([sha_to_hex(h) for h in postponed.keys()])
+
+
 class SHA1Reader(object):
 class SHA1Reader(object):
     """Wrapper around a file-like object that remembers the SHA1 of its data."""
     """Wrapper around a file-like object that remembers the SHA1 of its data."""
 
 
@@ -1113,6 +1166,7 @@ class Pack(object):
         ret = Pack("")
         ret = Pack("")
         ret._data = data
         ret._data = data
         ret._idx = idx
         ret._idx = idx
+        data.pack = ret
         return ret
         return ret
 
 
     def name(self):
     def name(self):
@@ -1124,6 +1178,7 @@ class Pack(object):
         """The pack data object being used."""
         """The pack data object being used."""
         if self._data is None:
         if self._data is None:
             self._data = PackData(self._data_path)
             self._data = PackData(self._data_path)
+            self._data.pack = self
             assert len(self.index) == len(self._data)
             assert len(self.index) == len(self._data)
             idx_stored_checksum = self.index.get_pack_checksum()
             idx_stored_checksum = self.index.get_pack_checksum()
             data_stored_checksum = self._data.get_stored_checksum()
             data_stored_checksum = self._data.get_stored_checksum()
@@ -1180,30 +1235,25 @@ class Pack(object):
         except KeyError:
         except KeyError:
             return False
             return False
 
 
-    def get_raw(self, sha1, resolve_ref=None):
+    def get_raw(self, sha1):
         offset = self.index.object_index(sha1)
         offset = self.index.object_index(sha1)
         obj_type, obj = self.data.get_object_at(offset)
         obj_type, obj = self.data.get_object_at(offset)
         if type(offset) is long:
         if type(offset) is long:
           offset = int(offset)
           offset = int(offset)
-        if resolve_ref is None:
-            resolve_ref = self.get_raw
-        kind, chunks = self.data.resolve_object(offset, obj_type, obj,
-            resolve_ref)
-        return kind, "".join(chunks)
+        type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
+        return type_num, "".join(chunks)
 
 
     def __getitem__(self, sha1):
     def __getitem__(self, sha1):
         """Retrieve the specified SHA1."""
         """Retrieve the specified SHA1."""
         type, uncomp = self.get_raw(sha1)
         type, uncomp = self.get_raw(sha1)
         return ShaFile.from_raw_string(type, uncomp)
         return ShaFile.from_raw_string(type, uncomp)
 
 
-    def iterobjects(self, get_raw=None):
+    def iterobjects(self):
         """Iterate over the objects in this pack."""
         """Iterate over the objects in this pack."""
-        if get_raw is None:
-            get_raw = self.get_raw
         for offset, type, obj, crc32 in self.data.iterobjects():
         for offset, type, obj, crc32 in self.data.iterobjects():
             assert isinstance(offset, int)
             assert isinstance(offset, int)
-            type, obj = self.data.resolve_object(offset, type, obj, get_raw)
-            yield ShaFile.from_raw_chunks(type, obj)
+            yield ShaFile.from_raw_chunks(
+                    *self.data.resolve_object(offset, type, obj))
 
 
 
 
 try:
 try:

+ 2 - 1
dulwich/server.py

@@ -642,7 +642,8 @@ class ReceivePackHandler(Handler):
 
 
     def _apply_pack(self, refs):
     def _apply_pack(self, refs):
         f, commit = self.repo.object_store.add_thin_pack()
         f, commit = self.repo.object_store.add_thin_pack()
-        all_exceptions = (IOError, OSError, ChecksumMismatch, ApplyDeltaError)
+        all_exceptions = (IOError, OSError, ChecksumMismatch, ApplyDeltaError,
+                          AssertionError, socket.error, zlib.error)
         status = []
         status = []
         unpack_error = None
         unpack_error = None
         # TODO: more informative error messages than just the exception string
         # TODO: more informative error messages than just the exception string

+ 2 - 1
dulwich/tests/compat/server_utils.py

@@ -53,7 +53,8 @@ class ServerTests(object):
 
 
     def assertReposEqual(self, repo1, repo2):
     def assertReposEqual(self, repo1, repo2):
         self.assertEqual(repo1.get_refs(), repo2.get_refs())
         self.assertEqual(repo1.get_refs(), repo2.get_refs())
-        self.assertEqual(set(repo1.object_store), set(repo2.object_store))
+        self.assertEqual(sorted(set(repo1.object_store)),
+                         sorted(set(repo2.object_store)))
 
 
     def assertReposNotEqual(self, repo1, repo2):
     def assertReposNotEqual(self, repo1, repo2):
         refs1 = repo1.get_refs()
         refs1 = repo1.get_refs()

+ 0 - 4
dulwich/tests/compat/test_server.py

@@ -76,7 +76,3 @@ class GitServerTestCase(ServerTests, CompatTestCase):
         self._server = dul_server
         self._server = dul_server
         _, port = self._server.socket.getsockname()
         _, port = self._server.socket.getsockname()
         return port
         return port
-
-    def test_push_to_dulwich(self):
-        # TODO(dborowitz): enable after merging thin pack fixes.
-        raise TestSkipped('Skipping push test due to known deadlock bug.')

+ 0 - 4
dulwich/tests/compat/test_web.py

@@ -96,10 +96,6 @@ class SmartWebTestCase(WebTests, CompatTestCase):
     def _make_app(self, backend):
     def _make_app(self, backend):
         return HTTPGitApplication(backend)
         return HTTPGitApplication(backend)
 
 
-    def test_push_to_dulwich(self):
-        # TODO(dborowitz): enable after merging thin pack fixes.
-        raise TestSkipped('Skipping push test due to known pack bug.')
-
 
 
 class DumbWebTestCase(WebTests, CompatTestCase):
 class DumbWebTestCase(WebTests, CompatTestCase):
     """Test cases for dumb HTTP server."""
     """Test cases for dumb HTTP server."""