Browse Source

Significant rewrite of pack handling.

We now reuse local deltas and send them, significantly reducing the
bandwidth required for most pushes and improving performance by at
least twofold.
Jelmer Vernooij 2 years ago
parent
commit
16b026f6d1

+ 6 - 3
dulwich/cli.py

@@ -37,7 +37,7 @@ import signal
 from typing import Dict, Type, Optional
 
 from dulwich import porcelain
-from dulwich.client import get_transport_and_path
+from dulwich.client import get_transport_and_path, GitProtocolError
 from dulwich.errors import ApplyDeltaError
 from dulwich.index import Index
 from dulwich.objectspec import parse_commit
@@ -263,8 +263,11 @@ class cmd_clone(Command):
         else:
             target = None
 
-        porcelain.clone(source, target, bare=options.bare, depth=options.depth,
-                        branch=options.branch)
+        try:
+            porcelain.clone(source, target, bare=options.bare, depth=options.depth,
+                            branch=options.branch)
+        except GitProtocolError as e:
+            print("%s" % e)
 
 
 class cmd_commit(Command):

+ 9 - 6
dulwich/client.py

@@ -51,6 +51,7 @@ from typing import (
     Callable,
     Dict,
     List,
+    Iterator,
     Optional,
     Set,
     Tuple,
@@ -117,7 +118,8 @@ from dulwich.protocol import (
     pkt_line,
 )
 from dulwich.pack import (
-    write_pack_objects,
+    write_pack_from_container,
+    UnpackedObject,
     PackChunkGenerator,
 )
 from dulwich.refs import (
@@ -699,7 +701,7 @@ class GitClient:
         """
         raise NotImplementedError(cls.from_parsedurl)
 
-    def send_pack(self, path, update_refs, generate_pack_data, progress=None):
+    def send_pack(self, path, update_refs, generate_pack_data: Callable[[Set[bytes], Set[bytes], bool], Tuple[int, Iterator[UnpackedObject]]], progress=None):
         """Upload a pack to a remote repository.
 
         Args:
@@ -1106,10 +1108,11 @@ class TraditionalGitClient(GitClient):
                 header_handler.have,
                 header_handler.want,
                 ofs_delta=(CAPABILITY_OFS_DELTA in negotiated_capabilities),
+                progress=progress,
             )
 
             if self._should_send_pack(new_refs):
-                for chunk in PackChunkGenerator(pack_data_count, pack_data):
+                for chunk in PackChunkGenerator(pack_data_count, pack_data, progress=progress):
                     proto.write(chunk)
 
             ref_status = self._handle_receive_pack_tail(
@@ -1533,7 +1536,7 @@ class LocalGitClient(GitClient):
 
         """
         with self._open_repo(path) as r:
-            objects_iter = r.fetch_objects(
+            object_ids = r.fetch_objects(
                 determine_wants, graph_walker, progress=progress, depth=depth
             )
             symrefs = r.refs.get_symrefs()
@@ -1541,9 +1544,9 @@ class LocalGitClient(GitClient):
 
             # Did the process short-circuit (e.g. in a stateless RPC call)?
             # Note that the client still expects a 0-object pack in most cases.
-            if objects_iter is None:
+            if object_ids is None:
                 return FetchPackResult(None, symrefs, agent)
-            write_pack_objects(pack_data, objects_iter, reuse_pack=r.object_store)
+            write_pack_from_container(pack_data, r.object_store, object_ids)
             return FetchPackResult(r.get_refs(), symrefs, agent)
 
     def get_refs(self, path):

+ 0 - 19
dulwich/contrib/swift.py

@@ -40,7 +40,6 @@ from geventhttpclient import HTTPClient
 
 from dulwich.greenthreads import (
     GreenThreadsMissingObjectFinder,
-    GreenThreadsObjectStoreIterator,
 )
 
 from dulwich.lru_cache import LRUSizeCache
@@ -119,15 +118,6 @@ cache_length = 20
 """
 
 
-class PackInfoObjectStoreIterator(GreenThreadsObjectStoreIterator):
-    def __len__(self):
-        while self.finder.objects_to_send:
-            for _ in range(0, len(self.finder.objects_to_send)):
-                sha = self.finder.next()
-                self._shas.append(sha)
-        return len(self._shas)
-
-
 class PackInfoMissingObjectFinder(GreenThreadsMissingObjectFinder):
     def next(self):
         while True:
@@ -681,15 +671,6 @@ class SwiftObjectStore(PackBasedObjectStore):
         """Loose objects are not supported by this repository"""
         return []
 
-    def iter_shas(self, finder):
-        """An iterator over pack's ObjectStore.
-
-        Returns: a `ObjectStoreIterator` or `GreenThreadsObjectStoreIterator`
-                 instance if gevent is enabled
-        """
-        shas = iter(finder.next, None)
-        return PackInfoObjectStoreIterator(self, shas, finder, self.scon.concurrency)
-
     def pack_info_get(self, sha):
         for pack in self.packs:
             if sha in pack:

+ 0 - 34
dulwich/greenthreads.py

@@ -33,7 +33,6 @@ from dulwich.object_store import (
     MissingObjectFinder,
     _collect_ancestors,
     _collect_filetree_revs,
-    ObjectStoreIterator,
 )
 
 
@@ -111,36 +110,3 @@ class GreenThreadsMissingObjectFinder(MissingObjectFinder):
         else:
             self.progress = progress
         self._tagged = get_tagged and get_tagged() or {}
-
-
-class GreenThreadsObjectStoreIterator(ObjectStoreIterator):
-    """ObjectIterator that works on top of an ObjectStore.
-
-    Same implementation as object_store.ObjectStoreIterator
-    except we use gevent to parallelize object retrieval.
-    """
-
-    def __init__(self, store, shas, finder, concurrency=1):
-        self.finder = finder
-        self.p = pool.Pool(size=concurrency)
-        super().__init__(store, shas)
-
-    def retrieve(self, args):
-        sha, path = args
-        return self.store[sha], path
-
-    def __iter__(self):
-        yield from self.p.imap_unordered(self.retrieve, self.itershas())
-
-    def __len__(self):
-        if len(self._shas) > 0:
-            return len(self._shas)
-        while self.finder.objects_to_send:
-            jobs = []
-            for _ in range(0, len(self.finder.objects_to_send)):
-                jobs.append(self.p.spawn(self.finder.next))
-            gevent.joinall(jobs)
-            for j in jobs:
-                if j.value is not None:
-                    self._shas.append(j.value)
-        return len(self._shas)

+ 189 - 180
dulwich/object_store.py

@@ -28,7 +28,7 @@ import stat
 import sys
 import warnings
 
-from typing import Callable, Dict, List, Optional, Tuple, Protocol, Union, Iterator, Set
+from typing import Callable, Dict, List, Optional, Tuple, Protocol, Union, Iterator, Set, Iterable, Sequence, cast
 
 from dulwich.errors import (
     NotTreeError,
@@ -40,6 +40,7 @@ from dulwich.objects import (
     ShaFile,
     Tag,
     Tree,
+    Blob,
     ZERO_SHA,
     hex_to_sha,
     sha_to_hex,
@@ -55,8 +56,11 @@ from dulwich.pack import (
     PackData,
     PackInflater,
     PackFileDisappeared,
+    UnpackedObject,
     load_pack_index_file,
     iter_sha1,
+    full_unpacked_object,
+    generate_unpacked_objects,
     pack_objects_to_data,
     write_pack_header,
     write_pack_index_v2,
@@ -65,6 +69,7 @@ from dulwich.pack import (
     compute_file_sha,
     PackIndexer,
     PackStreamCopier,
+    PackedObjectContainer,
 )
 from dulwich.protocol import DEPTH_INFINITE
 from dulwich.refs import ANNOTATED_TAG_SUFFIX, Ref
@@ -109,29 +114,16 @@ class BaseObjectStore:
             and not sha == ZERO_SHA
         ]
 
-    def iter_shas(self, shas):
-        """Iterate over the objects for the specified shas.
-
-        Args:
-          shas: Iterable object with SHAs
-        Returns: Object iterator
-        """
-        return ObjectStoreIterator(self, shas)
-
     def contains_loose(self, sha):
         """Check if a particular object is present by SHA1 and is loose."""
         raise NotImplementedError(self.contains_loose)
 
-    def contains_packed(self, sha):
-        """Check if a particular object is present by SHA1 and is packed."""
-        raise NotImplementedError(self.contains_packed)
-
-    def __contains__(self, sha):
+    def __contains__(self, sha1: bytes) -> bool:
         """Check if a particular object is present by SHA1.
 
         This method makes no distinction between loose and packed objects.
         """
-        return self.contains_packed(sha) or self.contains_loose(sha)
+        return self.contains_loose(sha1)
 
     @property
     def packs(self):
@@ -147,21 +139,15 @@ class BaseObjectStore:
         """
         raise NotImplementedError(self.get_raw)
 
-    def __getitem__(self, sha: ObjectID):
+    def __getitem__(self, sha1: ObjectID) -> ShaFile:
         """Obtain an object by SHA1."""
-        type_num, uncomp = self.get_raw(sha)
-        return ShaFile.from_raw_string(type_num, uncomp, sha=sha)
+        type_num, uncomp = self.get_raw(sha1)
+        return ShaFile.from_raw_string(type_num, uncomp, sha=sha1)
 
     def __iter__(self):
         """Iterate over the SHAs that are present in this store."""
         raise NotImplementedError(self.__iter__)
 
-    def add_pack(
-        self
-    ) -> Tuple[BytesIO, Callable[[], None], Callable[[], None]]:
-        """Add a new pack to this object store."""
-        raise NotImplementedError(self.add_pack)
-
     def add_object(self, obj):
         """Add a single object to this object store."""
         raise NotImplementedError(self.add_object)
@@ -174,31 +160,6 @@ class BaseObjectStore:
         """
         raise NotImplementedError(self.add_objects)
 
-    def add_pack_data(self, count, pack_data, progress=None):
-        """Add pack data to this object store.
-
-        Args:
-          count: Number of items to add
-          pack_data: Iterator over pack data tuples
-        """
-        if count == 0:
-            # Don't bother writing an empty pack file
-            return
-        f, commit, abort = self.add_pack()
-        try:
-            write_pack_data(
-                f.write,
-                pack_data,
-                num_records=count,
-                progress=progress,
-                compression_level=self.pack_compression_level,
-            )
-        except BaseException:
-            abort()
-            raise
-        else:
-            return commit()
-
     def tree_changes(
         self,
         source,
@@ -253,6 +214,14 @@ class BaseObjectStore:
             DeprecationWarning, stacklevel=2)
         return iter_tree_contents(self, tree_id, include_trees=include_trees)
 
+    def iterobjects_subset(self, shas: Iterable[bytes], *, allow_missing: bool = False) -> Iterator[ShaFile]:
+        for sha in shas:
+            try:
+                yield self[sha]
+            except KeyError:
+                if not allow_missing:
+                    raise
+
     def find_common_revisions(self, graphwalker):
         """Find which revisions this store has in common using graphwalker.
 
@@ -269,22 +238,10 @@ class BaseObjectStore:
             sha = next(graphwalker)
         return haves
 
-    def generate_pack_contents(self, have, want, shallow=None, progress=None):
-        """Iterate over the contents of a pack file.
-
-        Args:
-          have: List of SHA1s of objects that should not be sent
-          want: List of SHA1s of objects that should be sent
-          shallow: Set of shallow commit SHA1s to skip
-          progress: Optional progress reporting method
-        """
-        missing = MissingObjectFinder(
-            self, haves=have, wants=want, shallow=shallow, progress=progress)
-        return self.iter_shas(missing)
-
     def generate_pack_data(
-        self, have, want, shallow=None, progress=None, ofs_delta=True
-    ):
+        self, have, want, shallow=None, progress=None,
+        ofs_delta=True
+    ) -> Tuple[int, Iterator[UnpackedObject]]:
         """Generate pack data objects for a set of wants/haves.
 
         Args:
@@ -294,10 +251,12 @@ class BaseObjectStore:
           ofs_delta: Whether OFS deltas can be included
           progress: Optional progress reporting method
         """
-        # TODO(jelmer): More efficient implementation
+        # Note that the pack-specific implementation below is more efficient,
+        # as it reuses deltas
+        object_ids = list(MissingObjectFinder(self, have, want, shallow, progress))
         return pack_objects_to_data(
-            self.generate_pack_contents(have, want, shallow, progress)
-        )
+            [(self[oid], path) for oid, path in object_ids], ofs_delta=ofs_delta,
+            progress=progress)
 
     def peel_sha(self, sha):
         """Peel all tags from a SHA.
@@ -353,6 +312,37 @@ class PackBasedObjectStore(BaseObjectStore):
         self._pack_cache = {}
         self.pack_compression_level = pack_compression_level
 
+    def add_pack(
+        self
+    ) -> Tuple[BytesIO, Callable[[], None], Callable[[], None]]:
+        """Add a new pack to this object store."""
+        raise NotImplementedError(self.add_pack)
+
+    def add_pack_data(self, count: int, unpacked_objects: Iterator[UnpackedObject], progress=None) -> None:
+        """Add pack data to this object store.
+
+        Args:
+          count: Number of items to add
+          pack_data: Iterator over pack data tuples
+        """
+        if count == 0:
+            # Don't bother writing an empty pack file
+            return
+        f, commit, abort = self.add_pack()
+        try:
+            write_pack_data(
+                f.write,
+                unpacked_objects,
+                num_records=count,
+                progress=progress,
+                compression_level=self.pack_compression_level,
+            )
+        except BaseException:
+            abort()
+            raise
+        else:
+            return commit()
+
     @property
     def alternates(self):
         return []
@@ -390,6 +380,26 @@ class PackBasedObjectStore(BaseObjectStore):
             if prev_pack:
                 prev_pack.close()
 
+    def generate_pack_data(
+        self, have, want, shallow=None, progress=None,
+        ofs_delta=True
+    ) -> Tuple[int, Iterator[UnpackedObject]]:
+        """Generate pack data objects for a set of wants/haves.
+
+        Args:
+          have: List of SHA1s of objects that should not be sent
+          want: List of SHA1s of objects that should be sent
+          shallow: Set of shallow commit SHA1s to skip
+          ofs_delta: Whether OFS deltas can be included
+          progress: Optional progress reporting method
+        """
+        object_ids = list(MissingObjectFinder(self, have, want, shallow, progress))
+        return len(object_ids), generate_unpacked_objects(
+            cast(PackedObjectContainer, self),
+            object_ids,
+            progress=progress,
+            ofs_delta=ofs_delta)
+
     def _clear_cached_packs(self):
         pack_cache = self._pack_cache
         self._pack_cache = {}
@@ -529,47 +539,89 @@ class PackBasedObjectStore(BaseObjectStore):
                 pass
         raise KeyError(hexsha)
 
-    def get_raw_unresolved(self, name: bytes) -> Tuple[int, Union[bytes, None], List[bytes]]:
-        """Obtain the unresolved data for an object.
+    def iter_unpacked_subset(self, shas, *, include_comp=False, allow_missing: bool = False, convert_ofs_delta: bool = True) -> Iterator[ShaFile]:
+        todo: Set[bytes] = set(shas)
+        for p in self._iter_cached_packs():
+            for unpacked in p.iter_unpacked_subset(todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta):
+                yield unpacked
+                hexsha = sha_to_hex(unpacked.sha())
+                todo.remove(hexsha)
+        # Maybe something else has added a pack with the object
+        # in the mean time?
+        for p in self._update_pack_cache():
+            for unpacked in p.iter_unpacked_subset(todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta):
+                yield unpacked
+                hexsha = sha_to_hex(unpacked.sha())
+                todo.remove(hexsha)
+        for alternate in self.alternates:
+            for unpacked in alternate.iter_unpacked_subset(todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta):
+                yield unpacked
+                hexsha = sha_to_hex(unpacked.sha())
+                todo.remove(hexsha)
+
+    def iterobjects_subset(self, shas: Iterable[bytes], *, allow_missing: bool = False) -> Iterator[ShaFile]:
+        todo: Set[bytes] = set(shas)
+        for p in self._iter_cached_packs():
+            for o in p.iterobjects_subset(todo, allow_missing=True):
+                yield o
+                todo.remove(o.id)
+        # Maybe something else has added a pack with the object
+        # in the mean time?
+        for p in self._update_pack_cache():
+            for o in p.iterobjects_subset(todo, allow_missing=True):
+                yield o
+                todo.remove(o.id)
+        for alternate in self.alternates:
+            for o in alternate.iterobjects_subset(todo, allow_missing=True):
+                yield o
+                todo.remove(o.id)
+        for oid in todo:
+            o = self._get_loose_object(oid)
+            if o is not None:
+                yield o
+            elif not allow_missing:
+                raise KeyError(oid)
+
+    def get_unpacked_object(self, sha1: bytes, *, include_comp: bool = False) -> UnpackedObject:
+        """Obtain the unpacked object.
 
         Args:
-          name: sha for the object.
+          sha1: sha for the object.
         """
-        if name == ZERO_SHA:
-            raise KeyError(name)
-        if len(name) == 40:
-            sha = hex_to_sha(name)
-            hexsha = name
-        elif len(name) == 20:
-            sha = name
+        if sha1 == ZERO_SHA:
+            raise KeyError(sha1)
+        if len(sha1) == 40:
+            sha = hex_to_sha(sha1)
+            hexsha = sha1
+        elif len(sha1) == 20:
+            sha = sha1
             hexsha = None
         else:
-            raise AssertionError("Invalid object name {!r}".format(name))
+            raise AssertionError("Invalid object sha1 {!r}".format(sha1))
         for pack in self._iter_cached_packs():
             try:
-                return pack.get_raw_unresolved(sha)
+                return pack.get_unpacked_object(sha, include_comp=include_comp)
             except (KeyError, PackFileDisappeared):
                 pass
         if hexsha is None:
-            hexsha = sha_to_hex(name)
-        ret = self._get_loose_object(hexsha)
-        if ret is not None:
-            return ret.type_num, None, ret.as_raw_chunks()
+            hexsha = sha_to_hex(sha1)
         # Maybe something else has added a pack with the object
         # in the mean time?
         for pack in self._update_pack_cache():
             try:
-                return pack.get_raw_unresolved(sha)
+                return pack.get_unpacked_object(sha, include_comp=include_comp)
             except KeyError:
                 pass
         for alternate in self.alternates:
             try:
-                return alternate.get_raw_unresolved(hexsha)
+                return alternate.get_unpacked_object(hexsha, include_comp=include_comp)
             except KeyError:
                 pass
         raise KeyError(hexsha)
 
-    def add_objects(self, objects, progress=None):
+    def add_objects(
+            self, objects: Sequence[Tuple[ShaFile, Optional[str]]],
+            progress: Optional[Callable[[str], None]] = None) -> None:
         """Add a set of objects to this object store.
 
         Args:
@@ -577,7 +629,9 @@ class PackBasedObjectStore(BaseObjectStore):
             __len__.
         Returns: Pack object of the objects written.
         """
-        return self.add_pack_data(*pack_objects_to_data(objects), progress=progress)
+        count = len(objects)
+        record_iter = (full_unpacked_object(o) for (o, p) in objects)
+        return self.add_pack_data(count, record_iter, progress=progress)
 
 
 class DiskObjectStore(PackBasedObjectStore):
@@ -1067,79 +1121,6 @@ class ObjectIterator(Protocol):
         raise NotImplementedError(self.iterobjects)
 
 
-class ObjectStoreIterator(ObjectIterator):
-    """ObjectIterator that works on top of an ObjectStore."""
-
-    def __init__(self, store, sha_iter):
-        """Create a new ObjectIterator.
-
-        Args:
-          store: Object store to retrieve from
-          sha_iter: Iterator over (sha, path) tuples
-        """
-        self.store = store
-        self.sha_iter = sha_iter
-        self._shas = []
-
-    def __iter__(self):
-        """Yield tuple with next object and path."""
-        for sha, path in self.itershas():
-            yield self.store[sha], path
-
-    def iterobjects(self):
-        """Iterate over just the objects."""
-        for o, path in self:
-            yield o
-
-    def itershas(self):
-        """Iterate over the SHAs."""
-        for sha in self._shas:
-            yield sha
-        for sha in self.sha_iter:
-            self._shas.append(sha)
-            yield sha
-
-    def __contains__(self, needle):
-        """Check if an object is present.
-
-        Note: This checks if the object is present in
-            the underlying object store, not if it would
-            be yielded by the iterator.
-
-        Args:
-          needle: SHA1 of the object to check for
-        """
-        if needle == ZERO_SHA:
-            return False
-        return needle in self.store
-
-    def __getitem__(self, key):
-        """Find an object by SHA1.
-
-        Note: This retrieves the object from the underlying
-            object store. It will also succeed if the object would
-            not be returned by the iterator.
-        """
-        return self.store[key]
-
-    def __len__(self):
-        """Return the number of objects."""
-        return len(list(self.itershas()))
-
-    def _empty(self):
-        it = self.itershas()
-        try:
-            next(it)
-        except StopIteration:
-            return True
-        else:
-            return False
-
-    def __bool__(self):
-        """Indicate whether this object has contents."""
-        return not self._empty()
-
-
 def tree_lookup_path(lookup_obj, root_sha, path):
     """Look up an object in a Git tree.
 
@@ -1270,27 +1251,33 @@ class MissingObjectFinder:
             shallow=shallow,
             get_parents=self._get_parents,
         )
-        self.sha_done = set()
+        self.remote_has: Set[bytes] = set()
         # Now, fill sha_done with commits and revisions of
         # files and directories known to be both locally
         # and on target. Thus these commits and files
         # won't get selected for fetch
         for h in common_commits:
-            self.sha_done.add(h)
+            self.remote_has.add(h)
             cmt = object_store[h]
-            _collect_filetree_revs(object_store, cmt.tree, self.sha_done)
+            _collect_filetree_revs(object_store, cmt.tree, self.remote_has)
         # record tags we have as visited, too
         for t in have_tags:
-            self.sha_done.add(t)
+            self.remote_has.add(t)
+        self.sha_done = set(self.remote_has)
 
-        missing_tags = want_tags.difference(have_tags)
-        missing_others = want_others.difference(have_others)
         # in fact, what we 'want' is commits, tags, and others
         # we've found missing
-        wants = missing_commits.union(missing_tags)
-        wants = wants.union(missing_others)
-
-        self.objects_to_send = {(w, None, False) for w in wants}
+        self.objects_to_send = {
+            (w, None, Commit.type_num, False)
+            for w in missing_commits}
+        missing_tags = want_tags.difference(have_tags)
+        self.objects_to_send.update(
+            {(w, None, Tag.type_num, False)
+            for w in missing_tags})
+        missing_others = want_others.difference(have_others)
+        self.objects_to_send.update(
+            {(w, None, None, False)
+            for w in missing_others})
 
         if progress is None:
             self.progress = lambda x: None
@@ -1301,35 +1288,39 @@ class MissingObjectFinder:
     def add_todo(self, entries):
         self.objects_to_send.update([e for e in entries if not e[0] in self.sha_done])
 
-    def __next__(self):
+    def __next__(self) -> Tuple[bytes, Optional[bytes]]:
         while True:
             if not self.objects_to_send:
-                return None
-            (sha, name, leaf) = self.objects_to_send.pop()
+                self.progress(("counting objects: %d, done.\n" % len(self.sha_done)).encode("ascii"))
+                raise StopIteration
+            (sha, name, type_num, leaf) = self.objects_to_send.pop()
             if sha not in self.sha_done:
                 break
         if not leaf:
             o = self.object_store[sha]
             if isinstance(o, Commit):
-                self.add_todo([(o.tree, b"", False)])
+                self.add_todo([(o.tree, b"", Tree.type_num, False)])
             elif isinstance(o, Tree):
                 self.add_todo(
                     [
-                        (s, n, not stat.S_ISDIR(m))
+                        (s, n, (Blob.type_num if stat.S_ISREG(m) else Tree.type_num),
+                         not stat.S_ISDIR(m))
                         for n, m, s in o.iteritems()
                         if not S_ISGITLINK(m)
                     ]
                 )
             elif isinstance(o, Tag):
-                self.add_todo([(o.object[1], None, False)])
+                self.add_todo([(o.object[1], None, o.object[0].type_num, False)])
         if sha in self._tagged:
-            self.add_todo([(self._tagged[sha], None, True)])
+            self.add_todo([(self._tagged[sha], None, None, True)])
         self.sha_done.add(sha)
-        self.progress(("counting objects: %d\r" % len(self.sha_done)).encode("ascii"))
+        if len(self.sha_done) % 1000 == 0:
+            self.progress(("counting objects: %d\r" % len(self.sha_done)).encode("ascii"))
+        # TODO (type_num, name, -length, sha)
         return (sha, name)
 
     def __iter__(self):
-        return iter(self.__next__, None)
+        return self
 
 
 class ObjectStoreGraphWalker:
@@ -1476,6 +1467,24 @@ class OverlayObjectStore(BaseObjectStore):
                     yield o_id
                     done.add(o_id)
 
+    def iterobjects_subset(self, shas: Iterable[bytes], *, allow_missing: bool = False) -> Iterator[ShaFile]:
+        todo = set(shas)
+        for b in self.bases:
+            for o in b.iterobjects_subset(todo, allow_missing=True):
+                yield o
+                todo.remove(o.id)
+        if todo and not allow_missing:
+            raise KeyError(o.id)
+
+    def iter_unpacked_subset(self, shas: Iterable[bytes], *, include_comp=False, allow_missing: bool = False, convert_ofs_delta=True) -> Iterator[ShaFile]:
+        todo = set(shas)
+        for b in self.bases:
+            for o in b.iter_unpacked_subset(todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta):
+                yield o
+                todo.remove(o.id)
+        if todo and not allow_missing:
+            raise KeyError(o.id)
+
     def get_raw(self, sha_id):
         for b in self.bases:
             try:

+ 1 - 4
dulwich/objects.py

@@ -528,10 +528,7 @@ class ShaFile:
 
     def raw_length(self) -> int:
         """Returns the length of the raw string of this object."""
-        ret = 0
-        for chunk in self.as_raw_chunks():
-            ret += len(chunk)
-        return ret
+        return sum(map(len, self.as_raw_chunks()))
 
     def sha(self):
         """The SHA1 object that is the name of this object."""

+ 211 - 142
dulwich/pack.py

@@ -50,7 +50,7 @@ from itertools import chain
 
 import os
 import sys
-from typing import Optional, Callable, Tuple, List, Deque, Union, Protocol, Iterable, Iterator, Dict, TypeVar, Generic
+from typing import Optional, Callable, Tuple, List, Deque, Union, Protocol, Iterable, Iterator, Dict, TypeVar, Generic, Sequence, Set
 import warnings
 
 from hashlib import sha1
@@ -108,7 +108,7 @@ class ObjectContainer(Protocol):
         """Add a single object to this object store."""
 
     def add_objects(
-            self, objects: Iterable[Tuple[ShaFile, Optional[str]]],
+            self, objects: Sequence[Tuple[ShaFile, Optional[str]]],
             progress: Optional[Callable[[str], None]] = None) -> None:
         """Add a set of objects to this object store.
 
@@ -129,6 +129,14 @@ class PackedObjectContainer(ObjectContainer):
         """Get a raw unresolved object."""
         raise NotImplementedError(self.get_unpacked_object)
 
+    def iterobjects_subset(self, shas: Iterable[bytes], *, allow_missing: bool = False) -> Iterator[ShaFile]:
+        raise NotImplementedError(self.iterobjects_subset)
+
+    def iter_unpacked_subset(
+            self, shas: Set[bytes], include_comp: bool = False, allow_missing: bool = False,
+            convert_ofs_delta: bool = True) -> Iterator["UnpackedObject"]:
+        raise NotImplementedError(self.iter_unpacked_subset)
+
 
 def take_msb_bytes(read: Callable[[int], bytes], crc32: Optional[int] = None) -> Tuple[List[int], Optional[int]]:
     """Read bytes marked with most significant bit.
@@ -176,6 +184,8 @@ class UnpackedObject:
 
     obj_type_num: Optional[int]
     obj_chunks: Optional[List[bytes]]
+    delta_base: Union[None, bytes, int]
+    decomp_chunks: List[bytes]
 
     # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
     # methods of this object.
@@ -456,7 +466,6 @@ class PackIndex:
 
     def object_sha1(self, index: int) -> bytes:
         """Return the SHA1 corresponding to the index in the pack file."""
-        # PERFORMANCE/TODO(jelmer): Avoid scanning entire index
         for (name, offset, crc32) in self.iterentries():
             if offset == index:
                 return name
@@ -531,6 +540,10 @@ class MemoryPackIndex(PackIndex):
     def for_pack(cls, pack):
         return MemoryPackIndex(pack.sorted_entries(), pack.calculate_checksum())
 
+    @classmethod
+    def clone(cls, other_index):
+        return cls(other_index.iterentries(), other_index.get_pack_checksum())
+
 
 class FilePackIndex(PackIndex):
     """Pack index that is based on a file.
@@ -1326,34 +1339,6 @@ class PackData:
         unpacked.offset = offset
         return unpacked
 
-    def get_compressed_data_at(self, offset):
-        """Given offset in the packfile return compressed data that is there.
-
-        Using the associated index the location of an object can be looked up,
-        and then the packfile can be asked directly for that object using this
-        function.
-        """
-        unpacked = self.get_unpacked_object_at(offset, include_comp=True)
-        return (
-            unpacked.pack_type_num,
-            unpacked.delta_base,
-            unpacked.comp_chunks,
-        )
-
-    def get_decompressed_data_at(self, offset: int) -> Tuple[int, Optional[bytes], List[bytes]]:
-        """Given an offset in the packfile, decompress the data that is there.
-
-        Using the associated index the location of an object can be looked up,
-        and then the packfile can be asked directly for that object using this
-        function.
-        """
-        unpacked = self.get_unpacked_object_at(offset, include_comp=False)
-        return (
-            unpacked.pack_type_num,
-            unpacked.delta_base,
-            unpacked.decomp_chunks,
-        )
-
     def get_object_at(self, offset: int) -> Tuple[int, OldUnpackedObject]:
         """Given an offset in to the packfile return the object that is there.
 
@@ -1408,7 +1393,7 @@ class DeltaChainIterator(Generic[T]):
     def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None):
         walker = cls(None, resolve_ext_ref=resolve_ext_ref)
         walker.set_pack_data(pack_data)
-        for unpacked in pack_data.iter_unpacked():
+        for unpacked in pack_data.iter_unpacked(include_comp=False):
             walker.record(unpacked)
         return walker
 
@@ -1438,6 +1423,7 @@ class DeltaChainIterator(Generic[T]):
                 base_ofs = unpacked.offset - unpacked.delta_base
             elif unpacked.pack_type_num == REF_DELTA:
                 with suppress(KeyError):
+                    assert isinstance(unpacked.delta_base, bytes)
                     base_ofs = pack.index.object_index(unpacked.delta_base)
             if base_ofs is not None and base_ofs not in done:
                 todo.add(base_ofs)
@@ -1450,6 +1436,7 @@ class DeltaChainIterator(Generic[T]):
             base_offset = offset - unpacked.delta_base
             self._pending_ofs[base_offset].append(offset)
         elif type_num == REF_DELTA:
+            assert isinstance(unpacked.delta_base, bytes)
             self._pending_ref[unpacked.delta_base].append(offset)
         else:
             self._full_ofs.append((offset, type_num))
@@ -1675,11 +1662,6 @@ def write_pack_object(write, type, object, sha=None, compression_level=-1):
       compression_level: the zlib compression level
     Returns: Tuple with offset at which the object was written, and crc32
     """
-    if hasattr(write, 'write'):
-        warnings.warn(
-            'write_pack_object() now takes a write rather than file argument',
-            DeprecationWarning, stacklevel=2)
-        write = write.write
     crc32 = 0
     for chunk in pack_object_chunks(
             type, object, compression_level=compression_level):
@@ -1692,17 +1674,17 @@ def write_pack_object(write, type, object, sha=None, compression_level=-1):
 
 def write_pack(
         filename,
-        objects,
+        objects: Sequence[Tuple[ShaFile, Optional[bytes]]],
         *,
         deltify: Optional[bool] = None,
         delta_window_size: Optional[int] = None,
-        compression_level: int = -1,
-        reuse_pack: Optional[PackedObjectContainer] = None):
+        compression_level: int = -1):
     """Write a new pack data file.
 
     Args:
       filename: Path to the new pack file (without .pack extension)
-      objects: (object, path) tuple iterable to write. Should provide __len__
+      container: PackedObjectContainer
+      entries: Sequence of (object_id, path) tuples to write
       delta_window_size: Delta window size
       deltify: Whether to deltify pack objects
       compression_level: the zlib compression level
@@ -1715,7 +1697,6 @@ def write_pack(
             delta_window_size=delta_window_size,
             deltify=deltify,
             compression_level=compression_level,
-            reuse_pack=reuse_pack,
         )
     entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])
     with GitFile(filename + ".idx", "wb") as f:
@@ -1740,60 +1721,72 @@ def write_pack_header(write, num_objects):
         write(chunk)
 
 
+def find_reusable_deltas(
+        container: PackedObjectContainer,
+        object_ids: Set[bytes],
+        *, other_haves: Optional[Set[bytes]] = None, progress=None) -> Iterator[UnpackedObject]:
+    if other_haves is None:
+        other_haves = set()
+    reused = 0
+    for i, unpacked in enumerate(container.iter_unpacked_subset(object_ids, allow_missing=True, convert_ofs_delta=True)):
+        if progress is not None and i % 1000 == 0:
+            progress(("checking for reusable deltas: %d/%d\r" % (i, len(object_ids))).encode('utf-8'))
+        if unpacked.pack_type_num == REF_DELTA:
+            hexsha = sha_to_hex(unpacked.delta_base)
+            if hexsha in object_ids or hexsha in other_haves:
+                yield unpacked
+                reused += 1
+    if progress is not None:
+        progress(("found %d deltas to reuse\n" % (reused, )).encode('utf-8'))
+
+
 def deltify_pack_objects(
-        objects: Iterable[Tuple[ShaFile, str]],
-        window_size: Optional[int] = None,
-        reuse_pack: Optional[PackedObjectContainer] = None) -> Iterator[UnpackedObject]:
+        objects,
+        *, window_size: Optional[int] = None,
+        progress=None) -> Iterator[UnpackedObject]:
     """Generate deltas for pack objects.
 
     Args:
       objects: An iterable of (object, path) tuples to deltify.
       window_size: Window size; None for default
-      reuse_pack: Pack object we can search for objects to reuse
     Returns: Iterator over type_num, object id, delta_base, content
         delta_base is None for full text entries
     """
-    # TODO(jelmer): Use threads
-    if window_size is None:
-        window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
+    yield from deltas_from_sorted_objects(
+        sort_objects_for_delta(objects),
+        window_size=window_size,
+        progress=progress)
 
-    reused_deltas = set()
-    if reuse_pack:
-        # Build a set of SHA1 IDs which will be part of this pack file.
-        # We can only reuse a delta if its base will be present in the
-        # generated pack file.
-        objects_to_pack = set()
-        for obj, path in objects:
-            objects_to_pack.add(sha_to_hex(obj.sha().digest()))
-        for o, _ in objects:
-            sha_digest = o.sha().digest()
-            # get_raw_unresolved() translates OFS_DELTA into REF_DELTA for us
-            try:
-                unpacked = reuse_pack.get_unpacked_object(sha_digest, convert_ofs_delta=True)
-            except KeyError:
-                continue
-            if unpacked.pack_type_num == REF_DELTA and unpacked.delta_base in objects_to_pack:
-                yield unpacked
-                reused_deltas.add(sha_digest)
 
-    # Build a list of objects ordered by the magic Linus heuristic
-    # This helps us find good objects to diff against us
+def sort_objects_for_delta(objects: Union[Iterator[ShaFile], Iterator[Tuple[ShaFile, Optional[bytes]]]]) -> Iterator[ShaFile]:
     magic = []
-    for obj, path in objects:
-        if obj.sha().digest() in reused_deltas:
-            continue
+    for entry in objects:
+        if isinstance(entry, tuple):
+            obj, path = entry
+        else:
+            obj = entry
         magic.append((obj.type_num, path, -obj.raw_length(), obj))
+    # Build a list of objects ordered by the magic Linus heuristic
+    # This helps us find good objects to diff against us
     magic.sort()
+    return (x[3] for x in magic)
 
-    possible_bases: Deque[Tuple[bytes, int, List[bytes]]] = deque()
 
-    for type_num, path, neg_length, o in magic:
+def deltas_from_sorted_objects(objects, window_size: Optional[int] = None, progress=None):
+    # TODO(jelmer): Use threads
+    if window_size is None:
+        window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
+
+    possible_bases: Deque[Tuple[bytes, int, List[bytes]]] = deque()
+    for i, o in enumerate(objects):
+        if progress is not None and i % 1000 == 0:
+            progress(("generating deltas: %d\r" % (i, )).encode('utf-8'))
         raw = o.as_raw_chunks()
         winner = raw
         winner_len = sum(map(len, winner))
         winner_base = None
         for base_id, base_type_num, base in possible_bases:
-            if base_type_num != type_num:
+            if base_type_num != o.type_num:
                 continue
             delta_len = 0
             delta = []
@@ -1806,40 +1799,80 @@ def deltify_pack_objects(
                 winner_base = base_id
                 winner = delta
                 winner_len = sum(map(len, winner))
-        unpacked = UnpackedObject(type_num, sha=o.sha().digest(), delta_base=winner_base, decomp_len=winner_len, decomp_chunks=winner)
-        yield unpacked
-        possible_bases.appendleft((o.sha().digest(), type_num, raw))
+        yield UnpackedObject(o.type_num, sha=o.sha().digest(), delta_base=winner_base, decomp_len=winner_len, decomp_chunks=winner)
+        possible_bases.appendleft((o.sha().digest(), o.type_num, raw))
         while len(possible_bases) > window_size:
             possible_bases.pop()
 
 
 def pack_objects_to_data(
-        objects,
-        *, delta_window_size: Optional[int] = None,
+        objects: Sequence[Tuple[ShaFile, Optional[bytes]]],
+        *,
         deltify: Optional[bool] = None,
-        reuse_pack: Optional[PackedObjectContainer] = None) -> Tuple[int, Iterator[UnpackedObject]]:
+        delta_window_size: Optional[int] = None,
+        ofs_delta: bool = True,
+        progress=None) -> Tuple[int, Iterator[UnpackedObject]]:
     """Create pack data from objects
 
     Args:
       objects: Pack objects
     Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
     """
+    # TODO(jelmer): support deltaifying
     count = len(objects)
     if deltify is None:
         # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
         # slow at the moment.
         deltify = False
     if deltify:
-        pack_contents = deltify_pack_objects(
-            objects, window_size=delta_window_size, reuse_pack=reuse_pack)
-        return (count, pack_contents)
+        return (
+            count,
+            deltify_pack_objects(objects, window_size=delta_window_size, progress=progress))
     else:
         return (
             count,
-            (full_unpacked_object(o) for (o, path) in objects)
+            (full_unpacked_object(o) for o, path in objects)
         )
 
 
+def generate_unpacked_objects(
+        container: PackedObjectContainer,
+        object_ids: Sequence[Tuple[bytes, Optional[bytes]]],
+        delta_window_size: Optional[int] = None,
+        deltify: Optional[bool] = None,
+        reuse_deltas: bool = True,
+        ofs_delta: bool = True,
+        other_haves: Optional[Set[bytes]] = None,
+        progress=None) -> Iterator[UnpackedObject]:
+    """Create pack data from objects
+
+    Args:
+      objects: Pack objects
+    Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
+    """
+    todo = dict(object_ids)
+    if reuse_deltas:
+        for unpack in find_reusable_deltas(container, set(todo), other_haves=other_haves, progress=progress):
+            del todo[sha_to_hex(unpack.sha())]
+            yield unpack
+    if deltify is None:
+        # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
+        # slow at the moment.
+        deltify = False
+    if deltify:
+        objects_to_delta = container.iterobjects_subset(todo.keys(), allow_missing=False)
+        yield from deltas_from_sorted_objects(
+            sort_objects_for_delta(
+                (o, todo[o.id])
+                for o in objects_to_delta),
+            window_size=delta_window_size,
+            progress=progress)
+    else:
+        for oid in todo:
+            yield full_unpacked_object(container[oid])
+
+
+
 def full_unpacked_object(o: ShaFile) -> UnpackedObject:
     return UnpackedObject(
         o.type_num, delta_base=None, crc32=None,
@@ -1847,36 +1880,62 @@ def full_unpacked_object(o: ShaFile) -> UnpackedObject:
         sha=o.sha().digest())
 
 
-def write_pack_objects(
-        write, objects,
+def write_pack_from_container(
+        write,
+        container: PackedObjectContainer,
+        object_ids: Sequence[Tuple[bytes, Optional[bytes]]],
         delta_window_size: Optional[int] = None,
         deltify: Optional[bool] = None,
-        reuse_pack: Optional[PackedObjectContainer] = None,
+        reuse_deltas: bool = True,
         compression_level: int = -1
 ):
     """Write a new pack data file.
 
     Args:
       write: write function to use
-      objects: Iterable of (object, path) tuples to write. Should provide
-         __len__
+      container: PackedObjectContainer
+      entries: Sequence of (object_id, path) tuples to write
       delta_window_size: Sliding window size for searching for deltas;
                          Set to None for default window size.
       deltify: Whether to deltify objects
-      reuse_pack: Pack object we can search for objects to reuse
       compression_level: the zlib compression level to use
     Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
     """
-    if hasattr(write, 'write'):
-        warnings.warn(
-            'write_pack_objects() now takes a write rather than file argument',
-            DeprecationWarning, stacklevel=2)
-        write = write.write
+    pack_contents_count = len(object_ids)
+    pack_contents = generate_unpacked_objects(
+        container, object_ids, delta_window_size=delta_window_size,
+        deltify=deltify,
+        reuse_deltas=reuse_deltas)
+
+    return write_pack_data(
+        write,
+        pack_contents,
+        num_records=pack_contents_count,
+        compression_level=compression_level,
+    )
+
+
+
+def write_pack_objects(
+        write,
+        objects: Sequence[Tuple[ShaFile, Optional[bytes]]],
+        delta_window_size: Optional[int] = None,
+        deltify: Optional[bool] = None,
+        compression_level: int = -1
+):
+    """Write a new pack data file.
 
+    Args:
+      write: write function to use
+      objects: Sequence of (object, path) tuples to write
+      delta_window_size: Sliding window size for searching for deltas;
+                         Set to None for default window size.
+      deltify: Whether to deltify objects
+      compression_level: the zlib compression level to use
+    Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
+    """
     pack_contents_count, pack_contents = pack_objects_to_data(
-        objects, delta_window_size=delta_window_size,
-        deltify=deltify,
-        reuse_pack=reuse_pack)
+        objects, deltify=deltify)
 
     return write_pack_data(
         write,
@@ -1888,11 +1947,11 @@ def write_pack_objects(
 
 class PackChunkGenerator:
 
-    def __init__(self, num_records=None, records=None, progress=None, compression_level=-1):
+    def __init__(self, num_records=None, records=None, progress=None, compression_level=-1, reuse_compressed=True):
         self.cs = sha1(b"")
         self.entries = {}
         self._it = self._pack_data_chunks(
-            num_records=num_records, records=records, progress=progress, compression_level=compression_level)
+            num_records=num_records, records=records, progress=progress, compression_level=compression_level, reuse_compressed=reuse_compressed)
 
     def sha1digest(self):
         return self.cs.digest()
@@ -1900,12 +1959,12 @@ class PackChunkGenerator:
     def __iter__(self):
         return self._it
 
-    def _pack_data_chunks(self, records: Iterator[UnpackedObject], *, num_records=None, progress=None, compression_level: int = -1) -> Iterator[bytes]:
+    def _pack_data_chunks(self, records: Iterator[UnpackedObject], *, num_records=None, progress=None, compression_level: int = -1, reuse_compressed: bool = True) -> Iterator[bytes]:
         """Iterate pack data file chunks.
 
         Args:
-          num_records: Number of records (defaults to len(records) if None)
-          records: Iterator over type_num, object_id, delta_base, raw
+          records: Iterator over UnpackedObject
+          num_records: Number of records (defaults to len(records) if not specified)
           progress: Function to report progress to
           compression_level: the zlib compression level
         Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
@@ -1921,23 +1980,28 @@ class PackChunkGenerator:
         actual_num_records = 0
         for i, unpacked in enumerate(records):
             type_num = unpacked.pack_type_num
-            if progress is not None:
+            if progress is not None and i % 1000 == 0:
                 progress(("writing pack data: %d/%d\r" % (i, num_records)).encode("ascii"))
+            raw: Union[List[bytes], Tuple[int, List[bytes]], Tuple[bytes, List[bytes]]]
             if unpacked.delta_base is not None:
                 try:
                     base_offset, base_crc32 = self.entries[unpacked.delta_base]
                 except KeyError:
                     type_num = REF_DELTA
+                    assert isinstance(unpacked.delta_base, bytes)
                     raw = (unpacked.delta_base, unpacked.decomp_chunks)
                 else:
                     type_num = OFS_DELTA
                     raw = (offset - base_offset, unpacked.decomp_chunks)
             else:
                 raw = unpacked.decomp_chunks
+            if unpacked.comp_chunks is not None and reuse_compressed:
+                chunks = unpacked.comp_chunks
+            else:
+                chunks = pack_object_chunks(type_num, raw, compression_level=compression_level)
             crc32 = 0
             object_size = 0
-            # TODO(jelmer,perf): if unpacked.comp_chunks is populated, use that?
-            for chunk in pack_object_chunks(type_num, raw, compression_level=compression_level):
+            for chunk in chunks:
                 yield chunk
                 crc32 = binascii.crc32(chunk, crc32)
                 self.cs.update(chunk)
@@ -2189,20 +2253,6 @@ def write_pack_index_v2(
 write_pack_index = write_pack_index_v2
 
 
-class _PackTupleIterable:
-    """Helper for Pack.pack_tuples."""
-
-    def __init__(self, iterobjects, length):
-        self._iterobjects = iterobjects
-        self._length = length
-
-    def __len__(self):
-        return self._length
-
-    def __iter__(self):
-        return ((o, None) for o in self._iterobjects())
-
-
 class Pack:
     """A Git pack object."""
 
@@ -2318,6 +2368,9 @@ class Pack:
     def get_stored_checksum(self) -> bytes:
         return self.data.get_stored_checksum()
 
+    def pack_tuples(self):
+        return [(o, None) for o in self.iterobjects()]
+
     def __contains__(self, sha1: bytes) -> bool:
         """Check whether this pack contains a particular SHA1."""
         try:
@@ -2326,21 +2379,6 @@ class Pack:
         except KeyError:
             return False
 
-    def get_raw_unresolved(self, sha1: bytes) -> Tuple[int, Optional[bytes], List[bytes]]:
-        """Get raw unresolved data for a SHA.
-
-        Args:
-          sha1: SHA to return data for
-        Returns: Tuple with pack object type, delta base (if applicable),
-            list of data chunks
-        """
-        offset = self.index.object_offset(sha1)
-        (obj_type, delta_base, chunks) = self.data.get_compressed_data_at(offset)
-        if obj_type == OFS_DELTA:
-            delta_base = sha_to_hex(self.index.object_sha1(offset - delta_base))
-            obj_type = REF_DELTA
-        return (obj_type, delta_base, chunks)
-
     def get_raw(self, sha1: bytes) -> Tuple[int, bytes]:
         offset = self.index.object_offset(sha1)
         obj_type, obj = self.data.get_object_at(offset)
@@ -2367,11 +2405,42 @@ class Pack:
                 resolve_ext_ref=self.resolve_ext_ref)
             if uo.sha() in shas)
 
-    def pack_tuples(self):
-        return _PackTupleIterable(self.iterobjects, len(self))
-
-    def iter_unpacked(self):
-        return self.data.iter_unpacked()
+    def iter_unpacked_subset(self, shas, *, include_comp: bool = False, allow_missing: bool = False, convert_ofs_delta: bool = False) -> Iterator[UnpackedObject]:
+        ofs_pending: Dict[int, List[UnpackedObject]] = defaultdict(list)
+        ofs: Dict[bytes, int] = {}
+        todo = set(shas)
+        for unpacked in self.iter_unpacked(include_comp=include_comp):
+            sha = unpacked.sha()
+            ofs[unpacked.offset] = sha
+            hexsha = sha_to_hex(sha)
+            if hexsha in todo:
+                if unpacked.pack_type_num == OFS_DELTA:
+                    assert isinstance(unpacked.delta_base, int)
+                    base_offset = unpacked.offset - unpacked.delta_base
+                    try:
+                        unpacked.delta_base = ofs[base_offset]
+                    except KeyError:
+                        ofs_pending[base_offset].append(unpacked)
+                        continue
+                    else:
+                        unpacked.pack_type_num = REF_DELTA
+                yield unpacked
+                todo.remove(hexsha)
+            for child in ofs_pending.pop(unpacked.offset, []):
+                child.pack_type_num = REF_DELTA
+                child.delta_base = sha
+                yield child
+        assert not ofs_pending
+        if not allow_missing and todo:
+            raise KeyError(todo.pop())
+
+    def iter_unpacked(self, include_comp=False):
+        ofs_to_entries = {ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()}
+        for unpacked in self.data.iter_unpacked(include_comp=include_comp):
+            (sha, crc32) = ofs_to_entries[unpacked.offset]
+            unpacked._sha = sha
+            unpacked.crc32 = crc32
+            yield unpacked
 
     def keep(self, msg: Optional[bytes] = None) -> str:
         """Add a .keep file for the pack, preventing git from garbage collecting it.
@@ -2477,12 +2546,12 @@ class Pack:
         offset = self.index.object_offset(sha)
         unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)
         if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:
+            assert isinstance(unpacked.delta_base, int)
             unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)
             unpacked.pack_type_num = REF_DELTA
         return unpacked
 
 
-
 try:
     from dulwich._pack import (  # type: ignore # noqa: F811
         apply_delta,

+ 5 - 19
dulwich/porcelain.py

@@ -134,7 +134,7 @@ from dulwich.objectspec import (
 )
 from dulwich.pack import (
     write_pack_index,
-    write_pack_objects,
+    write_pack_from_container,
 )
 from dulwich.patch import write_tree_diff
 from dulwich.protocol import (
@@ -1753,18 +1753,6 @@ def repack(repo):
         r.object_store.pack_loose_objects()
 
 
-def find_pack_for_reuse(repo):
-    reuse_pack = None
-    max_pack_len = 0
-    # The pack file which contains the largest number of objects
-    # will be most suitable for object reuse.
-    for p in repo.object_store.packs:
-        if len(p) > max_pack_len:
-            reuse_pack = p
-            max_pack_len = len(reuse_pack)
-    return reuse_pack
-
-
 def pack_objects(repo, object_ids, packf, idxf, delta_window_size=None, deltify=None, reuse_deltas=True):
     """Pack objects into a file.
 
@@ -1779,15 +1767,13 @@ def pack_objects(repo, object_ids, packf, idxf, delta_window_size=None, deltify=
       reuse_deltas: Allow reuse of existing deltas while deltifying
     """
     with open_repo_closing(repo) as r:
-        reuse_pack = None
-        if deltify and reuse_deltas:
-            reuse_pack = find_pack_for_reuse(r)
-        entries, data_sum = write_pack_objects(
+        entries, data_sum = write_pack_from_container(
             packf.write,
-            r.object_store.iter_shas((oid, None) for oid in object_ids),
+            repo.object_store,
+            [(oid, None) for oid in object_ids],
             deltify=deltify,
             delta_window_size=delta_window_size,
-            reuse_pack=reuse_pack
+            reuse_deltas=reuse_deltas,
         )
     if idxf is not None:
         entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])

+ 19 - 18
dulwich/repo.py

@@ -71,7 +71,7 @@ from dulwich.object_store import (
     DiskObjectStore,
     MemoryObjectStore,
     MissingObjectFinder,
-    BaseObjectStore,
+    PackBasedObjectStore,
     ObjectStoreGraphWalker,
     peel_sha,
     MissingObjectFinder,
@@ -87,7 +87,7 @@ from dulwich.objects import (
     ObjectID,
 )
 from dulwich.pack import (
-    pack_objects_to_data,
+    generate_unpacked_objects
 )
 
 from dulwich.hooks import (
@@ -363,7 +363,7 @@ class BaseRepo:
         repository
     """
 
-    def __init__(self, object_store: BaseObjectStore, refs: RefsContainer):
+    def __init__(self, object_store: PackBasedObjectStore, refs: RefsContainer):
         """Open a repository.
 
         This shouldn't be called directly, but rather through one of the
@@ -486,10 +486,12 @@ class BaseRepo:
         Returns: count and iterator over pack data
         """
         # TODO(jelmer): Fetch pack data directly, don't create objects first.
-        objects = self.fetch_objects(
+        object_ids = self.fetch_objects(
             determine_wants, graph_walker, progress, get_tagged, depth=depth
         )
-        return pack_objects_to_data(objects)
+        # TODO(jelmer): Set other_has=missing_objects.remote_has
+        return len(object_ids), generate_unpacked_objects(
+            self.object_store, object_ids, progress=progress)
 
     def fetch_objects(
         self,
@@ -567,17 +569,14 @@ class BaseRepo:
         def get_parents(commit):
             return parents_provider.get_parents(commit.id, commit)
 
-        return self.object_store.iter_shas(
-            MissingObjectFinder(
-                self.object_store,
-                haves=haves,
-                wants=wants,
-                shallow=self.get_shallow(),
-                progress=progress,
-                get_tagged=get_tagged,
-                get_parents=get_parents,
-            )
-        )
+        return list(MissingObjectFinder(
+            self.object_store,
+            haves,
+            wants,
+            self.get_shallow(),
+            progress,
+            get_tagged,
+            get_parents=get_parents))
 
     def generate_pack_data(self, have: List[ObjectID], want: List[ObjectID],
                            progress: Optional[Callable[[str], None]] = None,
@@ -1117,7 +1116,7 @@ class Repo(BaseRepo):
     def __init__(
         self,
         root: str,
-        object_store: Optional[BaseObjectStore] = None,
+        object_store: Optional[PackBasedObjectStore] = None,
         bare: Optional[bool] = None
     ) -> None:
         self.symlink_fn = None
@@ -1434,7 +1433,9 @@ class Repo(BaseRepo):
         for fs_path in fs_paths:
             tree_path = _fs_to_tree_path(fs_path)
             try:
-                tree_entry = self.object_store[tree_id].lookup_path(
+                tree = self.object_store[tree_id]
+                assert isinstance(tree, Tree)
+                tree_entry = tree.lookup_path(
                     self.object_store.__getitem__, tree_path)
             except KeyError:
                 # if tree_entry didn't exist, this file was being added, so

+ 4 - 4
dulwich/server.py

@@ -71,7 +71,7 @@ from dulwich.object_store import (
     peel_sha,
 )
 from dulwich.pack import (
-    write_pack_objects,
+    write_pack_from_container,
     ObjectContainer,
 )
 from dulwich.protocol import (
@@ -379,7 +379,7 @@ class UploadPackHandler(PackHandler):
             wants.extend(graph_walker.determine_wants(refs, **kwargs))
             return wants
 
-        objects_iter = self.repo.fetch_objects(
+        object_ids = self.repo.fetch_objects(
             wants_wrapper,
             graph_walker,
             self.progress,
@@ -410,9 +410,9 @@ class UploadPackHandler(PackHandler):
             return
 
         self.progress(
-            ("counting objects: %d, done.\n" % len(objects_iter)).encode("ascii")
+            ("counting objects: %d, done.\n" % len(object_ids)).encode("ascii")
         )
-        write_pack_objects(write, objects_iter)
+        write_pack_from_container(write, self.repo.object_store, object_ids)
         # we are done
         self.proto.write_pkt_line(None)
 

+ 2 - 2
dulwich/tests/compat/test_client.py

@@ -329,7 +329,7 @@ class DulwichClientTestBase:
             sendrefs[b"refs/heads/abranch"] = b"00" * 20
             del sendrefs[b"HEAD"]
 
-            def gen_pack(have, want, ofs_delta=False):
+            def gen_pack(have, want, ofs_delta=False, progress=None):
                 return 0, []
 
             c = self._client()
@@ -344,7 +344,7 @@ class DulwichClientTestBase:
             dest.refs[b"refs/heads/abranch"] = dummy_commit
             sendrefs = {b"refs/heads/bbranch": dummy_commit}
 
-            def gen_pack(have, want, ofs_delta=False):
+            def gen_pack(have, want, ofs_delta=False, progress=None):
                 return 0, []
 
             c = self._client()

+ 4 - 4
dulwich/tests/compat/test_pack.py

@@ -115,8 +115,8 @@ class TestPack(PackTests):
                 (new_blob, None),
                 (new_blob_2, None),
             ]
-        pack_path = os.path.join(self._tempdir, "pack_with_deltas")
-        write_pack(pack_path, all_to_pack, deltify=True)
+            pack_path = os.path.join(self._tempdir, "pack_with_deltas")
+            write_pack(pack_path, all_to_pack, deltify=True)
         output = run_git_or_fail(["verify-pack", "-v", pack_path])
         self.assertEqual(
             {x[0].id for x in all_to_pack},
@@ -154,8 +154,8 @@ class TestPack(PackTests):
                 (new_blob, None),
                 (new_blob_2, None),
             ]
-        pack_path = os.path.join(self._tempdir, "pack_with_deltas")
-        write_pack(pack_path, all_to_pack, deltify=True)
+            pack_path = os.path.join(self._tempdir, "pack_with_deltas")
+            write_pack(pack_path, all_to_pack, deltify=True)
         output = run_git_or_fail(["verify-pack", "-v", pack_path])
         self.assertEqual(
             {x[0].id for x in all_to_pack},

+ 13 - 11
dulwich/tests/test_client.py

@@ -200,7 +200,7 @@ class GitClientTests(TestCase):
         self.assertEqual({}, ret.symrefs)
         self.assertEqual(self.rout.getvalue(), b"0000")
 
-    def test_send_pack_no_sideband64k_with_update_ref_error(self):
+    def test_send_pack_no_sideband64k_with_update_ref_error(self) -> None:
         # No side-bank-64k reported by server shouldn't try to parse
         # side band data
         pkts = [
@@ -233,11 +233,11 @@ class GitClientTests(TestCase):
                 b"refs/foo/bar": commit.id,
             }
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return pack_objects_to_data(
                 [
                     (commit, None),
-                    (tree, ""),
+                    (tree, b""),
                 ]
             )
 
@@ -260,7 +260,7 @@ class GitClientTests(TestCase):
         def update_refs(refs):
             return {b"refs/heads/master": b"310ca9477129b8586fa2afc779c1f57cf64bba6c"}
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return 0, []
 
         self.client.send_pack(b"/", update_refs, generate_pack_data)
@@ -280,7 +280,7 @@ class GitClientTests(TestCase):
         def update_refs(refs):
             return {b"refs/heads/master": b"0" * 40}
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return 0, []
 
         self.client.send_pack(b"/", update_refs, generate_pack_data)
@@ -304,7 +304,7 @@ class GitClientTests(TestCase):
         def update_refs(refs):
             return {b"refs/heads/master": b"0" * 40}
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return 0, []
 
         self.client.send_pack(b"/", update_refs, generate_pack_data)
@@ -331,11 +331,11 @@ class GitClientTests(TestCase):
                 b"refs/heads/master": b"310ca9477129b8586fa2afc779c1f57cf64bba6c",
             }
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return 0, []
 
         f = BytesIO()
-        write_pack_objects(f.write, {})
+        write_pack_objects(f.write, [])
         self.client.send_pack("/", update_refs, generate_pack_data)
         self.assertEqual(
             self.rout.getvalue(),
@@ -371,7 +371,7 @@ class GitClientTests(TestCase):
                 b"refs/heads/master": b"310ca9477129b8586fa2afc779c1f57cf64bba6c",
             }
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return pack_objects_to_data(
                 [
                     (commit, None),
@@ -408,7 +408,7 @@ class GitClientTests(TestCase):
         def update_refs(refs):
             return {b"refs/heads/master": b"0" * 40}
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return 0, []
 
         result = self.client.send_pack(b"/", update_refs, generate_pack_data)
@@ -862,7 +862,9 @@ class LocalGitClientTests(TestCase):
 
     def test_fetch_into_empty(self):
         c = LocalGitClient()
-        t = MemoryRepo()
+        target = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, target)
+        t = Repo.init_bare(target)
         s = open_repo("a.git")
         self.addCleanup(tear_down_repo, s)
         self.assertEqual(s.get_refs(), c.fetch(s.path, t).refs)

+ 0 - 37
dulwich/tests/test_greenthreads.py

@@ -46,7 +46,6 @@ except ImportError:
 
 if gevent_support:
     from dulwich.greenthreads import (
-        GreenThreadsObjectStoreIterator,
         GreenThreadsMissingObjectFinder,
     )
 
@@ -77,42 +76,6 @@ def init_store(store, count=1):
     return ret
 
 
-@skipIf(not gevent_support, skipmsg)
-class TestGreenThreadsObjectStoreIterator(TestCase):
-    def setUp(self):
-        super().setUp()
-        self.store = MemoryObjectStore()
-        self.cmt_amount = 10
-        self.objs = init_store(self.store, self.cmt_amount)
-
-    def test_len(self):
-        wants = [sha.id for sha in self.objs if isinstance(sha, Commit)]
-        finder = MissingObjectFinder(self.store, (), wants)
-        iterator = GreenThreadsObjectStoreIterator(
-            self.store, iter(finder.next, None), finder
-        )
-        # One commit refers one tree and one blob
-        self.assertEqual(len(iterator), self.cmt_amount * 3)
-        haves = wants[0 : self.cmt_amount - 1]
-        finder = MissingObjectFinder(self.store, haves, wants)
-        iterator = GreenThreadsObjectStoreIterator(
-            self.store, iter(finder.next, None), finder
-        )
-        self.assertEqual(len(iterator), 3)
-
-    def test_iter(self):
-        wants = [sha.id for sha in self.objs if isinstance(sha, Commit)]
-        finder = MissingObjectFinder(self.store, (), wants)
-        iterator = GreenThreadsObjectStoreIterator(
-            self.store, iter(finder.next, None), finder
-        )
-        objs = []
-        for sha, path in iterator:
-            self.assertIn(sha, self.objs)
-            objs.append(sha)
-        self.assertEqual(len(objs), len(self.objs))
-
-
 @skipIf(not gevent_support, skipmsg)
 class TestGreenThreadsMissingObjectFinder(TestCase):
     def setUp(self):

+ 20 - 16
dulwich/tests/test_pack.py

@@ -429,7 +429,7 @@ class TestPack(PackTests):
         with self.get_pack(pack1_sha) as origpack:
             self.assertSucceeds(origpack.index.check)
             basename = os.path.join(self.tempdir, "Elch")
-            write_pack(basename, list(origpack.pack_tuples()))
+            write_pack(basename, origpack.pack_tuples())
 
             with Pack(basename) as newpack:
                 self.assertEqual(origpack, newpack)
@@ -453,7 +453,7 @@ class TestPack(PackTests):
 
     def _copy_pack(self, origpack):
         basename = os.path.join(self.tempdir, "somepack")
-        write_pack(basename, list(origpack.pack_tuples()))
+        write_pack(basename, origpack.pack_tuples())
         return Pack(basename)
 
     def test_keep_no_message(self):
@@ -578,24 +578,28 @@ class TestThinPack(PackTests):
         with self.make_pack(True) as p:
             self.assertEqual((3, b"foo1234"), p.get_raw(self.blobs[b"foo1234"].id))
 
-    def test_get_raw_unresolved(self):
+    def test_get_unpacked_object(self):
+        self.maxDiff = None
         with self.make_pack(False) as p:
-            self.assertEqual(
-                (
-                    7,
-                    b"\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c",
-                    [b"x\x9ccf\x9f\xc0\xccbhdl\x02\x00\x06f\x01l"],
-                ),
-                p.get_raw_unresolved(self.blobs[b"foo1234"].id),
+            expected = UnpackedObject(
+                7,
+                b"\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c",
+                decomp_chunks=[b'\x03\x07\x90\x03\x041234'],
             )
+            expected.offset = 12
+            got = p.get_unpacked_object(self.blobs[b"foo1234"].id)
+            self.assertEqual(expected, got)
         with self.make_pack(True) as p:
+            expected = UnpackedObject(
+                7,
+                b"\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c",
+                decomp_chunks=[b'\x03\x07\x90\x03\x041234'],
+            )
+            expected.offset = 12
+            got = p.get_unpacked_object(self.blobs[b"foo1234"].id)
             self.assertEqual(
-                (
-                    7,
-                    b"\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c",
-                    [b"x\x9ccf\x9f\xc0\xccbhdl\x02\x00\x06f\x01l"],
-                ),
-                p.get_raw_unresolved(self.blobs[b"foo1234"].id),
+                expected,
+                got,
             )
 
     def test_iterobjects(self):

+ 4 - 1
dulwich/tests/test_server.py

@@ -165,7 +165,10 @@ class HandlerTestCase(TestCase):
 class UploadPackHandlerTestCase(TestCase):
     def setUp(self):
         super().setUp()
-        self._repo = MemoryRepo.init_bare([], {})
+        self.path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, self.path)
+        self.repo = Repo.init(self.path)
+        self._repo = Repo.init_bare(self.path)
         backend = DictBackend({b"/": self._repo})
         self._handler = UploadPackHandler(
             backend, [b"/", b"host=lolcathost"], TestProto()