소스 검색

Significant rewrite of pack handling.

We now reuse local deltas and send them, significantly reducing the
bandwidth required for most pushes and improving performance by at
least twofold.
Jelmer Vernooij 2 년 전
부모
커밋
16b026f6d1

+ 6 - 3
dulwich/cli.py

@@ -37,7 +37,7 @@ import signal
 from typing import Dict, Type, Optional
 
 from dulwich import porcelain
-from dulwich.client import get_transport_and_path
+from dulwich.client import get_transport_and_path, GitProtocolError
 from dulwich.errors import ApplyDeltaError
 from dulwich.index import Index
 from dulwich.objectspec import parse_commit
@@ -263,8 +263,11 @@ class cmd_clone(Command):
         else:
             target = None
 
-        porcelain.clone(source, target, bare=options.bare, depth=options.depth,
-                        branch=options.branch)
+        try:
+            porcelain.clone(source, target, bare=options.bare, depth=options.depth,
+                            branch=options.branch)
+        except GitProtocolError as e:
+            print("%s" % e)
 
 
 class cmd_commit(Command):

+ 9 - 6
dulwich/client.py

@@ -51,6 +51,7 @@ from typing import (
     Callable,
     Dict,
     List,
+    Iterator,
     Optional,
     Set,
     Tuple,
@@ -117,7 +118,8 @@ from dulwich.protocol import (
     pkt_line,
 )
 from dulwich.pack import (
-    write_pack_objects,
+    write_pack_from_container,
+    UnpackedObject,
     PackChunkGenerator,
 )
 from dulwich.refs import (
@@ -699,7 +701,7 @@ class GitClient:
         """
         raise NotImplementedError(cls.from_parsedurl)
 
-    def send_pack(self, path, update_refs, generate_pack_data, progress=None):
+    def send_pack(self, path, update_refs, generate_pack_data: Callable[[Set[bytes], Set[bytes], bool], Tuple[int, Iterator[UnpackedObject]]], progress=None):
         """Upload a pack to a remote repository.
 
         Args:
@@ -1106,10 +1108,11 @@ class TraditionalGitClient(GitClient):
                 header_handler.have,
                 header_handler.want,
                 ofs_delta=(CAPABILITY_OFS_DELTA in negotiated_capabilities),
+                progress=progress,
             )
 
             if self._should_send_pack(new_refs):
-                for chunk in PackChunkGenerator(pack_data_count, pack_data):
+                for chunk in PackChunkGenerator(pack_data_count, pack_data, progress=progress):
                     proto.write(chunk)
 
             ref_status = self._handle_receive_pack_tail(
@@ -1533,7 +1536,7 @@ class LocalGitClient(GitClient):
 
         """
         with self._open_repo(path) as r:
-            objects_iter = r.fetch_objects(
+            object_ids = r.fetch_objects(
                 determine_wants, graph_walker, progress=progress, depth=depth
             )
             symrefs = r.refs.get_symrefs()
@@ -1541,9 +1544,9 @@ class LocalGitClient(GitClient):
 
             # Did the process short-circuit (e.g. in a stateless RPC call)?
             # Note that the client still expects a 0-object pack in most cases.
-            if objects_iter is None:
+            if object_ids is None:
                 return FetchPackResult(None, symrefs, agent)
-            write_pack_objects(pack_data, objects_iter, reuse_pack=r.object_store)
+            write_pack_from_container(pack_data, r.object_store, object_ids)
             return FetchPackResult(r.get_refs(), symrefs, agent)
 
     def get_refs(self, path):

+ 0 - 19
dulwich/contrib/swift.py

@@ -40,7 +40,6 @@ from geventhttpclient import HTTPClient
 
 from dulwich.greenthreads import (
     GreenThreadsMissingObjectFinder,
-    GreenThreadsObjectStoreIterator,
 )
 
 from dulwich.lru_cache import LRUSizeCache
@@ -119,15 +118,6 @@ cache_length = 20
 """
 
 
-class PackInfoObjectStoreIterator(GreenThreadsObjectStoreIterator):
-    def __len__(self):
-        while self.finder.objects_to_send:
-            for _ in range(0, len(self.finder.objects_to_send)):
-                sha = self.finder.next()
-                self._shas.append(sha)
-        return len(self._shas)
-
-
 class PackInfoMissingObjectFinder(GreenThreadsMissingObjectFinder):
     def next(self):
         while True:
@@ -681,15 +671,6 @@ class SwiftObjectStore(PackBasedObjectStore):
         """Loose objects are not supported by this repository"""
         return []
 
-    def iter_shas(self, finder):
-        """An iterator over pack's ObjectStore.
-
-        Returns: a `ObjectStoreIterator` or `GreenThreadsObjectStoreIterator`
-                 instance if gevent is enabled
-        """
-        shas = iter(finder.next, None)
-        return PackInfoObjectStoreIterator(self, shas, finder, self.scon.concurrency)
-
     def pack_info_get(self, sha):
         for pack in self.packs:
             if sha in pack:

+ 0 - 34
dulwich/greenthreads.py

@@ -33,7 +33,6 @@ from dulwich.object_store import (
     MissingObjectFinder,
     _collect_ancestors,
     _collect_filetree_revs,
-    ObjectStoreIterator,
 )
 
 
@@ -111,36 +110,3 @@ class GreenThreadsMissingObjectFinder(MissingObjectFinder):
         else:
             self.progress = progress
         self._tagged = get_tagged and get_tagged() or {}
-
-
-class GreenThreadsObjectStoreIterator(ObjectStoreIterator):
-    """ObjectIterator that works on top of an ObjectStore.
-
-    Same implementation as object_store.ObjectStoreIterator
-    except we use gevent to parallelize object retrieval.
-    """
-
-    def __init__(self, store, shas, finder, concurrency=1):
-        self.finder = finder
-        self.p = pool.Pool(size=concurrency)
-        super().__init__(store, shas)
-
-    def retrieve(self, args):
-        sha, path = args
-        return self.store[sha], path
-
-    def __iter__(self):
-        yield from self.p.imap_unordered(self.retrieve, self.itershas())
-
-    def __len__(self):
-        if len(self._shas) > 0:
-            return len(self._shas)
-        while self.finder.objects_to_send:
-            jobs = []
-            for _ in range(0, len(self.finder.objects_to_send)):
-                jobs.append(self.p.spawn(self.finder.next))
-            gevent.joinall(jobs)
-            for j in jobs:
-                if j.value is not None:
-                    self._shas.append(j.value)
-        return len(self._shas)

+ 189 - 180
dulwich/object_store.py

@@ -28,7 +28,7 @@ import stat
 import sys
 import warnings
 
-from typing import Callable, Dict, List, Optional, Tuple, Protocol, Union, Iterator, Set
+from typing import Callable, Dict, List, Optional, Tuple, Protocol, Union, Iterator, Set, Iterable, Sequence, cast
 
 from dulwich.errors import (
     NotTreeError,
@@ -40,6 +40,7 @@ from dulwich.objects import (
     ShaFile,
     Tag,
     Tree,
+    Blob,
     ZERO_SHA,
     hex_to_sha,
     sha_to_hex,
@@ -55,8 +56,11 @@ from dulwich.pack import (
     PackData,
     PackInflater,
     PackFileDisappeared,
+    UnpackedObject,
     load_pack_index_file,
     iter_sha1,
+    full_unpacked_object,
+    generate_unpacked_objects,
     pack_objects_to_data,
     write_pack_header,
     write_pack_index_v2,
@@ -65,6 +69,7 @@ from dulwich.pack import (
     compute_file_sha,
     PackIndexer,
     PackStreamCopier,
+    PackedObjectContainer,
 )
 from dulwich.protocol import DEPTH_INFINITE
 from dulwich.refs import ANNOTATED_TAG_SUFFIX, Ref
@@ -109,29 +114,16 @@ class BaseObjectStore:
             and not sha == ZERO_SHA
         ]
 
-    def iter_shas(self, shas):
-        """Iterate over the objects for the specified shas.
-
-        Args:
-          shas: Iterable object with SHAs
-        Returns: Object iterator
-        """
-        return ObjectStoreIterator(self, shas)
-
     def contains_loose(self, sha):
         """Check if a particular object is present by SHA1 and is loose."""
         raise NotImplementedError(self.contains_loose)
 
-    def contains_packed(self, sha):
-        """Check if a particular object is present by SHA1 and is packed."""
-        raise NotImplementedError(self.contains_packed)
-
-    def __contains__(self, sha):
+    def __contains__(self, sha1: bytes) -> bool:
         """Check if a particular object is present by SHA1.
 
         This method makes no distinction between loose and packed objects.
         """
-        return self.contains_packed(sha) or self.contains_loose(sha)
+        return self.contains_loose(sha1)
 
     @property
     def packs(self):
@@ -147,21 +139,15 @@ class BaseObjectStore:
         """
         raise NotImplementedError(self.get_raw)
 
-    def __getitem__(self, sha: ObjectID):
+    def __getitem__(self, sha1: ObjectID) -> ShaFile:
         """Obtain an object by SHA1."""
-        type_num, uncomp = self.get_raw(sha)
-        return ShaFile.from_raw_string(type_num, uncomp, sha=sha)
+        type_num, uncomp = self.get_raw(sha1)
+        return ShaFile.from_raw_string(type_num, uncomp, sha=sha1)
 
     def __iter__(self):
         """Iterate over the SHAs that are present in this store."""
         raise NotImplementedError(self.__iter__)
 
-    def add_pack(
-        self
-    ) -> Tuple[BytesIO, Callable[[], None], Callable[[], None]]:
-        """Add a new pack to this object store."""
-        raise NotImplementedError(self.add_pack)
-
     def add_object(self, obj):
         """Add a single object to this object store."""
         raise NotImplementedError(self.add_object)
@@ -174,31 +160,6 @@ class BaseObjectStore:
         """
         raise NotImplementedError(self.add_objects)
 
-    def add_pack_data(self, count, pack_data, progress=None):
-        """Add pack data to this object store.
-
-        Args:
-          count: Number of items to add
-          pack_data: Iterator over pack data tuples
-        """
-        if count == 0:
-            # Don't bother writing an empty pack file
-            return
-        f, commit, abort = self.add_pack()
-        try:
-            write_pack_data(
-                f.write,
-                pack_data,
-                num_records=count,
-                progress=progress,
-                compression_level=self.pack_compression_level,
-            )
-        except BaseException:
-            abort()
-            raise
-        else:
-            return commit()
-
     def tree_changes(
         self,
         source,
@@ -253,6 +214,14 @@ class BaseObjectStore:
             DeprecationWarning, stacklevel=2)
         return iter_tree_contents(self, tree_id, include_trees=include_trees)
 
+    def iterobjects_subset(self, shas: Iterable[bytes], *, allow_missing: bool = False) -> Iterator[ShaFile]:
+        for sha in shas:
+            try:
+                yield self[sha]
+            except KeyError:
+                if not allow_missing:
+                    raise
+
     def find_common_revisions(self, graphwalker):
         """Find which revisions this store has in common using graphwalker.
 
@@ -269,22 +238,10 @@ class BaseObjectStore:
             sha = next(graphwalker)
         return haves
 
-    def generate_pack_contents(self, have, want, shallow=None, progress=None):
-        """Iterate over the contents of a pack file.
-
-        Args:
-          have: List of SHA1s of objects that should not be sent
-          want: List of SHA1s of objects that should be sent
-          shallow: Set of shallow commit SHA1s to skip
-          progress: Optional progress reporting method
-        """
-        missing = MissingObjectFinder(
-            self, haves=have, wants=want, shallow=shallow, progress=progress)
-        return self.iter_shas(missing)
-
     def generate_pack_data(
-        self, have, want, shallow=None, progress=None, ofs_delta=True
-    ):
+        self, have, want, shallow=None, progress=None,
+        ofs_delta=True
+    ) -> Tuple[int, Iterator[UnpackedObject]]:
         """Generate pack data objects for a set of wants/haves.
 
         Args:
@@ -294,10 +251,12 @@ class BaseObjectStore:
           ofs_delta: Whether OFS deltas can be included
           progress: Optional progress reporting method
         """
-        # TODO(jelmer): More efficient implementation
+        # Note that the pack-specific implementation below is more efficient,
+        # as it reuses deltas
+        object_ids = list(MissingObjectFinder(self, have, want, shallow, progress))
         return pack_objects_to_data(
-            self.generate_pack_contents(have, want, shallow, progress)
-        )
+            [(self[oid], path) for oid, path in object_ids], ofs_delta=ofs_delta,
+            progress=progress)
 
     def peel_sha(self, sha):
         """Peel all tags from a SHA.
@@ -353,6 +312,37 @@ class PackBasedObjectStore(BaseObjectStore):
         self._pack_cache = {}
         self.pack_compression_level = pack_compression_level
 
+    def add_pack(
+        self
+    ) -> Tuple[BytesIO, Callable[[], None], Callable[[], None]]:
+        """Add a new pack to this object store."""
+        raise NotImplementedError(self.add_pack)
+
+    def add_pack_data(self, count: int, unpacked_objects: Iterator[UnpackedObject], progress=None) -> None:
+        """Add pack data to this object store.
+
+        Args:
+          count: Number of items to add
+          pack_data: Iterator over pack data tuples
+        """
+        if count == 0:
+            # Don't bother writing an empty pack file
+            return
+        f, commit, abort = self.add_pack()
+        try:
+            write_pack_data(
+                f.write,
+                unpacked_objects,
+                num_records=count,
+                progress=progress,
+                compression_level=self.pack_compression_level,
+            )
+        except BaseException:
+            abort()
+            raise
+        else:
+            return commit()
+
     @property
     def alternates(self):
         return []
@@ -390,6 +380,26 @@ class PackBasedObjectStore(BaseObjectStore):
             if prev_pack:
                 prev_pack.close()
 
+    def generate_pack_data(
+        self, have, want, shallow=None, progress=None,
+        ofs_delta=True
+    ) -> Tuple[int, Iterator[UnpackedObject]]:
+        """Generate pack data objects for a set of wants/haves.
+
+        Args:
+          have: List of SHA1s of objects that should not be sent
+          want: List of SHA1s of objects that should be sent
+          shallow: Set of shallow commit SHA1s to skip
+          ofs_delta: Whether OFS deltas can be included
+          progress: Optional progress reporting method
+        """
+        object_ids = list(MissingObjectFinder(self, have, want, shallow, progress))
+        return len(object_ids), generate_unpacked_objects(
+            cast(PackedObjectContainer, self),
+            object_ids,
+            progress=progress,
+            ofs_delta=ofs_delta)
+
     def _clear_cached_packs(self):
         pack_cache = self._pack_cache
         self._pack_cache = {}
@@ -529,47 +539,89 @@ class PackBasedObjectStore(BaseObjectStore):
                 pass
         raise KeyError(hexsha)
 
-    def get_raw_unresolved(self, name: bytes) -> Tuple[int, Union[bytes, None], List[bytes]]:
-        """Obtain the unresolved data for an object.
+    def iter_unpacked_subset(self, shas, *, include_comp=False, allow_missing: bool = False, convert_ofs_delta: bool = True) -> Iterator[ShaFile]:
+        todo: Set[bytes] = set(shas)
+        for p in self._iter_cached_packs():
+            for unpacked in p.iter_unpacked_subset(todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta):
+                yield unpacked
+                hexsha = sha_to_hex(unpacked.sha())
+                todo.remove(hexsha)
+        # Maybe something else has added a pack with the object
+        # in the mean time?
+        for p in self._update_pack_cache():
+            for unpacked in p.iter_unpacked_subset(todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta):
+                yield unpacked
+                hexsha = sha_to_hex(unpacked.sha())
+                todo.remove(hexsha)
+        for alternate in self.alternates:
+            for unpacked in alternate.iter_unpacked_subset(todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta):
+                yield unpacked
+                hexsha = sha_to_hex(unpacked.sha())
+                todo.remove(hexsha)
+
+    def iterobjects_subset(self, shas: Iterable[bytes], *, allow_missing: bool = False) -> Iterator[ShaFile]:
+        todo: Set[bytes] = set(shas)
+        for p in self._iter_cached_packs():
+            for o in p.iterobjects_subset(todo, allow_missing=True):
+                yield o
+                todo.remove(o.id)
+        # Maybe something else has added a pack with the object
+        # in the mean time?
+        for p in self._update_pack_cache():
+            for o in p.iterobjects_subset(todo, allow_missing=True):
+                yield o
+                todo.remove(o.id)
+        for alternate in self.alternates:
+            for o in alternate.iterobjects_subset(todo, allow_missing=True):
+                yield o
+                todo.remove(o.id)
+        for oid in todo:
+            o = self._get_loose_object(oid)
+            if o is not None:
+                yield o
+            elif not allow_missing:
+                raise KeyError(oid)
+
+    def get_unpacked_object(self, sha1: bytes, *, include_comp: bool = False) -> UnpackedObject:
+        """Obtain the unpacked object.
 
         Args:
-          name: sha for the object.
+          sha1: sha for the object.
         """
-        if name == ZERO_SHA:
-            raise KeyError(name)
-        if len(name) == 40:
-            sha = hex_to_sha(name)
-            hexsha = name
-        elif len(name) == 20:
-            sha = name
+        if sha1 == ZERO_SHA:
+            raise KeyError(sha1)
+        if len(sha1) == 40:
+            sha = hex_to_sha(sha1)
+            hexsha = sha1
+        elif len(sha1) == 20:
+            sha = sha1
             hexsha = None
         else:
-            raise AssertionError("Invalid object name {!r}".format(name))
+            raise AssertionError("Invalid object sha1 {!r}".format(sha1))
         for pack in self._iter_cached_packs():
             try:
-                return pack.get_raw_unresolved(sha)
+                return pack.get_unpacked_object(sha, include_comp=include_comp)
             except (KeyError, PackFileDisappeared):
                 pass
         if hexsha is None:
-            hexsha = sha_to_hex(name)
-        ret = self._get_loose_object(hexsha)
-        if ret is not None:
-            return ret.type_num, None, ret.as_raw_chunks()
+            hexsha = sha_to_hex(sha1)
         # Maybe something else has added a pack with the object
         # in the mean time?
         for pack in self._update_pack_cache():
             try:
-                return pack.get_raw_unresolved(sha)
+                return pack.get_unpacked_object(sha, include_comp=include_comp)
             except KeyError:
                 pass
         for alternate in self.alternates:
             try:
-                return alternate.get_raw_unresolved(hexsha)
+                return alternate.get_unpacked_object(hexsha, include_comp=include_comp)
             except KeyError:
                 pass
         raise KeyError(hexsha)
 
-    def add_objects(self, objects, progress=None):
+    def add_objects(
+            self, objects: Sequence[Tuple[ShaFile, Optional[str]]],
+            progress: Optional[Callable[[str], None]] = None) -> None:
         """Add a set of objects to this object store.
 
         Args:
@@ -577,7 +629,9 @@ class PackBasedObjectStore(BaseObjectStore):
             __len__.
         Returns: Pack object of the objects written.
         """
-        return self.add_pack_data(*pack_objects_to_data(objects), progress=progress)
+        count = len(objects)
+        record_iter = (full_unpacked_object(o) for (o, p) in objects)
+        return self.add_pack_data(count, record_iter, progress=progress)
 
 
 class DiskObjectStore(PackBasedObjectStore):
@@ -1067,79 +1121,6 @@ class ObjectIterator(Protocol):
         raise NotImplementedError(self.iterobjects)
 
 
-class ObjectStoreIterator(ObjectIterator):
-    """ObjectIterator that works on top of an ObjectStore."""
-
-    def __init__(self, store, sha_iter):
-        """Create a new ObjectIterator.
-
-        Args:
-          store: Object store to retrieve from
-          sha_iter: Iterator over (sha, path) tuples
-        """
-        self.store = store
-        self.sha_iter = sha_iter
-        self._shas = []
-
-    def __iter__(self):
-        """Yield tuple with next object and path."""
-        for sha, path in self.itershas():
-            yield self.store[sha], path
-
-    def iterobjects(self):
-        """Iterate over just the objects."""
-        for o, path in self:
-            yield o
-
-    def itershas(self):
-        """Iterate over the SHAs."""
-        for sha in self._shas:
-            yield sha
-        for sha in self.sha_iter:
-            self._shas.append(sha)
-            yield sha
-
-    def __contains__(self, needle):
-        """Check if an object is present.
-
-        Note: This checks if the object is present in
-            the underlying object store, not if it would
-            be yielded by the iterator.
-
-        Args:
-          needle: SHA1 of the object to check for
-        """
-        if needle == ZERO_SHA:
-            return False
-        return needle in self.store
-
-    def __getitem__(self, key):
-        """Find an object by SHA1.
-
-        Note: This retrieves the object from the underlying
-            object store. It will also succeed if the object would
-            not be returned by the iterator.
-        """
-        return self.store[key]
-
-    def __len__(self):
-        """Return the number of objects."""
-        return len(list(self.itershas()))
-
-    def _empty(self):
-        it = self.itershas()
-        try:
-            next(it)
-        except StopIteration:
-            return True
-        else:
-            return False
-
-    def __bool__(self):
-        """Indicate whether this object has contents."""
-        return not self._empty()
-
-
 def tree_lookup_path(lookup_obj, root_sha, path):
     """Look up an object in a Git tree.
 
@@ -1270,27 +1251,33 @@ class MissingObjectFinder:
             shallow=shallow,
             get_parents=self._get_parents,
         )
-        self.sha_done = set()
+        self.remote_has: Set[bytes] = set()
         # Now, fill sha_done with commits and revisions of
         # files and directories known to be both locally
         # and on target. Thus these commits and files
         # won't get selected for fetch
         for h in common_commits:
-            self.sha_done.add(h)
+            self.remote_has.add(h)
             cmt = object_store[h]
-            _collect_filetree_revs(object_store, cmt.tree, self.sha_done)
+            _collect_filetree_revs(object_store, cmt.tree, self.remote_has)
         # record tags we have as visited, too
         for t in have_tags:
-            self.sha_done.add(t)
+            self.remote_has.add(t)
+        self.sha_done = set(self.remote_has)
 
-        missing_tags = want_tags.difference(have_tags)
-        missing_others = want_others.difference(have_others)
         # in fact, what we 'want' is commits, tags, and others
         # we've found missing
-        wants = missing_commits.union(missing_tags)
-        wants = wants.union(missing_others)
-
-        self.objects_to_send = {(w, None, False) for w in wants}
+        self.objects_to_send = {
+            (w, None, Commit.type_num, False)
+            for w in missing_commits}
+        missing_tags = want_tags.difference(have_tags)
+        self.objects_to_send.update(
+            {(w, None, Tag.type_num, False)
+            for w in missing_tags})
+        missing_others = want_others.difference(have_others)
+        self.objects_to_send.update(
+            {(w, None, None, False)
+            for w in missing_others})
 
         if progress is None:
             self.progress = lambda x: None
@@ -1301,35 +1288,39 @@ class MissingObjectFinder:
     def add_todo(self, entries):
         self.objects_to_send.update([e for e in entries if not e[0] in self.sha_done])
 
-    def __next__(self):
+    def __next__(self) -> Tuple[bytes, Optional[bytes]]:
         while True:
             if not self.objects_to_send:
-                return None
-            (sha, name, leaf) = self.objects_to_send.pop()
+                self.progress(("counting objects: %d, done.\n" % len(self.sha_done)).encode("ascii"))
+                raise StopIteration
+            (sha, name, type_num, leaf) = self.objects_to_send.pop()
             if sha not in self.sha_done:
                 break
         if not leaf:
             o = self.object_store[sha]
             if isinstance(o, Commit):
-                self.add_todo([(o.tree, b"", False)])
+                self.add_todo([(o.tree, b"", Tree.type_num, False)])
             elif isinstance(o, Tree):
                 self.add_todo(
                     [
-                        (s, n, not stat.S_ISDIR(m))
+                        (s, n, (Blob.type_num if stat.S_ISREG(m) else Tree.type_num),
+                         not stat.S_ISDIR(m))
                         for n, m, s in o.iteritems()
                         if not S_ISGITLINK(m)
                     ]
                 )
             elif isinstance(o, Tag):
-                self.add_todo([(o.object[1], None, False)])
+                self.add_todo([(o.object[1], None, o.object[0].type_num, False)])
         if sha in self._tagged:
-            self.add_todo([(self._tagged[sha], None, True)])
+            self.add_todo([(self._tagged[sha], None, None, True)])
         self.sha_done.add(sha)
-        self.progress(("counting objects: %d\r" % len(self.sha_done)).encode("ascii"))
+        if len(self.sha_done) % 1000 == 0:
+            self.progress(("counting objects: %d\r" % len(self.sha_done)).encode("ascii"))
+        # TODO (type_num, name, -length, sha)
         return (sha, name)
 
     def __iter__(self):
-        return iter(self.__next__, None)
+        return self
 
 
 class ObjectStoreGraphWalker:
@@ -1476,6 +1467,24 @@ class OverlayObjectStore(BaseObjectStore):
                     yield o_id
                     done.add(o_id)
 
+    def iterobjects_subset(self, shas: Iterable[bytes], *, allow_missing: bool = False) -> Iterator[ShaFile]:
+        todo = set(shas)
+        for b in self.bases:
+            for o in b.iterobjects_subset(todo, allow_missing=True):
+                yield o
+                todo.remove(o.id)
+        if todo and not allow_missing:
+            raise KeyError(o.id)
+
+    def iter_unpacked_subset(self, shas: Iterable[bytes], *, include_comp=False, allow_missing: bool = False, convert_ofs_delta=True) -> Iterator[ShaFile]:
+        todo = set(shas)
+        for b in self.bases:
+            for o in b.iter_unpacked_subset(todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta):
+                yield o
+                todo.remove(o.id)
+        if todo and not allow_missing:
+            raise KeyError(o.id)
+
     def get_raw(self, sha_id):
         for b in self.bases:
             try:

+ 1 - 4
dulwich/objects.py

@@ -528,10 +528,7 @@ class ShaFile:
 
     def raw_length(self) -> int:
         """Returns the length of the raw string of this object."""
-        ret = 0
-        for chunk in self.as_raw_chunks():
-            ret += len(chunk)
-        return ret
+        return sum(map(len, self.as_raw_chunks()))
 
     def sha(self):
         """The SHA1 object that is the name of this object."""

+ 211 - 142
dulwich/pack.py

@@ -50,7 +50,7 @@ from itertools import chain
 
 import os
 import sys
-from typing import Optional, Callable, Tuple, List, Deque, Union, Protocol, Iterable, Iterator, Dict, TypeVar, Generic
+from typing import Optional, Callable, Tuple, List, Deque, Union, Protocol, Iterable, Iterator, Dict, TypeVar, Generic, Sequence, Set
 import warnings
 
 from hashlib import sha1
@@ -108,7 +108,7 @@ class ObjectContainer(Protocol):
         """Add a single object to this object store."""
 
     def add_objects(
-            self, objects: Iterable[Tuple[ShaFile, Optional[str]]],
+            self, objects: Sequence[Tuple[ShaFile, Optional[str]]],
             progress: Optional[Callable[[str], None]] = None) -> None:
         """Add a set of objects to this object store.
 
@@ -129,6 +129,14 @@ class PackedObjectContainer(ObjectContainer):
         """Get a raw unresolved object."""
         raise NotImplementedError(self.get_unpacked_object)
 
+    def iterobjects_subset(self, shas: Iterable[bytes], *, allow_missing: bool = False) -> Iterator[ShaFile]:
+        raise NotImplementedError(self.iterobjects_subset)
+
+    def iter_unpacked_subset(
+            self, shas: Set[bytes], include_comp: bool = False, allow_missing: bool = False,
+            convert_ofs_delta: bool = True) -> Iterator["UnpackedObject"]:
+        raise NotImplementedError(self.iter_unpacked_subset)
+
 
 def take_msb_bytes(read: Callable[[int], bytes], crc32: Optional[int] = None) -> Tuple[List[int], Optional[int]]:
     """Read bytes marked with most significant bit.
@@ -176,6 +184,8 @@ class UnpackedObject:
 
     obj_type_num: Optional[int]
     obj_chunks: Optional[List[bytes]]
+    delta_base: Union[None, bytes, int]
+    decomp_chunks: List[bytes]
 
     # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
     # methods of this object.
@@ -456,7 +466,6 @@ class PackIndex:
 
     def object_sha1(self, index: int) -> bytes:
         """Return the SHA1 corresponding to the index in the pack file."""
-        # PERFORMANCE/TODO(jelmer): Avoid scanning entire index
         for (name, offset, crc32) in self.iterentries():
             if offset == index:
                 return name
@@ -531,6 +540,10 @@ class MemoryPackIndex(PackIndex):
     def for_pack(cls, pack):
         return MemoryPackIndex(pack.sorted_entries(), pack.calculate_checksum())
 
+    @classmethod
+    def clone(cls, other_index):
+        return cls(other_index.iterentries(), other_index.get_pack_checksum())
+
 
 class FilePackIndex(PackIndex):
     """Pack index that is based on a file.
@@ -1326,34 +1339,6 @@ class PackData:
         unpacked.offset = offset
         return unpacked
 
-    def get_compressed_data_at(self, offset):
-        """Given offset in the packfile return compressed data that is there.
-
-        Using the associated index the location of an object can be looked up,
-        and then the packfile can be asked directly for that object using this
-        function.
-        """
-        unpacked = self.get_unpacked_object_at(offset, include_comp=True)
-        return (
-            unpacked.pack_type_num,
-            unpacked.delta_base,
-            unpacked.comp_chunks,
-        )
-
-    def get_decompressed_data_at(self, offset: int) -> Tuple[int, Optional[bytes], List[bytes]]:
-        """Given an offset in the packfile, decompress the data that is there.
-
-        Using the associated index the location of an object can be looked up,
-        and then the packfile can be asked directly for that object using this
-        function.
-        """
-        unpacked = self.get_unpacked_object_at(offset, include_comp=False)
-        return (
-            unpacked.pack_type_num,
-            unpacked.delta_base,
-            unpacked.decomp_chunks,
-        )
-
     def get_object_at(self, offset: int) -> Tuple[int, OldUnpackedObject]:
         """Given an offset in to the packfile return the object that is there.
 
@@ -1408,7 +1393,7 @@ class DeltaChainIterator(Generic[T]):
     def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None):
         walker = cls(None, resolve_ext_ref=resolve_ext_ref)
         walker.set_pack_data(pack_data)
-        for unpacked in pack_data.iter_unpacked():
+        for unpacked in pack_data.iter_unpacked(include_comp=False):
             walker.record(unpacked)
         return walker
 
@@ -1438,6 +1423,7 @@ class DeltaChainIterator(Generic[T]):
                 base_ofs = unpacked.offset - unpacked.delta_base
             elif unpacked.pack_type_num == REF_DELTA:
                 with suppress(KeyError):
+                    assert isinstance(unpacked.delta_base, bytes)
                     base_ofs = pack.index.object_index(unpacked.delta_base)
             if base_ofs is not None and base_ofs not in done:
                 todo.add(base_ofs)
@@ -1450,6 +1436,7 @@ class DeltaChainIterator(Generic[T]):
             base_offset = offset - unpacked.delta_base
             self._pending_ofs[base_offset].append(offset)
         elif type_num == REF_DELTA:
+            assert isinstance(unpacked.delta_base, bytes)
             self._pending_ref[unpacked.delta_base].append(offset)
         else:
             self._full_ofs.append((offset, type_num))
@@ -1675,11 +1662,6 @@ def write_pack_object(write, type, object, sha=None, compression_level=-1):
       compression_level: the zlib compression level
     Returns: Tuple with offset at which the object was written, and crc32
     """
-    if hasattr(write, 'write'):
-        warnings.warn(
-            'write_pack_object() now takes a write rather than file argument',
-            DeprecationWarning, stacklevel=2)
-        write = write.write
     crc32 = 0
     for chunk in pack_object_chunks(
             type, object, compression_level=compression_level):
@@ -1692,17 +1674,17 @@ def write_pack_object(write, type, object, sha=None, compression_level=-1):
 
 def write_pack(
         filename,
-        objects,
+        objects: Sequence[Tuple[ShaFile, Optional[bytes]]],
         *,
         deltify: Optional[bool] = None,
         delta_window_size: Optional[int] = None,
-        compression_level: int = -1,
-        reuse_pack: Optional[PackedObjectContainer] = None):
+        compression_level: int = -1):
     """Write a new pack data file.
 
     Args:
       filename: Path to the new pack file (without .pack extension)
-      objects: (object, path) tuple iterable to write. Should provide __len__
+      container: PackedObjectContainer
+      entries: Sequence of (object_id, path) tuples to write
       delta_window_size: Delta window size
       deltify: Whether to deltify pack objects
       compression_level: the zlib compression level
@@ -1715,7 +1697,6 @@ def write_pack(
             delta_window_size=delta_window_size,
             deltify=deltify,
             compression_level=compression_level,
-            reuse_pack=reuse_pack,
         )
     entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])
     with GitFile(filename + ".idx", "wb") as f:
@@ -1740,60 +1721,72 @@ def write_pack_header(write, num_objects):
         write(chunk)
 
 
+def find_reusable_deltas(
+        container: PackedObjectContainer,
+        object_ids: Set[bytes],
+        *, other_haves: Optional[Set[bytes]] = None, progress=None) -> Iterator[UnpackedObject]:
+    if other_haves is None:
+        other_haves = set()
+    reused = 0
+    for i, unpacked in enumerate(container.iter_unpacked_subset(object_ids, allow_missing=True, convert_ofs_delta=True)):
+        if progress is not None and i % 1000 == 0:
+            progress(("checking for reusable deltas: %d/%d\r" % (i, len(object_ids))).encode('utf-8'))
+        if unpacked.pack_type_num == REF_DELTA:
+            hexsha = sha_to_hex(unpacked.delta_base)
+            if hexsha in object_ids or hexsha in other_haves:
+                yield unpacked
+                reused += 1
+    if progress is not None:
+        progress(("found %d deltas to reuse\n" % (reused, )).encode('utf-8'))
+
+
 def deltify_pack_objects(
-        objects: Iterable[Tuple[ShaFile, str]],
-        window_size: Optional[int] = None,
-        reuse_pack: Optional[PackedObjectContainer] = None) -> Iterator[UnpackedObject]:
+        objects,
+        *, window_size: Optional[int] = None,
+        progress=None) -> Iterator[UnpackedObject]:
     """Generate deltas for pack objects.
 
     Args:
       objects: An iterable of (object, path) tuples to deltify.
       window_size: Window size; None for default
-      reuse_pack: Pack object we can search for objects to reuse
     Returns: Iterator over type_num, object id, delta_base, content
         delta_base is None for full text entries
     """
-    # TODO(jelmer): Use threads
-    if window_size is None:
-        window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
+    yield from deltas_from_sorted_objects(
+        sort_objects_for_delta(objects),
+        window_size=window_size,
+        progress=progress)
 
-    reused_deltas = set()
-    if reuse_pack:
-        # Build a set of SHA1 IDs which will be part of this pack file.
-        # We can only reuse a delta if its base will be present in the
-        # generated pack file.
-        objects_to_pack = set()
-        for obj, path in objects:
-            objects_to_pack.add(sha_to_hex(obj.sha().digest()))
-        for o, _ in objects:
-            sha_digest = o.sha().digest()
-            # get_raw_unresolved() translates OFS_DELTA into REF_DELTA for us
-            try:
-                unpacked = reuse_pack.get_unpacked_object(sha_digest, convert_ofs_delta=True)
-            except KeyError:
-                continue
-            if unpacked.pack_type_num == REF_DELTA and unpacked.delta_base in objects_to_pack:
-                yield unpacked
-                reused_deltas.add(sha_digest)
 
-    # Build a list of objects ordered by the magic Linus heuristic
-    # This helps us find good objects to diff against us
+def sort_objects_for_delta(objects: Union[Iterator[ShaFile], Iterator[Tuple[ShaFile, Optional[bytes]]]]) -> Iterator[ShaFile]:
     magic = []
-    for obj, path in objects:
-        if obj.sha().digest() in reused_deltas:
-            continue
+    for entry in objects:
+        if isinstance(entry, tuple):
+            obj, path = entry
+        else:
+            obj = entry
         magic.append((obj.type_num, path, -obj.raw_length(), obj))
+    # Build a list of objects ordered by the magic Linus heuristic
+    # This helps us find good objects to diff against us
     magic.sort()
+    return (x[3] for x in magic)
 
-    possible_bases: Deque[Tuple[bytes, int, List[bytes]]] = deque()
 
-    for type_num, path, neg_length, o in magic:
+def deltas_from_sorted_objects(objects, window_size: Optional[int] = None, progress=None):
+    # TODO(jelmer): Use threads
+    if window_size is None:
+        window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
+
+    possible_bases: Deque[Tuple[bytes, int, List[bytes]]] = deque()
+    for i, o in enumerate(objects):
+        if progress is not None and i % 1000 == 0:
+            progress(("generating deltas: %d\r" % (i, )).encode('utf-8'))
         raw = o.as_raw_chunks()
         winner = raw
         winner_len = sum(map(len, winner))
         winner_base = None
         for base_id, base_type_num, base in possible_bases:
-            if base_type_num != type_num:
+            if base_type_num != o.type_num:
                 continue
             delta_len = 0
             delta = []
@@ -1806,40 +1799,80 @@ def deltify_pack_objects(
                 winner_base = base_id
                 winner = delta
                 winner_len = sum(map(len, winner))
-        unpacked = UnpackedObject(type_num, sha=o.sha().digest(), delta_base=winner_base, decomp_len=winner_len, decomp_chunks=winner)
-        yield unpacked
-        possible_bases.appendleft((o.sha().digest(), type_num, raw))
+        yield UnpackedObject(o.type_num, sha=o.sha().digest(), delta_base=winner_base, decomp_len=winner_len, decomp_chunks=winner)
+        possible_bases.appendleft((o.sha().digest(), o.type_num, raw))
         while len(possible_bases) > window_size:
             possible_bases.pop()
 
 
 def pack_objects_to_data(
-        objects,
-        *, delta_window_size: Optional[int] = None,
+        objects: Sequence[Tuple[ShaFile, Optional[bytes]]],
+        *,
         deltify: Optional[bool] = None,
-        reuse_pack: Optional[PackedObjectContainer] = None) -> Tuple[int, Iterator[UnpackedObject]]:
+        delta_window_size: Optional[int] = None,
+        ofs_delta: bool = True,
+        progress=None) -> Tuple[int, Iterator[UnpackedObject]]:
     """Create pack data from objects
 
     Args:
       objects: Pack objects
     Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
     """
+    # TODO(jelmer): support deltaifying
     count = len(objects)
     if deltify is None:
         # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
         # slow at the moment.
         deltify = False
     if deltify:
-        pack_contents = deltify_pack_objects(
-            objects, window_size=delta_window_size, reuse_pack=reuse_pack)
-        return (count, pack_contents)
+        return (
+            count,
+            deltify_pack_objects(objects, window_size=delta_window_size, progress=progress))
     else:
         return (
             count,
-            (full_unpacked_object(o) for (o, path) in objects)
+            (full_unpacked_object(o) for o, path in objects)
         )
 
 
+def generate_unpacked_objects(
+        container: PackedObjectContainer,
+        object_ids: Sequence[Tuple[bytes, Optional[bytes]]],
+        delta_window_size: Optional[int] = None,
+        deltify: Optional[bool] = None,
+        reuse_deltas: bool = True,
+        ofs_delta: bool = True,
+        other_haves: Optional[Set[bytes]] = None,
+        progress=None) -> Iterator[UnpackedObject]:
+    """Create pack data from objects
+
+    Args:
+      objects: Pack objects
+    Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
+    """
+    todo = dict(object_ids)
+    if reuse_deltas:
+        for unpack in find_reusable_deltas(container, set(todo), other_haves=other_haves, progress=progress):
+            del todo[sha_to_hex(unpack.sha())]
+            yield unpack
+    if deltify is None:
+        # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
+        # slow at the moment.
+        deltify = False
+    if deltify:
+        objects_to_delta = container.iterobjects_subset(todo.keys(), allow_missing=False)
+        yield from deltas_from_sorted_objects(
+            sort_objects_for_delta(
+                (o, todo[o.id])
+                for o in objects_to_delta),
+            window_size=delta_window_size,
+            progress=progress)
+    else:
+        for oid in todo:
+            yield full_unpacked_object(container[oid])
+
+
+
 def full_unpacked_object(o: ShaFile) -> UnpackedObject:
     return UnpackedObject(
         o.type_num, delta_base=None, crc32=None,
@@ -1847,36 +1880,62 @@ def full_unpacked_object(o: ShaFile) -> UnpackedObject:
         sha=o.sha().digest())
 
 
-def write_pack_objects(
-        write, objects,
+def write_pack_from_container(
+        write,
+        container: PackedObjectContainer,
+        object_ids: Sequence[Tuple[bytes, Optional[bytes]]],
         delta_window_size: Optional[int] = None,
         deltify: Optional[bool] = None,
-        reuse_pack: Optional[PackedObjectContainer] = None,
+        reuse_deltas: bool = True,
         compression_level: int = -1
 ):
     """Write a new pack data file.
 
     Args:
       write: write function to use
-      objects: Iterable of (object, path) tuples to write. Should provide
-         __len__
+      container: PackedObjectContainer
+      entries: Sequence of (object_id, path) tuples to write
       delta_window_size: Sliding window size for searching for deltas;
                          Set to None for default window size.
       deltify: Whether to deltify objects
-      reuse_pack: Pack object we can search for objects to reuse
       compression_level: the zlib compression level to use
     Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
     """
-    if hasattr(write, 'write'):
-        warnings.warn(
-            'write_pack_objects() now takes a write rather than file argument',
-            DeprecationWarning, stacklevel=2)
-        write = write.write
+    pack_contents_count = len(object_ids)
+    pack_contents = generate_unpacked_objects(
+        container, object_ids, delta_window_size=delta_window_size,
+        deltify=deltify,
+        reuse_deltas=reuse_deltas)
+
+    return write_pack_data(
+        write,
+        pack_contents,
+        num_records=pack_contents_count,
+        compression_level=compression_level,
+    )
+
+
+
+def write_pack_objects(
+        write,
+        objects: Sequence[Tuple[ShaFile, Optional[bytes]]],
+        delta_window_size: Optional[int] = None,
+        deltify: Optional[bool] = None,
+        compression_level: int = -1
+):
+    """Write a new pack data file.
 
+    Args:
+      write: write function to use
+      objects: Sequence of (object, path) tuples to write
+      delta_window_size: Sliding window size for searching for deltas;
+                         Set to None for default window size.
+      deltify: Whether to deltify objects
+      compression_level: the zlib compression level to use
+    Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
+    """
     pack_contents_count, pack_contents = pack_objects_to_data(
-        objects, delta_window_size=delta_window_size,
-        deltify=deltify,
-        reuse_pack=reuse_pack)
+        objects, deltify=deltify)
 
     return write_pack_data(
         write,
@@ -1888,11 +1947,11 @@ def write_pack_objects(
 
 class PackChunkGenerator:
 
-    def __init__(self, num_records=None, records=None, progress=None, compression_level=-1):
+    def __init__(self, num_records=None, records=None, progress=None, compression_level=-1, reuse_compressed=True):
         self.cs = sha1(b"")
         self.entries = {}
         self._it = self._pack_data_chunks(
-            num_records=num_records, records=records, progress=progress, compression_level=compression_level)
+            num_records=num_records, records=records, progress=progress, compression_level=compression_level, reuse_compressed=reuse_compressed)
 
     def sha1digest(self):
         return self.cs.digest()
@@ -1900,12 +1959,12 @@ class PackChunkGenerator:
     def __iter__(self):
         return self._it
 
-    def _pack_data_chunks(self, records: Iterator[UnpackedObject], *, num_records=None, progress=None, compression_level: int = -1) -> Iterator[bytes]:
+    def _pack_data_chunks(self, records: Iterator[UnpackedObject], *, num_records=None, progress=None, compression_level: int = -1, reuse_compressed: bool = True) -> Iterator[bytes]:
         """Iterate pack data file chunks.
 
         Args:
-          num_records: Number of records (defaults to len(records) if None)
-          records: Iterator over type_num, object_id, delta_base, raw
+          records: Iterator over UnpackedObject
+          num_records: Number of records (defaults to len(records) if not specified)
           progress: Function to report progress to
           compression_level: the zlib compression level
         Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
@@ -1921,23 +1980,28 @@ class PackChunkGenerator:
         actual_num_records = 0
         for i, unpacked in enumerate(records):
             type_num = unpacked.pack_type_num
-            if progress is not None:
+            if progress is not None and i % 1000 == 0:
                 progress(("writing pack data: %d/%d\r" % (i, num_records)).encode("ascii"))
+            raw: Union[List[bytes], Tuple[int, List[bytes]], Tuple[bytes, List[bytes]]]
             if unpacked.delta_base is not None:
                 try:
                     base_offset, base_crc32 = self.entries[unpacked.delta_base]
                 except KeyError:
                     type_num = REF_DELTA
+                    assert isinstance(unpacked.delta_base, bytes)
                     raw = (unpacked.delta_base, unpacked.decomp_chunks)
                 else:
                     type_num = OFS_DELTA
                     raw = (offset - base_offset, unpacked.decomp_chunks)
             else:
                 raw = unpacked.decomp_chunks
+            if unpacked.comp_chunks is not None and reuse_compressed:
+                chunks = unpacked.comp_chunks
+            else:
+                chunks = pack_object_chunks(type_num, raw, compression_level=compression_level)
             crc32 = 0
             object_size = 0
-            # TODO(jelmer,perf): if unpacked.comp_chunks is populated, use that?
-            for chunk in pack_object_chunks(type_num, raw, compression_level=compression_level):
+            for chunk in chunks:
                 yield chunk
                 crc32 = binascii.crc32(chunk, crc32)
                 self.cs.update(chunk)
@@ -2189,20 +2253,6 @@ def write_pack_index_v2(
 write_pack_index = write_pack_index_v2
 
 
-class _PackTupleIterable:
-    """Helper for Pack.pack_tuples."""
-
-    def __init__(self, iterobjects, length):
-        self._iterobjects = iterobjects
-        self._length = length
-
-    def __len__(self):
-        return self._length
-
-    def __iter__(self):
-        return ((o, None) for o in self._iterobjects())
-
-
 class Pack:
     """A Git pack object."""
 
@@ -2318,6 +2368,9 @@ class Pack:
     def get_stored_checksum(self) -> bytes:
         return self.data.get_stored_checksum()
 
+    def pack_tuples(self):
+        return [(o, None) for o in self.iterobjects()]
+
     def __contains__(self, sha1: bytes) -> bool:
         """Check whether this pack contains a particular SHA1."""
         try:
@@ -2326,21 +2379,6 @@ class Pack:
         except KeyError:
             return False
 
-    def get_raw_unresolved(self, sha1: bytes) -> Tuple[int, Optional[bytes], List[bytes]]:
-        """Get raw unresolved data for a SHA.
-
-        Args:
-          sha1: SHA to return data for
-        Returns: Tuple with pack object type, delta base (if applicable),
-            list of data chunks
-        """
-        offset = self.index.object_offset(sha1)
-        (obj_type, delta_base, chunks) = self.data.get_compressed_data_at(offset)
-        if obj_type == OFS_DELTA:
-            delta_base = sha_to_hex(self.index.object_sha1(offset - delta_base))
-            obj_type = REF_DELTA
-        return (obj_type, delta_base, chunks)
-
     def get_raw(self, sha1: bytes) -> Tuple[int, bytes]:
         offset = self.index.object_offset(sha1)
         obj_type, obj = self.data.get_object_at(offset)
@@ -2367,11 +2405,42 @@ class Pack:
                 resolve_ext_ref=self.resolve_ext_ref)
             if uo.sha() in shas)
 
-    def pack_tuples(self):
-        return _PackTupleIterable(self.iterobjects, len(self))
-
-    def iter_unpacked(self):
-        return self.data.iter_unpacked()
+    def iter_unpacked_subset(self, shas, *, include_comp: bool = False, allow_missing: bool = False, convert_ofs_delta: bool = False) -> Iterator[UnpackedObject]:
+        ofs_pending: Dict[int, List[UnpackedObject]] = defaultdict(list)
+        ofs: Dict[bytes, int] = {}
+        todo = set(shas)
+        for unpacked in self.iter_unpacked(include_comp=include_comp):
+            sha = unpacked.sha()
+            ofs[unpacked.offset] = sha
+            hexsha = sha_to_hex(sha)
+            if hexsha in todo:
+                if unpacked.pack_type_num == OFS_DELTA:
+                    assert isinstance(unpacked.delta_base, int)
+                    base_offset = unpacked.offset - unpacked.delta_base
+                    try:
+                        unpacked.delta_base = ofs[base_offset]
+                    except KeyError:
+                        ofs_pending[base_offset].append(unpacked)
+                        continue
+                    else:
+                        unpacked.pack_type_num = REF_DELTA
+                yield unpacked
+                todo.remove(hexsha)
+            for child in ofs_pending.pop(unpacked.offset, []):
+                child.pack_type_num = REF_DELTA
+                child.delta_base = sha
+                yield child
+        assert not ofs_pending
+        if not allow_missing and todo:
+            raise KeyError(todo.pop())
+
+    def iter_unpacked(self, include_comp=False):
+        ofs_to_entries = {ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()}
+        for unpacked in self.data.iter_unpacked(include_comp=include_comp):
+            (sha, crc32) = ofs_to_entries[unpacked.offset]
+            unpacked._sha = sha
+            unpacked.crc32 = crc32
+            yield unpacked
 
     def keep(self, msg: Optional[bytes] = None) -> str:
         """Add a .keep file for the pack, preventing git from garbage collecting it.
@@ -2477,12 +2546,12 @@ class Pack:
         offset = self.index.object_offset(sha)
         unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)
         if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:
+            assert isinstance(unpacked.delta_base, int)
             unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)
             unpacked.pack_type_num = REF_DELTA
         return unpacked
 
 
-
 try:
     from dulwich._pack import (  # type: ignore # noqa: F811
         apply_delta,

+ 5 - 19
dulwich/porcelain.py

@@ -134,7 +134,7 @@ from dulwich.objectspec import (
 )
 from dulwich.pack import (
     write_pack_index,
-    write_pack_objects,
+    write_pack_from_container,
 )
 from dulwich.patch import write_tree_diff
 from dulwich.protocol import (
@@ -1753,18 +1753,6 @@ def repack(repo):
         r.object_store.pack_loose_objects()
 
 
-def find_pack_for_reuse(repo):
-    reuse_pack = None
-    max_pack_len = 0
-    # The pack file which contains the largest number of objects
-    # will be most suitable for object reuse.
-    for p in repo.object_store.packs:
-        if len(p) > max_pack_len:
-            reuse_pack = p
-            max_pack_len = len(reuse_pack)
-    return reuse_pack
-
-
 def pack_objects(repo, object_ids, packf, idxf, delta_window_size=None, deltify=None, reuse_deltas=True):
     """Pack objects into a file.
 
@@ -1779,15 +1767,13 @@ def pack_objects(repo, object_ids, packf, idxf, delta_window_size=None, deltify=
       reuse_deltas: Allow reuse of existing deltas while deltifying
     """
     with open_repo_closing(repo) as r:
-        reuse_pack = None
-        if deltify and reuse_deltas:
-            reuse_pack = find_pack_for_reuse(r)
-        entries, data_sum = write_pack_objects(
+        entries, data_sum = write_pack_from_container(
             packf.write,
-            r.object_store.iter_shas((oid, None) for oid in object_ids),
+            repo.object_store,
+            [(oid, None) for oid in object_ids],
             deltify=deltify,
             delta_window_size=delta_window_size,
-            reuse_pack=reuse_pack
+            reuse_deltas=reuse_deltas,
         )
     if idxf is not None:
         entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])

+ 19 - 18
dulwich/repo.py

@@ -71,7 +71,7 @@ from dulwich.object_store import (
     DiskObjectStore,
     MemoryObjectStore,
     MissingObjectFinder,
-    BaseObjectStore,
+    PackBasedObjectStore,
     ObjectStoreGraphWalker,
     peel_sha,
     MissingObjectFinder,
@@ -87,7 +87,7 @@ from dulwich.objects import (
     ObjectID,
 )
 from dulwich.pack import (
-    pack_objects_to_data,
+    generate_unpacked_objects
 )
 
 from dulwich.hooks import (
@@ -363,7 +363,7 @@ class BaseRepo:
         repository
     """
 
-    def __init__(self, object_store: BaseObjectStore, refs: RefsContainer):
+    def __init__(self, object_store: PackBasedObjectStore, refs: RefsContainer):
         """Open a repository.
 
         This shouldn't be called directly, but rather through one of the
@@ -486,10 +486,12 @@ class BaseRepo:
         Returns: count and iterator over pack data
         """
         # TODO(jelmer): Fetch pack data directly, don't create objects first.
-        objects = self.fetch_objects(
+        object_ids = self.fetch_objects(
             determine_wants, graph_walker, progress, get_tagged, depth=depth
         )
-        return pack_objects_to_data(objects)
+        # TODO(jelmer): Set other_has=missing_objects.remote_has
+        return len(object_ids), generate_unpacked_objects(
+            self.object_store, object_ids, progress=progress)
 
     def fetch_objects(
         self,
@@ -567,17 +569,14 @@ class BaseRepo:
         def get_parents(commit):
             return parents_provider.get_parents(commit.id, commit)
 
-        return self.object_store.iter_shas(
-            MissingObjectFinder(
-                self.object_store,
-                haves=haves,
-                wants=wants,
-                shallow=self.get_shallow(),
-                progress=progress,
-                get_tagged=get_tagged,
-                get_parents=get_parents,
-            )
-        )
+        return list(MissingObjectFinder(
+            self.object_store,
+            haves,
+            wants,
+            self.get_shallow(),
+            progress,
+            get_tagged,
+            get_parents=get_parents))
 
     def generate_pack_data(self, have: List[ObjectID], want: List[ObjectID],
                            progress: Optional[Callable[[str], None]] = None,
@@ -1117,7 +1116,7 @@ class Repo(BaseRepo):
     def __init__(
         self,
         root: str,
-        object_store: Optional[BaseObjectStore] = None,
+        object_store: Optional[PackBasedObjectStore] = None,
         bare: Optional[bool] = None
     ) -> None:
         self.symlink_fn = None
@@ -1434,7 +1433,9 @@ class Repo(BaseRepo):
         for fs_path in fs_paths:
             tree_path = _fs_to_tree_path(fs_path)
             try:
-                tree_entry = self.object_store[tree_id].lookup_path(
+                tree = self.object_store[tree_id]
+                assert isinstance(tree, Tree)
+                tree_entry = tree.lookup_path(
                     self.object_store.__getitem__, tree_path)
             except KeyError:
                 # if tree_entry didn't exist, this file was being added, so

+ 4 - 4
dulwich/server.py

@@ -71,7 +71,7 @@ from dulwich.object_store import (
     peel_sha,
 )
 from dulwich.pack import (
-    write_pack_objects,
+    write_pack_from_container,
     ObjectContainer,
 )
 from dulwich.protocol import (
@@ -379,7 +379,7 @@ class UploadPackHandler(PackHandler):
             wants.extend(graph_walker.determine_wants(refs, **kwargs))
             return wants
 
-        objects_iter = self.repo.fetch_objects(
+        object_ids = self.repo.fetch_objects(
             wants_wrapper,
             graph_walker,
             self.progress,
@@ -410,9 +410,9 @@ class UploadPackHandler(PackHandler):
             return
 
         self.progress(
-            ("counting objects: %d, done.\n" % len(objects_iter)).encode("ascii")
+            ("counting objects: %d, done.\n" % len(object_ids)).encode("ascii")
         )
-        write_pack_objects(write, objects_iter)
+        write_pack_from_container(write, self.repo.object_store, object_ids)
         # we are done
         self.proto.write_pkt_line(None)
 

+ 2 - 2
dulwich/tests/compat/test_client.py

@@ -329,7 +329,7 @@ class DulwichClientTestBase:
             sendrefs[b"refs/heads/abranch"] = b"00" * 20
             del sendrefs[b"HEAD"]
 
-            def gen_pack(have, want, ofs_delta=False):
+            def gen_pack(have, want, ofs_delta=False, progress=None):
                 return 0, []
 
             c = self._client()
@@ -344,7 +344,7 @@ class DulwichClientTestBase:
             dest.refs[b"refs/heads/abranch"] = dummy_commit
             sendrefs = {b"refs/heads/bbranch": dummy_commit}
 
-            def gen_pack(have, want, ofs_delta=False):
+            def gen_pack(have, want, ofs_delta=False, progress=None):
                 return 0, []
 
             c = self._client()

+ 4 - 4
dulwich/tests/compat/test_pack.py

@@ -115,8 +115,8 @@ class TestPack(PackTests):
                 (new_blob, None),
                 (new_blob_2, None),
             ]
-        pack_path = os.path.join(self._tempdir, "pack_with_deltas")
-        write_pack(pack_path, all_to_pack, deltify=True)
+            pack_path = os.path.join(self._tempdir, "pack_with_deltas")
+            write_pack(pack_path, all_to_pack, deltify=True)
         output = run_git_or_fail(["verify-pack", "-v", pack_path])
         self.assertEqual(
             {x[0].id for x in all_to_pack},
@@ -154,8 +154,8 @@ class TestPack(PackTests):
                 (new_blob, None),
                 (new_blob_2, None),
             ]
-        pack_path = os.path.join(self._tempdir, "pack_with_deltas")
-        write_pack(pack_path, all_to_pack, deltify=True)
+            pack_path = os.path.join(self._tempdir, "pack_with_deltas")
+            write_pack(pack_path, all_to_pack, deltify=True)
         output = run_git_or_fail(["verify-pack", "-v", pack_path])
         self.assertEqual(
             {x[0].id for x in all_to_pack},

+ 13 - 11
dulwich/tests/test_client.py

@@ -200,7 +200,7 @@ class GitClientTests(TestCase):
         self.assertEqual({}, ret.symrefs)
         self.assertEqual(self.rout.getvalue(), b"0000")
 
-    def test_send_pack_no_sideband64k_with_update_ref_error(self):
+    def test_send_pack_no_sideband64k_with_update_ref_error(self) -> None:
         # No side-bank-64k reported by server shouldn't try to parse
         # side band data
         pkts = [
@@ -233,11 +233,11 @@ class GitClientTests(TestCase):
                 b"refs/foo/bar": commit.id,
             }
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return pack_objects_to_data(
                 [
                     (commit, None),
-                    (tree, ""),
+                    (tree, b""),
                 ]
             )
 
@@ -260,7 +260,7 @@ class GitClientTests(TestCase):
         def update_refs(refs):
             return {b"refs/heads/master": b"310ca9477129b8586fa2afc779c1f57cf64bba6c"}
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return 0, []
 
         self.client.send_pack(b"/", update_refs, generate_pack_data)
@@ -280,7 +280,7 @@ class GitClientTests(TestCase):
         def update_refs(refs):
             return {b"refs/heads/master": b"0" * 40}
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return 0, []
 
         self.client.send_pack(b"/", update_refs, generate_pack_data)
@@ -304,7 +304,7 @@ class GitClientTests(TestCase):
         def update_refs(refs):
             return {b"refs/heads/master": b"0" * 40}
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return 0, []
 
         self.client.send_pack(b"/", update_refs, generate_pack_data)
@@ -331,11 +331,11 @@ class GitClientTests(TestCase):
                 b"refs/heads/master": b"310ca9477129b8586fa2afc779c1f57cf64bba6c",
             }
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return 0, []
 
         f = BytesIO()
-        write_pack_objects(f.write, {})
+        write_pack_objects(f.write, [])
         self.client.send_pack("/", update_refs, generate_pack_data)
         self.assertEqual(
             self.rout.getvalue(),
@@ -371,7 +371,7 @@ class GitClientTests(TestCase):
                 b"refs/heads/master": b"310ca9477129b8586fa2afc779c1f57cf64bba6c",
             }
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return pack_objects_to_data(
                 [
                     (commit, None),
@@ -408,7 +408,7 @@ class GitClientTests(TestCase):
         def update_refs(refs):
             return {b"refs/heads/master": b"0" * 40}
 
-        def generate_pack_data(have, want, ofs_delta=False):
+        def generate_pack_data(have, want, ofs_delta=False, progress=None):
             return 0, []
 
         result = self.client.send_pack(b"/", update_refs, generate_pack_data)
@@ -862,7 +862,9 @@ class LocalGitClientTests(TestCase):
 
     def test_fetch_into_empty(self):
         c = LocalGitClient()
-        t = MemoryRepo()
+        target = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, target)
+        t = Repo.init_bare(target)
         s = open_repo("a.git")
         self.addCleanup(tear_down_repo, s)
         self.assertEqual(s.get_refs(), c.fetch(s.path, t).refs)

+ 0 - 37
dulwich/tests/test_greenthreads.py

@@ -46,7 +46,6 @@ except ImportError:
 
 if gevent_support:
     from dulwich.greenthreads import (
-        GreenThreadsObjectStoreIterator,
         GreenThreadsMissingObjectFinder,
     )
 
@@ -77,42 +76,6 @@ def init_store(store, count=1):
     return ret
 
 
-@skipIf(not gevent_support, skipmsg)
-class TestGreenThreadsObjectStoreIterator(TestCase):
-    def setUp(self):
-        super().setUp()
-        self.store = MemoryObjectStore()
-        self.cmt_amount = 10
-        self.objs = init_store(self.store, self.cmt_amount)
-
-    def test_len(self):
-        wants = [sha.id for sha in self.objs if isinstance(sha, Commit)]
-        finder = MissingObjectFinder(self.store, (), wants)
-        iterator = GreenThreadsObjectStoreIterator(
-            self.store, iter(finder.next, None), finder
-        )
-        # One commit refers one tree and one blob
-        self.assertEqual(len(iterator), self.cmt_amount * 3)
-        haves = wants[0 : self.cmt_amount - 1]
-        finder = MissingObjectFinder(self.store, haves, wants)
-        iterator = GreenThreadsObjectStoreIterator(
-            self.store, iter(finder.next, None), finder
-        )
-        self.assertEqual(len(iterator), 3)
-
-    def test_iter(self):
-        wants = [sha.id for sha in self.objs if isinstance(sha, Commit)]
-        finder = MissingObjectFinder(self.store, (), wants)
-        iterator = GreenThreadsObjectStoreIterator(
-            self.store, iter(finder.next, None), finder
-        )
-        objs = []
-        for sha, path in iterator:
-            self.assertIn(sha, self.objs)
-            objs.append(sha)
-        self.assertEqual(len(objs), len(self.objs))
-
-
 @skipIf(not gevent_support, skipmsg)
 class TestGreenThreadsMissingObjectFinder(TestCase):
     def setUp(self):

+ 20 - 16
dulwich/tests/test_pack.py

@@ -429,7 +429,7 @@ class TestPack(PackTests):
         with self.get_pack(pack1_sha) as origpack:
             self.assertSucceeds(origpack.index.check)
             basename = os.path.join(self.tempdir, "Elch")
-            write_pack(basename, list(origpack.pack_tuples()))
+            write_pack(basename, origpack.pack_tuples())
 
             with Pack(basename) as newpack:
                 self.assertEqual(origpack, newpack)
@@ -453,7 +453,7 @@ class TestPack(PackTests):
 
     def _copy_pack(self, origpack):
         basename = os.path.join(self.tempdir, "somepack")
-        write_pack(basename, list(origpack.pack_tuples()))
+        write_pack(basename, origpack.pack_tuples())
         return Pack(basename)
 
     def test_keep_no_message(self):
@@ -578,24 +578,28 @@ class TestThinPack(PackTests):
         with self.make_pack(True) as p:
             self.assertEqual((3, b"foo1234"), p.get_raw(self.blobs[b"foo1234"].id))
 
-    def test_get_raw_unresolved(self):
+    def test_get_unpacked_object(self):
+        self.maxDiff = None
         with self.make_pack(False) as p:
-            self.assertEqual(
-                (
-                    7,
-                    b"\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c",
-                    [b"x\x9ccf\x9f\xc0\xccbhdl\x02\x00\x06f\x01l"],
-                ),
-                p.get_raw_unresolved(self.blobs[b"foo1234"].id),
+            expected = UnpackedObject(
+                7,
+                b"\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c",
+                decomp_chunks=[b'\x03\x07\x90\x03\x041234'],
             )
+            expected.offset = 12
+            got = p.get_unpacked_object(self.blobs[b"foo1234"].id)
+            self.assertEqual(expected, got)
         with self.make_pack(True) as p:
+            expected = UnpackedObject(
+                7,
+                b"\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c",
+                decomp_chunks=[b'\x03\x07\x90\x03\x041234'],
+            )
+            expected.offset = 12
+            got = p.get_unpacked_object(self.blobs[b"foo1234"].id)
             self.assertEqual(
-                (
-                    7,
-                    b"\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c",
-                    [b"x\x9ccf\x9f\xc0\xccbhdl\x02\x00\x06f\x01l"],
-                ),
-                p.get_raw_unresolved(self.blobs[b"foo1234"].id),
+                expected,
+                got,
             )
 
     def test_iterobjects(self):

+ 4 - 1
dulwich/tests/test_server.py

@@ -165,7 +165,10 @@ class HandlerTestCase(TestCase):
 class UploadPackHandlerTestCase(TestCase):
     def setUp(self):
         super().setUp()
-        self._repo = MemoryRepo.init_bare([], {})
+        self.path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, self.path)
+        self.repo = Repo.init(self.path)
+        self._repo = Repo.init_bare(self.path)
         backend = DictBackend({b"/": self._repo})
         self._handler = UploadPackHandler(
             backend, [b"/", b"host=lolcathost"], TestProto()