Переглянути джерело

allow for reuse of existing deltas while creating pack files

This feature allows the ``dulwich pack-objects`` command to
complete in a somewhat reasonable amount of time on small
repositories, provided a pack file which can provide deltas
already exists.

Reusing deltas from an existing pack file avoids deltification
overhead while creating a new pack file. This optimization is
especially important when serving clones and fetches from a server.

For testing purposes, add two options to ``dulwich pack-objects``:
  --deltify: enables deltification and delta-reuse
  --no-reuse-deltas: disables delta-reuse during deltification

The write_pack_objects() now accepts an optional Pack file object
argument which enables reuse of deltas.

If deltification is enabled and delta-reuse is not explicitly disabled,
the pack_objects() routine in porcelain.py picks a suitable pack file
for reuse, which is the pack file which contains the largest amount of
objects. A delta found in this pack file can be reused if its delta-base
will also be part of the generated pack file.

There are some limitations which should be lifted eventually:

Reused deltas are stored as ref-deltas only. There is no logic
yet to figure out where the base of a reused offset-delta has
been placed in the new pack file.

We decompress deltas for reuse and compress them again when writing
them to the new pack file. More code refactoring would be needed to
support direct copying from the source pack file in order to avoid
re-compression overhead.
Stefan Sperling 2 роки тому
батько
коміт
89b43214b1
4 змінених файлів з 101 додано та 6 видалено
  1. 5 0
      NEWS
  2. 14 2
      dulwich/cli.py
  3. 60 3
      dulwich/pack.py
  4. 22 1
      dulwich/porcelain.py

+ 5 - 0
NEWS

@@ -2,6 +2,11 @@
 
 0.20.50	2022-10-30
 
+ * Add --deltify option to ``dulwich pack-objects`` which enables
+   deltification, and add initial support for reusing suitable
+   deltas found in an existing pack file.
+   (Stefan Sperling)
+
  * Fix Repo.reset_index.
    Previously, it instead took the union with the given tree.
    (Christian Sattler, #1072)

+ 14 - 2
dulwich/cli.py

@@ -523,12 +523,18 @@ class cmd_ls_tree(Command):
 
 class cmd_pack_objects(Command):
     def run(self, args):
-        opts, args = getopt(args, "", ["stdout"])
+        deltify = False
+        reuse_deltas = True
+        opts, args = getopt(args, "", ["stdout", "deltify", "no-reuse-deltas"])
         opts = dict(opts)
         if len(args) < 1 and "--stdout" not in opts.keys():
             print("Usage: dulwich pack-objects basename")
             sys.exit(1)
         object_ids = [line.strip() for line in sys.stdin.readlines()]
+        if "--deltify" in opts.keys():
+            deltify = True
+        if "--no-reuse-deltas" in opts.keys():
+            reuse_deltas = False
         if "--stdout" in opts.keys():
             packf = getattr(sys.stdout, "buffer", sys.stdout)
             idxf = None
@@ -538,7 +544,13 @@ class cmd_pack_objects(Command):
             packf = open(basename + ".pack", "wb")
             idxf = open(basename + ".idx", "wb")
             close = [packf, idxf]
-        porcelain.pack_objects(".", object_ids, packf, idxf)
+        porcelain.pack_objects(
+            ".",
+            object_ids,
+            packf,
+            idxf,
+            deltify=deltify,
+            reuse_deltas=reuse_deltas)
         for f in close:
             f.close()
 

+ 60 - 3
dulwich/pack.py

@@ -1270,6 +1270,22 @@ class PackData(object):
             unpacked.comp_chunks,
         )
 
+    def get_decompressed_data_at(self, offset):
+        """Given an offset in the packfile, decompress the data that is there.
+
+        Using the associated index the location of an object can be looked up,
+        and then the packfile can be asked directly for that object using this
+        function.
+        """
+        assert offset >= self._header_size
+        self._file.seek(offset)
+        unpacked, _ = unpack_object(self._file.read, include_comp=False)
+        return (
+            unpacked.pack_type_num,
+            unpacked.delta_base,
+            unpacked.decomp_chunks,
+        )
+
     def get_object_at(self, offset):
         """Given an offset in to the packfile return the object that is there.
 
@@ -1616,22 +1632,42 @@ def write_pack_header(write, num_objects):
         write(chunk)
 
 
-def deltify_pack_objects(objects, window_size=None):
+def deltify_pack_objects(objects, window_size=None, reuse_pack=None):
     """Generate deltas for pack objects.
 
     Args:
       objects: An iterable of (object, path) tuples to deltify.
       window_size: Window size; None for default
+      reuse_pack: Pack object we can search for objects to reuse
     Returns: Iterator over type_num, object id, delta_base, content
         delta_base is None for full text entries
     """
     # TODO(jelmer): Use threads
     if window_size is None:
         window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
+
+    reused_deltas = set()
+    if reuse_pack:
+        # Build a set of SHA1 IDs which will be part of this pack file.
+        # We can only reuse a delta if its base will be present in the
+        # generated pack file.
+        objects_to_pack = set()
+        for obj, path in objects:
+            objects_to_pack.add(sha_to_hex(obj.sha().digest()))
+        for o, _ in objects:
+            if not o.sha().digest() in reuse_pack:
+                continue
+            # get_raw_unresolved() translates OFS_DELTA into REF_DELTA for us
+            (obj_type, delta_base, _) = reuse_pack.get_raw_unresolved(o.sha().digest())
+            if obj_type == REF_DELTA and delta_base in objects_to_pack:
+                reused_deltas.add(o.sha().digest())
+
     # Build a list of objects ordered by the magic Linus heuristic
     # This helps us find good objects to diff against us
     magic = []
     for obj, path in objects:
+        if obj.sha().digest() in reused_deltas:
+            continue
         magic.append((obj.type_num, path, -obj.raw_length(), obj))
     magic.sort()
 
@@ -1661,6 +1697,10 @@ def deltify_pack_objects(objects, window_size=None):
         while len(possible_bases) > window_size:
             possible_bases.pop()
 
+    for sha_digest in reused_deltas:
+        (obj_type, delta_base, chunks) = reuse_pack.get_raw_delta(sha_digest)
+        yield obj_type, sha_digest, hex_to_sha(delta_base), chunks
+
 
 def pack_objects_to_data(objects):
     """Create pack data from objects
@@ -1680,7 +1720,7 @@ def pack_objects_to_data(objects):
 
 
 def write_pack_objects(
-    write, objects, delta_window_size=None, deltify=None, compression_level=-1
+    write, objects, delta_window_size=None, deltify=None, reuse_pack=None, compression_level=-1
 ):
     """Write a new pack data file.
 
@@ -1691,6 +1731,7 @@ def write_pack_objects(
       delta_window_size: Sliding window size for searching for deltas;
                          Set to None for default window size.
       deltify: Whether to deltify objects
+      reuse_pack: Pack object we can search for objects to reuse
       compression_level: the zlib compression level to use
     Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
     """
@@ -1705,7 +1746,7 @@ def write_pack_objects(
         # slow at the moment.
         deltify = False
     if deltify:
-        pack_contents = deltify_pack_objects(objects, delta_window_size)
+        pack_contents = deltify_pack_objects(objects, delta_window_size, reuse_pack)
         pack_contents_count = len(objects)
     else:
         pack_contents_count, pack_contents = pack_objects_to_data(objects)
@@ -2172,6 +2213,22 @@ class Pack(object):
         type_num, chunks = self.resolve_object(offset, obj_type, obj)
         return type_num, b"".join(chunks)
 
+    def get_raw_delta(self, sha1):
+        """Get raw decompressed delta data chunks for a given SHA1.
+        Convert OFS_DELTA objects to REF_DELTA objects, like get_raw_unresolved() does.
+
+        Args:
+          sha1: SHA to return data for
+        Returns: Tuple with pack object type, delta base (if applicable),
+            list of data chunks
+        """
+        offset = self.index.object_index(sha1)
+        (obj_type, delta_base, chunks) = self.data.get_decompressed_data_at(offset)
+        if obj_type == OFS_DELTA:
+            delta_base = sha_to_hex(self.index.object_sha1(offset - delta_base))
+            obj_type = REF_DELTA
+        return (obj_type, delta_base, chunks)
+
     def __getitem__(self, sha1):
         """Retrieve the specified SHA1."""
         type, uncomp = self.get_raw(sha1)

+ 22 - 1
dulwich/porcelain.py

@@ -1741,7 +1741,19 @@ def repack(repo):
         r.object_store.pack_loose_objects()
 
 
-def pack_objects(repo, object_ids, packf, idxf, delta_window_size=None):
+def find_pack_for_reuse(repo):
+    reuse_pack = None
+    max_pack_len = 0
+    # The pack file which contains the largest number of objects
+    # will be most suitable for object reuse.
+    for p in repo.object_store.packs:
+        if len(p) > max_pack_len:
+            reuse_pack = p
+            max_pack_len = len(reuse_pack)
+    return reuse_pack
+
+
+def pack_objects(repo, object_ids, packf, idxf, delta_window_size=None, deltify=None, reuse_deltas=True):
     """Pack objects into a file.
 
     Args:
@@ -1749,12 +1761,21 @@ def pack_objects(repo, object_ids, packf, idxf, delta_window_size=None):
       object_ids: List of object ids to write
       packf: File-like object to write to
       idxf: File-like object to write to (can be None)
+      delta_window_size: Sliding window size for searching for deltas;
+                         Set to None for default window size.
+      deltify: Whether to deltify objects
+      reuse_deltas: Allow reuse of existing deltas while deltifying
     """
     with open_repo_closing(repo) as r:
+        reuse_pack = None
+        if deltify and reuse_deltas:
+            reuse_pack = find_pack_for_reuse(r)
         entries, data_sum = write_pack_objects(
             packf.write,
             r.object_store.iter_shas((oid, None) for oid in object_ids),
+            deltify=deltify,
             delta_window_size=delta_window_size,
+            reuse_pack=reuse_pack
         )
     if idxf is not None:
         entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])