Browse Source

Merge remote-tracking branch 'danchr/issue1107'

Jelmer Vernooij 2 years ago
parent
commit
7dac1cc553
11 changed files with 291 additions and 25 deletions
  1. 1 8
      .github/workflows/pythontest.yml
  2. 5 0
      NEWS
  3. 32 7
      dulwich/cli.py
  4. 4 1
      dulwich/file.py
  5. 1 1
      dulwich/objects.py
  6. 60 3
      dulwich/pack.py
  7. 34 1
      dulwich/porcelain.py
  8. 57 3
      dulwich/refs.py
  9. 37 0
      dulwich/tests/test_porcelain.py
  10. 60 0
      dulwich/tests/test_refs.py
  11. 0 1
      setup.cfg

+ 1 - 8
.github/workflows/pythontest.yml

@@ -13,14 +13,7 @@ jobs:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
         python-version:
-          ["3.7", "3.8", "3.9", "3.10", "3.11", pypy3]
-        exclude:
-          # sqlite3 exit handling seems to get in the way
-          - os: macos-latest
-            python-version: pypy3
-          # doesn't support passing in bytestrings to os.scandir
-          - os: windows-latest
-            python-version: pypy3
+          ["3.7", "3.8", "3.9", "3.10", "3.11"]
       fail-fast: false
 
     steps:

+ 5 - 0
NEWS

@@ -2,6 +2,11 @@
 
 0.20.50	2022-10-30
 
+ * Add --deltify option to ``dulwich pack-objects`` which enables
+   deltification, and add initial support for reusing suitable
+   deltas found in an existing pack file.
+   (Stefan Sperling)
+
  * Fix Repo.reset_index.
    Previously, it instead took the union with the given tree.
    (Christian Sattler, #1072)

+ 32 - 7
dulwich/cli.py

@@ -300,6 +300,18 @@ class cmd_symbolic_ref(Command):
         porcelain.symbolic_ref(".", ref_name=ref_name, force="--force" in args)
 
 
+class cmd_pack_refs(Command):
+    def run(self, argv):
+        parser = argparse.ArgumentParser()
+        parser.add_argument('--all', action='store_true')
+        # ignored, we never prune
+        parser.add_argument('--no-prune', action='store_true')
+
+        args = parser.parse_args(argv)
+
+        porcelain.pack_refs(".", all=args.all)
+
+
 class cmd_show(Command):
     def run(self, argv):
         parser = argparse.ArgumentParser()
@@ -523,22 +535,34 @@ class cmd_ls_tree(Command):
 
 class cmd_pack_objects(Command):
     def run(self, args):
-        opts, args = getopt(args, "", ["stdout"])
+        deltify = False
+        reuse_deltas = True
+        opts, args = getopt(args, "", ["stdout", "deltify", "no-reuse-deltas"])
         opts = dict(opts)
-        if len(args) < 1 and "--stdout" not in args:
+        if len(args) < 1 and "--stdout" not in opts.keys():
             print("Usage: dulwich pack-objects basename")
             sys.exit(1)
         object_ids = [line.strip() for line in sys.stdin.readlines()]
-        basename = args[0]
-        if "--stdout" in opts:
+        if "--deltify" in opts.keys():
+            deltify = True
+        if "--no-reuse-deltas" in opts.keys():
+            reuse_deltas = False
+        if "--stdout" in opts.keys():
             packf = getattr(sys.stdout, "buffer", sys.stdout)
             idxf = None
             close = []
         else:
-            packf = open(basename + ".pack", "w")
-            idxf = open(basename + ".idx", "w")
+            basename = args[0]
+            packf = open(basename + ".pack", "wb")
+            idxf = open(basename + ".idx", "wb")
             close = [packf, idxf]
-        porcelain.pack_objects(".", object_ids, packf, idxf)
+        porcelain.pack_objects(
+            ".",
+            object_ids,
+            packf,
+            idxf,
+            deltify=deltify,
+            reuse_deltas=reuse_deltas)
         for f in close:
             f.close()
 
@@ -744,6 +768,7 @@ commands = {
     "ls-remote": cmd_ls_remote,
     "ls-tree": cmd_ls_tree,
     "pack-objects": cmd_pack_objects,
+    "pack-refs": cmd_pack_refs,
     "pull": cmd_pull,
     "push": cmd_push,
     "receive-pack": cmd_receive_pack,

+ 4 - 1
dulwich/file.py

@@ -209,7 +209,10 @@ class _GitFile(object):
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.close()
+        if exc_type is not None:
+            self.abort()
+        else:
+            self.close()
 
     def __getattr__(self, name):
         """Proxy property calls to the underlying file."""

+ 1 - 1
dulwich/objects.py

@@ -135,7 +135,7 @@ def hex_to_filename(path, hex):
     # os.path.join accepts bytes or unicode, but all args must be of the same
     # type. Make sure that hex which is expected to be bytes, is the same type
     # as path.
-    if getattr(path, "encode", None) is not None:
+    if type(path) != type(hex) and getattr(path, "encode", None) is not None:
         hex = hex.decode("ascii")
     dir = hex[:2]
     file = hex[2:]

+ 60 - 3
dulwich/pack.py

@@ -1270,6 +1270,22 @@ class PackData(object):
             unpacked.comp_chunks,
         )
 
+    def get_decompressed_data_at(self, offset):
+        """Given an offset in the packfile, decompress the data that is there.
+
+        Using the associated index the location of an object can be looked up,
+        and then the packfile can be asked directly for that object using this
+        function.
+        """
+        assert offset >= self._header_size
+        self._file.seek(offset)
+        unpacked, _ = unpack_object(self._file.read, include_comp=False)
+        return (
+            unpacked.pack_type_num,
+            unpacked.delta_base,
+            unpacked.decomp_chunks,
+        )
+
     def get_object_at(self, offset):
         """Given an offset in to the packfile return the object that is there.
 
@@ -1616,22 +1632,42 @@ def write_pack_header(write, num_objects):
         write(chunk)
 
 
-def deltify_pack_objects(objects, window_size=None):
+def deltify_pack_objects(objects, window_size=None, reuse_pack=None):
     """Generate deltas for pack objects.
 
     Args:
       objects: An iterable of (object, path) tuples to deltify.
       window_size: Window size; None for default
+      reuse_pack: Pack object we can search for objects to reuse
     Returns: Iterator over type_num, object id, delta_base, content
         delta_base is None for full text entries
     """
     # TODO(jelmer): Use threads
     if window_size is None:
         window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
+
+    reused_deltas = set()
+    if reuse_pack:
+        # Build a set of SHA1 IDs which will be part of this pack file.
+        # We can only reuse a delta if its base will be present in the
+        # generated pack file.
+        objects_to_pack = set()
+        for obj, path in objects:
+            objects_to_pack.add(sha_to_hex(obj.sha().digest()))
+        for o, _ in objects:
+            if not o.sha().digest() in reuse_pack:
+                continue
+            # get_raw_unresolved() translates OFS_DELTA into REF_DELTA for us
+            (obj_type, delta_base, _) = reuse_pack.get_raw_unresolved(o.sha().digest())
+            if obj_type == REF_DELTA and delta_base in objects_to_pack:
+                reused_deltas.add(o.sha().digest())
+
     # Build a list of objects ordered by the magic Linus heuristic
     # This helps us find good objects to diff against us
     magic = []
     for obj, path in objects:
+        if obj.sha().digest() in reused_deltas:
+            continue
         magic.append((obj.type_num, path, -obj.raw_length(), obj))
     magic.sort()
 
@@ -1661,6 +1697,10 @@ def deltify_pack_objects(objects, window_size=None):
         while len(possible_bases) > window_size:
             possible_bases.pop()
 
+    for sha_digest in reused_deltas:
+        (obj_type, delta_base, chunks) = reuse_pack.get_raw_delta(sha_digest)
+        yield obj_type, sha_digest, hex_to_sha(delta_base), chunks
+
 
 def pack_objects_to_data(objects):
     """Create pack data from objects
@@ -1680,7 +1720,7 @@ def pack_objects_to_data(objects):
 
 
 def write_pack_objects(
-    write, objects, delta_window_size=None, deltify=None, compression_level=-1
+    write, objects, delta_window_size=None, deltify=None, reuse_pack=None, compression_level=-1
 ):
     """Write a new pack data file.
 
@@ -1691,6 +1731,7 @@ def write_pack_objects(
       delta_window_size: Sliding window size for searching for deltas;
                          Set to None for default window size.
       deltify: Whether to deltify objects
+      reuse_pack: Pack object we can search for objects to reuse
       compression_level: the zlib compression level to use
     Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
     """
@@ -1705,7 +1746,7 @@ def write_pack_objects(
         # slow at the moment.
         deltify = False
     if deltify:
-        pack_contents = deltify_pack_objects(objects, delta_window_size)
+        pack_contents = deltify_pack_objects(objects, delta_window_size, reuse_pack)
         pack_contents_count = len(objects)
     else:
         pack_contents_count, pack_contents = pack_objects_to_data(objects)
@@ -2172,6 +2213,22 @@ class Pack(object):
         type_num, chunks = self.resolve_object(offset, obj_type, obj)
         return type_num, b"".join(chunks)
 
+    def get_raw_delta(self, sha1):
+        """Get raw decompressed delta data chunks for a given SHA1.
+        Convert OFS_DELTA objects to REF_DELTA objects, like get_raw_unresolved() does.
+
+        Args:
+          sha1: SHA to return data for
+        Returns: Tuple with pack object type, delta base (if applicable),
+            list of data chunks
+        """
+        offset = self.index.object_index(sha1)
+        (obj_type, delta_base, chunks) = self.data.get_decompressed_data_at(offset)
+        if obj_type == OFS_DELTA:
+            delta_base = sha_to_hex(self.index.object_sha1(offset - delta_base))
+            obj_type = REF_DELTA
+        return (obj_type, delta_base, chunks)
+
     def __getitem__(self, sha1):
         """Retrieve the specified SHA1."""
         type, uncomp = self.get_raw(sha1)

+ 34 - 1
dulwich/porcelain.py

@@ -407,6 +407,18 @@ def symbolic_ref(repo, ref_name, force=False):
         repo_obj.refs.set_symbolic_ref(b"HEAD", ref_path)
 
 
+def pack_refs(repo, all=False):
+    with open_repo_closing(repo) as repo_obj:
+        refs = repo_obj.refs
+        packed_refs = {
+            ref: refs[ref]
+            for ref in refs
+            if (all or ref.startswith(LOCAL_TAG_PREFIX)) and ref != b"HEAD"
+        }
+
+        refs.add_packed_refs(packed_refs)
+
+
 def commit(
     repo=".",
     message=None,
@@ -1741,7 +1753,19 @@ def repack(repo):
         r.object_store.pack_loose_objects()
 
 
-def pack_objects(repo, object_ids, packf, idxf, delta_window_size=None):
+def find_pack_for_reuse(repo):
+    reuse_pack = None
+    max_pack_len = 0
+    # The pack file which contains the largest number of objects
+    # will be most suitable for object reuse.
+    for p in repo.object_store.packs:
+        if len(p) > max_pack_len:
+            reuse_pack = p
+            max_pack_len = len(reuse_pack)
+    return reuse_pack
+
+
+def pack_objects(repo, object_ids, packf, idxf, delta_window_size=None, deltify=None, reuse_deltas=True):
     """Pack objects into a file.
 
     Args:
@@ -1749,12 +1773,21 @@ def pack_objects(repo, object_ids, packf, idxf, delta_window_size=None):
       object_ids: List of object ids to write
       packf: File-like object to write to
       idxf: File-like object to write to (can be None)
+      delta_window_size: Sliding window size for searching for deltas;
+                         Set to None for default window size.
+      deltify: Whether to deltify objects
+      reuse_deltas: Allow reuse of existing deltas while deltifying
     """
     with open_repo_closing(repo) as r:
+        reuse_pack = None
+        if deltify and reuse_deltas:
+            reuse_pack = find_pack_for_reuse(r)
         entries, data_sum = write_pack_objects(
             packf.write,
             r.object_store.iter_shas((oid, None) for oid in object_ids),
+            deltify=deltify,
             delta_window_size=delta_window_size,
+            reuse_pack=reuse_pack
         )
     if idxf is not None:
         entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])

+ 57 - 3
dulwich/refs.py

@@ -155,6 +155,15 @@ class RefsContainer(object):
         """
         raise NotImplementedError(self.get_packed_refs)
 
+    def add_packed_refs(self, new_refs: Dict[Ref, Optional[ObjectID]]):
+        """Add the given refs as packed refs.
+
+        Args:
+          new_refs: A mapping of ref names to targets; if a target is None that
+            means remove the ref
+        """
+        raise NotImplementedError(self.add_packed_refs)
+
     def get_peeled(self, name):
         """Return the cached peeled value of a ref, if available.
 
@@ -706,6 +715,44 @@ class DiskRefsContainer(RefsContainer):
                         self._packed_refs[name] = sha
         return self._packed_refs
 
+    def add_packed_refs(self, new_refs: Dict[Ref, Optional[ObjectID]]):
+        """Add the given refs as packed refs.
+
+        Args:
+          new_refs: A mapping of ref names to targets; if a target is None that
+            means remove the ref
+        """
+        if not new_refs:
+            return
+
+        path = os.path.join(self.path, b"packed-refs")
+
+        with GitFile(path, "wb") as f:
+            # reread cached refs from disk, while holding the lock
+            packed_refs = self.get_packed_refs().copy()
+
+            for ref, target in new_refs.items():
+                # sanity check
+                if ref == HEADREF:
+                    raise ValueError("cannot pack HEAD")
+
+                # remove any loose refs pointing to this one -- please
+                # note that this bypasses remove_if_equals as we don't
+                # want to affect packed refs in here
+                try:
+                    os.remove(self.refpath(ref))
+                except OSError:
+                    pass
+
+                if target is not None:
+                    packed_refs[ref] = target
+                else:
+                    packed_refs.pop(ref, None)
+
+            write_packed_refs(f, packed_refs, self._peeled_refs)
+
+            self._packed_refs = packed_refs
+
     def get_peeled(self, name):
         """Return the cached peeled value of a ref, if available.
 
@@ -748,7 +795,10 @@ class DiskRefsContainer(RefsContainer):
                 else:
                     # Read only the first 40 bytes
                     return header + f.read(40 - len(SYMREF))
-        except (FileNotFoundError, IsADirectoryError, NotADirectoryError):
+        except (OSError, UnicodeError):
+            # don't assume anything specific about the error; in
+            # particular, invalid or forbidden paths can raise weird
+            # errors depending on the specific operating system
             return None
 
     def _remove_packed_ref(self, name):
@@ -965,9 +1015,13 @@ class DiskRefsContainer(RefsContainer):
 
             # remove the reference file itself
             try:
+                found = os.path.lexists(filename)
+            except OSError:
+                # may only be packed, or otherwise unstorable
+                found = False
+
+            if found:
                 os.remove(filename)
-            except FileNotFoundError:
-                pass  # may only be packed
 
             self._remove_packed_ref(name)
             self._log(

+ 37 - 0
dulwich/tests/test_porcelain.py

@@ -3140,6 +3140,43 @@ class FindUniqueAbbrevTests(PorcelainTestCase):
             porcelain.find_unique_abbrev(self.repo.object_store, c1.id))
 
 
+class PackRefsTests(PorcelainTestCase):
+    def test_all(self):
+        c1, c2, c3 = build_commit_graph(
+            self.repo.object_store, [[1], [2, 1], [3, 1, 2]]
+        )
+        self.repo.refs[b"HEAD"] = c3.id
+        self.repo.refs[b"refs/heads/master"] = c2.id
+        self.repo.refs[b"refs/tags/foo"] = c1.id
+
+        porcelain.pack_refs(self.repo, all=True)
+
+        self.assertEqual(
+            self.repo.refs.get_packed_refs(),
+            {
+                b"refs/heads/master": c2.id,
+                b"refs/tags/foo": c1.id,
+            },
+        )
+
+    def test_not_all(self):
+        c1, c2, c3 = build_commit_graph(
+            self.repo.object_store, [[1], [2, 1], [3, 1, 2]]
+        )
+        self.repo.refs[b"HEAD"] = c3.id
+        self.repo.refs[b"refs/heads/master"] = c2.id
+        self.repo.refs[b"refs/tags/foo"] = c1.id
+
+        porcelain.pack_refs(self.repo)
+
+        self.assertEqual(
+            self.repo.refs.get_packed_refs(),
+            {
+                b"refs/tags/foo": c1.id,
+            },
+        )
+
+
 class ServerTests(PorcelainTestCase):
     @contextlib.contextmanager
     def _serving(self):

+ 60 - 0
dulwich/tests/test_refs.py

@@ -447,6 +447,66 @@ class DiskRefsContainerTests(RefsContainerTests, TestCase):
             b"42d06bd4b77fed026b154d16493e5deab78f02ec",
         )
 
+        # this shouldn't overwrite the packed refs
+        self.assertEqual(
+            {b"refs/heads/packed": b"42d06bd4b77fed026b154d16493e5deab78f02ec"},
+            self._refs.get_packed_refs(),
+        )
+
+    def test_add_packed_refs(self):
+        # first, create a non-packed ref
+        self._refs[b"refs/heads/packed"] = b"3ec9c43c84ff242e3ef4a9fc5bc111fd780a76a8"
+
+        packed_ref_path = os.path.join(self._refs.path, b"refs", b"heads", b"packed")
+        self.assertTrue(os.path.exists(packed_ref_path))
+
+        # now overwrite that with a packed ref
+        packed_refs_file_path = os.path.join(self._refs.path, b"packed-refs")
+        self._refs.add_packed_refs(
+            {
+                b"refs/heads/packed": b"42d06bd4b77fed026b154d16493e5deab78f02ec",
+            }
+        )
+
+        # that should kill the file
+        self.assertFalse(os.path.exists(packed_ref_path))
+
+        # now delete the packed ref
+        self._refs.add_packed_refs(
+            {
+                b"refs/heads/packed": None,
+            }
+        )
+
+        # and it's gone!
+        self.assertFalse(os.path.exists(packed_ref_path))
+
+        self.assertRaises(
+            KeyError,
+            self._refs.__getitem__,
+            b"refs/heads/packed",
+        )
+
+        # just in case, make sure we can't pack HEAD
+        self.assertRaises(
+            ValueError,
+            self._refs.add_packed_refs,
+            {b"HEAD": "02ac81614bcdbd585a37b4b0edf8cb8a"},
+        )
+
+        # delete all packed refs
+        self._refs.add_packed_refs({ref: None for ref in self._refs.get_packed_refs()})
+
+        self.assertEqual({}, self._refs.get_packed_refs())
+
+        # remove the packed ref file, and check that adding nothing doesn't affect that
+        os.remove(packed_refs_file_path)
+
+        # adding nothing doesn't make it reappear
+        self._refs.add_packed_refs({})
+
+        self.assertFalse(os.path.exists(packed_refs_file_path))
+
     def test_setitem_symbolic(self):
         ones = b"1" * 40
         self._refs[b"HEAD"] = ones

+ 0 - 1
setup.cfg

@@ -14,7 +14,6 @@ keywords = vcs, git
 classifiers =
     Development Status :: 4 - Beta
     License :: OSI Approved :: Apache Software License
-    Programming Language :: Python :: 3.6
     Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9