浏览代码

Add prune method to clean up orphaned temporary pack files

This implements cleanup of orphaned temporary pack files left behind by
interrupted git operations. The implementation follows Git's design:

- Add prune() method to BaseObjectStore and DiskObjectStore
- Prune removes tmp_pack_* files and orphaned .pack files without .idx
- Default grace period of 2 weeks matches Git's behavior
- garbage_collect() now calls prune() to include cleanup in gc operations

Fixes #558
Jelmer Vernooij 1 月之前
父节点
当前提交
f4f96dde7d
共有 4 个文件被更改,包括 177 次插入0 次删除
  1. 4 0
      NEWS
  2. 6 0
      dulwich/gc.py
  3. 76 0
      dulwich/object_store.py
  4. 91 0
      tests/test_object_store.py

+ 4 - 0
NEWS

@@ -44,6 +44,10 @@
 
  * Support timeouts for HTTP client operations.  (Jelmer Vernooij)
 
+ * Add ``prune`` method to object stores for cleaning up orphaned temporary
+   pack files. This is now called by ``garbage_collect()`` to match Git's
+   behavior. (#558, Jelmer Vernooij)
+
 0.23.0	2025-06-21
 
  * Add basic ``rebase`` subcommand. (Jelmer Vernooij)

+ 6 - 0
dulwich/gc.py

@@ -285,6 +285,12 @@ def garbage_collect(
             # Normal repack
             object_store.repack()
 
+    # Prune orphaned temporary files
+    if progress:
+        progress("Pruning temporary files")
+    if not dry_run:
+        object_store.prune(grace_period=grace_period)
+
     # Count final state
     stats.packs_after = len(list(object_store.packs))
     # TODO: Count loose objects when we have a method for it

+ 76 - 0
dulwich/object_store.py

@@ -27,6 +27,7 @@ import binascii
 import os
 import stat
 import sys
+import time
 import warnings
 from collections.abc import Iterable, Iterator, Sequence
 from contextlib import suppress
@@ -89,6 +90,10 @@ PACKDIR = "pack"
 # would requite some rather significant adjustments to the test suite
 PACK_MODE = 0o444 if sys.platform != "win32" else 0o644
 
+# Grace period for cleaning up temporary pack files (in seconds)
+# Matches git's default of 2 weeks
+DEFAULT_TEMPFILE_GRACE_PERIOD = 14 * 24 * 60 * 60  # 2 weeks
+
 
 def find_shallow(store, heads, depth):
     """Find shallow commits according to a given depth.
@@ -417,6 +422,18 @@ class BaseObjectStore:
         """Close any files opened by this object store."""
         # Default implementation is a NO-OP
 
+    def prune(self, grace_period: Optional[int] = None) -> None:
+        """Prune/clean up this object store.
+
+        This includes removing orphaned temporary files and other
+        housekeeping tasks. Default implementation is a NO-OP.
+
+        Args:
+          grace_period: Grace period in seconds for removing temporary files.
+                       If None, uses the default grace period.
+        """
+        # Default implementation is a NO-OP
+
     def iter_prefix(self, prefix: bytes) -> Iterator[ObjectID]:
         """Iterate over all SHA1s that start with a given prefix.
 
@@ -852,6 +869,9 @@ class PackBasedObjectStore(BaseObjectStore, PackedObjectContainer):
 class DiskObjectStore(PackBasedObjectStore):
     """Git-style object store that exists on disk."""
 
+    path: Union[str, os.PathLike]
+    pack_dir: Union[str, os.PathLike]
+
     def __init__(
         self,
         path: Union[str, os.PathLike],
@@ -1326,6 +1346,62 @@ class DiskObjectStore(PackBasedObjectStore):
             # Clear cached commit graph so it gets reloaded
             self._commit_graph = None
 
+    def prune(self, grace_period: Optional[int] = None) -> None:
+        """Prune/clean up this object store.
+
+        This removes temporary files that were left behind by interrupted
+        pack operations. These are files that start with 'tmp_pack_' in the
+        repository directory or files with .pack extension but no corresponding
+        .idx file in the pack directory.
+
+        Args:
+          grace_period: Grace period in seconds for removing temporary files.
+                       If None, uses DEFAULT_TEMPFILE_GRACE_PERIOD.
+        """
+        import glob
+
+        if grace_period is None:
+            grace_period = DEFAULT_TEMPFILE_GRACE_PERIOD
+
+        # Clean up tmp_pack_* files in the repository directory
+        for tmp_file in glob.glob(os.path.join(self.path, "tmp_pack_*")):
+            try:
+                # Check if file is old enough (more than grace period)
+                mtime = os.path.getmtime(tmp_file)
+                if time.time() - mtime > grace_period:
+                    os.remove(tmp_file)
+            except OSError:
+                pass
+
+        # Clean up orphaned .pack files without corresponding .idx files
+        try:
+            pack_dir_contents = os.listdir(self.pack_dir)
+        except FileNotFoundError:
+            return
+
+        pack_files = {}
+        idx_files = set()
+
+        for name in pack_dir_contents:
+            if name.endswith(".pack"):
+                base_name = name[:-5]  # Remove .pack extension
+                pack_files[base_name] = name
+            elif name.endswith(".idx"):
+                base_name = name[:-4]  # Remove .idx extension
+                idx_files.add(base_name)
+
+        # Remove .pack files without corresponding .idx files
+        for base_name, pack_name in pack_files.items():
+            if base_name not in idx_files:
+                pack_path = os.path.join(self.pack_dir, pack_name)
+                try:
+                    # Check if file is old enough (more than grace period)
+                    mtime = os.path.getmtime(pack_path)
+                    if time.time() - mtime > grace_period:
+                        os.remove(pack_path)
+                except OSError:
+                    pass
+
 
 class MemoryObjectStore(BaseObjectStore):
     """Object store that keeps all objects in memory."""

+ 91 - 0
tests/test_object_store.py

@@ -415,6 +415,97 @@ class DiskObjectStoreTests(PackBasedObjectStoreTests, TestCase):
         idx2 = load_pack_index(idx_path2)
         self.assertEqual(3, idx2.version)
 
+    def test_prune_orphaned_tempfiles(self) -> None:
+        import time
+
+        # Create an orphaned temporary pack file in the repository directory
+        tmp_pack_path = os.path.join(self.store_dir, "tmp_pack_test123")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"temporary pack data")
+
+        # Create an orphaned .pack file without .idx in pack directory
+        pack_dir = os.path.join(self.store_dir, "pack")
+        orphaned_pack_path = os.path.join(pack_dir, "pack-orphaned.pack")
+        with open(orphaned_pack_path, "wb") as f:
+            f.write(b"orphaned pack data")
+
+        # Make files appear old by modifying mtime (older than grace period)
+        from dulwich.object_store import DEFAULT_TEMPFILE_GRACE_PERIOD
+
+        old_time = time.time() - (
+            DEFAULT_TEMPFILE_GRACE_PERIOD + 3600
+        )  # grace period + 1 hour
+        os.utime(tmp_pack_path, (old_time, old_time))
+        os.utime(orphaned_pack_path, (old_time, old_time))
+
+        # Create a recent temporary file that should NOT be cleaned
+        recent_tmp_path = os.path.join(self.store_dir, "tmp_pack_recent")
+        with open(recent_tmp_path, "wb") as f:
+            f.write(b"recent temp data")
+
+        # Run prune
+        self.store.prune()
+
+        # Check that old orphaned files were removed
+        self.assertFalse(os.path.exists(tmp_pack_path))
+        self.assertFalse(os.path.exists(orphaned_pack_path))
+
+        # Check that recent file was NOT removed
+        self.assertTrue(os.path.exists(recent_tmp_path))
+
+        # Cleanup the recent file
+        os.remove(recent_tmp_path)
+
+    def test_prune_with_custom_grace_period(self) -> None:
+        """Test that prune respects custom grace period."""
+        import time
+
+        # Create a temporary file that's 1 hour old
+        tmp_pack_path = os.path.join(self.store_dir, "tmp_pack_1hour")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"1 hour old data")
+
+        # Make it 1 hour old
+        old_time = time.time() - 3600  # 1 hour ago
+        os.utime(tmp_pack_path, (old_time, old_time))
+
+        # Prune with default grace period (2 weeks) - should NOT remove
+        self.store.prune()
+        self.assertTrue(os.path.exists(tmp_pack_path))
+
+        # Prune with 30 minute grace period - should remove
+        self.store.prune(grace_period=1800)  # 30 minutes
+        self.assertFalse(os.path.exists(tmp_pack_path))
+
+    def test_gc_prunes_tempfiles(self) -> None:
+        """Test that garbage collection prunes temporary files."""
+        import time
+
+        from dulwich.gc import garbage_collect
+        from dulwich.repo import Repo
+
+        # Create a repository with the store
+        repo = Repo.init(self.store_dir)
+
+        # Create an old orphaned temporary file in the objects directory
+        tmp_pack_path = os.path.join(repo.object_store.path, "tmp_pack_old")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"old temporary data")
+
+        # Make it old (older than grace period)
+        from dulwich.object_store import DEFAULT_TEMPFILE_GRACE_PERIOD
+
+        old_time = time.time() - (
+            DEFAULT_TEMPFILE_GRACE_PERIOD + 3600
+        )  # grace period + 1 hour
+        os.utime(tmp_pack_path, (old_time, old_time))
+
+        # Run garbage collection
+        garbage_collect(repo)
+
+        # Verify the orphaned file was cleaned up
+        self.assertFalse(os.path.exists(tmp_pack_path))
+
 
 class TreeLookupPathTests(TestCase):
     def setUp(self) -> None: