فهرست منبع

Add prune method to clean up orphaned temporary pack files

This implements cleanup of orphaned temporary pack files left behind by
interrupted git operations. The implementation follows Git's design:

- Add prune() method to BaseObjectStore and DiskObjectStore
- Prune removes tmp_pack_* files and orphaned .pack files without .idx
- Default grace period of 2 weeks matches Git's behavior
- garbage_collect() now calls prune() to include cleanup in gc operations

Fixes #558
Jelmer Vernooij 1 ماه پیش
والد
کامیت
f4f96dde7d
4فایلهای تغییر یافته به همراه177 افزوده شده و 0 حذف شده
  1. 4 0
      NEWS
  2. 6 0
      dulwich/gc.py
  3. 76 0
      dulwich/object_store.py
  4. 91 0
      tests/test_object_store.py

+ 4 - 0
NEWS

@@ -44,6 +44,10 @@
 
  * Support timeouts for HTTP client operations.  (Jelmer Vernooij)
 
+ * Add ``prune`` method to object stores for cleaning up orphaned temporary
+   pack files. This is now called by ``garbage_collect()`` to match Git's
+   behavior. (#558, Jelmer Vernooij)
+
 0.23.0	2025-06-21
 
  * Add basic ``rebase`` subcommand. (Jelmer Vernooij)

+ 6 - 0
dulwich/gc.py

@@ -285,6 +285,12 @@ def garbage_collect(
             # Normal repack
             object_store.repack()
 
+    # Prune orphaned temporary files
+    if progress:
+        progress("Pruning temporary files")
+    if not dry_run:
+        object_store.prune(grace_period=grace_period)
+
     # Count final state
     stats.packs_after = len(list(object_store.packs))
     # TODO: Count loose objects when we have a method for it

+ 76 - 0
dulwich/object_store.py

@@ -27,6 +27,7 @@ import binascii
 import os
 import stat
 import sys
+import time
 import warnings
 from collections.abc import Iterable, Iterator, Sequence
 from contextlib import suppress
@@ -89,6 +90,10 @@ PACKDIR = "pack"
 # would requite some rather significant adjustments to the test suite
 PACK_MODE = 0o444 if sys.platform != "win32" else 0o644
 
+# Grace period for cleaning up temporary pack files (in seconds)
+# Matches git's default of 2 weeks
+DEFAULT_TEMPFILE_GRACE_PERIOD = 14 * 24 * 60 * 60  # 2 weeks
+
 
 def find_shallow(store, heads, depth):
     """Find shallow commits according to a given depth.
@@ -417,6 +422,18 @@ class BaseObjectStore:
         """Close any files opened by this object store."""
         # Default implementation is a NO-OP
 
+    def prune(self, grace_period: Optional[int] = None) -> None:
+        """Prune/clean up this object store.
+
+        This includes removing orphaned temporary files and other
+        housekeeping tasks. Default implementation is a NO-OP.
+
+        Args:
+          grace_period: Grace period in seconds for removing temporary files.
+                       If None, uses the default grace period.
+        """
+        # Default implementation is a NO-OP
+
     def iter_prefix(self, prefix: bytes) -> Iterator[ObjectID]:
         """Iterate over all SHA1s that start with a given prefix.
 
@@ -852,6 +869,9 @@ class PackBasedObjectStore(BaseObjectStore, PackedObjectContainer):
 class DiskObjectStore(PackBasedObjectStore):
     """Git-style object store that exists on disk."""
 
+    path: Union[str, os.PathLike]
+    pack_dir: Union[str, os.PathLike]
+
     def __init__(
         self,
         path: Union[str, os.PathLike],
@@ -1326,6 +1346,62 @@ class DiskObjectStore(PackBasedObjectStore):
             # Clear cached commit graph so it gets reloaded
             self._commit_graph = None
 
+    def prune(self, grace_period: Optional[int] = None) -> None:
+        """Prune/clean up this object store.
+
+        This removes temporary files that were left behind by interrupted
+        pack operations. These are files that start with 'tmp_pack_' in the
+        repository directory or files with .pack extension but no corresponding
+        .idx file in the pack directory.
+
+        Args:
+          grace_period: Grace period in seconds for removing temporary files.
+                       If None, uses DEFAULT_TEMPFILE_GRACE_PERIOD.
+        """
+        import glob
+
+        if grace_period is None:
+            grace_period = DEFAULT_TEMPFILE_GRACE_PERIOD
+
+        # Clean up tmp_pack_* files in the repository directory
+        for tmp_file in glob.glob(os.path.join(self.path, "tmp_pack_*")):
+            try:
+                # Check if file is old enough (more than grace period)
+                mtime = os.path.getmtime(tmp_file)
+                if time.time() - mtime > grace_period:
+                    os.remove(tmp_file)
+            except OSError:
+                pass
+
+        # Clean up orphaned .pack files without corresponding .idx files
+        try:
+            pack_dir_contents = os.listdir(self.pack_dir)
+        except FileNotFoundError:
+            return
+
+        pack_files = {}
+        idx_files = set()
+
+        for name in pack_dir_contents:
+            if name.endswith(".pack"):
+                base_name = name[:-5]  # Remove .pack extension
+                pack_files[base_name] = name
+            elif name.endswith(".idx"):
+                base_name = name[:-4]  # Remove .idx extension
+                idx_files.add(base_name)
+
+        # Remove .pack files without corresponding .idx files
+        for base_name, pack_name in pack_files.items():
+            if base_name not in idx_files:
+                pack_path = os.path.join(self.pack_dir, pack_name)
+                try:
+                    # Check if file is old enough (more than grace period)
+                    mtime = os.path.getmtime(pack_path)
+                    if time.time() - mtime > grace_period:
+                        os.remove(pack_path)
+                except OSError:
+                    pass
+
 
 class MemoryObjectStore(BaseObjectStore):
     """Object store that keeps all objects in memory."""

+ 91 - 0
tests/test_object_store.py

@@ -415,6 +415,97 @@ class DiskObjectStoreTests(PackBasedObjectStoreTests, TestCase):
         idx2 = load_pack_index(idx_path2)
         self.assertEqual(3, idx2.version)
 
+    def test_prune_orphaned_tempfiles(self) -> None:
+        import time
+
+        # Create an orphaned temporary pack file in the repository directory
+        tmp_pack_path = os.path.join(self.store_dir, "tmp_pack_test123")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"temporary pack data")
+
+        # Create an orphaned .pack file without .idx in pack directory
+        pack_dir = os.path.join(self.store_dir, "pack")
+        orphaned_pack_path = os.path.join(pack_dir, "pack-orphaned.pack")
+        with open(orphaned_pack_path, "wb") as f:
+            f.write(b"orphaned pack data")
+
+        # Make files appear old by modifying mtime (older than grace period)
+        from dulwich.object_store import DEFAULT_TEMPFILE_GRACE_PERIOD
+
+        old_time = time.time() - (
+            DEFAULT_TEMPFILE_GRACE_PERIOD + 3600
+        )  # grace period + 1 hour
+        os.utime(tmp_pack_path, (old_time, old_time))
+        os.utime(orphaned_pack_path, (old_time, old_time))
+
+        # Create a recent temporary file that should NOT be cleaned
+        recent_tmp_path = os.path.join(self.store_dir, "tmp_pack_recent")
+        with open(recent_tmp_path, "wb") as f:
+            f.write(b"recent temp data")
+
+        # Run prune
+        self.store.prune()
+
+        # Check that old orphaned files were removed
+        self.assertFalse(os.path.exists(tmp_pack_path))
+        self.assertFalse(os.path.exists(orphaned_pack_path))
+
+        # Check that recent file was NOT removed
+        self.assertTrue(os.path.exists(recent_tmp_path))
+
+        # Cleanup the recent file
+        os.remove(recent_tmp_path)
+
+    def test_prune_with_custom_grace_period(self) -> None:
+        """Test that prune respects custom grace period."""
+        import time
+
+        # Create a temporary file that's 1 hour old
+        tmp_pack_path = os.path.join(self.store_dir, "tmp_pack_1hour")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"1 hour old data")
+
+        # Make it 1 hour old
+        old_time = time.time() - 3600  # 1 hour ago
+        os.utime(tmp_pack_path, (old_time, old_time))
+
+        # Prune with default grace period (2 weeks) - should NOT remove
+        self.store.prune()
+        self.assertTrue(os.path.exists(tmp_pack_path))
+
+        # Prune with 30 minute grace period - should remove
+        self.store.prune(grace_period=1800)  # 30 minutes
+        self.assertFalse(os.path.exists(tmp_pack_path))
+
+    def test_gc_prunes_tempfiles(self) -> None:
+        """Test that garbage collection prunes temporary files."""
+        import time
+
+        from dulwich.gc import garbage_collect
+        from dulwich.repo import Repo
+
+        # Create a repository with the store
+        repo = Repo.init(self.store_dir)
+
+        # Create an old orphaned temporary file in the objects directory
+        tmp_pack_path = os.path.join(repo.object_store.path, "tmp_pack_old")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"old temporary data")
+
+        # Make it old (older than grace period)
+        from dulwich.object_store import DEFAULT_TEMPFILE_GRACE_PERIOD
+
+        old_time = time.time() - (
+            DEFAULT_TEMPFILE_GRACE_PERIOD + 3600
+        )  # grace period + 1 hour
+        os.utime(tmp_pack_path, (old_time, old_time))
+
+        # Run garbage collection
+        garbage_collect(repo)
+
+        # Verify the orphaned file was cleaned up
+        self.assertFalse(os.path.exists(tmp_pack_path))
+
 
 class TreeLookupPathTests(TestCase):
     def setUp(self) -> None: