瀏覽代碼

Add prune command (#1622)

Fixes #558
Jelmer Vernooij 1 月之前
父節點
當前提交
10512297dc
共有 7 個文件被更改,包括 344 次插入0 次删除
  1. 5 0
      NEWS
  2. 63 0
      dulwich/cli.py
  3. 6 0
      dulwich/gc.py
  4. 76 0
      dulwich/object_store.py
  5. 26 0
      dulwich/porcelain.py
  6. 91 0
      tests/test_object_store.py
  7. 77 0
      tests/test_porcelain.py

+ 5 - 0
NEWS

@@ -44,6 +44,11 @@
 
  * Support timeouts for HTTP client operations.  (Jelmer Vernooij)
 
+ * Add ``prune`` method to object stores for cleaning up orphaned temporary
+   pack files. This is now called by ``garbage_collect()`` to match Git's
+   behavior. Also added ``prune`` command to ``dulwich.porcelain``.
+   (Jelmer Vernooij, #558)
+
 0.23.0	2025-06-21
 
  * Add basic ``rebase`` subcommand. (Jelmer Vernooij)

+ 63 - 0
dulwich/cli.py

@@ -761,6 +761,68 @@ class cmd_unpack_objects(Command):
         print(f"Unpacked {count} objects")
 
 
+class cmd_prune(Command):
+    def run(self, args) -> Optional[int]:
+        import datetime
+        import time
+
+        from dulwich.object_store import DEFAULT_TEMPFILE_GRACE_PERIOD
+
+        parser = argparse.ArgumentParser(
+            description="Remove temporary pack files left behind by interrupted operations"
+        )
+        parser.add_argument(
+            "--expire",
+            nargs="?",
+            const="2.weeks.ago",
+            help="Only prune files older than the specified date (default: 2.weeks.ago)",
+        )
+        parser.add_argument(
+            "--dry-run",
+            "-n",
+            action="store_true",
+            help="Only report what would be removed",
+        )
+        parser.add_argument(
+            "--verbose",
+            "-v",
+            action="store_true",
+            help="Report all actions",
+        )
+        args = parser.parse_args(args)
+
+        # Parse expire grace period
+        grace_period = DEFAULT_TEMPFILE_GRACE_PERIOD
+        if args.expire:
+            try:
+                grace_period = parse_relative_time(args.expire)
+            except ValueError:
+                # Try to parse as absolute date
+                try:
+                    date = datetime.datetime.strptime(args.expire, "%Y-%m-%d")
+                    grace_period = int(time.time() - date.timestamp())
+                except ValueError:
+                    print(f"Error: Invalid expire date: {args.expire}", file=sys.stderr)
+                    return 1
+
+        # Progress callback
+        def progress(msg):
+            if args.verbose:
+                print(msg)
+
+        try:
+            porcelain.prune(
+                ".",
+                grace_period=grace_period,
+                dry_run=args.dry_run,
+                progress=progress if args.verbose else None,
+            )
+            return None
+        except porcelain.Error as e:
+            print(f"Error: {e}", file=sys.stderr)
+            return 1
+
+
 class cmd_pull(Command):
     def run(self, args) -> None:
         parser = argparse.ArgumentParser()
@@ -1491,6 +1553,7 @@ commands = {
     "notes": cmd_notes,
     "pack-objects": cmd_pack_objects,
     "pack-refs": cmd_pack_refs,
+    "prune": cmd_prune,
     "pull": cmd_pull,
     "push": cmd_push,
     "rebase": cmd_rebase,

+ 6 - 0
dulwich/gc.py

@@ -285,6 +285,12 @@ def garbage_collect(
             # Normal repack
             object_store.repack()
 
+    # Prune orphaned temporary files
+    if progress:
+        progress("Pruning temporary files")
+    if not dry_run:
+        object_store.prune(grace_period=grace_period)
+
     # Count final state
     stats.packs_after = len(list(object_store.packs))
     # TODO: Count loose objects when we have a method for it

+ 76 - 0
dulwich/object_store.py

@@ -27,6 +27,7 @@ import binascii
 import os
 import stat
 import sys
+import time
 import warnings
 from collections.abc import Iterable, Iterator, Sequence
 from contextlib import suppress
@@ -89,6 +90,10 @@ PACKDIR = "pack"
 # would requite some rather significant adjustments to the test suite
 PACK_MODE = 0o444 if sys.platform != "win32" else 0o644
 
+# Grace period for cleaning up temporary pack files (in seconds)
+# Matches git's default of 2 weeks
+DEFAULT_TEMPFILE_GRACE_PERIOD = 14 * 24 * 60 * 60  # 2 weeks
+
 
 def find_shallow(store, heads, depth):
     """Find shallow commits according to a given depth.
@@ -417,6 +422,18 @@ class BaseObjectStore:
         """Close any files opened by this object store."""
         # Default implementation is a NO-OP
 
+    def prune(self, grace_period: Optional[int] = None) -> None:
+        """Prune/clean up this object store.
+
+        This includes removing orphaned temporary files and other
+        housekeeping tasks. Default implementation is a NO-OP.
+
+        Args:
+          grace_period: Grace period in seconds for removing temporary files.
+                       If None, uses the default grace period.
+        """
+        # Default implementation is a NO-OP
+
     def iter_prefix(self, prefix: bytes) -> Iterator[ObjectID]:
         """Iterate over all SHA1s that start with a given prefix.
 
@@ -852,6 +869,9 @@ class PackBasedObjectStore(BaseObjectStore, PackedObjectContainer):
 class DiskObjectStore(PackBasedObjectStore):
     """Git-style object store that exists on disk."""
 
+    path: Union[str, os.PathLike]
+    pack_dir: Union[str, os.PathLike]
+
     def __init__(
         self,
         path: Union[str, os.PathLike],
@@ -1326,6 +1346,62 @@ class DiskObjectStore(PackBasedObjectStore):
             # Clear cached commit graph so it gets reloaded
             self._commit_graph = None
 
+    def prune(self, grace_period: Optional[int] = None) -> None:
+        """Prune/clean up this object store.
+
+        This removes temporary files that were left behind by interrupted
+        pack operations. These are files that start with 'tmp_pack_' in the
+        repository directory or files with .pack extension but no corresponding
+        .idx file in the pack directory.
+
+        Args:
+          grace_period: Grace period in seconds for removing temporary files.
+                       If None, uses DEFAULT_TEMPFILE_GRACE_PERIOD.
+        """
+        import glob
+
+        if grace_period is None:
+            grace_period = DEFAULT_TEMPFILE_GRACE_PERIOD
+
+        # Clean up tmp_pack_* files in the repository directory
+        for tmp_file in glob.glob(os.path.join(self.path, "tmp_pack_*")):
+            try:
+                # Check if file is old enough (more than grace period)
+                mtime = os.path.getmtime(tmp_file)
+                if time.time() - mtime > grace_period:
+                    os.remove(tmp_file)
+            except OSError:
+                pass
+
+        # Clean up orphaned .pack files without corresponding .idx files
+        try:
+            pack_dir_contents = os.listdir(self.pack_dir)
+        except FileNotFoundError:
+            return
+
+        pack_files = {}
+        idx_files = set()
+
+        for name in pack_dir_contents:
+            if name.endswith(".pack"):
+                base_name = name[:-5]  # Remove .pack extension
+                pack_files[base_name] = name
+            elif name.endswith(".idx"):
+                base_name = name[:-4]  # Remove .idx extension
+                idx_files.add(base_name)
+
+        # Remove .pack files without corresponding .idx files
+        for base_name, pack_name in pack_files.items():
+            if base_name not in idx_files:
+                pack_path = os.path.join(self.pack_dir, pack_name)
+                try:
+                    # Check if file is old enough (more than grace period)
+                    mtime = os.path.getmtime(pack_path)
+                    if time.time() - mtime > grace_period:
+                        os.remove(pack_path)
+                except OSError:
+                    pass
+
 
 class MemoryObjectStore(BaseObjectStore):
     """Object store that keeps all objects in memory."""

+ 26 - 0
dulwich/porcelain.py

@@ -43,6 +43,7 @@ Currently implemented:
  * ls_tree
  * merge
  * merge_tree
+ * prune
  * pull
  * push
  * rm
@@ -3436,6 +3437,31 @@ def gc(
         )
 
 
+def prune(
+    repo,
+    grace_period: Optional[int] = None,
+    dry_run: bool = False,
+    progress=None,
+):
+    """Prune/clean up a repository's object store.
+
+    This removes temporary files that were left behind by interrupted
+    pack operations.
+
+    Args:
+      repo: Path to the repository or a Repo object
+      grace_period: Grace period in seconds for removing temporary files
+                    (default 2 weeks)
+      dry_run: If True, only report what would be done
+      progress: Optional progress callback
+    """
+    with open_repo_closing(repo) as r:
+        if progress:
+            progress("Pruning temporary files")
+        if not dry_run:
+            r.object_store.prune(grace_period=grace_period)
+
+
 def count_objects(repo=".", verbose=False) -> CountObjectsResult:
     """Count unpacked objects and their disk usage.
 

+ 91 - 0
tests/test_object_store.py

@@ -415,6 +415,97 @@ class DiskObjectStoreTests(PackBasedObjectStoreTests, TestCase):
         idx2 = load_pack_index(idx_path2)
         self.assertEqual(3, idx2.version)
 
+    def test_prune_orphaned_tempfiles(self) -> None:
+        import time
+
+        # Create an orphaned temporary pack file in the repository directory
+        tmp_pack_path = os.path.join(self.store_dir, "tmp_pack_test123")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"temporary pack data")
+
+        # Create an orphaned .pack file without .idx in pack directory
+        pack_dir = os.path.join(self.store_dir, "pack")
+        orphaned_pack_path = os.path.join(pack_dir, "pack-orphaned.pack")
+        with open(orphaned_pack_path, "wb") as f:
+            f.write(b"orphaned pack data")
+
+        # Make files appear old by modifying mtime (older than grace period)
+        from dulwich.object_store import DEFAULT_TEMPFILE_GRACE_PERIOD
+
+        old_time = time.time() - (
+            DEFAULT_TEMPFILE_GRACE_PERIOD + 3600
+        )  # grace period + 1 hour
+        os.utime(tmp_pack_path, (old_time, old_time))
+        os.utime(orphaned_pack_path, (old_time, old_time))
+
+        # Create a recent temporary file that should NOT be cleaned
+        recent_tmp_path = os.path.join(self.store_dir, "tmp_pack_recent")
+        with open(recent_tmp_path, "wb") as f:
+            f.write(b"recent temp data")
+
+        # Run prune
+        self.store.prune()
+
+        # Check that old orphaned files were removed
+        self.assertFalse(os.path.exists(tmp_pack_path))
+        self.assertFalse(os.path.exists(orphaned_pack_path))
+
+        # Check that recent file was NOT removed
+        self.assertTrue(os.path.exists(recent_tmp_path))
+
+        # Cleanup the recent file
+        os.remove(recent_tmp_path)
+
+    def test_prune_with_custom_grace_period(self) -> None:
+        """Test that prune respects custom grace period."""
+        import time
+
+        # Create a temporary file that's 1 hour old
+        tmp_pack_path = os.path.join(self.store_dir, "tmp_pack_1hour")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"1 hour old data")
+
+        # Make it 1 hour old
+        old_time = time.time() - 3600  # 1 hour ago
+        os.utime(tmp_pack_path, (old_time, old_time))
+
+        # Prune with default grace period (2 weeks) - should NOT remove
+        self.store.prune()
+        self.assertTrue(os.path.exists(tmp_pack_path))
+
+        # Prune with 30 minute grace period - should remove
+        self.store.prune(grace_period=1800)  # 30 minutes
+        self.assertFalse(os.path.exists(tmp_pack_path))
+
+    def test_gc_prunes_tempfiles(self) -> None:
+        """Test that garbage collection prunes temporary files."""
+        import time
+
+        from dulwich.gc import garbage_collect
+        from dulwich.repo import Repo
+
+        # Create a repository with the store
+        repo = Repo.init(self.store_dir)
+
+        # Create an old orphaned temporary file in the objects directory
+        tmp_pack_path = os.path.join(repo.object_store.path, "tmp_pack_old")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"old temporary data")
+
+        # Make it old (older than grace period)
+        from dulwich.object_store import DEFAULT_TEMPFILE_GRACE_PERIOD
+
+        old_time = time.time() - (
+            DEFAULT_TEMPFILE_GRACE_PERIOD + 3600
+        )  # grace period + 1 hour
+        os.utime(tmp_pack_path, (old_time, old_time))
+
+        # Run garbage collection
+        garbage_collect(repo)
+
+        # Verify the orphaned file was cleaned up
+        self.assertFalse(os.path.exists(tmp_pack_path))
+
 
 class TreeLookupPathTests(TestCase):
     def setUp(self) -> None:

+ 77 - 0
tests/test_porcelain.py

@@ -39,6 +39,7 @@ from unittest import skipIf
 from dulwich import porcelain
 from dulwich.diff_tree import tree_changes
 from dulwich.errors import CommitError
+from dulwich.object_store import DEFAULT_TEMPFILE_GRACE_PERIOD
 from dulwich.objects import ZERO_SHA, Blob, Tag, Tree
 from dulwich.porcelain import (
     CheckoutError,  # Hypothetical or real error class
@@ -5502,3 +5503,79 @@ class CountObjectsTests(PorcelainTestCase):
 
         # Verify it's the correct dataclass type
         self.assertIsInstance(stats, CountObjectsResult)
+
+
+class PruneTests(PorcelainTestCase):
+    def test_prune_removes_old_tempfiles(self):
+        """Test that prune removes old temporary files."""
+        # Create an old temporary file in the objects directory
+        objects_dir = os.path.join(self.repo.path, ".git", "objects")
+        tmp_pack_path = os.path.join(objects_dir, "tmp_pack_test")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"old temporary data")
+
+        # Make it old
+        old_time = time.time() - (DEFAULT_TEMPFILE_GRACE_PERIOD + 3600)
+        os.utime(tmp_pack_path, (old_time, old_time))
+
+        # Run prune
+        porcelain.prune(self.repo.path)
+
+        # Verify the file was removed
+        self.assertFalse(os.path.exists(tmp_pack_path))
+
+    def test_prune_keeps_recent_tempfiles(self):
+        """Test that prune keeps recent temporary files."""
+        # Create a recent temporary file
+        objects_dir = os.path.join(self.repo.path, ".git", "objects")
+        tmp_pack_path = os.path.join(objects_dir, "tmp_pack_recent")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"recent temporary data")
+
+        # Run prune
+        porcelain.prune(self.repo.path)
+
+        # Verify the file was NOT removed
+        self.assertTrue(os.path.exists(tmp_pack_path))
+
+        # Clean up
+        os.remove(tmp_pack_path)
+
+    def test_prune_with_custom_grace_period(self):
+        """Test prune with custom grace period."""
+        # Create a 1-hour-old temporary file
+        objects_dir = os.path.join(self.repo.path, ".git", "objects")
+        tmp_pack_path = os.path.join(objects_dir, "tmp_pack_1hour")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"1 hour old data")
+
+        # Make it 1 hour old
+        old_time = time.time() - 3600
+        os.utime(tmp_pack_path, (old_time, old_time))
+
+        # Prune with 30-minute grace period should remove it
+        porcelain.prune(self.repo.path, grace_period=1800)
+
+        # Verify the file was removed
+        self.assertFalse(os.path.exists(tmp_pack_path))
+
+    def test_prune_dry_run(self):
+        """Test prune in dry-run mode."""
+        # Create an old temporary file
+        objects_dir = os.path.join(self.repo.path, ".git", "objects")
+        tmp_pack_path = os.path.join(objects_dir, "tmp_pack_dryrun")
+        with open(tmp_pack_path, "wb") as f:
+            f.write(b"old temporary data")
+
+        # Make it old
+        old_time = time.time() - (DEFAULT_TEMPFILE_GRACE_PERIOD + 3600)
+        os.utime(tmp_pack_path, (old_time, old_time))
+
+        # Run prune in dry-run mode
+        porcelain.prune(self.repo.path, dry_run=True)
+
+        # Verify the file was NOT removed (dry run)
+        self.assertTrue(os.path.exists(tmp_pack_path))
+
+        # Clean up
+        os.remove(tmp_pack_path)