فهرست منبع

Add support for auto garbage collection. (#1623)

... and invoke from some porcelain commands.

Fixes #1600
Jelmer Vernooij 1 ماه پیش
والد
کامیت
ce12ad8304
7فایلهای تغییر یافته به همراه442 افزوده شده و 33 حذف شده
  1. 3 0
      NEWS
  2. 134 4
      dulwich/gc.py
  3. 50 16
      dulwich/object_store.py
  4. 30 3
      dulwich/porcelain.py
  5. 5 0
      dulwich/repo.py
  6. 218 4
      tests/test_gc.py
  7. 2 6
      tests/test_porcelain.py

+ 3 - 0
NEWS

@@ -49,6 +49,9 @@
    behavior. Also added ``prune`` command to ``dulwich.porcelain``.
    (Jelmer Vernooij, #558)
 
+ * Add support for auto garbage collection, and invoke from
+   some porcelain commands. (Jelmer Vernooij, #1600)
+
 0.23.0	2025-06-21
 
  * Add basic ``rebase`` subcommand. (Jelmer Vernooij)

+ 134 - 4
dulwich/gc.py

@@ -1,14 +1,27 @@
 """Git garbage collection implementation."""
 
 import collections
+import os
 import time
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
-from dulwich.object_store import BaseObjectStore, PackBasedObjectStore
+from dulwich.object_store import (
+    BaseObjectStore,
+    DiskObjectStore,
+    PackBasedObjectStore,
+)
 from dulwich.objects import Commit, ObjectID, Tag, Tree
 from dulwich.refs import RefsContainer
 
+if TYPE_CHECKING:
+    from .config import Config
+    from .repo import BaseRepo
+
+
+DEFAULT_GC_AUTO = 6700
+DEFAULT_GC_AUTO_PACK_LIMIT = 50
+
 
 @dataclass
 class GCStats:
@@ -222,7 +235,7 @@ def garbage_collect(
 
     # Count initial state
     stats.packs_before = len(list(object_store.packs))
-    # TODO: Count loose objects when we have a method for it
+    stats.loose_objects_before = object_store.count_loose_objects()
 
     # Find unreachable objects to exclude from repacking
     unreachable_to_prune = set()
@@ -293,6 +306,123 @@ def garbage_collect(
 
     # Count final state
     stats.packs_after = len(list(object_store.packs))
-    # TODO: Count loose objects when we have a method for it
+    stats.loose_objects_after = object_store.count_loose_objects()
 
     return stats
+
+
+def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
+    """Check if automatic garbage collection should run.
+
+    Args:
+        repo: Repository to check
+        config: Configuration to use (defaults to repo config)
+
+    Returns:
+        True if GC should run, False otherwise
+    """
+    if config is None:
+        config = repo.get_config()
+
+    # Check if auto GC is disabled
+    try:
+        gc_auto = config.get(b"gc", b"auto")
+        gc_auto_value = int(gc_auto)
+    except KeyError:
+        gc_auto_value = DEFAULT_GC_AUTO
+
+    if gc_auto_value == 0:
+        # Auto GC is disabled
+        return False
+
+    # Check loose object count
+    object_store = repo.object_store
+    if not isinstance(object_store, DiskObjectStore):
+        # Can't count loose objects on non-disk stores
+        return False
+
+    loose_count = object_store.count_loose_objects()
+    if loose_count >= gc_auto_value:
+        return True
+
+    # Check pack file count
+    try:
+        gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
+        pack_limit = int(gc_auto_pack_limit)
+    except KeyError:
+        pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
+
+    if pack_limit > 0:
+        pack_count = object_store.count_pack_files()
+        if pack_count >= pack_limit:
+            return True
+
+    return False
+
+
+def maybe_auto_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
+    """Run automatic garbage collection if needed.
+
+    Args:
+        repo: Repository to potentially GC
+        config: Configuration to use (defaults to repo config)
+
+    Returns:
+        True if GC was run, False otherwise
+    """
+    if not should_run_gc(repo, config):
+        return False
+
+    # Check for gc.log file - only for disk-based repos
+    if not hasattr(repo, "controldir"):
+        # For non-disk repos, just run GC without gc.log handling
+        garbage_collect(repo, auto=True)
+        return True
+
+    gc_log_path = os.path.join(repo.controldir(), "gc.log")
+    if os.path.exists(gc_log_path):
+        # Check gc.logExpiry
+        if config is None:
+            config = repo.get_config()
+        try:
+            log_expiry = config.get(b"gc", b"logExpiry")
+        except KeyError:
+            # Default to 1 day
+            expiry_seconds = 86400
+        else:
+            # Parse time value (simplified - just support days for now)
+            if log_expiry.endswith((b".days", b".day")):
+                days = int(log_expiry.split(b".")[0])
+                expiry_seconds = days * 86400
+            else:
+                # Default to 1 day
+                expiry_seconds = 86400
+
+        stat_info = os.stat(gc_log_path)
+        if time.time() - stat_info.st_mtime < expiry_seconds:
+            # gc.log exists and is not expired - skip GC
+            with open(gc_log_path, "rb") as f:
+                print(f.read().decode("utf-8", errors="replace"))
+            return False
+
+    # TODO: Support gc.autoDetach to run in background
+    # For now, run in foreground
+
+    try:
+        # Run GC with auto=True flag
+        garbage_collect(repo, auto=True)
+
+        # Remove gc.log on successful completion
+        if os.path.exists(gc_log_path):
+            try:
+                os.unlink(gc_log_path)
+            except FileNotFoundError:
+                pass
+
+        return True
+    except OSError as e:
+        # Write error to gc.log
+        with open(gc_log_path, "wb") as f:
+            f.write(f"Auto GC failed: {e}\n".encode())
+        # Don't propagate the error - auto GC failures shouldn't break operations
+        return False

+ 50 - 16
dulwich/object_store.py

@@ -603,6 +603,20 @@ class PackBasedObjectStore(BaseObjectStore, PackedObjectContainer):
         """List with pack objects."""
         return list(self._iter_cached_packs()) + list(self._update_pack_cache())
 
+    def count_pack_files(self) -> int:
+        """Count the number of pack files.
+
+        Returns:
+            Number of pack files (excluding those with .keep files)
+        """
+        count = 0
+        for pack in self.packs:
+            # Check if there's a .keep file for this pack
+            keep_path = pack._basename + ".keep"
+            if not os.path.exists(keep_path):
+                count += 1
+        return count
+
     def _iter_alternate_objects(self):
         """Iterate over the SHAs of all the objects in alternate stores."""
         for alternate in self.alternates:
@@ -1020,6 +1034,32 @@ class DiskObjectStore(PackBasedObjectStore):
                     continue
                 yield sha
 
+    def count_loose_objects(self) -> int:
+        """Count the number of loose objects in the object store.
+
+        Returns:
+            Number of loose objects
+        """
+        count = 0
+        if not os.path.exists(self.path):
+            return 0
+
+        for i in range(256):
+            subdir = os.path.join(self.path, f"{i:02x}")
+            try:
+                count += len(
+                    [
+                        name
+                        for name in os.listdir(subdir)
+                        if len(name) == 38  # 40 - 2 for the prefix
+                    ]
+                )
+            except FileNotFoundError:
+                # Directory may have been removed or is inaccessible
+                continue
+
+        return count
+
     def _get_loose_object(self, sha):
         path = self._get_shafile_path(sha)
         try:
@@ -1047,7 +1087,7 @@ class DiskObjectStore(PackBasedObjectStore):
             path = self._get_shafile_path(sha)
             try:
                 return os.path.getmtime(path)
-            except (OSError, FileNotFoundError):
+            except FileNotFoundError:
                 pass
 
         # Check if it's in a pack file
@@ -1058,7 +1098,7 @@ class DiskObjectStore(PackBasedObjectStore):
                     pack_path = pack._data_path
                     try:
                         return os.path.getmtime(pack_path)
-                    except (OSError, FileNotFoundError, AttributeError):
+                    except (FileNotFoundError, AttributeError):
                         pass
             except PackFileDisappeared:
                 pass
@@ -1365,13 +1405,10 @@ class DiskObjectStore(PackBasedObjectStore):
 
         # Clean up tmp_pack_* files in the repository directory
         for tmp_file in glob.glob(os.path.join(self.path, "tmp_pack_*")):
-            try:
-                # Check if file is old enough (more than grace period)
-                mtime = os.path.getmtime(tmp_file)
-                if time.time() - mtime > grace_period:
-                    os.remove(tmp_file)
-            except OSError:
-                pass
+            # Check if file is old enough (more than grace period)
+            mtime = os.path.getmtime(tmp_file)
+            if time.time() - mtime > grace_period:
+                os.remove(tmp_file)
 
         # Clean up orphaned .pack files without corresponding .idx files
         try:
@@ -1394,13 +1431,10 @@ class DiskObjectStore(PackBasedObjectStore):
         for base_name, pack_name in pack_files.items():
             if base_name not in idx_files:
                 pack_path = os.path.join(self.pack_dir, pack_name)
-                try:
-                    # Check if file is old enough (more than grace period)
-                    mtime = os.path.getmtime(pack_path)
-                    if time.time() - mtime > grace_period:
-                        os.remove(pack_path)
-                except OSError:
-                    pass
+                # Check if file is old enough (more than grace period)
+                mtime = os.path.getmtime(pack_path)
+                if time.time() - mtime > grace_period:
+                    os.remove(pack_path)
 
 
 class MemoryObjectStore(BaseObjectStore):

+ 30 - 3
dulwich/porcelain.py

@@ -652,9 +652,11 @@ def add(repo: Union[str, os.PathLike, BaseRepo] = ".", paths=None):
 
             try:
                 relpath = str(resolved_path.relative_to(repo_path)).replace(os.sep, "/")
-            except ValueError:
+            except ValueError as e:
                 # Path is not within the repository
-                raise ValueError(f"Path {p} is not within repository {repo_path}")
+                raise ValueError(
+                    f"Path {p} is not within repository {repo_path}"
+                ) from e
 
             # Handle directories by scanning their contents
             if resolved_path.is_dir():
@@ -1537,6 +1539,12 @@ def push(
         if remote_name is not None:
             _import_remote_refs(r.refs, remote_name, remote_changed_refs)
 
+    # Trigger auto GC if needed
+    from .gc import maybe_auto_gc
+
+    with open_repo_closing(repo) as r:
+        maybe_auto_gc(r)
+
 
 def pull(
     repo,
@@ -1641,6 +1649,12 @@ def pull(
         if remote_name is not None:
             _import_remote_refs(r.refs, remote_name, fetch_result.refs)
 
+    # Trigger auto GC if needed
+    from .gc import maybe_auto_gc
+
+    with open_repo_closing(repo) as r:
+        maybe_auto_gc(r)
+
 
 def status(repo=".", ignored=False, untracked_files="normal"):
     """Returns staged, unstaged, and untracked changes relative to the HEAD.
@@ -2101,6 +2115,12 @@ def fetch(
                 prune_tags=prune_tags,
             )
 
+    # Trigger auto GC if needed
+    from .gc import maybe_auto_gc
+
+    with open_repo_closing(repo) as r:
+        maybe_auto_gc(r)
+
     return fetch_result
 
 
@@ -3040,10 +3060,17 @@ def merge(
         except KeyError:
             raise Error(f"Cannot find commit '{committish}'")
 
-        return _do_merge(
+        result = _do_merge(
             r, merge_commit_id, no_commit, no_ff, message, author, committer
         )
 
+        # Trigger auto GC if needed
+        from .gc import maybe_auto_gc
+
+        maybe_auto_gc(r)
+
+        return result
+
 
 def unpack_objects(pack_path, target="."):
     """Unpack objects from a pack file into the repository.

+ 5 - 0
dulwich/repo.py

@@ -1131,6 +1131,11 @@ class BaseRepo:
         except KeyError:  # no hook defined, silent fallthrough
             pass
 
+        # Trigger auto GC if needed
+        from .gc import maybe_auto_gc
+
+        maybe_auto_gc(self)
+
         return c.id
 
 

+ 218 - 4
tests/test_gc.py

@@ -1,18 +1,24 @@
 """Tests for dulwich.gc."""
 
+import os
 import shutil
 import tempfile
+import time
 from unittest import TestCase
+from unittest.mock import patch
 
+from dulwich.config import ConfigDict
 from dulwich.gc import (
     GCStats,
     find_reachable_objects,
     find_unreachable_objects,
     garbage_collect,
+    maybe_auto_gc,
     prune_unreachable_objects,
+    should_run_gc,
 )
 from dulwich.objects import Blob, Commit, Tree
-from dulwich.repo import Repo
+from dulwich.repo import MemoryRepo, Repo
 
 
 class GCTestCase(TestCase):
@@ -159,6 +165,13 @@ class GCTestCase(TestCase):
         self.assertIsInstance(stats, GCStats)
         self.assertEqual({unreachable_blob.id}, stats.pruned_objects)
         self.assertGreater(stats.bytes_freed, 0)
+        # Check that loose objects were counted
+        self.assertGreaterEqual(
+            stats.loose_objects_before, 4
+        )  # At least blob, tree, commit, unreachable
+        self.assertLess(
+            stats.loose_objects_after, stats.loose_objects_before
+        )  # Should have fewer after GC
 
     def test_garbage_collect_no_prune(self):
         """Test garbage collection without pruning."""
@@ -217,9 +230,6 @@ class GCTestCase(TestCase):
 
     def test_grace_period_old_object(self):
         """Test that old objects are pruned even with grace period."""
-        import os
-        import time
-
         # Create an unreachable blob
         old_blob = Blob.from_string(b"old unreachable content")
         self.repo.object_store.add_object(old_blob)
@@ -260,3 +270,207 @@ class GCTestCase(TestCase):
         self.assertEqual({unreachable_blob.id}, stats.pruned_objects)
         self.assertGreater(stats.bytes_freed, 0)
         self.assertNotIn(unreachable_blob.id, self.repo.object_store)
+
+
+class AutoGCTestCase(TestCase):
+    """Tests for auto GC functionality."""
+
+    def test_should_run_gc_disabled(self):
+        """Test that auto GC doesn't run when gc.auto is 0."""
+        r = MemoryRepo()
+        config = ConfigDict()
+        config.set(b"gc", b"auto", b"0")
+
+        self.assertFalse(should_run_gc(r, config))
+
+    def test_should_run_gc_default_values(self):
+        """Test auto GC with default configuration values."""
+        r = MemoryRepo()
+        config = ConfigDict()
+
+        # Should not run with empty repo
+        self.assertFalse(should_run_gc(r, config))
+
+    def test_should_run_gc_with_loose_objects(self):
+        """Test that auto GC triggers based on loose object count."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            r = Repo.init(tmpdir)
+            config = ConfigDict()
+            config.set(b"gc", b"auto", b"10")  # Low threshold for testing
+
+            # Add some loose objects
+            for i in range(15):
+                blob = Blob()
+                blob.data = f"test blob {i}".encode()
+                r.object_store.add_object(blob)
+
+            self.assertTrue(should_run_gc(r, config))
+
+    def test_should_run_gc_with_pack_limit(self):
+        """Test that auto GC triggers based on pack file count."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            r = Repo.init(tmpdir)
+            config = ConfigDict()
+            config.set(b"gc", b"autoPackLimit", b"2")  # Low threshold for testing
+
+            # Create some pack files by repacking
+            for i in range(3):
+                blob = Blob()
+                blob.data = f"test blob {i}".encode()
+                r.object_store.add_object(blob)
+                r.object_store.pack_loose_objects()
+
+            # Force re-enumeration of packs
+            r.object_store._update_pack_cache()
+
+            self.assertTrue(should_run_gc(r, config))
+
+    def test_count_loose_objects(self):
+        """Test counting loose objects."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            r = Repo.init(tmpdir)
+
+            # Initially should have no loose objects
+            count = r.object_store.count_loose_objects()
+            self.assertEqual(0, count)
+
+            # Add some loose objects
+            for i in range(5):
+                blob = Blob()
+                blob.data = f"test blob {i}".encode()
+                r.object_store.add_object(blob)
+
+            count = r.object_store.count_loose_objects()
+            self.assertEqual(5, count)
+
+    def test_count_pack_files(self):
+        """Test counting pack files."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            r = Repo.init(tmpdir)
+
+            # Initially should have no packs
+            count = r.object_store.count_pack_files()
+            self.assertEqual(0, count)
+
+            # Create a pack
+            blob = Blob()
+            blob.data = b"test blob"
+            r.object_store.add_object(blob)
+            r.object_store.pack_loose_objects()
+
+            # Force re-enumeration of packs
+            r.object_store._update_pack_cache()
+
+            count = r.object_store.count_pack_files()
+            self.assertEqual(1, count)
+
+    def test_maybe_auto_gc_runs_when_needed(self):
+        """Test that auto GC runs when thresholds are exceeded."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            r = Repo.init(tmpdir)
+            config = ConfigDict()
+            config.set(b"gc", b"auto", b"5")  # Low threshold for testing
+
+            # Add enough loose objects to trigger GC
+            for i in range(10):
+                blob = Blob()
+                blob.data = f"test blob {i}".encode()
+                r.object_store.add_object(blob)
+
+            with patch("dulwich.gc.garbage_collect") as mock_gc:
+                result = maybe_auto_gc(r, config)
+
+            self.assertTrue(result)
+            mock_gc.assert_called_once_with(r, auto=True)
+
+    def test_maybe_auto_gc_skips_when_not_needed(self):
+        """Test that auto GC doesn't run when thresholds are not exceeded."""
+        r = MemoryRepo()
+        config = ConfigDict()
+
+        with patch("dulwich.gc.garbage_collect") as mock_gc:
+            result = maybe_auto_gc(r, config)
+
+        self.assertFalse(result)
+        mock_gc.assert_not_called()
+
+    def test_maybe_auto_gc_with_gc_log(self):
+        """Test that auto GC is skipped when gc.log exists and is recent."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            r = Repo.init(tmpdir)
+            config = ConfigDict()
+            config.set(b"gc", b"auto", b"1")  # Low threshold
+
+            # Create gc.log file
+            gc_log_path = os.path.join(r.controldir(), "gc.log")
+            with open(gc_log_path, "wb") as f:
+                f.write(b"Previous GC failed\n")
+
+            # Add objects to trigger GC
+            blob = Blob()
+            blob.data = b"test"
+            r.object_store.add_object(blob)
+
+            with patch("builtins.print") as mock_print:
+                result = maybe_auto_gc(r, config)
+
+            self.assertFalse(result)
+            # Verify gc.log contents were printed
+            mock_print.assert_called_once_with("Previous GC failed\n")
+
+    def test_maybe_auto_gc_with_expired_gc_log(self):
+        """Test that auto GC runs when gc.log exists but is expired."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            r = Repo.init(tmpdir)
+            config = ConfigDict()
+            config.set(b"gc", b"auto", b"1")  # Low threshold
+            config.set(b"gc", b"logExpiry", b"0.days")  # Expire immediately
+
+            # Create gc.log file
+            gc_log_path = os.path.join(r.controldir(), "gc.log")
+            with open(gc_log_path, "wb") as f:
+                f.write(b"Previous GC failed\n")
+
+            # Make the file old
+            old_time = time.time() - 86400  # 1 day ago
+            os.utime(gc_log_path, (old_time, old_time))
+
+            # Add objects to trigger GC
+            blob = Blob()
+            blob.data = b"test"
+            r.object_store.add_object(blob)
+
+            with patch("dulwich.gc.garbage_collect") as mock_gc:
+                result = maybe_auto_gc(r, config)
+
+            self.assertTrue(result)
+            mock_gc.assert_called_once_with(r, auto=True)
+            # gc.log should be removed after successful GC
+            self.assertFalse(os.path.exists(gc_log_path))
+
+    def test_maybe_auto_gc_handles_gc_failure(self):
+        """Test that auto GC handles failures gracefully."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            r = Repo.init(tmpdir)
+            config = ConfigDict()
+            config.set(b"gc", b"auto", b"1")  # Low threshold
+
+            # Add objects to trigger GC
+            blob = Blob()
+            blob.data = b"test"
+            r.object_store.add_object(blob)
+
+            with patch(
+                "dulwich.gc.garbage_collect", side_effect=OSError("GC failed")
+            ) as mock_gc:
+                result = maybe_auto_gc(r, config)
+
+            self.assertFalse(result)
+            mock_gc.assert_called_once_with(r, auto=True)
+
+            # Check that error was written to gc.log
+            gc_log_path = os.path.join(r.controldir(), "gc.log")
+            self.assertTrue(os.path.exists(gc_log_path))
+            with open(gc_log_path, "rb") as f:
+                content = f.read()
+                self.assertIn(b"Auto GC failed: GC failed", content)

+ 2 - 6
tests/test_porcelain.py

@@ -5531,6 +5531,7 @@ class PruneTests(PorcelainTestCase):
         tmp_pack_path = os.path.join(objects_dir, "tmp_pack_recent")
         with open(tmp_pack_path, "wb") as f:
             f.write(b"recent temporary data")
+        self.addCleanup(os.remove, tmp_pack_path)
 
         # Run prune
         porcelain.prune(self.repo.path)
@@ -5538,9 +5539,6 @@ class PruneTests(PorcelainTestCase):
         # Verify the file was NOT removed
         self.assertTrue(os.path.exists(tmp_pack_path))
 
-        # Clean up
-        os.remove(tmp_pack_path)
-
     def test_prune_with_custom_grace_period(self):
         """Test prune with custom grace period."""
         # Create a 1-hour-old temporary file
@@ -5566,6 +5564,7 @@ class PruneTests(PorcelainTestCase):
         tmp_pack_path = os.path.join(objects_dir, "tmp_pack_dryrun")
         with open(tmp_pack_path, "wb") as f:
             f.write(b"old temporary data")
+        self.addCleanup(os.remove, tmp_pack_path)
 
         # Make it old
         old_time = time.time() - (DEFAULT_TEMPFILE_GRACE_PERIOD + 3600)
@@ -5576,6 +5575,3 @@ class PruneTests(PorcelainTestCase):
 
         # Verify the file was NOT removed (dry run)
         self.assertTrue(os.path.exists(tmp_pack_path))
-
-        # Clean up
-        os.remove(tmp_pack_path)