123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428 |
- """Git garbage collection implementation."""
- import collections
- import os
- import time
- from dataclasses import dataclass, field
- from typing import TYPE_CHECKING, Optional
- from dulwich.object_store import (
- BaseObjectStore,
- DiskObjectStore,
- PackBasedObjectStore,
- )
- from dulwich.objects import Commit, ObjectID, Tag, Tree
- from dulwich.refs import RefsContainer
- if TYPE_CHECKING:
- from .config import Config
- from .repo import BaseRepo
- DEFAULT_GC_AUTO = 6700
- DEFAULT_GC_AUTO_PACK_LIMIT = 50
- @dataclass
- class GCStats:
- """Statistics from garbage collection."""
- pruned_objects: set[bytes] = field(default_factory=set)
- bytes_freed: int = 0
- packs_before: int = 0
- packs_after: int = 0
- loose_objects_before: int = 0
- loose_objects_after: int = 0
- def find_reachable_objects(
- object_store: BaseObjectStore,
- refs_container: RefsContainer,
- include_reflogs: bool = True,
- progress=None,
- ) -> set[bytes]:
- """Find all reachable objects in the repository.
- Args:
- object_store: Object store to search
- refs_container: Reference container
- include_reflogs: Whether to include reflog entries
- progress: Optional progress callback
- Returns:
- Set of reachable object SHAs
- """
- reachable = set()
- pending: collections.deque[ObjectID] = collections.deque()
- # Start with all refs
- for ref in refs_container.allkeys():
- try:
- sha = refs_container[ref] # This follows symbolic refs
- if sha and sha not in reachable:
- pending.append(sha)
- reachable.add(sha)
- except KeyError:
- # Broken ref
- if progress:
- progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
- continue
- # TODO: Add reflog support when reflog functionality is available
- # Walk all reachable objects
- while pending:
- sha = pending.popleft()
- if progress:
- progress(f"Checking object {sha.decode('ascii', 'replace')}")
- try:
- obj = object_store[sha]
- except KeyError:
- continue
- # Add referenced objects
- if isinstance(obj, Commit):
- # Tree
- if obj.tree not in reachable:
- pending.append(obj.tree)
- reachable.add(obj.tree)
- # Parents
- for parent in obj.parents:
- if parent not in reachable:
- pending.append(parent)
- reachable.add(parent)
- elif isinstance(obj, Tree):
- # Tree entries
- for entry in obj.items():
- if entry.sha not in reachable:
- pending.append(entry.sha)
- reachable.add(entry.sha)
- elif isinstance(obj, Tag):
- # Tagged object
- if obj.object[1] not in reachable:
- pending.append(obj.object[1])
- reachable.add(obj.object[1])
- return reachable
- def find_unreachable_objects(
- object_store: BaseObjectStore,
- refs_container: RefsContainer,
- include_reflogs: bool = True,
- progress=None,
- ) -> set[bytes]:
- """Find all unreachable objects in the repository.
- Args:
- object_store: Object store to search
- refs_container: Reference container
- include_reflogs: Whether to include reflog entries
- progress: Optional progress callback
- Returns:
- Set of unreachable object SHAs
- """
- reachable = find_reachable_objects(
- object_store, refs_container, include_reflogs, progress
- )
- unreachable = set()
- for sha in object_store:
- if sha not in reachable:
- unreachable.add(sha)
- return unreachable
- def prune_unreachable_objects(
- object_store: PackBasedObjectStore,
- refs_container: RefsContainer,
- grace_period: Optional[int] = None,
- dry_run: bool = False,
- progress=None,
- ) -> tuple[set[bytes], int]:
- """Remove unreachable objects from the repository.
- Args:
- object_store: Object store to prune
- refs_container: Reference container
- grace_period: Grace period in seconds (objects newer than this are kept)
- dry_run: If True, only report what would be deleted
- progress: Optional progress callback
- Returns:
- Tuple of (set of pruned object SHAs, total bytes freed)
- """
- unreachable = find_unreachable_objects(
- object_store, refs_container, progress=progress
- )
- pruned = set()
- bytes_freed = 0
- for sha in unreachable:
- try:
- obj = object_store[sha]
- # Check grace period
- if grace_period is not None:
- try:
- mtime = object_store.get_object_mtime(sha)
- age = time.time() - mtime
- if age < grace_period:
- if progress:
- progress(
- f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
- )
- continue
- except KeyError:
- # Object not found, skip it
- continue
- if progress:
- progress(f"Pruning {sha.decode('ascii', 'replace')}")
- # Calculate size before attempting deletion
- obj_size = len(obj.as_raw_string())
- if not dry_run:
- object_store.delete_loose_object(sha)
- # Only count as pruned if we get here (deletion succeeded or dry run)
- pruned.add(sha)
- bytes_freed += obj_size
- except KeyError:
- # Object already gone
- pass
- except OSError as e:
- # File system errors during deletion
- if progress:
- progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
- return pruned, bytes_freed
- def garbage_collect(
- repo,
- auto: bool = False,
- aggressive: bool = False,
- prune: bool = True,
- grace_period: Optional[int] = 1209600, # 2 weeks default
- dry_run: bool = False,
- progress=None,
- ) -> GCStats:
- """Run garbage collection on a repository.
- Args:
- repo: Repository to garbage collect
- auto: Whether this is an automatic gc
- aggressive: Whether to use aggressive settings
- prune: Whether to prune unreachable objects
- grace_period: Grace period for pruning in seconds
- dry_run: If True, only report what would be done
- progress: Optional progress callback
- Returns:
- GCStats object with garbage collection statistics
- """
- stats = GCStats()
- object_store = repo.object_store
- refs_container = repo.refs
- # Count initial state
- stats.packs_before = len(list(object_store.packs))
- stats.loose_objects_before = object_store.count_loose_objects()
- # Find unreachable objects to exclude from repacking
- unreachable_to_prune = set()
- if prune:
- if progress:
- progress("Finding unreachable objects")
- unreachable = find_unreachable_objects(
- object_store, refs_container, progress=progress
- )
- # Apply grace period check
- for sha in unreachable:
- try:
- if grace_period is not None:
- try:
- mtime = object_store.get_object_mtime(sha)
- age = time.time() - mtime
- if age < grace_period:
- if progress:
- progress(
- f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
- )
- continue
- except KeyError:
- # Object not found, skip it
- continue
- unreachable_to_prune.add(sha)
- obj = object_store[sha]
- stats.bytes_freed += len(obj.as_raw_string())
- except KeyError:
- pass
- stats.pruned_objects = unreachable_to_prune
- # Pack refs
- if progress:
- progress("Packing references")
- if not dry_run:
- repo.refs.pack_refs()
- # Delete loose unreachable objects
- if prune and not dry_run:
- for sha in unreachable_to_prune:
- if object_store.contains_loose(sha):
- try:
- object_store.delete_loose_object(sha)
- except OSError:
- pass
- # Repack everything, excluding unreachable objects
- # This handles both loose object packing and pack consolidation
- if progress:
- progress("Repacking repository")
- if not dry_run:
- if prune and unreachable_to_prune:
- # Repack excluding unreachable objects
- object_store.repack(exclude=unreachable_to_prune)
- else:
- # Normal repack
- object_store.repack()
- # Prune orphaned temporary files
- if progress:
- progress("Pruning temporary files")
- if not dry_run:
- object_store.prune(grace_period=grace_period)
- # Count final state
- stats.packs_after = len(list(object_store.packs))
- stats.loose_objects_after = object_store.count_loose_objects()
- return stats
- def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
- """Check if automatic garbage collection should run.
- Args:
- repo: Repository to check
- config: Configuration to use (defaults to repo config)
- Returns:
- True if GC should run, False otherwise
- """
- if config is None:
- config = repo.get_config()
- # Check if auto GC is disabled
- try:
- gc_auto = config.get(b"gc", b"auto")
- gc_auto_value = int(gc_auto)
- except KeyError:
- gc_auto_value = DEFAULT_GC_AUTO
- if gc_auto_value == 0:
- # Auto GC is disabled
- return False
- # Check loose object count
- object_store = repo.object_store
- if not isinstance(object_store, DiskObjectStore):
- # Can't count loose objects on non-disk stores
- return False
- loose_count = object_store.count_loose_objects()
- if loose_count >= gc_auto_value:
- return True
- # Check pack file count
- try:
- gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
- pack_limit = int(gc_auto_pack_limit)
- except KeyError:
- pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
- if pack_limit > 0:
- pack_count = object_store.count_pack_files()
- if pack_count >= pack_limit:
- return True
- return False
- def maybe_auto_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
- """Run automatic garbage collection if needed.
- Args:
- repo: Repository to potentially GC
- config: Configuration to use (defaults to repo config)
- Returns:
- True if GC was run, False otherwise
- """
- if not should_run_gc(repo, config):
- return False
- # Check for gc.log file - only for disk-based repos
- if not hasattr(repo, "controldir"):
- # For non-disk repos, just run GC without gc.log handling
- garbage_collect(repo, auto=True)
- return True
- gc_log_path = os.path.join(repo.controldir(), "gc.log")
- if os.path.exists(gc_log_path):
- # Check gc.logExpiry
- if config is None:
- config = repo.get_config()
- try:
- log_expiry = config.get(b"gc", b"logExpiry")
- except KeyError:
- # Default to 1 day
- expiry_seconds = 86400
- else:
- # Parse time value (simplified - just support days for now)
- if log_expiry.endswith((b".days", b".day")):
- days = int(log_expiry.split(b".")[0])
- expiry_seconds = days * 86400
- else:
- # Default to 1 day
- expiry_seconds = 86400
- stat_info = os.stat(gc_log_path)
- if time.time() - stat_info.st_mtime < expiry_seconds:
- # gc.log exists and is not expired - skip GC
- with open(gc_log_path, "rb") as f:
- print(f.read().decode("utf-8", errors="replace"))
- return False
- # TODO: Support gc.autoDetach to run in background
- # For now, run in foreground
- try:
- # Run GC with auto=True flag
- garbage_collect(repo, auto=True)
- # Remove gc.log on successful completion
- if os.path.exists(gc_log_path):
- try:
- os.unlink(gc_log_path)
- except FileNotFoundError:
- pass
- return True
- except OSError as e:
- # Write error to gc.log
- with open(gc_log_path, "wb") as f:
- f.write(f"Auto GC failed: {e}\n".encode())
- # Don't propagate the error - auto GC failures shouldn't break operations
- return False
|