gc.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. """Git garbage collection implementation."""
  2. import collections
  3. import os
  4. import time
  5. from dataclasses import dataclass, field
  6. from typing import TYPE_CHECKING, Callable, Optional
  7. from dulwich.object_store import (
  8. BaseObjectStore,
  9. DiskObjectStore,
  10. )
  11. from dulwich.objects import Commit, ObjectID, Tag, Tree
  12. from dulwich.refs import RefsContainer
  13. if TYPE_CHECKING:
  14. from .config import Config
  15. from .repo import BaseRepo, Repo
  16. DEFAULT_GC_AUTO = 6700
  17. DEFAULT_GC_AUTO_PACK_LIMIT = 50
  18. @dataclass
  19. class GCStats:
  20. """Statistics from garbage collection."""
  21. pruned_objects: set[bytes] = field(default_factory=set)
  22. bytes_freed: int = 0
  23. packs_before: int = 0
  24. packs_after: int = 0
  25. loose_objects_before: int = 0
  26. loose_objects_after: int = 0
  27. def find_reachable_objects(
  28. object_store: BaseObjectStore,
  29. refs_container: RefsContainer,
  30. include_reflogs: bool = True,
  31. progress: Optional[Callable[[str], None]] = None,
  32. ) -> set[bytes]:
  33. """Find all reachable objects in the repository.
  34. Args:
  35. object_store: Object store to search
  36. refs_container: Reference container
  37. include_reflogs: Whether to include reflog entries
  38. progress: Optional progress callback
  39. Returns:
  40. Set of reachable object SHAs
  41. """
  42. reachable = set()
  43. pending: collections.deque[ObjectID] = collections.deque()
  44. # Start with all refs
  45. for ref in refs_container.allkeys():
  46. try:
  47. sha = refs_container[ref] # This follows symbolic refs
  48. if sha and sha not in reachable:
  49. pending.append(sha)
  50. reachable.add(sha)
  51. except KeyError:
  52. # Broken ref
  53. if progress:
  54. progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
  55. continue
  56. # TODO: Add reflog support when reflog functionality is available
  57. # Walk all reachable objects
  58. while pending:
  59. sha = pending.popleft()
  60. if progress:
  61. progress(f"Checking object {sha.decode('ascii', 'replace')}")
  62. try:
  63. obj = object_store[sha]
  64. except KeyError:
  65. continue
  66. # Add referenced objects
  67. if isinstance(obj, Commit):
  68. # Tree
  69. if obj.tree not in reachable:
  70. pending.append(obj.tree)
  71. reachable.add(obj.tree)
  72. # Parents
  73. for parent in obj.parents:
  74. if parent not in reachable:
  75. pending.append(parent)
  76. reachable.add(parent)
  77. elif isinstance(obj, Tree):
  78. # Tree entries
  79. for entry in obj.items():
  80. if entry.sha not in reachable:
  81. pending.append(entry.sha)
  82. reachable.add(entry.sha)
  83. elif isinstance(obj, Tag):
  84. # Tagged object
  85. if obj.object[1] not in reachable:
  86. pending.append(obj.object[1])
  87. reachable.add(obj.object[1])
  88. return reachable
  89. def find_unreachable_objects(
  90. object_store: BaseObjectStore,
  91. refs_container: RefsContainer,
  92. include_reflogs: bool = True,
  93. progress: Optional[Callable[[str], None]] = None,
  94. ) -> set[bytes]:
  95. """Find all unreachable objects in the repository.
  96. Args:
  97. object_store: Object store to search
  98. refs_container: Reference container
  99. include_reflogs: Whether to include reflog entries
  100. progress: Optional progress callback
  101. Returns:
  102. Set of unreachable object SHAs
  103. """
  104. reachable = find_reachable_objects(
  105. object_store, refs_container, include_reflogs, progress
  106. )
  107. unreachable = set()
  108. for sha in object_store:
  109. if sha not in reachable:
  110. unreachable.add(sha)
  111. return unreachable
  112. def prune_unreachable_objects(
  113. object_store: DiskObjectStore,
  114. refs_container: RefsContainer,
  115. grace_period: Optional[int] = None,
  116. dry_run: bool = False,
  117. progress: Optional[Callable[[str], None]] = None,
  118. ) -> tuple[set[bytes], int]:
  119. """Remove unreachable objects from the repository.
  120. Args:
  121. object_store: Object store to prune
  122. refs_container: Reference container
  123. grace_period: Grace period in seconds (objects newer than this are kept)
  124. dry_run: If True, only report what would be deleted
  125. progress: Optional progress callback
  126. Returns:
  127. Tuple of (set of pruned object SHAs, total bytes freed)
  128. """
  129. unreachable = find_unreachable_objects(
  130. object_store, refs_container, progress=progress
  131. )
  132. pruned = set()
  133. bytes_freed = 0
  134. for sha in unreachable:
  135. try:
  136. obj = object_store[sha]
  137. # Check grace period
  138. if grace_period is not None:
  139. try:
  140. mtime = object_store.get_object_mtime(sha)
  141. age = time.time() - mtime
  142. if age < grace_period:
  143. if progress:
  144. progress(
  145. f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
  146. )
  147. continue
  148. except KeyError:
  149. # Object not found, skip it
  150. continue
  151. if progress:
  152. progress(f"Pruning {sha.decode('ascii', 'replace')}")
  153. # Calculate size before attempting deletion
  154. obj_size = len(obj.as_raw_string())
  155. if not dry_run:
  156. object_store.delete_loose_object(sha)
  157. # Only count as pruned if we get here (deletion succeeded or dry run)
  158. pruned.add(sha)
  159. bytes_freed += obj_size
  160. except KeyError:
  161. # Object already gone
  162. pass
  163. except OSError as e:
  164. # File system errors during deletion
  165. if progress:
  166. progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
  167. return pruned, bytes_freed
  168. def garbage_collect(
  169. repo: "Repo",
  170. auto: bool = False,
  171. aggressive: bool = False,
  172. prune: bool = True,
  173. grace_period: Optional[int] = 1209600, # 2 weeks default
  174. dry_run: bool = False,
  175. progress: Optional[Callable[[str], None]] = None,
  176. ) -> GCStats:
  177. """Run garbage collection on a repository.
  178. Args:
  179. repo: Repository to garbage collect
  180. auto: Whether this is an automatic gc
  181. aggressive: Whether to use aggressive settings
  182. prune: Whether to prune unreachable objects
  183. grace_period: Grace period for pruning in seconds
  184. dry_run: If True, only report what would be done
  185. progress: Optional progress callback
  186. Returns:
  187. GCStats object with garbage collection statistics
  188. """
  189. stats = GCStats()
  190. object_store = repo.object_store
  191. refs_container = repo.refs
  192. # Count initial state
  193. stats.packs_before = len(list(object_store.packs))
  194. stats.loose_objects_before = object_store.count_loose_objects()
  195. # Find unreachable objects to exclude from repacking
  196. unreachable_to_prune = set()
  197. if prune:
  198. if progress:
  199. progress("Finding unreachable objects")
  200. unreachable = find_unreachable_objects(
  201. object_store, refs_container, progress=progress
  202. )
  203. # Apply grace period check
  204. for sha in unreachable:
  205. try:
  206. if grace_period is not None:
  207. try:
  208. mtime = object_store.get_object_mtime(sha)
  209. age = time.time() - mtime
  210. if age < grace_period:
  211. if progress:
  212. progress(
  213. f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
  214. )
  215. continue
  216. except KeyError:
  217. # Object not found, skip it
  218. continue
  219. unreachable_to_prune.add(sha)
  220. obj = object_store[sha]
  221. stats.bytes_freed += len(obj.as_raw_string())
  222. except KeyError:
  223. pass
  224. stats.pruned_objects = unreachable_to_prune
  225. # Pack refs
  226. if progress:
  227. progress("Packing references")
  228. if not dry_run:
  229. repo.refs.pack_refs()
  230. # Delete loose unreachable objects
  231. if prune and not dry_run:
  232. for sha in unreachable_to_prune:
  233. if object_store.contains_loose(sha):
  234. try:
  235. object_store.delete_loose_object(sha)
  236. except OSError:
  237. pass
  238. # Repack everything, excluding unreachable objects
  239. # This handles both loose object packing and pack consolidation
  240. if progress:
  241. progress("Repacking repository")
  242. if not dry_run:
  243. if prune and unreachable_to_prune:
  244. # Repack excluding unreachable objects
  245. object_store.repack(exclude=unreachable_to_prune)
  246. else:
  247. # Normal repack
  248. object_store.repack()
  249. # Prune orphaned temporary files
  250. if progress:
  251. progress("Pruning temporary files")
  252. if not dry_run:
  253. object_store.prune(grace_period=grace_period)
  254. # Count final state
  255. stats.packs_after = len(list(object_store.packs))
  256. stats.loose_objects_after = object_store.count_loose_objects()
  257. return stats
  258. def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
  259. """Check if automatic garbage collection should run.
  260. Args:
  261. repo: Repository to check
  262. config: Configuration to use (defaults to repo config)
  263. Returns:
  264. True if GC should run, False otherwise
  265. """
  266. # Check environment variable first
  267. if os.environ.get("GIT_AUTO_GC") == "0":
  268. return False
  269. # Check programmatic disable flag
  270. if getattr(repo, "_autogc_disabled", False):
  271. return False
  272. if config is None:
  273. config = repo.get_config()
  274. # Check if auto GC is disabled
  275. try:
  276. gc_auto = config.get(b"gc", b"auto")
  277. gc_auto_value = int(gc_auto)
  278. except KeyError:
  279. gc_auto_value = DEFAULT_GC_AUTO
  280. if gc_auto_value == 0:
  281. # Auto GC is disabled
  282. return False
  283. # Check loose object count
  284. object_store = repo.object_store
  285. if not isinstance(object_store, DiskObjectStore):
  286. # Can't count loose objects on non-disk stores
  287. return False
  288. loose_count = object_store.count_loose_objects()
  289. if loose_count >= gc_auto_value:
  290. return True
  291. # Check pack file count
  292. try:
  293. gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
  294. pack_limit = int(gc_auto_pack_limit)
  295. except KeyError:
  296. pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
  297. if pack_limit > 0:
  298. pack_count = object_store.count_pack_files()
  299. if pack_count >= pack_limit:
  300. return True
  301. return False
  302. def maybe_auto_gc(repo: "Repo", config: Optional["Config"] = None) -> bool:
  303. """Run automatic garbage collection if needed.
  304. Args:
  305. repo: Repository to potentially GC
  306. config: Configuration to use (defaults to repo config)
  307. Returns:
  308. True if GC was run, False otherwise
  309. """
  310. if not should_run_gc(repo, config):
  311. return False
  312. # Check for gc.log file - only for disk-based repos
  313. if not hasattr(repo, "controldir"):
  314. # For non-disk repos, just run GC without gc.log handling
  315. garbage_collect(repo, auto=True)
  316. return True
  317. gc_log_path = os.path.join(repo.controldir(), "gc.log")
  318. if os.path.exists(gc_log_path):
  319. # Check gc.logExpiry
  320. if config is None:
  321. config = repo.get_config()
  322. try:
  323. log_expiry = config.get(b"gc", b"logExpiry")
  324. except KeyError:
  325. # Default to 1 day
  326. expiry_seconds = 86400
  327. else:
  328. # Parse time value (simplified - just support days for now)
  329. if log_expiry.endswith((b".days", b".day")):
  330. days = int(log_expiry.split(b".")[0])
  331. expiry_seconds = days * 86400
  332. else:
  333. # Default to 1 day
  334. expiry_seconds = 86400
  335. stat_info = os.stat(gc_log_path)
  336. if time.time() - stat_info.st_mtime < expiry_seconds:
  337. # gc.log exists and is not expired - skip GC
  338. with open(gc_log_path, "rb") as f:
  339. print(f.read().decode("utf-8", errors="replace"))
  340. return False
  341. # TODO: Support gc.autoDetach to run in background
  342. # For now, run in foreground
  343. try:
  344. # Run GC with auto=True flag
  345. garbage_collect(repo, auto=True)
  346. # Remove gc.log on successful completion
  347. if os.path.exists(gc_log_path):
  348. try:
  349. os.unlink(gc_log_path)
  350. except FileNotFoundError:
  351. pass
  352. return True
  353. except OSError as e:
  354. # Write error to gc.log
  355. with open(gc_log_path, "wb") as f:
  356. f.write(f"Auto GC failed: {e}\n".encode())
  357. # Don't propagate the error - auto GC failures shouldn't break operations
  358. return False