gc.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. """Git garbage collection implementation."""
  2. import logging
  3. import os
  4. import time
  5. from collections import deque
  6. from collections.abc import Callable
  7. from dataclasses import dataclass, field
  8. from typing import TYPE_CHECKING, Optional
  9. from dulwich.object_store import (
  10. BaseObjectStore,
  11. DiskObjectStore,
  12. )
  13. from dulwich.objects import Commit, ObjectID, Tag, Tree
  14. from dulwich.refs import RefsContainer
  15. if TYPE_CHECKING:
  16. from .config import Config
  17. from .repo import BaseRepo, Repo
  18. DEFAULT_GC_AUTO = 6700
  19. DEFAULT_GC_AUTO_PACK_LIMIT = 50
  20. @dataclass
  21. class GCStats:
  22. """Statistics from garbage collection."""
  23. pruned_objects: set[bytes] = field(default_factory=set)
  24. bytes_freed: int = 0
  25. packs_before: int = 0
  26. packs_after: int = 0
  27. loose_objects_before: int = 0
  28. loose_objects_after: int = 0
  29. def find_reachable_objects(
  30. object_store: BaseObjectStore,
  31. refs_container: RefsContainer,
  32. include_reflogs: bool = True,
  33. progress: Callable[[str], None] | None = None,
  34. ) -> set[bytes]:
  35. """Find all reachable objects in the repository.
  36. Args:
  37. object_store: Object store to search
  38. refs_container: Reference container
  39. include_reflogs: Whether to include reflog entries
  40. progress: Optional progress callback
  41. Returns:
  42. Set of reachable object SHAs
  43. """
  44. reachable = set()
  45. pending: deque[ObjectID] = deque()
  46. # Start with all refs
  47. for ref in refs_container.allkeys():
  48. try:
  49. sha = refs_container[ref] # This follows symbolic refs
  50. if sha and sha not in reachable:
  51. pending.append(sha)
  52. reachable.add(sha)
  53. except KeyError:
  54. # Broken ref
  55. if progress:
  56. progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
  57. continue
  58. # TODO: Add reflog support when reflog functionality is available
  59. # Walk all reachable objects
  60. while pending:
  61. sha = pending.popleft()
  62. if progress:
  63. progress(f"Checking object {sha.decode('ascii', 'replace')}")
  64. try:
  65. obj = object_store[sha]
  66. except KeyError:
  67. continue
  68. # Add referenced objects
  69. if isinstance(obj, Commit):
  70. # Tree
  71. if obj.tree not in reachable:
  72. pending.append(obj.tree)
  73. reachable.add(obj.tree)
  74. # Parents
  75. for parent in obj.parents:
  76. if parent not in reachable:
  77. pending.append(parent)
  78. reachable.add(parent)
  79. elif isinstance(obj, Tree):
  80. # Tree entries
  81. for entry in obj.items():
  82. assert entry.sha is not None
  83. if entry.sha not in reachable:
  84. pending.append(entry.sha)
  85. reachable.add(entry.sha)
  86. elif isinstance(obj, Tag):
  87. # Tagged object
  88. if obj.object[1] not in reachable:
  89. pending.append(obj.object[1])
  90. reachable.add(obj.object[1])
  91. return reachable
  92. def find_unreachable_objects(
  93. object_store: BaseObjectStore,
  94. refs_container: RefsContainer,
  95. include_reflogs: bool = True,
  96. progress: Callable[[str], None] | None = None,
  97. ) -> set[bytes]:
  98. """Find all unreachable objects in the repository.
  99. Args:
  100. object_store: Object store to search
  101. refs_container: Reference container
  102. include_reflogs: Whether to include reflog entries
  103. progress: Optional progress callback
  104. Returns:
  105. Set of unreachable object SHAs
  106. """
  107. reachable = find_reachable_objects(
  108. object_store, refs_container, include_reflogs, progress
  109. )
  110. unreachable = set()
  111. for sha in object_store:
  112. if sha not in reachable:
  113. unreachable.add(sha)
  114. return unreachable
  115. def prune_unreachable_objects(
  116. object_store: DiskObjectStore,
  117. refs_container: RefsContainer,
  118. grace_period: int | None = None,
  119. dry_run: bool = False,
  120. progress: Callable[[str], None] | None = None,
  121. ) -> tuple[set[bytes], int]:
  122. """Remove unreachable objects from the repository.
  123. Args:
  124. object_store: Object store to prune
  125. refs_container: Reference container
  126. grace_period: Grace period in seconds (objects newer than this are kept)
  127. dry_run: If True, only report what would be deleted
  128. progress: Optional progress callback
  129. Returns:
  130. Tuple of (set of pruned object SHAs, total bytes freed)
  131. """
  132. unreachable = find_unreachable_objects(
  133. object_store, refs_container, progress=progress
  134. )
  135. pruned = set()
  136. bytes_freed = 0
  137. for sha in unreachable:
  138. try:
  139. obj = object_store[sha]
  140. # Check grace period
  141. if grace_period is not None:
  142. try:
  143. mtime = object_store.get_object_mtime(sha)
  144. age = time.time() - mtime
  145. if age < grace_period:
  146. if progress:
  147. progress(
  148. f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
  149. )
  150. continue
  151. except KeyError:
  152. # Object not found, skip it
  153. continue
  154. if progress:
  155. progress(f"Pruning {sha.decode('ascii', 'replace')}")
  156. # Calculate size before attempting deletion
  157. obj_size = len(obj.as_raw_string())
  158. if not dry_run:
  159. object_store.delete_loose_object(sha)
  160. # Only count as pruned if we get here (deletion succeeded or dry run)
  161. pruned.add(sha)
  162. bytes_freed += obj_size
  163. except KeyError:
  164. # Object already gone
  165. pass
  166. except OSError as e:
  167. # File system errors during deletion
  168. if progress:
  169. progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
  170. return pruned, bytes_freed
  171. def garbage_collect(
  172. repo: "Repo",
  173. auto: bool = False,
  174. aggressive: bool = False,
  175. prune: bool = True,
  176. grace_period: int | None = 1209600, # 2 weeks default
  177. dry_run: bool = False,
  178. progress: Callable[[str], None] | None = None,
  179. ) -> GCStats:
  180. """Run garbage collection on a repository.
  181. Args:
  182. repo: Repository to garbage collect
  183. auto: Whether this is an automatic gc
  184. aggressive: Whether to use aggressive settings
  185. prune: Whether to prune unreachable objects
  186. grace_period: Grace period for pruning in seconds
  187. dry_run: If True, only report what would be done
  188. progress: Optional progress callback
  189. Returns:
  190. GCStats object with garbage collection statistics
  191. """
  192. stats = GCStats()
  193. object_store = repo.object_store
  194. refs_container = repo.refs
  195. # Count initial state
  196. stats.packs_before = len(list(object_store.packs))
  197. stats.loose_objects_before = object_store.count_loose_objects()
  198. # Find unreachable objects to exclude from repacking
  199. unreachable_to_prune = set()
  200. if prune:
  201. if progress:
  202. progress("Finding unreachable objects")
  203. unreachable = find_unreachable_objects(
  204. object_store, refs_container, progress=progress
  205. )
  206. # Apply grace period check
  207. for sha in unreachable:
  208. try:
  209. if grace_period is not None:
  210. try:
  211. mtime = object_store.get_object_mtime(sha)
  212. age = time.time() - mtime
  213. if age < grace_period:
  214. if progress:
  215. progress(
  216. f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
  217. )
  218. continue
  219. except KeyError:
  220. # Object not found, skip it
  221. continue
  222. unreachable_to_prune.add(sha)
  223. obj = object_store[sha]
  224. stats.bytes_freed += len(obj.as_raw_string())
  225. except KeyError:
  226. pass
  227. stats.pruned_objects = unreachable_to_prune
  228. # Pack refs
  229. if progress:
  230. progress("Packing references")
  231. if not dry_run:
  232. repo.refs.pack_refs()
  233. # Delete loose unreachable objects
  234. if prune and not dry_run:
  235. for sha in unreachable_to_prune:
  236. if object_store.contains_loose(sha):
  237. try:
  238. object_store.delete_loose_object(sha)
  239. except OSError:
  240. pass
  241. # Repack everything, excluding unreachable objects
  242. # This handles both loose object packing and pack consolidation
  243. if progress:
  244. progress("Repacking repository")
  245. if not dry_run:
  246. if prune and unreachable_to_prune:
  247. # Repack excluding unreachable objects
  248. object_store.repack(exclude=unreachable_to_prune, progress=progress)
  249. else:
  250. # Normal repack
  251. object_store.repack(progress=progress)
  252. # Prune orphaned temporary files
  253. if progress:
  254. progress("Pruning temporary files")
  255. if not dry_run:
  256. object_store.prune(grace_period=grace_period)
  257. # Count final state
  258. stats.packs_after = len(list(object_store.packs))
  259. stats.loose_objects_after = object_store.count_loose_objects()
  260. return stats
  261. def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
  262. """Check if automatic garbage collection should run.
  263. Args:
  264. repo: Repository to check
  265. config: Configuration to use (defaults to repo config)
  266. Returns:
  267. True if GC should run, False otherwise
  268. """
  269. # Check environment variable first
  270. if os.environ.get("GIT_AUTO_GC") == "0":
  271. return False
  272. # Check programmatic disable flag
  273. if getattr(repo, "_autogc_disabled", False):
  274. return False
  275. if config is None:
  276. config = repo.get_config()
  277. # Check if auto GC is disabled
  278. try:
  279. gc_auto = config.get(b"gc", b"auto")
  280. gc_auto_value = int(gc_auto)
  281. except KeyError:
  282. gc_auto_value = DEFAULT_GC_AUTO
  283. if gc_auto_value == 0:
  284. # Auto GC is disabled
  285. return False
  286. # Check loose object count
  287. object_store = repo.object_store
  288. if not isinstance(object_store, DiskObjectStore):
  289. # Can't count loose objects on non-disk stores
  290. return False
  291. loose_count = object_store.count_loose_objects()
  292. if loose_count >= gc_auto_value:
  293. return True
  294. # Check pack file count
  295. try:
  296. gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
  297. pack_limit = int(gc_auto_pack_limit)
  298. except KeyError:
  299. pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
  300. if pack_limit > 0:
  301. pack_count = object_store.count_pack_files()
  302. if pack_count >= pack_limit:
  303. return True
  304. return False
  305. def maybe_auto_gc(
  306. repo: "Repo",
  307. config: Optional["Config"] = None,
  308. progress: Callable[[str], None] | None = None,
  309. ) -> bool:
  310. """Run automatic garbage collection if needed.
  311. Args:
  312. repo: Repository to potentially GC
  313. config: Configuration to use (defaults to repo config)
  314. progress: Optional progress reporting callback
  315. Returns:
  316. True if GC was run, False otherwise
  317. """
  318. if not should_run_gc(repo, config):
  319. return False
  320. # Check for gc.log file - only for disk-based repos
  321. if not hasattr(repo, "controldir"):
  322. # For non-disk repos, just run GC without gc.log handling
  323. garbage_collect(repo, auto=True, progress=progress)
  324. return True
  325. gc_log_path = os.path.join(repo.controldir(), "gc.log")
  326. if os.path.exists(gc_log_path):
  327. # Check gc.logExpiry
  328. if config is None:
  329. config = repo.get_config()
  330. try:
  331. log_expiry = config.get(b"gc", b"logExpiry")
  332. except KeyError:
  333. # Default to 1 day
  334. expiry_seconds = 86400
  335. else:
  336. # Parse time value (simplified - just support days for now)
  337. if log_expiry.endswith((b".days", b".day")):
  338. days = int(log_expiry.split(b".")[0])
  339. expiry_seconds = days * 86400
  340. else:
  341. # Default to 1 day
  342. expiry_seconds = 86400
  343. stat_info = os.stat(gc_log_path)
  344. if time.time() - stat_info.st_mtime < expiry_seconds:
  345. # gc.log exists and is not expired - skip GC
  346. with open(gc_log_path, "rb") as f:
  347. logging.info(
  348. "gc.log content: %s", f.read().decode("utf-8", errors="replace")
  349. )
  350. return False
  351. # TODO: Support gc.autoDetach to run in background
  352. # For now, run in foreground
  353. try:
  354. # Run GC with auto=True flag
  355. garbage_collect(repo, auto=True, progress=progress)
  356. # Remove gc.log on successful completion
  357. if os.path.exists(gc_log_path):
  358. try:
  359. os.unlink(gc_log_path)
  360. except FileNotFoundError:
  361. pass
  362. return True
  363. except OSError as e:
  364. # Write error to gc.log
  365. with open(gc_log_path, "wb") as f:
  366. f.write(f"Auto GC failed: {e}\n".encode())
  367. # Don't propagate the error - auto GC failures shouldn't break operations
  368. return False