gc.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. """Git garbage collection implementation."""
  2. import collections
  3. import logging
  4. import os
  5. import time
  6. from dataclasses import dataclass, field
  7. from typing import TYPE_CHECKING, Callable, Optional
  8. from dulwich.object_store import (
  9. BaseObjectStore,
  10. DiskObjectStore,
  11. )
  12. from dulwich.objects import Commit, ObjectID, Tag, Tree
  13. from dulwich.refs import RefsContainer
  14. if TYPE_CHECKING:
  15. from .config import Config
  16. from .repo import BaseRepo, Repo
  17. DEFAULT_GC_AUTO = 6700
  18. DEFAULT_GC_AUTO_PACK_LIMIT = 50
  19. @dataclass
  20. class GCStats:
  21. """Statistics from garbage collection."""
  22. pruned_objects: set[bytes] = field(default_factory=set)
  23. bytes_freed: int = 0
  24. packs_before: int = 0
  25. packs_after: int = 0
  26. loose_objects_before: int = 0
  27. loose_objects_after: int = 0
  28. def find_reachable_objects(
  29. object_store: BaseObjectStore,
  30. refs_container: RefsContainer,
  31. include_reflogs: bool = True,
  32. progress: Optional[Callable[[str], None]] = None,
  33. ) -> set[bytes]:
  34. """Find all reachable objects in the repository.
  35. Args:
  36. object_store: Object store to search
  37. refs_container: Reference container
  38. include_reflogs: Whether to include reflog entries
  39. progress: Optional progress callback
  40. Returns:
  41. Set of reachable object SHAs
  42. """
  43. reachable = set()
  44. pending: collections.deque[ObjectID] = collections.deque()
  45. # Start with all refs
  46. for ref in refs_container.allkeys():
  47. try:
  48. sha = refs_container[ref] # This follows symbolic refs
  49. if sha and sha not in reachable:
  50. pending.append(sha)
  51. reachable.add(sha)
  52. except KeyError:
  53. # Broken ref
  54. if progress:
  55. progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
  56. continue
  57. # TODO: Add reflog support when reflog functionality is available
  58. # Walk all reachable objects
  59. while pending:
  60. sha = pending.popleft()
  61. if progress:
  62. progress(f"Checking object {sha.decode('ascii', 'replace')}")
  63. try:
  64. obj = object_store[sha]
  65. except KeyError:
  66. continue
  67. # Add referenced objects
  68. if isinstance(obj, Commit):
  69. # Tree
  70. if obj.tree not in reachable:
  71. pending.append(obj.tree)
  72. reachable.add(obj.tree)
  73. # Parents
  74. for parent in obj.parents:
  75. if parent not in reachable:
  76. pending.append(parent)
  77. reachable.add(parent)
  78. elif isinstance(obj, Tree):
  79. # Tree entries
  80. for entry in obj.items():
  81. assert entry.sha is not None
  82. if entry.sha not in reachable:
  83. pending.append(entry.sha)
  84. reachable.add(entry.sha)
  85. elif isinstance(obj, Tag):
  86. # Tagged object
  87. if obj.object[1] not in reachable:
  88. pending.append(obj.object[1])
  89. reachable.add(obj.object[1])
  90. return reachable
  91. def find_unreachable_objects(
  92. object_store: BaseObjectStore,
  93. refs_container: RefsContainer,
  94. include_reflogs: bool = True,
  95. progress: Optional[Callable[[str], None]] = None,
  96. ) -> set[bytes]:
  97. """Find all unreachable objects in the repository.
  98. Args:
  99. object_store: Object store to search
  100. refs_container: Reference container
  101. include_reflogs: Whether to include reflog entries
  102. progress: Optional progress callback
  103. Returns:
  104. Set of unreachable object SHAs
  105. """
  106. reachable = find_reachable_objects(
  107. object_store, refs_container, include_reflogs, progress
  108. )
  109. unreachable = set()
  110. for sha in object_store:
  111. if sha not in reachable:
  112. unreachable.add(sha)
  113. return unreachable
  114. def prune_unreachable_objects(
  115. object_store: DiskObjectStore,
  116. refs_container: RefsContainer,
  117. grace_period: Optional[int] = None,
  118. dry_run: bool = False,
  119. progress: Optional[Callable[[str], None]] = None,
  120. ) -> tuple[set[bytes], int]:
  121. """Remove unreachable objects from the repository.
  122. Args:
  123. object_store: Object store to prune
  124. refs_container: Reference container
  125. grace_period: Grace period in seconds (objects newer than this are kept)
  126. dry_run: If True, only report what would be deleted
  127. progress: Optional progress callback
  128. Returns:
  129. Tuple of (set of pruned object SHAs, total bytes freed)
  130. """
  131. unreachable = find_unreachable_objects(
  132. object_store, refs_container, progress=progress
  133. )
  134. pruned = set()
  135. bytes_freed = 0
  136. for sha in unreachable:
  137. try:
  138. obj = object_store[sha]
  139. # Check grace period
  140. if grace_period is not None:
  141. try:
  142. mtime = object_store.get_object_mtime(sha)
  143. age = time.time() - mtime
  144. if age < grace_period:
  145. if progress:
  146. progress(
  147. f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
  148. )
  149. continue
  150. except KeyError:
  151. # Object not found, skip it
  152. continue
  153. if progress:
  154. progress(f"Pruning {sha.decode('ascii', 'replace')}")
  155. # Calculate size before attempting deletion
  156. obj_size = len(obj.as_raw_string())
  157. if not dry_run:
  158. object_store.delete_loose_object(sha)
  159. # Only count as pruned if we get here (deletion succeeded or dry run)
  160. pruned.add(sha)
  161. bytes_freed += obj_size
  162. except KeyError:
  163. # Object already gone
  164. pass
  165. except OSError as e:
  166. # File system errors during deletion
  167. if progress:
  168. progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
  169. return pruned, bytes_freed
  170. def garbage_collect(
  171. repo: "Repo",
  172. auto: bool = False,
  173. aggressive: bool = False,
  174. prune: bool = True,
  175. grace_period: Optional[int] = 1209600, # 2 weeks default
  176. dry_run: bool = False,
  177. progress: Optional[Callable[[str], None]] = None,
  178. ) -> GCStats:
  179. """Run garbage collection on a repository.
  180. Args:
  181. repo: Repository to garbage collect
  182. auto: Whether this is an automatic gc
  183. aggressive: Whether to use aggressive settings
  184. prune: Whether to prune unreachable objects
  185. grace_period: Grace period for pruning in seconds
  186. dry_run: If True, only report what would be done
  187. progress: Optional progress callback
  188. Returns:
  189. GCStats object with garbage collection statistics
  190. """
  191. stats = GCStats()
  192. object_store = repo.object_store
  193. refs_container = repo.refs
  194. # Count initial state
  195. stats.packs_before = len(list(object_store.packs))
  196. stats.loose_objects_before = object_store.count_loose_objects()
  197. # Find unreachable objects to exclude from repacking
  198. unreachable_to_prune = set()
  199. if prune:
  200. if progress:
  201. progress("Finding unreachable objects")
  202. unreachable = find_unreachable_objects(
  203. object_store, refs_container, progress=progress
  204. )
  205. # Apply grace period check
  206. for sha in unreachable:
  207. try:
  208. if grace_period is not None:
  209. try:
  210. mtime = object_store.get_object_mtime(sha)
  211. age = time.time() - mtime
  212. if age < grace_period:
  213. if progress:
  214. progress(
  215. f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
  216. )
  217. continue
  218. except KeyError:
  219. # Object not found, skip it
  220. continue
  221. unreachable_to_prune.add(sha)
  222. obj = object_store[sha]
  223. stats.bytes_freed += len(obj.as_raw_string())
  224. except KeyError:
  225. pass
  226. stats.pruned_objects = unreachable_to_prune
  227. # Pack refs
  228. if progress:
  229. progress("Packing references")
  230. if not dry_run:
  231. repo.refs.pack_refs()
  232. # Delete loose unreachable objects
  233. if prune and not dry_run:
  234. for sha in unreachable_to_prune:
  235. if object_store.contains_loose(sha):
  236. try:
  237. object_store.delete_loose_object(sha)
  238. except OSError:
  239. pass
  240. # Repack everything, excluding unreachable objects
  241. # This handles both loose object packing and pack consolidation
  242. if progress:
  243. progress("Repacking repository")
  244. if not dry_run:
  245. if prune and unreachable_to_prune:
  246. # Repack excluding unreachable objects
  247. object_store.repack(exclude=unreachable_to_prune, progress=progress)
  248. else:
  249. # Normal repack
  250. object_store.repack(progress=progress)
  251. # Prune orphaned temporary files
  252. if progress:
  253. progress("Pruning temporary files")
  254. if not dry_run:
  255. object_store.prune(grace_period=grace_period)
  256. # Count final state
  257. stats.packs_after = len(list(object_store.packs))
  258. stats.loose_objects_after = object_store.count_loose_objects()
  259. return stats
  260. def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
  261. """Check if automatic garbage collection should run.
  262. Args:
  263. repo: Repository to check
  264. config: Configuration to use (defaults to repo config)
  265. Returns:
  266. True if GC should run, False otherwise
  267. """
  268. # Check environment variable first
  269. if os.environ.get("GIT_AUTO_GC") == "0":
  270. return False
  271. # Check programmatic disable flag
  272. if getattr(repo, "_autogc_disabled", False):
  273. return False
  274. if config is None:
  275. config = repo.get_config()
  276. # Check if auto GC is disabled
  277. try:
  278. gc_auto = config.get(b"gc", b"auto")
  279. gc_auto_value = int(gc_auto)
  280. except KeyError:
  281. gc_auto_value = DEFAULT_GC_AUTO
  282. if gc_auto_value == 0:
  283. # Auto GC is disabled
  284. return False
  285. # Check loose object count
  286. object_store = repo.object_store
  287. if not isinstance(object_store, DiskObjectStore):
  288. # Can't count loose objects on non-disk stores
  289. return False
  290. loose_count = object_store.count_loose_objects()
  291. if loose_count >= gc_auto_value:
  292. return True
  293. # Check pack file count
  294. try:
  295. gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
  296. pack_limit = int(gc_auto_pack_limit)
  297. except KeyError:
  298. pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
  299. if pack_limit > 0:
  300. pack_count = object_store.count_pack_files()
  301. if pack_count >= pack_limit:
  302. return True
  303. return False
  304. def maybe_auto_gc(
  305. repo: "Repo",
  306. config: Optional["Config"] = None,
  307. progress: Optional[Callable[[str], None]] = None,
  308. ) -> bool:
  309. """Run automatic garbage collection if needed.
  310. Args:
  311. repo: Repository to potentially GC
  312. config: Configuration to use (defaults to repo config)
  313. progress: Optional progress reporting callback
  314. Returns:
  315. True if GC was run, False otherwise
  316. """
  317. if not should_run_gc(repo, config):
  318. return False
  319. # Check for gc.log file - only for disk-based repos
  320. if not hasattr(repo, "controldir"):
  321. # For non-disk repos, just run GC without gc.log handling
  322. garbage_collect(repo, auto=True, progress=progress)
  323. return True
  324. gc_log_path = os.path.join(repo.controldir(), "gc.log")
  325. if os.path.exists(gc_log_path):
  326. # Check gc.logExpiry
  327. if config is None:
  328. config = repo.get_config()
  329. try:
  330. log_expiry = config.get(b"gc", b"logExpiry")
  331. except KeyError:
  332. # Default to 1 day
  333. expiry_seconds = 86400
  334. else:
  335. # Parse time value (simplified - just support days for now)
  336. if log_expiry.endswith((b".days", b".day")):
  337. days = int(log_expiry.split(b".")[0])
  338. expiry_seconds = days * 86400
  339. else:
  340. # Default to 1 day
  341. expiry_seconds = 86400
  342. stat_info = os.stat(gc_log_path)
  343. if time.time() - stat_info.st_mtime < expiry_seconds:
  344. # gc.log exists and is not expired - skip GC
  345. with open(gc_log_path, "rb") as f:
  346. logging.info(
  347. "gc.log content: %s", f.read().decode("utf-8", errors="replace")
  348. )
  349. return False
  350. # TODO: Support gc.autoDetach to run in background
  351. # For now, run in foreground
  352. try:
  353. # Run GC with auto=True flag
  354. garbage_collect(repo, auto=True, progress=progress)
  355. # Remove gc.log on successful completion
  356. if os.path.exists(gc_log_path):
  357. try:
  358. os.unlink(gc_log_path)
  359. except FileNotFoundError:
  360. pass
  361. return True
  362. except OSError as e:
  363. # Write error to gc.log
  364. with open(gc_log_path, "wb") as f:
  365. f.write(f"Auto GC failed: {e}\n".encode())
  366. # Don't propagate the error - auto GC failures shouldn't break operations
  367. return False