2
0

gc.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. """Git garbage collection implementation."""
  2. import collections
  3. import os
  4. import time
  5. from dataclasses import dataclass, field
  6. from typing import TYPE_CHECKING, Optional
  7. from dulwich.object_store import (
  8. BaseObjectStore,
  9. DiskObjectStore,
  10. PackBasedObjectStore,
  11. )
  12. from dulwich.objects import Commit, ObjectID, Tag, Tree
  13. from dulwich.refs import RefsContainer
  14. if TYPE_CHECKING:
  15. from .config import Config
  16. from .repo import BaseRepo
  17. DEFAULT_GC_AUTO = 6700
  18. DEFAULT_GC_AUTO_PACK_LIMIT = 50
  19. @dataclass
  20. class GCStats:
  21. """Statistics from garbage collection."""
  22. pruned_objects: set[bytes] = field(default_factory=set)
  23. bytes_freed: int = 0
  24. packs_before: int = 0
  25. packs_after: int = 0
  26. loose_objects_before: int = 0
  27. loose_objects_after: int = 0
  28. def find_reachable_objects(
  29. object_store: BaseObjectStore,
  30. refs_container: RefsContainer,
  31. include_reflogs: bool = True,
  32. progress=None,
  33. ) -> set[bytes]:
  34. """Find all reachable objects in the repository.
  35. Args:
  36. object_store: Object store to search
  37. refs_container: Reference container
  38. include_reflogs: Whether to include reflog entries
  39. progress: Optional progress callback
  40. Returns:
  41. Set of reachable object SHAs
  42. """
  43. reachable = set()
  44. pending: collections.deque[ObjectID] = collections.deque()
  45. # Start with all refs
  46. for ref in refs_container.allkeys():
  47. try:
  48. sha = refs_container[ref] # This follows symbolic refs
  49. if sha and sha not in reachable:
  50. pending.append(sha)
  51. reachable.add(sha)
  52. except KeyError:
  53. # Broken ref
  54. if progress:
  55. progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
  56. continue
  57. # TODO: Add reflog support when reflog functionality is available
  58. # Walk all reachable objects
  59. while pending:
  60. sha = pending.popleft()
  61. if progress:
  62. progress(f"Checking object {sha.decode('ascii', 'replace')}")
  63. try:
  64. obj = object_store[sha]
  65. except KeyError:
  66. continue
  67. # Add referenced objects
  68. if isinstance(obj, Commit):
  69. # Tree
  70. if obj.tree not in reachable:
  71. pending.append(obj.tree)
  72. reachable.add(obj.tree)
  73. # Parents
  74. for parent in obj.parents:
  75. if parent not in reachable:
  76. pending.append(parent)
  77. reachable.add(parent)
  78. elif isinstance(obj, Tree):
  79. # Tree entries
  80. for entry in obj.items():
  81. if entry.sha not in reachable:
  82. pending.append(entry.sha)
  83. reachable.add(entry.sha)
  84. elif isinstance(obj, Tag):
  85. # Tagged object
  86. if obj.object[1] not in reachable:
  87. pending.append(obj.object[1])
  88. reachable.add(obj.object[1])
  89. return reachable
  90. def find_unreachable_objects(
  91. object_store: BaseObjectStore,
  92. refs_container: RefsContainer,
  93. include_reflogs: bool = True,
  94. progress=None,
  95. ) -> set[bytes]:
  96. """Find all unreachable objects in the repository.
  97. Args:
  98. object_store: Object store to search
  99. refs_container: Reference container
  100. include_reflogs: Whether to include reflog entries
  101. progress: Optional progress callback
  102. Returns:
  103. Set of unreachable object SHAs
  104. """
  105. reachable = find_reachable_objects(
  106. object_store, refs_container, include_reflogs, progress
  107. )
  108. unreachable = set()
  109. for sha in object_store:
  110. if sha not in reachable:
  111. unreachable.add(sha)
  112. return unreachable
  113. def prune_unreachable_objects(
  114. object_store: PackBasedObjectStore,
  115. refs_container: RefsContainer,
  116. grace_period: Optional[int] = None,
  117. dry_run: bool = False,
  118. progress=None,
  119. ) -> tuple[set[bytes], int]:
  120. """Remove unreachable objects from the repository.
  121. Args:
  122. object_store: Object store to prune
  123. refs_container: Reference container
  124. grace_period: Grace period in seconds (objects newer than this are kept)
  125. dry_run: If True, only report what would be deleted
  126. progress: Optional progress callback
  127. Returns:
  128. Tuple of (set of pruned object SHAs, total bytes freed)
  129. """
  130. unreachable = find_unreachable_objects(
  131. object_store, refs_container, progress=progress
  132. )
  133. pruned = set()
  134. bytes_freed = 0
  135. for sha in unreachable:
  136. try:
  137. obj = object_store[sha]
  138. # Check grace period
  139. if grace_period is not None:
  140. try:
  141. mtime = object_store.get_object_mtime(sha)
  142. age = time.time() - mtime
  143. if age < grace_period:
  144. if progress:
  145. progress(
  146. f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
  147. )
  148. continue
  149. except KeyError:
  150. # Object not found, skip it
  151. continue
  152. if progress:
  153. progress(f"Pruning {sha.decode('ascii', 'replace')}")
  154. # Calculate size before attempting deletion
  155. obj_size = len(obj.as_raw_string())
  156. if not dry_run:
  157. object_store.delete_loose_object(sha)
  158. # Only count as pruned if we get here (deletion succeeded or dry run)
  159. pruned.add(sha)
  160. bytes_freed += obj_size
  161. except KeyError:
  162. # Object already gone
  163. pass
  164. except OSError as e:
  165. # File system errors during deletion
  166. if progress:
  167. progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
  168. return pruned, bytes_freed
  169. def garbage_collect(
  170. repo,
  171. auto: bool = False,
  172. aggressive: bool = False,
  173. prune: bool = True,
  174. grace_period: Optional[int] = 1209600, # 2 weeks default
  175. dry_run: bool = False,
  176. progress=None,
  177. ) -> GCStats:
  178. """Run garbage collection on a repository.
  179. Args:
  180. repo: Repository to garbage collect
  181. auto: Whether this is an automatic gc
  182. aggressive: Whether to use aggressive settings
  183. prune: Whether to prune unreachable objects
  184. grace_period: Grace period for pruning in seconds
  185. dry_run: If True, only report what would be done
  186. progress: Optional progress callback
  187. Returns:
  188. GCStats object with garbage collection statistics
  189. """
  190. stats = GCStats()
  191. object_store = repo.object_store
  192. refs_container = repo.refs
  193. # Count initial state
  194. stats.packs_before = len(list(object_store.packs))
  195. stats.loose_objects_before = object_store.count_loose_objects()
  196. # Find unreachable objects to exclude from repacking
  197. unreachable_to_prune = set()
  198. if prune:
  199. if progress:
  200. progress("Finding unreachable objects")
  201. unreachable = find_unreachable_objects(
  202. object_store, refs_container, progress=progress
  203. )
  204. # Apply grace period check
  205. for sha in unreachable:
  206. try:
  207. if grace_period is not None:
  208. try:
  209. mtime = object_store.get_object_mtime(sha)
  210. age = time.time() - mtime
  211. if age < grace_period:
  212. if progress:
  213. progress(
  214. f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
  215. )
  216. continue
  217. except KeyError:
  218. # Object not found, skip it
  219. continue
  220. unreachable_to_prune.add(sha)
  221. obj = object_store[sha]
  222. stats.bytes_freed += len(obj.as_raw_string())
  223. except KeyError:
  224. pass
  225. stats.pruned_objects = unreachable_to_prune
  226. # Pack refs
  227. if progress:
  228. progress("Packing references")
  229. if not dry_run:
  230. repo.refs.pack_refs()
  231. # Delete loose unreachable objects
  232. if prune and not dry_run:
  233. for sha in unreachable_to_prune:
  234. if object_store.contains_loose(sha):
  235. try:
  236. object_store.delete_loose_object(sha)
  237. except OSError:
  238. pass
  239. # Repack everything, excluding unreachable objects
  240. # This handles both loose object packing and pack consolidation
  241. if progress:
  242. progress("Repacking repository")
  243. if not dry_run:
  244. if prune and unreachable_to_prune:
  245. # Repack excluding unreachable objects
  246. object_store.repack(exclude=unreachable_to_prune)
  247. else:
  248. # Normal repack
  249. object_store.repack()
  250. # Prune orphaned temporary files
  251. if progress:
  252. progress("Pruning temporary files")
  253. if not dry_run:
  254. object_store.prune(grace_period=grace_period)
  255. # Count final state
  256. stats.packs_after = len(list(object_store.packs))
  257. stats.loose_objects_after = object_store.count_loose_objects()
  258. return stats
  259. def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
  260. """Check if automatic garbage collection should run.
  261. Args:
  262. repo: Repository to check
  263. config: Configuration to use (defaults to repo config)
  264. Returns:
  265. True if GC should run, False otherwise
  266. """
  267. if config is None:
  268. config = repo.get_config()
  269. # Check if auto GC is disabled
  270. try:
  271. gc_auto = config.get(b"gc", b"auto")
  272. gc_auto_value = int(gc_auto)
  273. except KeyError:
  274. gc_auto_value = DEFAULT_GC_AUTO
  275. if gc_auto_value == 0:
  276. # Auto GC is disabled
  277. return False
  278. # Check loose object count
  279. object_store = repo.object_store
  280. if not isinstance(object_store, DiskObjectStore):
  281. # Can't count loose objects on non-disk stores
  282. return False
  283. loose_count = object_store.count_loose_objects()
  284. if loose_count >= gc_auto_value:
  285. return True
  286. # Check pack file count
  287. try:
  288. gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
  289. pack_limit = int(gc_auto_pack_limit)
  290. except KeyError:
  291. pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
  292. if pack_limit > 0:
  293. pack_count = object_store.count_pack_files()
  294. if pack_count >= pack_limit:
  295. return True
  296. return False
  297. def maybe_auto_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
  298. """Run automatic garbage collection if needed.
  299. Args:
  300. repo: Repository to potentially GC
  301. config: Configuration to use (defaults to repo config)
  302. Returns:
  303. True if GC was run, False otherwise
  304. """
  305. if not should_run_gc(repo, config):
  306. return False
  307. # Check for gc.log file - only for disk-based repos
  308. if not hasattr(repo, "controldir"):
  309. # For non-disk repos, just run GC without gc.log handling
  310. garbage_collect(repo, auto=True)
  311. return True
  312. gc_log_path = os.path.join(repo.controldir(), "gc.log")
  313. if os.path.exists(gc_log_path):
  314. # Check gc.logExpiry
  315. if config is None:
  316. config = repo.get_config()
  317. try:
  318. log_expiry = config.get(b"gc", b"logExpiry")
  319. except KeyError:
  320. # Default to 1 day
  321. expiry_seconds = 86400
  322. else:
  323. # Parse time value (simplified - just support days for now)
  324. if log_expiry.endswith((b".days", b".day")):
  325. days = int(log_expiry.split(b".")[0])
  326. expiry_seconds = days * 86400
  327. else:
  328. # Default to 1 day
  329. expiry_seconds = 86400
  330. stat_info = os.stat(gc_log_path)
  331. if time.time() - stat_info.st_mtime < expiry_seconds:
  332. # gc.log exists and is not expired - skip GC
  333. with open(gc_log_path, "rb") as f:
  334. print(f.read().decode("utf-8", errors="replace"))
  335. return False
  336. # TODO: Support gc.autoDetach to run in background
  337. # For now, run in foreground
  338. try:
  339. # Run GC with auto=True flag
  340. garbage_collect(repo, auto=True)
  341. # Remove gc.log on successful completion
  342. if os.path.exists(gc_log_path):
  343. try:
  344. os.unlink(gc_log_path)
  345. except FileNotFoundError:
  346. pass
  347. return True
  348. except OSError as e:
  349. # Write error to gc.log
  350. with open(gc_log_path, "wb") as f:
  351. f.write(f"Auto GC failed: {e}\n".encode())
  352. # Don't propagate the error - auto GC failures shouldn't break operations
  353. return False