commit_graph.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666
  1. # commit_graph.py -- Git commit graph file format support
  2. # Copyright (C) 2024 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Git commit graph file format support.
  22. Git's commit graph files store commit metadata and generation numbers
  23. for faster graph traversal operations like merge-base computation.
  24. The commit graph format is documented at:
  25. https://git-scm.com/docs/gitformat-commit-graph
  26. """
  27. __all__ = [
  28. "CHUNK_BASE_GRAPHS_LIST",
  29. "CHUNK_BLOOM_FILTER_DATA",
  30. "CHUNK_BLOOM_FILTER_INDEX",
  31. "CHUNK_COMMIT_DATA",
  32. "CHUNK_EXTRA_EDGE_LIST",
  33. "CHUNK_GENERATION_DATA",
  34. "CHUNK_GENERATION_DATA_OVERFLOW",
  35. "CHUNK_OID_FANOUT",
  36. "CHUNK_OID_LOOKUP",
  37. "COMMIT_GRAPH_SIGNATURE",
  38. "COMMIT_GRAPH_VERSION",
  39. "GENERATION_NUMBER_INFINITY",
  40. "GENERATION_NUMBER_V1_MAX",
  41. "GENERATION_NUMBER_ZERO",
  42. "GRAPH_EXTRA_EDGES_NEEDED",
  43. "GRAPH_LAST_EDGE",
  44. "GRAPH_PARENT_MISSING",
  45. "GRAPH_PARENT_NONE",
  46. "HASH_VERSION_SHA1",
  47. "HASH_VERSION_SHA256",
  48. "CommitGraph",
  49. "CommitGraphChunk",
  50. "CommitGraphEntry",
  51. "find_commit_graph_file",
  52. "generate_commit_graph",
  53. "get_reachable_commits",
  54. "read_commit_graph",
  55. "write_commit_graph",
  56. ]
  57. import os
  58. import struct
  59. from collections.abc import Iterator, Sequence
  60. from typing import TYPE_CHECKING, BinaryIO
  61. from .file import _GitFile
  62. if TYPE_CHECKING:
  63. from .object_store import BaseObjectStore
  64. from .objects import Commit, ObjectID, RawObjectID, hex_to_sha, sha_to_hex
  65. # File format constants
  66. COMMIT_GRAPH_SIGNATURE = b"CGPH"
  67. COMMIT_GRAPH_VERSION = 1
  68. HASH_VERSION_SHA1 = 1
  69. HASH_VERSION_SHA256 = 2
  70. # Chunk IDs
  71. CHUNK_OID_FANOUT = b"OIDF"
  72. CHUNK_OID_LOOKUP = b"OIDL"
  73. CHUNK_COMMIT_DATA = b"CDAT"
  74. CHUNK_GENERATION_DATA = b"GDA2"
  75. CHUNK_GENERATION_DATA_OVERFLOW = b"GDO2"
  76. CHUNK_EXTRA_EDGE_LIST = b"EDGE"
  77. CHUNK_BLOOM_FILTER_INDEX = b"BIDX"
  78. CHUNK_BLOOM_FILTER_DATA = b"BDAT"
  79. CHUNK_BASE_GRAPHS_LIST = b"BASE"
  80. # Generation number constants
  81. GENERATION_NUMBER_INFINITY = 0xFFFFFFFF
  82. GENERATION_NUMBER_ZERO = 0
  83. GENERATION_NUMBER_V1_MAX = 0x3FFFFFFF
  84. # Parent encoding constants
  85. GRAPH_PARENT_MISSING = 0x70000000
  86. GRAPH_PARENT_NONE = 0x70000000
  87. GRAPH_EXTRA_EDGES_NEEDED = 0x80000000
  88. GRAPH_LAST_EDGE = 0x80000000
  89. class CommitGraphEntry:
  90. """Represents a single commit entry in the commit graph."""
  91. def __init__(
  92. self,
  93. commit_id: ObjectID,
  94. tree_id: ObjectID,
  95. parents: list[ObjectID],
  96. generation: int,
  97. commit_time: int,
  98. ) -> None:
  99. """Initialize CommitGraphEntry.
  100. Args:
  101. commit_id: The commit object ID
  102. tree_id: The tree object ID
  103. parents: List of parent commit IDs
  104. generation: Generation number
  105. commit_time: Commit timestamp
  106. """
  107. self.commit_id = commit_id
  108. self.tree_id = tree_id
  109. self.parents = parents
  110. self.generation = generation
  111. self.commit_time = commit_time
  112. def __repr__(self) -> str:
  113. """Return string representation of CommitGraphEntry."""
  114. return (
  115. f"CommitGraphEntry(commit_id={self.commit_id!r}, "
  116. f"tree_id={self.tree_id!r}, parents={self.parents!r}, "
  117. f"generation={self.generation}, commit_time={self.commit_time})"
  118. )
  119. class CommitGraphChunk:
  120. """Represents a chunk in the commit graph file."""
  121. def __init__(self, chunk_id: bytes, data: bytes) -> None:
  122. """Initialize CommitGraphChunk.
  123. Args:
  124. chunk_id: Chunk identifier
  125. data: Chunk data
  126. """
  127. self.chunk_id = chunk_id
  128. self.data = data
  129. def __repr__(self) -> str:
  130. """Return string representation of CommitGraphChunk."""
  131. return f"CommitGraphChunk(chunk_id={self.chunk_id!r}, size={len(self.data)})"
  132. class CommitGraph:
  133. """Git commit graph file reader/writer."""
  134. def __init__(self, hash_version: int = HASH_VERSION_SHA1) -> None:
  135. """Initialize CommitGraph.
  136. Args:
  137. hash_version: Hash version to use (SHA1 or SHA256)
  138. """
  139. self.hash_version = hash_version
  140. self.chunks: dict[bytes, CommitGraphChunk] = {}
  141. self.entries: list[CommitGraphEntry] = []
  142. self._oid_to_index: dict[ObjectID, int] = {}
  143. self._hash_size = 20 if hash_version == HASH_VERSION_SHA1 else 32
  144. @classmethod
  145. def from_file(cls, f: BinaryIO) -> "CommitGraph":
  146. """Read commit graph from file."""
  147. graph = cls()
  148. graph._read_from_file(f)
  149. return graph
  150. def _read_from_file(self, f: BinaryIO) -> None:
  151. """Read commit graph data from file."""
  152. # Read header
  153. signature = f.read(4)
  154. if signature != COMMIT_GRAPH_SIGNATURE:
  155. raise ValueError(f"Invalid commit graph signature: {signature!r}")
  156. version = struct.unpack(">B", f.read(1))[0]
  157. if version != COMMIT_GRAPH_VERSION:
  158. raise ValueError(f"Unsupported commit graph version: {version}")
  159. self.hash_version = struct.unpack(">B", f.read(1))[0]
  160. if self.hash_version not in (HASH_VERSION_SHA1, HASH_VERSION_SHA256):
  161. raise ValueError(f"Unsupported hash version: {self.hash_version}")
  162. self._hash_size = 20 if self.hash_version == HASH_VERSION_SHA1 else 32
  163. num_chunks = struct.unpack(">B", f.read(1))[0]
  164. struct.unpack(">B", f.read(1))[0]
  165. # Read table of contents
  166. toc_entries = []
  167. for _ in range(num_chunks + 1): # +1 for terminating entry
  168. chunk_id = f.read(4)
  169. offset = struct.unpack(">Q", f.read(8))[0]
  170. toc_entries.append((chunk_id, offset))
  171. # Read chunks
  172. # Offsets in TOC are absolute from start of file
  173. for i in range(num_chunks):
  174. chunk_id, offset = toc_entries[i]
  175. next_offset = toc_entries[i + 1][1]
  176. chunk_size = next_offset - offset
  177. f.seek(offset)
  178. chunk_data = f.read(chunk_size)
  179. self.chunks[chunk_id] = CommitGraphChunk(chunk_id, chunk_data)
  180. # Parse chunks
  181. self._parse_chunks()
  182. def _parse_chunks(self) -> None:
  183. """Parse chunk data into entries."""
  184. if CHUNK_OID_LOOKUP not in self.chunks:
  185. raise ValueError("Missing required OID lookup chunk")
  186. if CHUNK_COMMIT_DATA not in self.chunks:
  187. raise ValueError("Missing required commit data chunk")
  188. # Parse OID lookup chunk
  189. oid_lookup_data = self.chunks[CHUNK_OID_LOOKUP].data
  190. num_commits = len(oid_lookup_data) // self._hash_size
  191. oids = []
  192. for i in range(num_commits):
  193. start = i * self._hash_size
  194. end = start + self._hash_size
  195. oid = RawObjectID(oid_lookup_data[start:end])
  196. oids.append(oid)
  197. self._oid_to_index[sha_to_hex(oid)] = i
  198. # Parse commit data chunk
  199. commit_data = self.chunks[CHUNK_COMMIT_DATA].data
  200. expected_size = num_commits * (self._hash_size + 16)
  201. if len(commit_data) != expected_size:
  202. raise ValueError(
  203. f"Invalid commit data chunk size: {len(commit_data)}, expected {expected_size}"
  204. )
  205. self.entries = []
  206. for i in range(num_commits):
  207. offset = i * (self._hash_size + 16)
  208. # Tree OID
  209. tree_id = RawObjectID(commit_data[offset : offset + self._hash_size])
  210. offset += self._hash_size
  211. # Parent positions (2 x 4 bytes)
  212. parent1_pos, parent2_pos = struct.unpack(
  213. ">LL", commit_data[offset : offset + 8]
  214. )
  215. offset += 8
  216. # Generation number and commit time (2 x 4 bytes)
  217. gen_and_time = struct.unpack(">LL", commit_data[offset : offset + 8])
  218. generation = gen_and_time[0] >> 2 # Upper 30 bits
  219. commit_time = gen_and_time[1] | (
  220. (gen_and_time[0] & 0x3) << 32
  221. ) # 34 bits total
  222. # Parse parents
  223. parents = []
  224. if parent1_pos < GRAPH_PARENT_MISSING:
  225. if parent1_pos >= len(oids):
  226. raise ValueError(f"Invalid parent1 position: {parent1_pos}")
  227. parents.append(oids[parent1_pos])
  228. if parent2_pos < GRAPH_PARENT_MISSING:
  229. if parent2_pos >= len(oids):
  230. raise ValueError(f"Invalid parent2 position: {parent2_pos}")
  231. parents.append(oids[parent2_pos])
  232. elif parent2_pos >= GRAPH_EXTRA_EDGES_NEEDED:
  233. # Handle extra edges (3+ parents)
  234. edge_offset = parent2_pos & ~GRAPH_EXTRA_EDGES_NEEDED
  235. parents.extend(self._parse_extra_edges(edge_offset, oids))
  236. entry = CommitGraphEntry(
  237. commit_id=sha_to_hex(oids[i]),
  238. tree_id=sha_to_hex(tree_id),
  239. parents=[sha_to_hex(p) for p in parents],
  240. generation=generation,
  241. commit_time=commit_time,
  242. )
  243. self.entries.append(entry)
  244. def _parse_extra_edges(self, offset: int, oids: Sequence[bytes]) -> list[bytes]:
  245. """Parse extra parent edges for commits with 3+ parents."""
  246. if CHUNK_EXTRA_EDGE_LIST not in self.chunks:
  247. return []
  248. edge_data = self.chunks[CHUNK_EXTRA_EDGE_LIST].data
  249. parents = []
  250. while offset < len(edge_data):
  251. parent_pos = struct.unpack(">L", edge_data[offset : offset + 4])[0]
  252. offset += 4
  253. if parent_pos & GRAPH_LAST_EDGE:
  254. parent_pos &= ~GRAPH_LAST_EDGE
  255. if parent_pos < len(oids):
  256. parents.append(oids[parent_pos])
  257. break
  258. else:
  259. if parent_pos < len(oids):
  260. parents.append(oids[parent_pos])
  261. return parents
  262. def get_entry_by_oid(self, oid: ObjectID) -> CommitGraphEntry | None:
  263. """Get commit graph entry by commit OID."""
  264. index = self._oid_to_index.get(oid)
  265. if index is not None:
  266. return self.entries[index]
  267. return None
  268. def get_generation_number(self, oid: ObjectID) -> int | None:
  269. """Get generation number for a commit."""
  270. entry = self.get_entry_by_oid(oid)
  271. return entry.generation if entry else None
  272. def get_parents(self, oid: ObjectID) -> list[ObjectID] | None:
  273. """Get parent commit IDs for a commit."""
  274. entry = self.get_entry_by_oid(oid)
  275. return entry.parents if entry else None
  276. def write_to_file(self, f: BinaryIO | _GitFile) -> None:
  277. """Write commit graph to file."""
  278. if not self.entries:
  279. raise ValueError("Cannot write empty commit graph")
  280. # Sort entries by commit ID for consistent output
  281. sorted_entries = sorted(self.entries, key=lambda e: e.commit_id)
  282. # Build OID lookup chunk
  283. oid_lookup_data = b""
  284. for entry in sorted_entries:
  285. oid_lookup_data += hex_to_sha(entry.commit_id)
  286. # Build commit data chunk
  287. commit_data = b""
  288. # Create OID to index mapping for parent lookups
  289. oid_to_index = {entry.commit_id: i for i, entry in enumerate(sorted_entries)}
  290. for entry in sorted_entries:
  291. # Tree OID (20 bytes)
  292. commit_data += hex_to_sha(entry.tree_id)
  293. # Parent positions (2 x 4 bytes)
  294. if len(entry.parents) == 0:
  295. parent1_pos = GRAPH_PARENT_MISSING
  296. parent2_pos = GRAPH_PARENT_MISSING
  297. elif len(entry.parents) == 1:
  298. parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING)
  299. parent2_pos = GRAPH_PARENT_MISSING
  300. elif len(entry.parents) == 2:
  301. parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING)
  302. parent2_pos = oid_to_index.get(entry.parents[1], GRAPH_PARENT_MISSING)
  303. else:
  304. # More than 2 parents - would need extra edge list chunk
  305. # For now, just store first two parents
  306. parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING)
  307. parent2_pos = oid_to_index.get(entry.parents[1], GRAPH_PARENT_MISSING)
  308. commit_data += struct.pack(">LL", parent1_pos, parent2_pos)
  309. # Generation and commit time (2 x 4 bytes)
  310. gen_and_time = (entry.generation << 2) | (entry.commit_time >> 32)
  311. commit_time_lower = entry.commit_time & 0xFFFFFFFF
  312. commit_data += struct.pack(">LL", gen_and_time, commit_time_lower)
  313. # Build fanout table
  314. fanout_data = b""
  315. fanout_counts = [0] * 256
  316. for i, entry in enumerate(sorted_entries):
  317. commit_oid_bytes = hex_to_sha(entry.commit_id)
  318. fanout_counts[commit_oid_bytes[0]] = i + 1
  319. # Fill in gaps - each fanout entry should be cumulative
  320. for i in range(1, 256):
  321. if fanout_counts[i] == 0:
  322. fanout_counts[i] = fanout_counts[i - 1]
  323. for count in fanout_counts:
  324. fanout_data += struct.pack(">L", count)
  325. # Calculate chunk offsets
  326. header_size = (
  327. 8 # signature + version + hash_version + num_chunks + base_graph_count
  328. )
  329. toc_size = 4 * 12 # 4 entries (3 chunks + terminator) * 12 bytes each
  330. chunk1_offset = header_size + toc_size # OID Fanout
  331. chunk2_offset = chunk1_offset + len(fanout_data) # OID Lookup
  332. chunk3_offset = chunk2_offset + len(oid_lookup_data) # Commit Data
  333. terminator_offset = chunk3_offset + len(commit_data)
  334. # Write header
  335. f.write(COMMIT_GRAPH_SIGNATURE)
  336. f.write(struct.pack(">B", COMMIT_GRAPH_VERSION))
  337. f.write(struct.pack(">B", self.hash_version))
  338. f.write(struct.pack(">B", 3)) # 3 chunks
  339. f.write(struct.pack(">B", 0)) # 0 base graphs
  340. # Write table of contents
  341. f.write(CHUNK_OID_FANOUT + struct.pack(">Q", chunk1_offset))
  342. f.write(CHUNK_OID_LOOKUP + struct.pack(">Q", chunk2_offset))
  343. f.write(CHUNK_COMMIT_DATA + struct.pack(">Q", chunk3_offset))
  344. f.write(b"\x00\x00\x00\x00" + struct.pack(">Q", terminator_offset))
  345. # Write chunks
  346. f.write(fanout_data)
  347. f.write(oid_lookup_data)
  348. f.write(commit_data)
  349. def __len__(self) -> int:
  350. """Return number of commits in the graph."""
  351. return len(self.entries)
  352. def __iter__(self) -> Iterator["CommitGraphEntry"]:
  353. """Iterate over commit graph entries."""
  354. return iter(self.entries)
  355. def read_commit_graph(path: str | bytes) -> CommitGraph | None:
  356. """Read commit graph from file path."""
  357. if isinstance(path, str):
  358. path = path.encode()
  359. if not os.path.exists(path):
  360. return None
  361. with open(path, "rb") as f:
  362. return CommitGraph.from_file(f)
  363. def find_commit_graph_file(git_dir: str | bytes) -> bytes | None:
  364. """Find commit graph file in a Git repository."""
  365. if isinstance(git_dir, str):
  366. git_dir = git_dir.encode()
  367. # Standard location: .git/objects/info/commit-graph
  368. commit_graph_path = os.path.join(git_dir, b"objects", b"info", b"commit-graph")
  369. if os.path.exists(commit_graph_path):
  370. return commit_graph_path
  371. # Chain files in .git/objects/info/commit-graphs/
  372. commit_graphs_dir = os.path.join(git_dir, b"objects", b"info", b"commit-graphs")
  373. if os.path.exists(commit_graphs_dir):
  374. # Look for graph-{hash}.graph files
  375. for filename in os.listdir(commit_graphs_dir):
  376. if filename.startswith(b"graph-") and filename.endswith(b".graph"):
  377. return os.path.join(commit_graphs_dir, filename)
  378. return None
  379. def generate_commit_graph(
  380. object_store: "BaseObjectStore", commit_ids: Sequence[ObjectID]
  381. ) -> CommitGraph:
  382. """Generate a commit graph from a set of commits.
  383. Args:
  384. object_store: Object store to retrieve commits from
  385. commit_ids: List of commit IDs to include in the graph
  386. Returns:
  387. CommitGraph object containing the specified commits
  388. """
  389. graph = CommitGraph()
  390. if not commit_ids:
  391. return graph
  392. # Ensure all commit_ids are in the correct format for object store access
  393. # DiskObjectStore expects hex ObjectIDs (40-byte hex strings)
  394. normalized_commit_ids: list[ObjectID] = []
  395. for commit_id in commit_ids:
  396. if isinstance(commit_id, bytes) and len(commit_id) == 40:
  397. # Already hex ObjectID
  398. normalized_commit_ids.append(ObjectID(commit_id))
  399. elif isinstance(commit_id, bytes) and len(commit_id) == 20:
  400. # Binary SHA, convert to hex ObjectID
  401. normalized_commit_ids.append(sha_to_hex(RawObjectID(commit_id)))
  402. else:
  403. # Assume it's already correct format
  404. normalized_commit_ids.append(ObjectID(commit_id))
  405. # Build a map of all commits and their metadata
  406. commit_map: dict[ObjectID, Commit] = {}
  407. for commit_id in normalized_commit_ids:
  408. try:
  409. commit_obj = object_store[commit_id]
  410. if commit_obj.type_name != b"commit":
  411. continue
  412. assert isinstance(commit_obj, Commit)
  413. commit_map[commit_id] = commit_obj
  414. except KeyError:
  415. # Commit not found, skip
  416. continue
  417. # Calculate generation numbers using topological sort
  418. generation_map: dict[bytes, int] = {}
  419. def calculate_generation(commit_id: ObjectID) -> int:
  420. if commit_id in generation_map:
  421. return generation_map[commit_id]
  422. if commit_id not in commit_map:
  423. # Unknown commit, assume generation 0
  424. generation_map[commit_id] = 0
  425. return 0
  426. commit_obj = commit_map[commit_id]
  427. if not commit_obj.parents:
  428. # Root commit
  429. generation_map[commit_id] = 1
  430. return 1
  431. # Calculate based on parents
  432. max_parent_gen = 0
  433. for parent_id in commit_obj.parents:
  434. parent_gen = calculate_generation(parent_id)
  435. max_parent_gen = max(max_parent_gen, parent_gen)
  436. generation = max_parent_gen + 1
  437. generation_map[commit_id] = generation
  438. return generation
  439. # Calculate generation numbers for all commits
  440. for commit_id in commit_map:
  441. calculate_generation(commit_id)
  442. # Build commit graph entries
  443. for commit_id, commit_obj in commit_map.items():
  444. # commit_id is already hex ObjectID from normalized_commit_ids
  445. commit_hex: ObjectID = commit_id
  446. # Handle tree ID - might already be hex ObjectID
  447. tree_hex: ObjectID
  448. if isinstance(commit_obj.tree, bytes) and len(commit_obj.tree) == 40:
  449. tree_hex = ObjectID(commit_obj.tree) # Already hex ObjectID
  450. else:
  451. tree_hex = sha_to_hex(commit_obj.tree) # Binary, convert to hex
  452. # Handle parent IDs - might already be hex ObjectIDs
  453. parents_hex: list[ObjectID] = []
  454. for parent_id in commit_obj.parents:
  455. if isinstance(parent_id, bytes) and len(parent_id) == 40:
  456. parents_hex.append(ObjectID(parent_id)) # Already hex ObjectID
  457. else:
  458. parents_hex.append(sha_to_hex(parent_id)) # Binary, convert to hex
  459. entry = CommitGraphEntry(
  460. commit_id=commit_hex,
  461. tree_id=tree_hex,
  462. parents=parents_hex,
  463. generation=generation_map[commit_id],
  464. commit_time=commit_obj.commit_time,
  465. )
  466. graph.entries.append(entry)
  467. # Build the OID to index mapping for lookups
  468. graph._oid_to_index = {}
  469. for i, entry in enumerate(graph.entries):
  470. graph._oid_to_index[entry.commit_id] = i
  471. return graph
  472. def write_commit_graph(
  473. git_dir: str | bytes,
  474. object_store: "BaseObjectStore",
  475. commit_ids: Sequence[ObjectID],
  476. ) -> None:
  477. """Write a commit graph file for the given commits.
  478. Args:
  479. git_dir: Git directory path
  480. object_store: Object store to retrieve commits from
  481. commit_ids: List of commit IDs to include in the graph
  482. """
  483. if isinstance(git_dir, str):
  484. git_dir = git_dir.encode()
  485. # Generate the commit graph
  486. graph = generate_commit_graph(object_store, commit_ids)
  487. if not graph.entries:
  488. return # Nothing to write
  489. # Ensure the objects/info directory exists
  490. info_dir = os.path.join(git_dir, b"objects", b"info")
  491. os.makedirs(info_dir, exist_ok=True)
  492. # Write using GitFile for atomic operation
  493. from .file import GitFile
  494. graph_path = os.path.join(info_dir, b"commit-graph")
  495. with GitFile(graph_path, "wb") as f:
  496. graph.write_to_file(f)
  497. def get_reachable_commits(
  498. object_store: "BaseObjectStore", start_commits: Sequence[ObjectID]
  499. ) -> list[ObjectID]:
  500. """Get all commits reachable from the given starting commits.
  501. Args:
  502. object_store: Object store to retrieve commits from
  503. start_commits: List of starting commit IDs
  504. Returns:
  505. List of all reachable commit IDs (including the starting commits)
  506. """
  507. visited: set[ObjectID] = set()
  508. reachable: list[ObjectID] = []
  509. stack: list[ObjectID] = []
  510. # Normalize commit IDs for object store access and tracking
  511. for commit_id in start_commits:
  512. if isinstance(commit_id, bytes) and len(commit_id) == 40:
  513. # Hex ObjectID - use directly for object store access
  514. oid = ObjectID(commit_id)
  515. if oid not in visited:
  516. stack.append(oid)
  517. elif isinstance(commit_id, bytes) and len(commit_id) == 20:
  518. # Binary SHA, convert to hex ObjectID for object store access
  519. hex_id = sha_to_hex(RawObjectID(commit_id))
  520. if hex_id not in visited:
  521. stack.append(hex_id)
  522. else:
  523. # Assume it's already correct format
  524. oid = ObjectID(commit_id)
  525. if oid not in visited:
  526. stack.append(oid)
  527. while stack:
  528. commit_id = stack.pop()
  529. if commit_id in visited:
  530. continue
  531. visited.add(commit_id)
  532. try:
  533. commit_obj = object_store[commit_id]
  534. if not isinstance(commit_obj, Commit):
  535. continue
  536. # Add to reachable list (commit_id is already hex ObjectID)
  537. reachable.append(commit_id)
  538. # Add parents to stack
  539. for parent_id in commit_obj.parents:
  540. if parent_id not in visited:
  541. stack.append(parent_id)
  542. except KeyError:
  543. # Commit not found, skip
  544. continue
  545. return reachable