commit_graph.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693
  1. # commit_graph.py -- Git commit graph file format support
  2. # Copyright (C) 2024 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Git commit graph file format support.
  22. Git's commit graph files store commit metadata and generation numbers
  23. for faster graph traversal operations like merge-base computation.
  24. The commit graph format is documented at:
  25. https://git-scm.com/docs/gitformat-commit-graph
  26. """
  27. __all__ = [
  28. "CHUNK_BASE_GRAPHS_LIST",
  29. "CHUNK_BLOOM_FILTER_DATA",
  30. "CHUNK_BLOOM_FILTER_INDEX",
  31. "CHUNK_COMMIT_DATA",
  32. "CHUNK_EXTRA_EDGE_LIST",
  33. "CHUNK_GENERATION_DATA",
  34. "CHUNK_GENERATION_DATA_OVERFLOW",
  35. "CHUNK_OID_FANOUT",
  36. "CHUNK_OID_LOOKUP",
  37. "COMMIT_GRAPH_SIGNATURE",
  38. "COMMIT_GRAPH_VERSION",
  39. "GENERATION_NUMBER_INFINITY",
  40. "GENERATION_NUMBER_V1_MAX",
  41. "GENERATION_NUMBER_ZERO",
  42. "GRAPH_EXTRA_EDGES_NEEDED",
  43. "GRAPH_LAST_EDGE",
  44. "GRAPH_PARENT_MISSING",
  45. "GRAPH_PARENT_NONE",
  46. "HASH_VERSION_SHA1",
  47. "HASH_VERSION_SHA256",
  48. "CommitGraph",
  49. "CommitGraphChunk",
  50. "CommitGraphEntry",
  51. "find_commit_graph_file",
  52. "generate_commit_graph",
  53. "get_reachable_commits",
  54. "read_commit_graph",
  55. "write_commit_graph",
  56. ]
  57. import os
  58. import struct
  59. from collections.abc import Iterator, Sequence
  60. from typing import TYPE_CHECKING, BinaryIO
  61. from .file import _GitFile
  62. from .object_format import ObjectFormat
  63. if TYPE_CHECKING:
  64. from .object_store import BaseObjectStore
  65. from .objects import Commit, ObjectID, RawObjectID, hex_to_sha, sha_to_hex
  66. # File format constants
  67. COMMIT_GRAPH_SIGNATURE = b"CGPH"
  68. COMMIT_GRAPH_VERSION = 1
  69. HASH_VERSION_SHA1 = 1
  70. HASH_VERSION_SHA256 = 2
  71. # Chunk IDs
  72. CHUNK_OID_FANOUT = b"OIDF"
  73. CHUNK_OID_LOOKUP = b"OIDL"
  74. CHUNK_COMMIT_DATA = b"CDAT"
  75. CHUNK_GENERATION_DATA = b"GDA2"
  76. CHUNK_GENERATION_DATA_OVERFLOW = b"GDO2"
  77. CHUNK_EXTRA_EDGE_LIST = b"EDGE"
  78. CHUNK_BLOOM_FILTER_INDEX = b"BIDX"
  79. CHUNK_BLOOM_FILTER_DATA = b"BDAT"
  80. CHUNK_BASE_GRAPHS_LIST = b"BASE"
  81. # Generation number constants
  82. GENERATION_NUMBER_INFINITY = 0xFFFFFFFF
  83. GENERATION_NUMBER_ZERO = 0
  84. GENERATION_NUMBER_V1_MAX = 0x3FFFFFFF
  85. # Parent encoding constants
  86. GRAPH_PARENT_MISSING = 0x70000000
  87. GRAPH_PARENT_NONE = 0x70000000
  88. GRAPH_EXTRA_EDGES_NEEDED = 0x80000000
  89. GRAPH_LAST_EDGE = 0x80000000
  90. class CommitGraphEntry:
  91. """Represents a single commit entry in the commit graph."""
  92. def __init__(
  93. self,
  94. commit_id: ObjectID,
  95. tree_id: ObjectID,
  96. parents: list[ObjectID],
  97. generation: int,
  98. commit_time: int,
  99. ) -> None:
  100. """Initialize CommitGraphEntry.
  101. Args:
  102. commit_id: The commit object ID
  103. tree_id: The tree object ID
  104. parents: List of parent commit IDs
  105. generation: Generation number
  106. commit_time: Commit timestamp
  107. """
  108. self.commit_id = commit_id
  109. self.tree_id = tree_id
  110. self.parents = parents
  111. self.generation = generation
  112. self.commit_time = commit_time
  113. def __repr__(self) -> str:
  114. """Return string representation of CommitGraphEntry."""
  115. return (
  116. f"CommitGraphEntry(commit_id={self.commit_id!r}, "
  117. f"tree_id={self.tree_id!r}, parents={self.parents!r}, "
  118. f"generation={self.generation}, commit_time={self.commit_time})"
  119. )
  120. class CommitGraphChunk:
  121. """Represents a chunk in the commit graph file."""
  122. def __init__(self, chunk_id: bytes, data: bytes) -> None:
  123. """Initialize CommitGraphChunk.
  124. Args:
  125. chunk_id: Chunk identifier
  126. data: Chunk data
  127. """
  128. self.chunk_id = chunk_id
  129. self.data = data
  130. def __repr__(self) -> str:
  131. """Return string representation of CommitGraphChunk."""
  132. return f"CommitGraphChunk(chunk_id={self.chunk_id!r}, size={len(self.data)})"
  133. class CommitGraph:
  134. """Git commit graph file reader/writer."""
  135. def __init__(self, *, object_format: ObjectFormat | None = None) -> None:
  136. """Initialize CommitGraph.
  137. Args:
  138. object_format: Object format to use (defaults to SHA1)
  139. """
  140. import warnings
  141. from .object_format import DEFAULT_OBJECT_FORMAT, SHA256
  142. if object_format is None:
  143. warnings.warn(
  144. "CommitGraph() should be called with object_format parameter",
  145. DeprecationWarning,
  146. stacklevel=2,
  147. )
  148. object_format = DEFAULT_OBJECT_FORMAT
  149. self.object_format = object_format
  150. self.hash_version = (
  151. HASH_VERSION_SHA256 if object_format == SHA256 else HASH_VERSION_SHA1
  152. )
  153. self.chunks: dict[bytes, CommitGraphChunk] = {}
  154. self.entries: list[CommitGraphEntry] = []
  155. self._oid_to_index: dict[RawObjectID, int] = {}
  156. @classmethod
  157. def from_file(cls, f: BinaryIO) -> "CommitGraph":
  158. """Read commit graph from file."""
  159. return cls._read_from_file(f)
  160. @classmethod
  161. def _read_from_file(cls, f: BinaryIO) -> "CommitGraph":
  162. """Read commit graph data from file."""
  163. # Read header
  164. signature = f.read(4)
  165. if signature != COMMIT_GRAPH_SIGNATURE:
  166. raise ValueError(f"Invalid commit graph signature: {signature!r}")
  167. version = struct.unpack(">B", f.read(1))[0]
  168. if version != COMMIT_GRAPH_VERSION:
  169. raise ValueError(f"Unsupported commit graph version: {version}")
  170. hash_version = struct.unpack(">B", f.read(1))[0]
  171. # Set object_format based on hash_version from file
  172. from .object_format import SHA1, SHA256
  173. if hash_version == HASH_VERSION_SHA1:
  174. object_format = SHA1
  175. elif hash_version == HASH_VERSION_SHA256:
  176. object_format = SHA256
  177. else:
  178. raise ValueError(f"Unsupported hash version: {hash_version}")
  179. # Create instance with correct object_format
  180. graph = cls(object_format=object_format)
  181. graph.hash_version = hash_version
  182. num_chunks = struct.unpack(">B", f.read(1))[0]
  183. struct.unpack(">B", f.read(1))[0]
  184. # Read table of contents
  185. toc_entries = []
  186. for _ in range(num_chunks + 1): # +1 for terminating entry
  187. chunk_id = f.read(4)
  188. offset = struct.unpack(">Q", f.read(8))[0]
  189. toc_entries.append((chunk_id, offset))
  190. # Read chunks
  191. # Offsets in TOC are absolute from start of file
  192. for i in range(num_chunks):
  193. chunk_id, offset = toc_entries[i]
  194. next_offset = toc_entries[i + 1][1]
  195. chunk_size = next_offset - offset
  196. f.seek(offset)
  197. chunk_data = f.read(chunk_size)
  198. graph.chunks[chunk_id] = CommitGraphChunk(chunk_id, chunk_data)
  199. # Parse chunks
  200. graph._parse_chunks()
  201. return graph
  202. def _parse_chunks(self) -> None:
  203. """Parse chunk data into entries."""
  204. if CHUNK_OID_LOOKUP not in self.chunks:
  205. raise ValueError("Missing required OID lookup chunk")
  206. if CHUNK_COMMIT_DATA not in self.chunks:
  207. raise ValueError("Missing required commit data chunk")
  208. # Parse OID lookup chunk
  209. oid_lookup_data = self.chunks[CHUNK_OID_LOOKUP].data
  210. num_commits = len(oid_lookup_data) // self.object_format.oid_length
  211. oids = []
  212. for i in range(num_commits):
  213. start = i * self.object_format.oid_length
  214. end = start + self.object_format.oid_length
  215. oid = RawObjectID(oid_lookup_data[start:end])
  216. oids.append(oid)
  217. self._oid_to_index[oid] = i
  218. # Parse commit data chunk
  219. commit_data = self.chunks[CHUNK_COMMIT_DATA].data
  220. expected_size = num_commits * (self.object_format.oid_length + 16)
  221. if len(commit_data) != expected_size:
  222. raise ValueError(
  223. f"Invalid commit data chunk size: {len(commit_data)}, expected {expected_size}"
  224. )
  225. self.entries = []
  226. for i in range(num_commits):
  227. offset = i * (self.object_format.oid_length + 16)
  228. # Tree OID
  229. tree_id = commit_data[offset : offset + self.object_format.oid_length]
  230. offset += self.object_format.oid_length
  231. # Parent positions (2 x 4 bytes)
  232. parent1_pos, parent2_pos = struct.unpack(
  233. ">LL", commit_data[offset : offset + 8]
  234. )
  235. offset += 8
  236. # Generation number and commit time (2 x 4 bytes)
  237. gen_and_time = struct.unpack(">LL", commit_data[offset : offset + 8])
  238. generation = gen_and_time[0] >> 2 # Upper 30 bits
  239. commit_time = gen_and_time[1] | (
  240. (gen_and_time[0] & 0x3) << 32
  241. ) # 34 bits total
  242. # Parse parents
  243. parents = []
  244. if parent1_pos < GRAPH_PARENT_MISSING:
  245. if parent1_pos >= len(oids):
  246. raise ValueError(f"Invalid parent1 position: {parent1_pos}")
  247. parents.append(oids[parent1_pos])
  248. if parent2_pos < GRAPH_PARENT_MISSING:
  249. if parent2_pos >= len(oids):
  250. raise ValueError(f"Invalid parent2 position: {parent2_pos}")
  251. parents.append(oids[parent2_pos])
  252. elif parent2_pos >= GRAPH_EXTRA_EDGES_NEEDED:
  253. # Handle extra edges (3+ parents)
  254. edge_offset = parent2_pos & ~GRAPH_EXTRA_EDGES_NEEDED
  255. parents.extend(self._parse_extra_edges(edge_offset, oids))
  256. entry = CommitGraphEntry(
  257. commit_id=sha_to_hex(oids[i]),
  258. tree_id=sha_to_hex(RawObjectID(tree_id)),
  259. parents=[sha_to_hex(p) for p in parents],
  260. generation=generation,
  261. commit_time=commit_time,
  262. )
  263. self.entries.append(entry)
  264. def _parse_extra_edges(
  265. self, offset: int, oids: Sequence[RawObjectID]
  266. ) -> list[RawObjectID]:
  267. """Parse extra parent edges for commits with 3+ parents."""
  268. if CHUNK_EXTRA_EDGE_LIST not in self.chunks:
  269. return []
  270. edge_data = self.chunks[CHUNK_EXTRA_EDGE_LIST].data
  271. parents = []
  272. while offset < len(edge_data):
  273. parent_pos = struct.unpack(">L", edge_data[offset : offset + 4])[0]
  274. offset += 4
  275. if parent_pos & GRAPH_LAST_EDGE:
  276. parent_pos &= ~GRAPH_LAST_EDGE
  277. if parent_pos < len(oids):
  278. parents.append(oids[parent_pos])
  279. break
  280. else:
  281. if parent_pos < len(oids):
  282. parents.append(oids[parent_pos])
  283. return parents
  284. def get_entry_by_oid(self, oid: ObjectID) -> CommitGraphEntry | None:
  285. """Get commit graph entry by commit OID."""
  286. # Convert hex ObjectID to binary if needed for lookup
  287. if isinstance(oid, bytes) and len(oid) == self.object_format.hex_length:
  288. # Input is hex ObjectID, convert to binary for internal lookup
  289. lookup_oid: RawObjectID = hex_to_sha(oid)
  290. else:
  291. # Input is already binary
  292. lookup_oid = RawObjectID(oid)
  293. index = self._oid_to_index.get(lookup_oid)
  294. if index is not None:
  295. return self.entries[index]
  296. return None
  297. def get_generation_number(self, oid: ObjectID) -> int | None:
  298. """Get generation number for a commit."""
  299. entry = self.get_entry_by_oid(oid)
  300. return entry.generation if entry else None
  301. def get_parents(self, oid: ObjectID) -> list[ObjectID] | None:
  302. """Get parent commit IDs for a commit."""
  303. entry = self.get_entry_by_oid(oid)
  304. return entry.parents if entry else None
  305. def write_to_file(self, f: BinaryIO | _GitFile) -> None:
  306. """Write commit graph to file."""
  307. if not self.entries:
  308. raise ValueError("Cannot write empty commit graph")
  309. # Sort entries by commit ID for consistent output
  310. sorted_entries = sorted(self.entries, key=lambda e: e.commit_id)
  311. # Build OID lookup chunk
  312. oid_lookup_data = b""
  313. for entry in sorted_entries:
  314. oid_lookup_data += hex_to_sha(entry.commit_id)
  315. # Build commit data chunk
  316. commit_data = b""
  317. # Create OID to index mapping for parent lookups
  318. oid_to_index = {entry.commit_id: i for i, entry in enumerate(sorted_entries)}
  319. for entry in sorted_entries:
  320. # Tree OID (20 bytes)
  321. commit_data += hex_to_sha(entry.tree_id)
  322. # Parent positions (2 x 4 bytes)
  323. if len(entry.parents) == 0:
  324. parent1_pos = GRAPH_PARENT_MISSING
  325. parent2_pos = GRAPH_PARENT_MISSING
  326. elif len(entry.parents) == 1:
  327. parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING)
  328. parent2_pos = GRAPH_PARENT_MISSING
  329. elif len(entry.parents) == 2:
  330. parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING)
  331. parent2_pos = oid_to_index.get(entry.parents[1], GRAPH_PARENT_MISSING)
  332. else:
  333. # More than 2 parents - would need extra edge list chunk
  334. # For now, just store first two parents
  335. parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING)
  336. parent2_pos = oid_to_index.get(entry.parents[1], GRAPH_PARENT_MISSING)
  337. commit_data += struct.pack(">LL", parent1_pos, parent2_pos)
  338. # Generation and commit time (2 x 4 bytes)
  339. gen_and_time = (entry.generation << 2) | (entry.commit_time >> 32)
  340. commit_time_lower = entry.commit_time & 0xFFFFFFFF
  341. commit_data += struct.pack(">LL", gen_and_time, commit_time_lower)
  342. # Build fanout table
  343. fanout_data = b""
  344. fanout_counts = [0] * 256
  345. for i, entry in enumerate(sorted_entries):
  346. commit_oid_bytes = hex_to_sha(entry.commit_id)
  347. fanout_counts[commit_oid_bytes[0]] = i + 1
  348. # Fill in gaps - each fanout entry should be cumulative
  349. for i in range(1, 256):
  350. if fanout_counts[i] == 0:
  351. fanout_counts[i] = fanout_counts[i - 1]
  352. for count in fanout_counts:
  353. fanout_data += struct.pack(">L", count)
  354. # Calculate chunk offsets
  355. header_size = (
  356. 8 # signature + version + hash_version + num_chunks + base_graph_count
  357. )
  358. toc_size = 4 * 12 # 4 entries (3 chunks + terminator) * 12 bytes each
  359. chunk1_offset = header_size + toc_size # OID Fanout
  360. chunk2_offset = chunk1_offset + len(fanout_data) # OID Lookup
  361. chunk3_offset = chunk2_offset + len(oid_lookup_data) # Commit Data
  362. terminator_offset = chunk3_offset + len(commit_data)
  363. # Write header
  364. f.write(COMMIT_GRAPH_SIGNATURE)
  365. f.write(struct.pack(">B", COMMIT_GRAPH_VERSION))
  366. f.write(struct.pack(">B", self.hash_version))
  367. f.write(struct.pack(">B", 3)) # 3 chunks
  368. f.write(struct.pack(">B", 0)) # 0 base graphs
  369. # Write table of contents
  370. f.write(CHUNK_OID_FANOUT + struct.pack(">Q", chunk1_offset))
  371. f.write(CHUNK_OID_LOOKUP + struct.pack(">Q", chunk2_offset))
  372. f.write(CHUNK_COMMIT_DATA + struct.pack(">Q", chunk3_offset))
  373. f.write(b"\x00\x00\x00\x00" + struct.pack(">Q", terminator_offset))
  374. # Write chunks
  375. f.write(fanout_data)
  376. f.write(oid_lookup_data)
  377. f.write(commit_data)
  378. def __len__(self) -> int:
  379. """Return number of commits in the graph."""
  380. return len(self.entries)
  381. def __iter__(self) -> Iterator["CommitGraphEntry"]:
  382. """Iterate over commit graph entries."""
  383. return iter(self.entries)
  384. def read_commit_graph(path: str | bytes) -> CommitGraph | None:
  385. """Read commit graph from file path."""
  386. if isinstance(path, str):
  387. path = path.encode()
  388. if not os.path.exists(path):
  389. return None
  390. with open(path, "rb") as f:
  391. return CommitGraph.from_file(f)
  392. def find_commit_graph_file(git_dir: str | bytes) -> bytes | None:
  393. """Find commit graph file in a Git repository."""
  394. if isinstance(git_dir, str):
  395. git_dir = git_dir.encode()
  396. # Standard location: .git/objects/info/commit-graph
  397. commit_graph_path = os.path.join(git_dir, b"objects", b"info", b"commit-graph")
  398. if os.path.exists(commit_graph_path):
  399. return commit_graph_path
  400. # Chain files in .git/objects/info/commit-graphs/
  401. commit_graphs_dir = os.path.join(git_dir, b"objects", b"info", b"commit-graphs")
  402. if os.path.exists(commit_graphs_dir):
  403. # Look for graph-{hash}.graph files
  404. for filename in os.listdir(commit_graphs_dir):
  405. if filename.startswith(b"graph-") and filename.endswith(b".graph"):
  406. return os.path.join(commit_graphs_dir, filename)
  407. return None
  408. def generate_commit_graph(
  409. object_store: "BaseObjectStore", commit_ids: Sequence[ObjectID]
  410. ) -> CommitGraph:
  411. """Generate a commit graph from a set of commits.
  412. Args:
  413. object_store: Object store to retrieve commits from
  414. commit_ids: List of commit IDs to include in the graph
  415. Returns:
  416. CommitGraph object containing the specified commits
  417. """
  418. graph = CommitGraph(object_format=object_store.object_format)
  419. if not commit_ids:
  420. return graph
  421. # Ensure all commit_ids are in the correct format for object store access
  422. hex_length = object_store.object_format.hex_length
  423. oid_length = object_store.object_format.oid_length
  424. normalized_commit_ids = []
  425. for commit_id in commit_ids:
  426. if isinstance(commit_id, bytes) and len(commit_id) == hex_length:
  427. # Already hex ObjectID
  428. normalized_commit_ids.append(commit_id)
  429. elif isinstance(commit_id, bytes) and len(commit_id) == oid_length:
  430. # Binary SHA, convert to hex ObjectID
  431. normalized_commit_ids.append(sha_to_hex(RawObjectID(commit_id)))
  432. else:
  433. # Assume it's already correct format
  434. normalized_commit_ids.append(ObjectID(commit_id))
  435. # Build a map of all commits and their metadata
  436. commit_map: dict[ObjectID, Commit] = {}
  437. for commit_id in normalized_commit_ids:
  438. try:
  439. commit_obj = object_store[commit_id]
  440. if commit_obj.type_name != b"commit":
  441. continue
  442. assert isinstance(commit_obj, Commit)
  443. commit_map[commit_id] = commit_obj
  444. except KeyError:
  445. # Commit not found, skip
  446. continue
  447. # Calculate generation numbers using topological sort
  448. generation_map: dict[bytes, int] = {}
  449. def calculate_generation(commit_id: ObjectID) -> int:
  450. if commit_id in generation_map:
  451. return generation_map[commit_id]
  452. if commit_id not in commit_map:
  453. # Unknown commit, assume generation 0
  454. generation_map[commit_id] = 0
  455. return 0
  456. commit_obj = commit_map[commit_id]
  457. if not commit_obj.parents:
  458. # Root commit
  459. generation_map[commit_id] = 1
  460. return 1
  461. # Calculate based on parents
  462. max_parent_gen = 0
  463. for parent_id in commit_obj.parents:
  464. parent_gen = calculate_generation(parent_id)
  465. max_parent_gen = max(max_parent_gen, parent_gen)
  466. generation = max_parent_gen + 1
  467. generation_map[commit_id] = generation
  468. return generation
  469. # Calculate generation numbers for all commits
  470. for commit_id in commit_map:
  471. calculate_generation(commit_id)
  472. # Build commit graph entries
  473. for commit_id, commit_obj in commit_map.items():
  474. # commit_id is already hex ObjectID from normalized_commit_ids
  475. commit_hex: ObjectID = commit_id
  476. # commit_obj.tree and commit_obj.parents are already ObjectIDs
  477. tree_hex = commit_obj.tree
  478. parents_hex: list[ObjectID] = commit_obj.parents
  479. entry = CommitGraphEntry(
  480. commit_id=commit_hex,
  481. tree_id=tree_hex,
  482. parents=parents_hex,
  483. generation=generation_map[commit_id],
  484. commit_time=commit_obj.commit_time,
  485. )
  486. graph.entries.append(entry)
  487. # Build the OID to index mapping for lookups
  488. graph._oid_to_index = {}
  489. for i, entry in enumerate(graph.entries):
  490. # Convert hex ObjectID to binary RawObjectID for consistent lookup
  491. graph._oid_to_index[hex_to_sha(entry.commit_id)] = i
  492. return graph
  493. def write_commit_graph(
  494. git_dir: str | bytes,
  495. object_store: "BaseObjectStore",
  496. commit_ids: Sequence[ObjectID],
  497. ) -> None:
  498. """Write a commit graph file for the given commits.
  499. Args:
  500. git_dir: Git directory path
  501. object_store: Object store to retrieve commits from
  502. commit_ids: List of commit IDs to include in the graph
  503. """
  504. if isinstance(git_dir, str):
  505. git_dir = git_dir.encode()
  506. # Generate the commit graph
  507. graph = generate_commit_graph(object_store, commit_ids)
  508. if not graph.entries:
  509. return # Nothing to write
  510. # Ensure the objects/info directory exists
  511. info_dir = os.path.join(git_dir, b"objects", b"info")
  512. os.makedirs(info_dir, exist_ok=True)
  513. # Write using GitFile for atomic operation
  514. from .file import GitFile
  515. graph_path = os.path.join(info_dir, b"commit-graph")
  516. with GitFile(graph_path, "wb") as f:
  517. graph.write_to_file(f)
  518. def get_reachable_commits(
  519. object_store: "BaseObjectStore", start_commits: Sequence[ObjectID]
  520. ) -> list[ObjectID]:
  521. """Get all commits reachable from the given starting commits.
  522. Args:
  523. object_store: Object store to retrieve commits from
  524. start_commits: List of starting commit IDs
  525. Returns:
  526. List of all reachable commit IDs (including the starting commits)
  527. """
  528. visited: set[ObjectID] = set()
  529. reachable: list[ObjectID] = []
  530. stack: list[ObjectID] = []
  531. hex_length = object_store.object_format.hex_length
  532. oid_length = object_store.object_format.oid_length
  533. # Normalize commit IDs for object store access and tracking
  534. for commit_id in start_commits:
  535. if isinstance(commit_id, bytes) and len(commit_id) == hex_length:
  536. # Hex ObjectID - use directly for object store access
  537. if commit_id not in visited:
  538. stack.append(commit_id)
  539. elif isinstance(commit_id, bytes) and len(commit_id) == oid_length:
  540. # Binary SHA, convert to hex ObjectID for object store access
  541. hex_id = sha_to_hex(RawObjectID(commit_id))
  542. if hex_id not in visited:
  543. stack.append(hex_id)
  544. else:
  545. # Assume it's already correct format
  546. oid = ObjectID(commit_id)
  547. if oid not in visited:
  548. stack.append(oid)
  549. while stack:
  550. commit_id = stack.pop()
  551. if commit_id in visited:
  552. continue
  553. visited.add(commit_id)
  554. try:
  555. commit_obj = object_store[commit_id]
  556. if not isinstance(commit_obj, Commit):
  557. continue
  558. # Add to reachable list (commit_id is already hex ObjectID)
  559. reachable.append(commit_id)
  560. # Add parents to stack
  561. for parent_id in commit_obj.parents:
  562. if parent_id not in visited:
  563. stack.append(parent_id)
  564. except KeyError:
  565. # Commit not found, skip
  566. continue
  567. return reachable