commit_graph.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690
  1. # commit_graph.py -- Git commit graph file format support
  2. # Copyright (C) 2024 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Git commit graph file format support.
  22. Git's commit graph files store commit metadata and generation numbers
  23. for faster graph traversal operations like merge-base computation.
  24. The commit graph format is documented at:
  25. https://git-scm.com/docs/gitformat-commit-graph
  26. """
  27. __all__ = [
  28. "CHUNK_BASE_GRAPHS_LIST",
  29. "CHUNK_BLOOM_FILTER_DATA",
  30. "CHUNK_BLOOM_FILTER_INDEX",
  31. "CHUNK_COMMIT_DATA",
  32. "CHUNK_EXTRA_EDGE_LIST",
  33. "CHUNK_GENERATION_DATA",
  34. "CHUNK_GENERATION_DATA_OVERFLOW",
  35. "CHUNK_OID_FANOUT",
  36. "CHUNK_OID_LOOKUP",
  37. "COMMIT_GRAPH_SIGNATURE",
  38. "COMMIT_GRAPH_VERSION",
  39. "GENERATION_NUMBER_INFINITY",
  40. "GENERATION_NUMBER_V1_MAX",
  41. "GENERATION_NUMBER_ZERO",
  42. "GRAPH_EXTRA_EDGES_NEEDED",
  43. "GRAPH_LAST_EDGE",
  44. "GRAPH_PARENT_MISSING",
  45. "GRAPH_PARENT_NONE",
  46. "HASH_VERSION_SHA1",
  47. "HASH_VERSION_SHA256",
  48. "CommitGraph",
  49. "CommitGraphChunk",
  50. "CommitGraphEntry",
  51. "find_commit_graph_file",
  52. "generate_commit_graph",
  53. "get_reachable_commits",
  54. "read_commit_graph",
  55. "write_commit_graph",
  56. ]
  57. import os
  58. import struct
  59. from collections.abc import Iterator, Sequence
  60. from typing import TYPE_CHECKING, BinaryIO
  61. from .file import _GitFile
  62. if TYPE_CHECKING:
  63. from .object_store import BaseObjectStore
  64. from .objects import Commit, ObjectID, RawObjectID, hex_to_sha, sha_to_hex
  65. # File format constants
  66. COMMIT_GRAPH_SIGNATURE = b"CGPH"
  67. COMMIT_GRAPH_VERSION = 1
  68. HASH_VERSION_SHA1 = 1
  69. HASH_VERSION_SHA256 = 2
  70. # Chunk IDs
  71. CHUNK_OID_FANOUT = b"OIDF"
  72. CHUNK_OID_LOOKUP = b"OIDL"
  73. CHUNK_COMMIT_DATA = b"CDAT"
  74. CHUNK_GENERATION_DATA = b"GDA2"
  75. CHUNK_GENERATION_DATA_OVERFLOW = b"GDO2"
  76. CHUNK_EXTRA_EDGE_LIST = b"EDGE"
  77. CHUNK_BLOOM_FILTER_INDEX = b"BIDX"
  78. CHUNK_BLOOM_FILTER_DATA = b"BDAT"
  79. CHUNK_BASE_GRAPHS_LIST = b"BASE"
  80. # Generation number constants
  81. GENERATION_NUMBER_INFINITY = 0xFFFFFFFF
  82. GENERATION_NUMBER_ZERO = 0
  83. GENERATION_NUMBER_V1_MAX = 0x3FFFFFFF
  84. # Parent encoding constants
  85. GRAPH_PARENT_MISSING = 0x70000000
  86. GRAPH_PARENT_NONE = 0x70000000
  87. GRAPH_EXTRA_EDGES_NEEDED = 0x80000000
  88. GRAPH_LAST_EDGE = 0x80000000
  89. class CommitGraphEntry:
  90. """Represents a single commit entry in the commit graph."""
  91. def __init__(
  92. self,
  93. commit_id: ObjectID,
  94. tree_id: ObjectID,
  95. parents: list[ObjectID],
  96. generation: int,
  97. commit_time: int,
  98. ) -> None:
  99. """Initialize CommitGraphEntry.
  100. Args:
  101. commit_id: The commit object ID
  102. tree_id: The tree object ID
  103. parents: List of parent commit IDs
  104. generation: Generation number
  105. commit_time: Commit timestamp
  106. """
  107. self.commit_id = commit_id
  108. self.tree_id = tree_id
  109. self.parents = parents
  110. self.generation = generation
  111. self.commit_time = commit_time
  112. def __repr__(self) -> str:
  113. """Return string representation of CommitGraphEntry."""
  114. return (
  115. f"CommitGraphEntry(commit_id={self.commit_id!r}, "
  116. f"tree_id={self.tree_id!r}, parents={self.parents!r}, "
  117. f"generation={self.generation}, commit_time={self.commit_time})"
  118. )
  119. class CommitGraphChunk:
  120. """Represents a chunk in the commit graph file."""
  121. def __init__(self, chunk_id: bytes, data: bytes) -> None:
  122. """Initialize CommitGraphChunk.
  123. Args:
  124. chunk_id: Chunk identifier
  125. data: Chunk data
  126. """
  127. self.chunk_id = chunk_id
  128. self.data = data
  129. def __repr__(self) -> str:
  130. """Return string representation of CommitGraphChunk."""
  131. return f"CommitGraphChunk(chunk_id={self.chunk_id!r}, size={len(self.data)})"
  132. class CommitGraph:
  133. """Git commit graph file reader/writer."""
  134. def __init__(self, *, object_format=None) -> None:
  135. """Initialize CommitGraph.
  136. Args:
  137. object_format: Object format to use (defaults to SHA1)
  138. """
  139. import warnings
  140. from .object_format import DEFAULT_OBJECT_FORMAT, SHA256
  141. if object_format is None:
  142. warnings.warn(
  143. "CommitGraph() should be called with object_format parameter",
  144. DeprecationWarning,
  145. stacklevel=2,
  146. )
  147. object_format = DEFAULT_OBJECT_FORMAT
  148. self.object_format = object_format
  149. self.hash_version = HASH_VERSION_SHA256 if object_format == SHA256 else HASH_VERSION_SHA1
  150. self.chunks: dict[bytes, CommitGraphChunk] = {}
  151. self.entries: list[CommitGraphEntry] = []
  152. self._oid_to_index: dict[ObjectID, int] = {}
  153. @classmethod
  154. def from_file(cls, f: BinaryIO) -> "CommitGraph":
  155. """Read commit graph from file."""
  156. graph = cls()
  157. graph._read_from_file(f)
  158. return graph
  159. def _read_from_file(self, f: BinaryIO) -> None:
  160. """Read commit graph data from file."""
  161. # Read header
  162. signature = f.read(4)
  163. if signature != COMMIT_GRAPH_SIGNATURE:
  164. raise ValueError(f"Invalid commit graph signature: {signature!r}")
  165. version = struct.unpack(">B", f.read(1))[0]
  166. if version != COMMIT_GRAPH_VERSION:
  167. raise ValueError(f"Unsupported commit graph version: {version}")
  168. self.hash_version = struct.unpack(">B", f.read(1))[0]
  169. # Set object_format based on hash_version from file
  170. from .object_format import SHA1, SHA256
  171. if self.hash_version == HASH_VERSION_SHA1:
  172. self.object_format = SHA1
  173. elif self.hash_version == HASH_VERSION_SHA256:
  174. self.object_format = SHA256
  175. else:
  176. raise ValueError(f"Unsupported hash version: {self.hash_version}")
  177. num_chunks = struct.unpack(">B", f.read(1))[0]
  178. struct.unpack(">B", f.read(1))[0]
  179. # Read table of contents
  180. toc_entries = []
  181. for _ in range(num_chunks + 1): # +1 for terminating entry
  182. chunk_id = f.read(4)
  183. offset = struct.unpack(">Q", f.read(8))[0]
  184. toc_entries.append((chunk_id, offset))
  185. # Read chunks
  186. # Offsets in TOC are absolute from start of file
  187. for i in range(num_chunks):
  188. chunk_id, offset = toc_entries[i]
  189. next_offset = toc_entries[i + 1][1]
  190. chunk_size = next_offset - offset
  191. f.seek(offset)
  192. chunk_data = f.read(chunk_size)
  193. self.chunks[chunk_id] = CommitGraphChunk(chunk_id, chunk_data)
  194. # Parse chunks
  195. self._parse_chunks()
  196. def _parse_chunks(self) -> None:
  197. """Parse chunk data into entries."""
  198. if CHUNK_OID_LOOKUP not in self.chunks:
  199. raise ValueError("Missing required OID lookup chunk")
  200. if CHUNK_COMMIT_DATA not in self.chunks:
  201. raise ValueError("Missing required commit data chunk")
  202. # Parse OID lookup chunk
  203. oid_lookup_data = self.chunks[CHUNK_OID_LOOKUP].data
  204. num_commits = len(oid_lookup_data) // self.object_format.oid_length
  205. oids = []
  206. for i in range(num_commits):
  207. start = i * self.object_format.oid_length
  208. end = start + self.object_format.oid_length
  209. oid = oid_lookup_data[start:end]
  210. oids.append(oid)
  211. self._oid_to_index[sha_to_hex(oid)] = i
  212. # Parse commit data chunk
  213. commit_data = self.chunks[CHUNK_COMMIT_DATA].data
  214. expected_size = num_commits * (self.object_format.oid_length + 16)
  215. if len(commit_data) != expected_size:
  216. raise ValueError(
  217. f"Invalid commit data chunk size: {len(commit_data)}, expected {expected_size}"
  218. )
  219. self.entries = []
  220. for i in range(num_commits):
  221. offset = i * (self.object_format.oid_length + 16)
  222. # Tree OID
  223. tree_id = commit_data[offset : offset + self.object_format.oid_length]
  224. offset += self.object_format.oid_length
  225. # Parent positions (2 x 4 bytes)
  226. parent1_pos, parent2_pos = struct.unpack(
  227. ">LL", commit_data[offset : offset + 8]
  228. )
  229. offset += 8
  230. # Generation number and commit time (2 x 4 bytes)
  231. gen_and_time = struct.unpack(">LL", commit_data[offset : offset + 8])
  232. generation = gen_and_time[0] >> 2 # Upper 30 bits
  233. commit_time = gen_and_time[1] | (
  234. (gen_and_time[0] & 0x3) << 32
  235. ) # 34 bits total
  236. # Parse parents
  237. parents = []
  238. if parent1_pos < GRAPH_PARENT_MISSING:
  239. if parent1_pos >= len(oids):
  240. raise ValueError(f"Invalid parent1 position: {parent1_pos}")
  241. parents.append(oids[parent1_pos])
  242. if parent2_pos < GRAPH_PARENT_MISSING:
  243. if parent2_pos >= len(oids):
  244. raise ValueError(f"Invalid parent2 position: {parent2_pos}")
  245. parents.append(oids[parent2_pos])
  246. elif parent2_pos >= GRAPH_EXTRA_EDGES_NEEDED:
  247. # Handle extra edges (3+ parents)
  248. edge_offset = parent2_pos & ~GRAPH_EXTRA_EDGES_NEEDED
  249. parents.extend(self._parse_extra_edges(edge_offset, oids))
  250. entry = CommitGraphEntry(
  251. commit_id=sha_to_hex(oids[i]),
  252. tree_id=sha_to_hex(tree_id),
  253. parents=[sha_to_hex(p) for p in parents],
  254. generation=generation,
  255. commit_time=commit_time,
  256. )
  257. self.entries.append(entry)
  258. def _parse_extra_edges(self, offset: int, oids: Sequence[bytes]) -> list[bytes]:
  259. """Parse extra parent edges for commits with 3+ parents."""
  260. if CHUNK_EXTRA_EDGE_LIST not in self.chunks:
  261. return []
  262. edge_data = self.chunks[CHUNK_EXTRA_EDGE_LIST].data
  263. parents = []
  264. while offset < len(edge_data):
  265. parent_pos = struct.unpack(">L", edge_data[offset : offset + 4])[0]
  266. offset += 4
  267. if parent_pos & GRAPH_LAST_EDGE:
  268. parent_pos &= ~GRAPH_LAST_EDGE
  269. if parent_pos < len(oids):
  270. parents.append(oids[parent_pos])
  271. break
  272. else:
  273. if parent_pos < len(oids):
  274. parents.append(oids[parent_pos])
  275. return parents
  276. def get_entry_by_oid(self, oid: ObjectID) -> CommitGraphEntry | None:
  277. """Get commit graph entry by commit OID."""
  278. # Convert hex ObjectID to binary if needed for lookup
  279. if isinstance(oid, bytes) and len(oid) == self.object_format.hex_length:
  280. # Input is hex ObjectID, convert to binary for internal lookup
  281. lookup_oid = hex_to_sha(oid)
  282. else:
  283. # Input is already binary
  284. lookup_oid = oid
  285. index = self._oid_to_index.get(lookup_oid)
  286. if index is not None:
  287. return self.entries[index]
  288. return None
  289. def get_generation_number(self, oid: ObjectID) -> int | None:
  290. """Get generation number for a commit."""
  291. entry = self.get_entry_by_oid(oid)
  292. return entry.generation if entry else None
  293. def get_parents(self, oid: ObjectID) -> list[ObjectID] | None:
  294. """Get parent commit IDs for a commit."""
  295. entry = self.get_entry_by_oid(oid)
  296. return entry.parents if entry else None
  297. def write_to_file(self, f: BinaryIO | _GitFile) -> None:
  298. """Write commit graph to file."""
  299. if not self.entries:
  300. raise ValueError("Cannot write empty commit graph")
  301. # Sort entries by commit ID for consistent output
  302. sorted_entries = sorted(self.entries, key=lambda e: e.commit_id)
  303. # Build OID lookup chunk
  304. oid_lookup_data = b""
  305. for entry in sorted_entries:
  306. oid_lookup_data += hex_to_sha(entry.commit_id)
  307. # Build commit data chunk
  308. commit_data = b""
  309. # Create OID to index mapping for parent lookups
  310. oid_to_index = {entry.commit_id: i for i, entry in enumerate(sorted_entries)}
  311. for entry in sorted_entries:
  312. # Tree OID (20 bytes)
  313. commit_data += hex_to_sha(entry.tree_id)
  314. # Parent positions (2 x 4 bytes)
  315. if len(entry.parents) == 0:
  316. parent1_pos = GRAPH_PARENT_MISSING
  317. parent2_pos = GRAPH_PARENT_MISSING
  318. elif len(entry.parents) == 1:
  319. parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING)
  320. parent2_pos = GRAPH_PARENT_MISSING
  321. elif len(entry.parents) == 2:
  322. parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING)
  323. parent2_pos = oid_to_index.get(entry.parents[1], GRAPH_PARENT_MISSING)
  324. else:
  325. # More than 2 parents - would need extra edge list chunk
  326. # For now, just store first two parents
  327. parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING)
  328. parent2_pos = oid_to_index.get(entry.parents[1], GRAPH_PARENT_MISSING)
  329. commit_data += struct.pack(">LL", parent1_pos, parent2_pos)
  330. # Generation and commit time (2 x 4 bytes)
  331. gen_and_time = (entry.generation << 2) | (entry.commit_time >> 32)
  332. commit_time_lower = entry.commit_time & 0xFFFFFFFF
  333. commit_data += struct.pack(">LL", gen_and_time, commit_time_lower)
  334. # Build fanout table
  335. fanout_data = b""
  336. fanout_counts = [0] * 256
  337. for i, entry in enumerate(sorted_entries):
  338. commit_oid_bytes = hex_to_sha(entry.commit_id)
  339. fanout_counts[commit_oid_bytes[0]] = i + 1
  340. # Fill in gaps - each fanout entry should be cumulative
  341. for i in range(1, 256):
  342. if fanout_counts[i] == 0:
  343. fanout_counts[i] = fanout_counts[i - 1]
  344. for count in fanout_counts:
  345. fanout_data += struct.pack(">L", count)
  346. # Calculate chunk offsets
  347. header_size = (
  348. 8 # signature + version + hash_version + num_chunks + base_graph_count
  349. )
  350. toc_size = 4 * 12 # 4 entries (3 chunks + terminator) * 12 bytes each
  351. chunk1_offset = header_size + toc_size # OID Fanout
  352. chunk2_offset = chunk1_offset + len(fanout_data) # OID Lookup
  353. chunk3_offset = chunk2_offset + len(oid_lookup_data) # Commit Data
  354. terminator_offset = chunk3_offset + len(commit_data)
  355. # Write header
  356. f.write(COMMIT_GRAPH_SIGNATURE)
  357. f.write(struct.pack(">B", COMMIT_GRAPH_VERSION))
  358. f.write(struct.pack(">B", self.hash_version))
  359. f.write(struct.pack(">B", 3)) # 3 chunks
  360. f.write(struct.pack(">B", 0)) # 0 base graphs
  361. # Write table of contents
  362. f.write(CHUNK_OID_FANOUT + struct.pack(">Q", chunk1_offset))
  363. f.write(CHUNK_OID_LOOKUP + struct.pack(">Q", chunk2_offset))
  364. f.write(CHUNK_COMMIT_DATA + struct.pack(">Q", chunk3_offset))
  365. f.write(b"\x00\x00\x00\x00" + struct.pack(">Q", terminator_offset))
  366. # Write chunks
  367. f.write(fanout_data)
  368. f.write(oid_lookup_data)
  369. f.write(commit_data)
  370. def __len__(self) -> int:
  371. """Return number of commits in the graph."""
  372. return len(self.entries)
  373. def __iter__(self) -> Iterator["CommitGraphEntry"]:
  374. """Iterate over commit graph entries."""
  375. return iter(self.entries)
  376. def read_commit_graph(path: str | bytes) -> CommitGraph | None:
  377. """Read commit graph from file path."""
  378. if isinstance(path, str):
  379. path = path.encode()
  380. if not os.path.exists(path):
  381. return None
  382. with open(path, "rb") as f:
  383. return CommitGraph.from_file(f)
  384. def find_commit_graph_file(git_dir: str | bytes) -> bytes | None:
  385. """Find commit graph file in a Git repository."""
  386. if isinstance(git_dir, str):
  387. git_dir = git_dir.encode()
  388. # Standard location: .git/objects/info/commit-graph
  389. commit_graph_path = os.path.join(git_dir, b"objects", b"info", b"commit-graph")
  390. if os.path.exists(commit_graph_path):
  391. return commit_graph_path
  392. # Chain files in .git/objects/info/commit-graphs/
  393. commit_graphs_dir = os.path.join(git_dir, b"objects", b"info", b"commit-graphs")
  394. if os.path.exists(commit_graphs_dir):
  395. # Look for graph-{hash}.graph files
  396. for filename in os.listdir(commit_graphs_dir):
  397. if filename.startswith(b"graph-") and filename.endswith(b".graph"):
  398. return os.path.join(commit_graphs_dir, filename)
  399. return None
  400. def generate_commit_graph(
  401. object_store: "BaseObjectStore", commit_ids: Sequence[ObjectID]
  402. ) -> CommitGraph:
  403. """Generate a commit graph from a set of commits.
  404. Args:
  405. object_store: Object store to retrieve commits from
  406. commit_ids: List of commit IDs to include in the graph
  407. Returns:
  408. CommitGraph object containing the specified commits
  409. """
  410. graph = CommitGraph(object_format=object_store.object_format)
  411. if not commit_ids:
  412. return graph
  413. # Ensure all commit_ids are in the correct format for object store access
  414. hex_length = object_store.object_format.hex_length
  415. oid_length = object_store.object_format.oid_length
  416. normalized_commit_ids = []
  417. for commit_id in commit_ids:
  418. if isinstance(commit_id, bytes) and len(commit_id) == hex_length:
  419. # Already hex ObjectID
  420. normalized_commit_ids.append(commit_id)
  421. elif isinstance(commit_id, bytes) and len(commit_id) == oid_length:
  422. # Binary SHA, convert to hex ObjectID
  423. normalized_commit_ids.append(sha_to_hex(RawObjectID(commit_id)))
  424. else:
  425. # Assume it's already correct format
  426. normalized_commit_ids.append(ObjectID(commit_id))
  427. # Build a map of all commits and their metadata
  428. commit_map: dict[ObjectID, Commit] = {}
  429. for commit_id in normalized_commit_ids:
  430. try:
  431. commit_obj = object_store[commit_id]
  432. if commit_obj.type_name != b"commit":
  433. continue
  434. assert isinstance(commit_obj, Commit)
  435. commit_map[commit_id] = commit_obj
  436. except KeyError:
  437. # Commit not found, skip
  438. continue
  439. # Calculate generation numbers using topological sort
  440. generation_map: dict[bytes, int] = {}
  441. def calculate_generation(commit_id: ObjectID) -> int:
  442. if commit_id in generation_map:
  443. return generation_map[commit_id]
  444. if commit_id not in commit_map:
  445. # Unknown commit, assume generation 0
  446. generation_map[commit_id] = 0
  447. return 0
  448. commit_obj = commit_map[commit_id]
  449. if not commit_obj.parents:
  450. # Root commit
  451. generation_map[commit_id] = 1
  452. return 1
  453. # Calculate based on parents
  454. max_parent_gen = 0
  455. for parent_id in commit_obj.parents:
  456. parent_gen = calculate_generation(parent_id)
  457. max_parent_gen = max(max_parent_gen, parent_gen)
  458. generation = max_parent_gen + 1
  459. generation_map[commit_id] = generation
  460. return generation
  461. # Calculate generation numbers for all commits
  462. for commit_id in commit_map:
  463. calculate_generation(commit_id)
  464. # Build commit graph entries
  465. for commit_id, commit_obj in commit_map.items():
  466. # commit_id is already hex ObjectID from normalized_commit_ids
  467. commit_hex: ObjectID = commit_id
  468. # Handle tree ID - might already be hex ObjectID
  469. if isinstance(commit_obj.tree, bytes) and len(commit_obj.tree) == hex_length:
  470. tree_hex = commit_obj.tree # Already hex ObjectID
  471. else:
  472. tree_hex = sha_to_hex(commit_obj.tree) # Binary, convert to hex
  473. # Handle parent IDs - might already be hex ObjectIDs
  474. parents_hex: list[ObjectID] = []
  475. for parent_id in commit_obj.parents:
  476. if isinstance(parent_id, bytes) and len(parent_id) == hex_length:
  477. parents_hex.append(parent_id) # Already hex ObjectID
  478. else:
  479. parents_hex.append(sha_to_hex(parent_id)) # Binary, convert to hex
  480. entry = CommitGraphEntry(
  481. commit_id=commit_hex,
  482. tree_id=tree_hex,
  483. parents=parents_hex,
  484. generation=generation_map[commit_id],
  485. commit_time=commit_obj.commit_time,
  486. )
  487. graph.entries.append(entry)
  488. # Build the OID to index mapping for lookups
  489. graph._oid_to_index = {}
  490. for i, entry in enumerate(graph.entries):
  491. graph._oid_to_index[entry.commit_id] = i
  492. return graph
  493. def write_commit_graph(
  494. git_dir: str | bytes,
  495. object_store: "BaseObjectStore",
  496. commit_ids: Sequence[ObjectID],
  497. ) -> None:
  498. """Write a commit graph file for the given commits.
  499. Args:
  500. git_dir: Git directory path
  501. object_store: Object store to retrieve commits from
  502. commit_ids: List of commit IDs to include in the graph
  503. """
  504. if isinstance(git_dir, str):
  505. git_dir = git_dir.encode()
  506. # Generate the commit graph
  507. graph = generate_commit_graph(object_store, commit_ids)
  508. if not graph.entries:
  509. return # Nothing to write
  510. # Ensure the objects/info directory exists
  511. info_dir = os.path.join(git_dir, b"objects", b"info")
  512. os.makedirs(info_dir, exist_ok=True)
  513. # Write using GitFile for atomic operation
  514. from .file import GitFile
  515. graph_path = os.path.join(info_dir, b"commit-graph")
  516. with GitFile(graph_path, "wb") as f:
  517. graph.write_to_file(f)
  518. def get_reachable_commits(
  519. object_store: "BaseObjectStore", start_commits: Sequence[ObjectID]
  520. ) -> list[ObjectID]:
  521. """Get all commits reachable from the given starting commits.
  522. Args:
  523. object_store: Object store to retrieve commits from
  524. start_commits: List of starting commit IDs
  525. Returns:
  526. List of all reachable commit IDs (including the starting commits)
  527. """
  528. visited: set[ObjectID] = set()
  529. reachable: list[ObjectID] = []
  530. stack: list[ObjectID] = []
  531. hex_length = object_store.object_format.hex_length
  532. oid_length = object_store.object_format.oid_length
  533. # Normalize commit IDs for object store access and tracking
  534. for commit_id in start_commits:
  535. if isinstance(commit_id, bytes) and len(commit_id) == hex_length:
  536. # Hex ObjectID - use directly for object store access
  537. if commit_id not in visited:
  538. stack.append(commit_id)
  539. elif isinstance(commit_id, bytes) and len(commit_id) == oid_length:
  540. # Binary SHA, convert to hex ObjectID for object store access
  541. hex_id = sha_to_hex(RawObjectID(commit_id))
  542. if hex_id not in visited:
  543. stack.append(hex_id)
  544. else:
  545. # Assume it's already correct format
  546. oid = ObjectID(commit_id)
  547. if oid not in visited:
  548. stack.append(oid)
  549. while stack:
  550. commit_id = stack.pop()
  551. if commit_id in visited:
  552. continue
  553. visited.add(commit_id)
  554. try:
  555. commit_obj = object_store[commit_id]
  556. if not isinstance(commit_obj, Commit):
  557. continue
  558. # Add to reachable list (commit_id is already hex ObjectID)
  559. reachable.append(commit_id)
  560. # Add parents to stack
  561. for parent_id in commit_obj.parents:
  562. if parent_id not in visited:
  563. stack.append(parent_id)
  564. except KeyError:
  565. # Commit not found, skip
  566. continue
  567. return reachable