# commit_graph.py -- Git commit graph file format support # Copyright (C) 2024 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as public by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. """Git commit graph file format support. Git's commit graph files store commit metadata and generation numbers for faster graph traversal operations like merge-base computation. The commit graph format is documented at: https://git-scm.com/docs/gitformat-commit-graph """ import os import struct from collections.abc import Iterator from typing import TYPE_CHECKING, BinaryIO, Optional, Union if TYPE_CHECKING: from .object_store import BaseObjectStore from .objects import Commit, ObjectID, hex_to_sha, sha_to_hex # File format constants COMMIT_GRAPH_SIGNATURE = b"CGPH" COMMIT_GRAPH_VERSION = 1 HASH_VERSION_SHA1 = 1 HASH_VERSION_SHA256 = 2 # Chunk IDs CHUNK_OID_FANOUT = b"OIDF" CHUNK_OID_LOOKUP = b"OIDL" CHUNK_COMMIT_DATA = b"CDAT" CHUNK_GENERATION_DATA = b"GDA2" CHUNK_GENERATION_DATA_OVERFLOW = b"GDO2" CHUNK_EXTRA_EDGE_LIST = b"EDGE" CHUNK_BLOOM_FILTER_INDEX = b"BIDX" CHUNK_BLOOM_FILTER_DATA = b"BDAT" CHUNK_BASE_GRAPHS_LIST = b"BASE" # Generation number constants GENERATION_NUMBER_INFINITY = 0xFFFFFFFF GENERATION_NUMBER_ZERO = 0 GENERATION_NUMBER_V1_MAX = 0x3FFFFFFF # Parent encoding constants GRAPH_PARENT_MISSING = 0x70000000 GRAPH_PARENT_NONE = 0x70000000 GRAPH_EXTRA_EDGES_NEEDED = 0x80000000 GRAPH_LAST_EDGE = 0x80000000 class CommitGraphEntry: """Represents a single commit entry in the commit graph.""" def __init__( self, commit_id: ObjectID, tree_id: ObjectID, parents: list[ObjectID], generation: int, commit_time: int, ) -> None: self.commit_id = commit_id self.tree_id = tree_id self.parents = parents self.generation = generation self.commit_time = commit_time def __repr__(self) -> str: return ( f"CommitGraphEntry(commit_id={self.commit_id!r}, " f"tree_id={self.tree_id!r}, parents={self.parents!r}, " f"generation={self.generation}, commit_time={self.commit_time})" ) class CommitGraphChunk: """Represents a chunk in the commit graph file.""" def __init__(self, chunk_id: bytes, data: bytes) -> None: self.chunk_id = chunk_id self.data = data def __repr__(self) -> str: return f"CommitGraphChunk(chunk_id={self.chunk_id!r}, size={len(self.data)})" class CommitGraph: """Git commit graph file reader/writer.""" def __init__(self, hash_version: int = HASH_VERSION_SHA1) -> None: self.hash_version = hash_version self.chunks: dict[bytes, CommitGraphChunk] = {} self.entries: list[CommitGraphEntry] = [] self._oid_to_index: dict[ObjectID, int] = {} self._hash_size = 20 if hash_version == HASH_VERSION_SHA1 else 32 @classmethod def from_file(cls, f: BinaryIO) -> "CommitGraph": """Read commit graph from file.""" graph = cls() graph._read_from_file(f) return graph def _read_from_file(self, f: BinaryIO) -> None: """Read commit graph data from file.""" # Read header signature = f.read(4) if signature != COMMIT_GRAPH_SIGNATURE: raise ValueError(f"Invalid commit graph signature: {signature!r}") version = struct.unpack(">B", f.read(1))[0] if version != COMMIT_GRAPH_VERSION: raise ValueError(f"Unsupported commit graph version: {version}") self.hash_version = struct.unpack(">B", f.read(1))[0] if self.hash_version not in (HASH_VERSION_SHA1, HASH_VERSION_SHA256): raise ValueError(f"Unsupported hash version: {self.hash_version}") self._hash_size = 20 if self.hash_version == HASH_VERSION_SHA1 else 32 num_chunks = struct.unpack(">B", f.read(1))[0] struct.unpack(">B", f.read(1))[0] # Read table of contents toc_entries = [] for _ in range(num_chunks + 1): # +1 for terminating entry chunk_id = f.read(4) offset = struct.unpack(">Q", f.read(8))[0] toc_entries.append((chunk_id, offset)) # Read chunks # Offsets in TOC are absolute from start of file for i in range(num_chunks): chunk_id, offset = toc_entries[i] next_offset = toc_entries[i + 1][1] chunk_size = next_offset - offset f.seek(offset) chunk_data = f.read(chunk_size) self.chunks[chunk_id] = CommitGraphChunk(chunk_id, chunk_data) # Parse chunks self._parse_chunks() def _parse_chunks(self) -> None: """Parse chunk data into entries.""" if CHUNK_OID_LOOKUP not in self.chunks: raise ValueError("Missing required OID lookup chunk") if CHUNK_COMMIT_DATA not in self.chunks: raise ValueError("Missing required commit data chunk") # Parse OID lookup chunk oid_lookup_data = self.chunks[CHUNK_OID_LOOKUP].data num_commits = len(oid_lookup_data) // self._hash_size oids = [] for i in range(num_commits): start = i * self._hash_size end = start + self._hash_size oid = oid_lookup_data[start:end] oids.append(oid) self._oid_to_index[oid] = i # Parse commit data chunk commit_data = self.chunks[CHUNK_COMMIT_DATA].data expected_size = num_commits * (self._hash_size + 16) if len(commit_data) != expected_size: raise ValueError( f"Invalid commit data chunk size: {len(commit_data)}, expected {expected_size}" ) self.entries = [] for i in range(num_commits): offset = i * (self._hash_size + 16) # Tree OID tree_id = commit_data[offset : offset + self._hash_size] offset += self._hash_size # Parent positions (2 x 4 bytes) parent1_pos, parent2_pos = struct.unpack( ">LL", commit_data[offset : offset + 8] ) offset += 8 # Generation number and commit time (2 x 4 bytes) gen_and_time = struct.unpack(">LL", commit_data[offset : offset + 8]) generation = gen_and_time[0] >> 2 # Upper 30 bits commit_time = gen_and_time[1] | ( (gen_and_time[0] & 0x3) << 32 ) # 34 bits total # Parse parents parents = [] if parent1_pos < GRAPH_PARENT_MISSING: if parent1_pos >= len(oids): raise ValueError(f"Invalid parent1 position: {parent1_pos}") parents.append(oids[parent1_pos]) if parent2_pos < GRAPH_PARENT_MISSING: if parent2_pos >= len(oids): raise ValueError(f"Invalid parent2 position: {parent2_pos}") parents.append(oids[parent2_pos]) elif parent2_pos >= GRAPH_EXTRA_EDGES_NEEDED: # Handle extra edges (3+ parents) edge_offset = parent2_pos & ~GRAPH_EXTRA_EDGES_NEEDED parents.extend(self._parse_extra_edges(edge_offset, oids)) entry = CommitGraphEntry( commit_id=sha_to_hex(oids[i]), tree_id=sha_to_hex(tree_id), parents=[sha_to_hex(p) for p in parents], generation=generation, commit_time=commit_time, ) self.entries.append(entry) def _parse_extra_edges(self, offset: int, oids: list[bytes]) -> list[bytes]: """Parse extra parent edges for commits with 3+ parents.""" if CHUNK_EXTRA_EDGE_LIST not in self.chunks: return [] edge_data = self.chunks[CHUNK_EXTRA_EDGE_LIST].data parents = [] while offset < len(edge_data): parent_pos = struct.unpack(">L", edge_data[offset : offset + 4])[0] offset += 4 if parent_pos & GRAPH_LAST_EDGE: parent_pos &= ~GRAPH_LAST_EDGE if parent_pos < len(oids): parents.append(oids[parent_pos]) break else: if parent_pos < len(oids): parents.append(oids[parent_pos]) return parents def get_entry_by_oid(self, oid: ObjectID) -> Optional[CommitGraphEntry]: """Get commit graph entry by commit OID.""" # Convert hex ObjectID to binary if needed for lookup if isinstance(oid, bytes) and len(oid) == 40: # Input is hex ObjectID, convert to binary for internal lookup lookup_oid = hex_to_sha(oid) else: # Input is already binary lookup_oid = oid index = self._oid_to_index.get(lookup_oid) if index is not None: return self.entries[index] return None def get_generation_number(self, oid: ObjectID) -> Optional[int]: """Get generation number for a commit.""" entry = self.get_entry_by_oid(oid) return entry.generation if entry else None def get_parents(self, oid: ObjectID) -> Optional[list[ObjectID]]: """Get parent commit IDs for a commit.""" entry = self.get_entry_by_oid(oid) return entry.parents if entry else None def write_to_file(self, f: BinaryIO) -> None: """Write commit graph to file.""" if not self.entries: raise ValueError("Cannot write empty commit graph") # Sort entries by commit ID for consistent output sorted_entries = sorted(self.entries, key=lambda e: e.commit_id) # Build OID lookup chunk oid_lookup_data = b"" for entry in sorted_entries: oid_lookup_data += hex_to_sha(entry.commit_id) # Build commit data chunk commit_data = b"" # Create OID to index mapping for parent lookups oid_to_index = {entry.commit_id: i for i, entry in enumerate(sorted_entries)} for entry in sorted_entries: # Tree OID (20 bytes) commit_data += hex_to_sha(entry.tree_id) # Parent positions (2 x 4 bytes) if len(entry.parents) == 0: parent1_pos = GRAPH_PARENT_MISSING parent2_pos = GRAPH_PARENT_MISSING elif len(entry.parents) == 1: parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING) parent2_pos = GRAPH_PARENT_MISSING elif len(entry.parents) == 2: parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING) parent2_pos = oid_to_index.get(entry.parents[1], GRAPH_PARENT_MISSING) else: # More than 2 parents - would need extra edge list chunk # For now, just store first two parents parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING) parent2_pos = oid_to_index.get(entry.parents[1], GRAPH_PARENT_MISSING) commit_data += struct.pack(">LL", parent1_pos, parent2_pos) # Generation and commit time (2 x 4 bytes) gen_and_time = (entry.generation << 2) | (entry.commit_time >> 32) commit_time_lower = entry.commit_time & 0xFFFFFFFF commit_data += struct.pack(">LL", gen_and_time, commit_time_lower) # Build fanout table fanout_data = b"" fanout_counts = [0] * 256 for i, entry in enumerate(sorted_entries): commit_oid_bytes = hex_to_sha(entry.commit_id) fanout_counts[commit_oid_bytes[0]] = i + 1 # Fill in gaps - each fanout entry should be cumulative for i in range(1, 256): if fanout_counts[i] == 0: fanout_counts[i] = fanout_counts[i - 1] for count in fanout_counts: fanout_data += struct.pack(">L", count) # Calculate chunk offsets header_size = ( 8 # signature + version + hash_version + num_chunks + base_graph_count ) toc_size = 4 * 12 # 4 entries (3 chunks + terminator) * 12 bytes each chunk1_offset = header_size + toc_size # OID Fanout chunk2_offset = chunk1_offset + len(fanout_data) # OID Lookup chunk3_offset = chunk2_offset + len(oid_lookup_data) # Commit Data terminator_offset = chunk3_offset + len(commit_data) # Write header f.write(COMMIT_GRAPH_SIGNATURE) f.write(struct.pack(">B", COMMIT_GRAPH_VERSION)) f.write(struct.pack(">B", self.hash_version)) f.write(struct.pack(">B", 3)) # 3 chunks f.write(struct.pack(">B", 0)) # 0 base graphs # Write table of contents f.write(CHUNK_OID_FANOUT + struct.pack(">Q", chunk1_offset)) f.write(CHUNK_OID_LOOKUP + struct.pack(">Q", chunk2_offset)) f.write(CHUNK_COMMIT_DATA + struct.pack(">Q", chunk3_offset)) f.write(b"\x00\x00\x00\x00" + struct.pack(">Q", terminator_offset)) # Write chunks f.write(fanout_data) f.write(oid_lookup_data) f.write(commit_data) def __len__(self) -> int: """Return number of commits in the graph.""" return len(self.entries) def __iter__(self) -> Iterator["CommitGraphEntry"]: """Iterate over commit graph entries.""" return iter(self.entries) def read_commit_graph(path: Union[str, bytes]) -> Optional[CommitGraph]: """Read commit graph from file path.""" if isinstance(path, str): path = path.encode() if not os.path.exists(path): return None with open(path, "rb") as f: return CommitGraph.from_file(f) def find_commit_graph_file(git_dir: Union[str, bytes]) -> Optional[bytes]: """Find commit graph file in a Git repository.""" if isinstance(git_dir, str): git_dir = git_dir.encode() # Standard location: .git/objects/info/commit-graph commit_graph_path = os.path.join(git_dir, b"objects", b"info", b"commit-graph") if os.path.exists(commit_graph_path): return commit_graph_path # Chain files in .git/objects/info/commit-graphs/ commit_graphs_dir = os.path.join(git_dir, b"objects", b"info", b"commit-graphs") if os.path.exists(commit_graphs_dir): # Look for graph-{hash}.graph files for filename in os.listdir(commit_graphs_dir): if filename.startswith(b"graph-") and filename.endswith(b".graph"): return os.path.join(commit_graphs_dir, filename) return None def generate_commit_graph( object_store: "BaseObjectStore", commit_ids: list[ObjectID] ) -> CommitGraph: """Generate a commit graph from a set of commits. Args: object_store: Object store to retrieve commits from commit_ids: List of commit IDs to include in the graph Returns: CommitGraph object containing the specified commits """ graph = CommitGraph() if not commit_ids: return graph # Ensure all commit_ids are in the correct format for object store access # DiskObjectStore expects hex ObjectIDs (40-byte hex strings) normalized_commit_ids = [] for commit_id in commit_ids: if isinstance(commit_id, bytes) and len(commit_id) == 40: # Already hex ObjectID normalized_commit_ids.append(commit_id) elif isinstance(commit_id, bytes) and len(commit_id) == 20: # Binary SHA, convert to hex ObjectID normalized_commit_ids.append(sha_to_hex(commit_id)) else: # Assume it's already correct format normalized_commit_ids.append(commit_id) # Build a map of all commits and their metadata commit_map: dict[bytes, Commit] = {} for commit_id in normalized_commit_ids: try: commit_obj = object_store[commit_id] if commit_obj.type_name != b"commit": continue assert isinstance(commit_obj, Commit) commit_map[commit_id] = commit_obj except KeyError: # Commit not found, skip continue # Calculate generation numbers using topological sort generation_map: dict[bytes, int] = {} def calculate_generation(commit_id: ObjectID) -> int: if commit_id in generation_map: return generation_map[commit_id] if commit_id not in commit_map: # Unknown commit, assume generation 0 generation_map[commit_id] = 0 return 0 commit_obj = commit_map[commit_id] if not commit_obj.parents: # Root commit generation_map[commit_id] = 1 return 1 # Calculate based on parents max_parent_gen = 0 for parent_id in commit_obj.parents: parent_gen = calculate_generation(parent_id) max_parent_gen = max(max_parent_gen, parent_gen) generation = max_parent_gen + 1 generation_map[commit_id] = generation return generation # Calculate generation numbers for all commits for commit_id in commit_map: calculate_generation(commit_id) # Build commit graph entries for commit_id, commit_obj in commit_map.items(): # commit_id is already hex ObjectID from normalized_commit_ids commit_hex = commit_id # Handle tree ID - might already be hex ObjectID if isinstance(commit_obj.tree, bytes) and len(commit_obj.tree) == 40: tree_hex = commit_obj.tree # Already hex ObjectID else: tree_hex = sha_to_hex(commit_obj.tree) # Binary, convert to hex # Handle parent IDs - might already be hex ObjectIDs parents_hex = [] for parent_id in commit_obj.parents: if isinstance(parent_id, bytes) and len(parent_id) == 40: parents_hex.append(parent_id) # Already hex ObjectID else: parents_hex.append(sha_to_hex(parent_id)) # Binary, convert to hex entry = CommitGraphEntry( commit_id=commit_hex, tree_id=tree_hex, parents=parents_hex, generation=generation_map[commit_id], commit_time=commit_obj.commit_time, ) graph.entries.append(entry) # Build the OID to index mapping for lookups graph._oid_to_index = {} for i, entry in enumerate(graph.entries): binary_oid = hex_to_sha(entry.commit_id.decode()) graph._oid_to_index[binary_oid] = i return graph def write_commit_graph( git_dir: Union[str, bytes], object_store: "BaseObjectStore", commit_ids: list[ObjectID], ) -> None: """Write a commit graph file for the given commits. Args: git_dir: Git directory path object_store: Object store to retrieve commits from commit_ids: List of commit IDs to include in the graph """ if isinstance(git_dir, str): git_dir = git_dir.encode() # Generate the commit graph graph = generate_commit_graph(object_store, commit_ids) if not graph.entries: return # Nothing to write # Ensure the objects/info directory exists info_dir = os.path.join(git_dir, b"objects", b"info") os.makedirs(info_dir, exist_ok=True) # Write using GitFile for atomic operation from .file import GitFile graph_path = os.path.join(info_dir, b"commit-graph") with GitFile(graph_path, "wb") as f: from typing import BinaryIO, cast graph.write_to_file(cast(BinaryIO, f)) def get_reachable_commits( object_store: "BaseObjectStore", start_commits: list[ObjectID] ) -> list[ObjectID]: """Get all commits reachable from the given starting commits. Args: object_store: Object store to retrieve commits from start_commits: List of starting commit IDs Returns: List of all reachable commit IDs (including the starting commits) """ visited = set() reachable = [] stack = [] # Normalize commit IDs for object store access and tracking for commit_id in start_commits: if isinstance(commit_id, bytes) and len(commit_id) == 40: # Hex ObjectID - use directly for object store access if commit_id not in visited: stack.append(commit_id) elif isinstance(commit_id, bytes) and len(commit_id) == 20: # Binary SHA, convert to hex ObjectID for object store access hex_id = sha_to_hex(commit_id) if hex_id not in visited: stack.append(hex_id) else: # Assume it's already correct format if commit_id not in visited: stack.append(commit_id) while stack: commit_id = stack.pop() if commit_id in visited: continue visited.add(commit_id) try: commit_obj = object_store[commit_id] if not isinstance(commit_obj, Commit): continue # Add to reachable list (commit_id is already hex ObjectID) reachable.append(commit_id) # Add parents to stack for parent_id in commit_obj.parents: if parent_id not in visited: stack.append(parent_id) except KeyError: # Commit not found, skip continue return reachable