| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644 |
- # midx.py -- Multi-Pack-Index (MIDX) support
- # Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
- #
- # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
- # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
- # General Public License as published by the Free Software Foundation; version 2.0
- # or (at your option) any later version. You can redistribute it and/or
- # modify it under the terms of either of these two licenses.
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- # You should have received a copy of the licenses; if not, see
- # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
- # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
- # License, Version 2.0.
- #
- """Multi-Pack-Index (MIDX) support.
- A multi-pack-index (MIDX) provides a single index that covers multiple pack files,
- enabling fast object lookup across all packs without opening each pack index.
- The MIDX file format consists of:
- - A header with signature, version, and hash algorithm
- - A chunk lookup table
- - Multiple chunks containing pack names, OID fanout, OID lookup, and object offsets
- - A trailer with checksum
- This module provides:
- - Reading MIDX files
- - Writing MIDX files
- - Integration with pack-based object stores
- Limitations:
- - Incremental MIDX chains are not yet supported (base_midx_files must be 0)
- - BTMP (bitmapped packfiles) chunk is not yet implemented
- - RIDX (reverse index) chunk is not yet implemented
- Note: Incremental MIDX chains were introduced in Git 2.47 as an experimental
- feature, where multiple MIDX files can be chained together. The format includes
- a base_midx_files field in the header and uses a multi-pack-index.d/ directory
- with a multi-pack-index-chain file. This feature is not yet supported by Dulwich
- as the specification is still evolving.
- """
- import os
- import struct
- from collections.abc import Iterator
- from io import UnsupportedOperation
- from typing import IO, Any
- try:
- import mmap
- except ImportError:
- has_mmap = False
- else:
- has_mmap = True
- from .file import GitFile, _GitFile
- from .objects import ObjectID, RawObjectID
- from .pack import SHA1Writer
- # MIDX signature
- MIDX_SIGNATURE = b"MIDX"
- # MIDX version
- MIDX_VERSION = 1
- # Chunk identifiers (4 bytes each)
- CHUNK_PNAM = b"PNAM" # Packfile names
- CHUNK_OIDF = b"OIDF" # OID fanout table
- CHUNK_OIDL = b"OIDL" # OID lookup table
- CHUNK_OOFF = b"OOFF" # Object offsets
- CHUNK_LOFF = b"LOFF" # Large offsets (optional)
- CHUNK_BTMP = b"BTMP" # Bitmapped packfiles (optional)
- CHUNK_RIDX = b"RIDX" # Reverse index (optional)
- # Hash algorithm identifiers
- HASH_ALGORITHM_SHA1 = 1
- HASH_ALGORITHM_SHA256 = 2
- class MultiPackIndex:
- """Multi-pack-index for efficient object lookup across multiple pack files."""
- def __init__(
- self,
- filename: str | os.PathLike[str],
- file: IO[bytes] | _GitFile | None = None,
- contents: bytes | None = None,
- size: int | None = None,
- ) -> None:
- """Initialize a MultiPackIndex.
- Args:
- filename: Path to the MIDX file
- file: Optional file object
- contents: Optional mmap'd contents
- size: Optional size of the MIDX file
- """
- self._filename = os.fspath(filename)
- self._file = file
- self._size = size
- # Instance variables that will be set during parsing
- self.version: int
- self.hash_algorithm: int
- self.hash_size: int
- self.chunk_count: int
- self.base_midx_files: int
- self.pack_count: int
- self.pack_names: list[str]
- self.object_count: int
- self._chunks: dict[bytes, int]
- self._fanout_table: list[int]
- self._oidl_offset: int
- self._ooff_offset: int
- self._loff_offset: int
- # Load file contents
- if contents is None:
- if file is None:
- with GitFile(filename, "rb") as f:
- self._contents, self._size = self._load_file_contents(f, size)
- else:
- self._contents, self._size = self._load_file_contents(file, size)
- else:
- self._contents = contents
- # Parse header
- self._parse_header()
- # Parse chunk lookup table
- self._parse_chunk_table()
- def _load_file_contents(
- self, f: IO[bytes] | _GitFile, size: int | None = None
- ) -> tuple[bytes | Any, int]:
- """Load contents from a file, preferring mmap when possible.
- Args:
- f: File-like object to load
- size: Expected size, or None to determine from file
- Returns:
- Tuple of (contents, size)
- """
- try:
- fd = f.fileno()
- except (UnsupportedOperation, AttributeError):
- fd = None
- # Attempt to use mmap if possible
- if fd is not None:
- if size is None:
- size = os.fstat(fd).st_size
- if has_mmap:
- try:
- contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
- except (OSError, ValueError):
- # Can't mmap - perhaps a socket or invalid file descriptor
- pass
- else:
- return contents, size
- # Fall back to reading entire file into memory
- contents_bytes = f.read()
- size = len(contents_bytes)
- return contents_bytes, size
- def _parse_header(self) -> None:
- """Parse the MIDX header."""
- if len(self._contents) < 12:
- raise ValueError("MIDX file too small")
- # Check signature
- signature = self._contents[0:4]
- if signature != MIDX_SIGNATURE:
- raise ValueError(f"Invalid MIDX signature: {signature!r}")
- # Read version
- self.version = self._contents[4]
- if self.version != MIDX_VERSION:
- raise ValueError(f"Unsupported MIDX version: {self.version}")
- # Read object ID version (hash algorithm)
- self.hash_algorithm = self._contents[5]
- if self.hash_algorithm == HASH_ALGORITHM_SHA1:
- self.hash_size = 20
- elif self.hash_algorithm == HASH_ALGORITHM_SHA256:
- self.hash_size = 32
- else:
- raise ValueError(f"Unknown hash algorithm: {self.hash_algorithm}")
- # Read chunk count
- self.chunk_count = self._contents[6]
- # Read base MIDX files count (currently always 0)
- self.base_midx_files = self._contents[7]
- if self.base_midx_files != 0:
- raise ValueError("Incremental MIDX not yet supported")
- # Read pack file count
- (self.pack_count,) = struct.unpack(">L", self._contents[8:12])
- def _parse_chunk_table(self) -> None:
- """Parse the chunk lookup table."""
- self._chunks = {}
- # Chunk table starts at offset 12
- offset = 12
- # Each chunk entry is 12 bytes (4-byte ID + 8-byte offset)
- for i in range(self.chunk_count + 1): # +1 for terminator
- chunk_id = self._contents[offset : offset + 4]
- (chunk_offset,) = struct.unpack(
- ">Q", self._contents[offset + 4 : offset + 12]
- )
- if chunk_id == b"\x00\x00\x00\x00":
- # Terminator entry
- break
- self._chunks[chunk_id] = chunk_offset
- offset += 12
- # Parse required chunks
- self._parse_pnam_chunk()
- self._parse_oidf_chunk()
- self._parse_oidl_chunk()
- self._parse_ooff_chunk()
- # Parse optional chunks
- if CHUNK_LOFF in self._chunks:
- self._parse_loff_chunk()
- def _parse_pnam_chunk(self) -> None:
- """Parse the Packfile Names (PNAM) chunk."""
- if CHUNK_PNAM not in self._chunks:
- raise ValueError("Required PNAM chunk not found")
- offset = self._chunks[CHUNK_PNAM]
- self.pack_names = []
- # Find the end of the PNAM chunk (next chunk or end of chunks section)
- next_offset = min(
- (o for o in self._chunks.values() if o > offset),
- default=len(self._contents),
- )
- # Parse null-terminated pack names
- current = offset
- while current < next_offset:
- # Find the next null terminator
- null_pos = self._contents.find(b"\x00", current, next_offset)
- if null_pos == -1:
- break
- pack_name = self._contents[current:null_pos].decode("utf-8")
- if pack_name: # Skip empty strings (padding)
- self.pack_names.append(pack_name)
- current = null_pos + 1
- def _parse_oidf_chunk(self) -> None:
- """Parse the OID Fanout (OIDF) chunk."""
- if CHUNK_OIDF not in self._chunks:
- raise ValueError("Required OIDF chunk not found")
- offset = self._chunks[CHUNK_OIDF]
- self._fanout_table = []
- # Read 256 4-byte entries
- for i in range(256):
- (count,) = struct.unpack(
- ">L", self._contents[offset + i * 4 : offset + i * 4 + 4]
- )
- self._fanout_table.append(count)
- # Total object count is the last entry
- self.object_count = self._fanout_table[255]
- def _parse_oidl_chunk(self) -> None:
- """Parse the OID Lookup (OIDL) chunk."""
- if CHUNK_OIDL not in self._chunks:
- raise ValueError("Required OIDL chunk not found")
- self._oidl_offset = self._chunks[CHUNK_OIDL]
- def _parse_ooff_chunk(self) -> None:
- """Parse the Object Offsets (OOFF) chunk."""
- if CHUNK_OOFF not in self._chunks:
- raise ValueError("Required OOFF chunk not found")
- self._ooff_offset = self._chunks[CHUNK_OOFF]
- def _parse_loff_chunk(self) -> None:
- """Parse the Large Offsets (LOFF) chunk."""
- self._loff_offset = self._chunks[CHUNK_LOFF]
- def __len__(self) -> int:
- """Return the number of objects in this MIDX."""
- return self.object_count
- def _get_oid(self, index: int) -> RawObjectID:
- """Get the object ID at the given index.
- Args:
- index: Index of the object
- Returns:
- Binary object ID
- """
- if index < 0 or index >= self.object_count:
- raise IndexError(f"Index {index} out of range")
- offset = self._oidl_offset + index * self.hash_size
- return RawObjectID(self._contents[offset : offset + self.hash_size])
- def _get_pack_info(self, index: int) -> tuple[int, int]:
- """Get pack ID and offset for object at the given index.
- Args:
- index: Index of the object
- Returns:
- Tuple of (pack_id, offset)
- """
- if index < 0 or index >= self.object_count:
- raise IndexError(f"Index {index} out of range")
- # Each entry is 8 bytes (4-byte pack ID + 4-byte offset)
- offset = self._ooff_offset + index * 8
- (pack_id,) = struct.unpack(">L", self._contents[offset : offset + 4])
- (pack_offset,) = struct.unpack(">L", self._contents[offset + 4 : offset + 8])
- # Check if this is a large offset (MSB set)
- if pack_offset & 0x80000000:
- # Look up in LOFF chunk
- if CHUNK_LOFF not in self._chunks:
- raise ValueError("Large offset found but no LOFF chunk")
- large_index = pack_offset & 0x7FFFFFFF
- large_offset_pos = self._loff_offset + large_index * 8
- (pack_offset,) = struct.unpack(
- ">Q", self._contents[large_offset_pos : large_offset_pos + 8]
- )
- return pack_id, pack_offset
- def object_offset(self, sha: ObjectID | RawObjectID) -> tuple[str, int] | None:
- """Return the pack name and offset for the given object.
- Args:
- sha: Binary SHA-1 or SHA-256 hash
- Returns:
- Tuple of (pack_name, offset) or None if not found
- """
- if len(sha) != self.hash_size:
- raise ValueError(
- f"SHA size mismatch: expected {self.hash_size}, got {len(sha)}"
- )
- # Use fanout table to narrow search range
- first_byte = sha[0]
- start_idx = 0 if first_byte == 0 else self._fanout_table[first_byte - 1]
- end_idx = self._fanout_table[first_byte]
- # Binary search within the range
- while start_idx < end_idx:
- mid = (start_idx + end_idx) // 2
- mid_sha = self._get_oid(mid)
- if mid_sha == sha:
- # Found it!
- pack_id, offset = self._get_pack_info(mid)
- return self.pack_names[pack_id], offset
- elif mid_sha < sha:
- start_idx = mid + 1
- else:
- end_idx = mid
- return None
- def __contains__(self, sha: ObjectID | RawObjectID) -> bool:
- """Check if the given object SHA is in this MIDX.
- Args:
- sha: Binary SHA hash
- Returns:
- True if the object is in this MIDX
- """
- return self.object_offset(sha) is not None
- def iterentries(self) -> Iterator[tuple[RawObjectID, str, int]]:
- """Iterate over all entries in this MIDX.
- Yields:
- Tuples of (sha, pack_name, offset)
- """
- for i in range(self.object_count):
- sha = self._get_oid(i)
- pack_id, offset = self._get_pack_info(i)
- pack_name = self.pack_names[pack_id]
- yield sha, pack_name, offset
- def close(self) -> None:
- """Close the MIDX file and release mmap resources."""
- # Close mmap'd contents first if it's an mmap object
- if self._contents is not None and has_mmap:
- if isinstance(self._contents, mmap.mmap):
- self._contents.close()
- self._contents = None
- # Close file handle
- if self._file is not None:
- self._file.close()
- self._file = None
- def load_midx(path: str | os.PathLike[str]) -> MultiPackIndex:
- """Load a multi-pack-index file by path.
- Args:
- path: Path to the MIDX file
- Returns:
- A MultiPackIndex loaded from the given path
- """
- with GitFile(path, "rb") as f:
- return load_midx_file(path, f)
- def load_midx_file(
- path: str | os.PathLike[str], f: IO[bytes] | _GitFile
- ) -> MultiPackIndex:
- """Load a multi-pack-index from a file-like object.
- Args:
- path: Path for the MIDX file
- f: File-like object
- Returns:
- A MultiPackIndex loaded from the given file
- """
- return MultiPackIndex(path, file=f)
- def write_midx(
- f: IO[bytes],
- pack_index_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]],
- hash_algorithm: int = HASH_ALGORITHM_SHA1,
- ) -> bytes:
- """Write a multi-pack-index file.
- Args:
- f: File-like object to write to
- pack_index_entries: List of (pack_name, entries) tuples where entries are
- (sha, offset, crc32) tuples, sorted by SHA
- hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
- Returns:
- SHA-1 checksum of the written MIDX file
- """
- if hash_algorithm == HASH_ALGORITHM_SHA1:
- hash_size = 20
- elif hash_algorithm == HASH_ALGORITHM_SHA256:
- hash_size = 32
- else:
- raise ValueError(f"Unknown hash algorithm: {hash_algorithm}")
- # Wrap file in SHA1Writer to compute checksum
- writer = SHA1Writer(f)
- # Sort pack entries by pack name (required by Git)
- pack_index_entries_sorted = sorted(pack_index_entries, key=lambda x: x[0])
- # Collect all objects from all packs
- all_objects: list[tuple[RawObjectID, int, int]] = [] # (sha, pack_id, offset)
- pack_names: list[str] = []
- for pack_id, (pack_name, entries) in enumerate(pack_index_entries_sorted):
- pack_names.append(pack_name)
- for sha, offset, _crc32 in entries:
- all_objects.append((sha, pack_id, offset))
- # Sort all objects by SHA
- all_objects.sort(key=lambda x: x[0])
- # Calculate offsets for chunks
- num_packs = len(pack_names)
- num_objects = len(all_objects)
- # Header: 12 bytes
- header_size = 12
- # Chunk count: PNAM, OIDF, OIDL, OOFF, and optionally LOFF
- # We'll determine if LOFF is needed later
- chunk_count = 4 # PNAM, OIDF, OIDL, OOFF
- # Check if we need LOFF chunk (for offsets >= 2^31)
- need_loff = any(offset >= 2**31 for _sha, _pack_id, offset in all_objects)
- if need_loff:
- chunk_count += 1
- # Chunk table: (chunk_count + 1) * 12 bytes (including terminator)
- chunk_table_size = (chunk_count + 1) * 12
- # Calculate chunk offsets
- current_offset = header_size + chunk_table_size
- # PNAM chunk: pack names as null-terminated strings, padded to 4-byte boundary
- pnam_data = b"".join(name.encode("utf-8") + b"\x00" for name in pack_names)
- # Pad to 4-byte boundary
- pnam_padding = (4 - len(pnam_data) % 4) % 4
- pnam_data += b"\x00" * pnam_padding
- pnam_offset = current_offset
- current_offset += len(pnam_data)
- # OIDF chunk: 256 * 4 bytes
- oidf_offset = current_offset
- oidf_size = 256 * 4
- current_offset += oidf_size
- # OIDL chunk: num_objects * hash_size bytes
- oidl_offset = current_offset
- oidl_size = num_objects * hash_size
- current_offset += oidl_size
- # OOFF chunk: num_objects * 8 bytes (4 for pack_id + 4 for offset)
- ooff_offset = current_offset
- ooff_size = num_objects * 8
- current_offset += ooff_size
- # LOFF chunk (if needed): variable size
- # We'll calculate the exact size when we know how many large offsets we have
- loff_offset = current_offset if need_loff else 0
- large_offsets: list[int] = []
- # Calculate trailer offset (where checksum starts)
- # We need to pre-calculate large offset count for accurate trailer offset
- if need_loff:
- # Count large offsets
- large_offset_count = sum(1 for _, _, offset in all_objects if offset >= 2**31)
- loff_size = large_offset_count * 8
- trailer_offset = current_offset + loff_size
- else:
- trailer_offset = current_offset
- # Write header
- writer.write(MIDX_SIGNATURE) # 4 bytes: signature
- writer.write(bytes([MIDX_VERSION])) # 1 byte: version
- writer.write(bytes([hash_algorithm])) # 1 byte: hash algorithm
- writer.write(bytes([chunk_count])) # 1 byte: chunk count
- writer.write(bytes([0])) # 1 byte: base MIDX files (always 0)
- writer.write(struct.pack(">L", num_packs)) # 4 bytes: pack count
- # Write chunk table
- chunk_table = [
- (CHUNK_PNAM, pnam_offset),
- (CHUNK_OIDF, oidf_offset),
- (CHUNK_OIDL, oidl_offset),
- (CHUNK_OOFF, ooff_offset),
- ]
- if need_loff:
- chunk_table.append((CHUNK_LOFF, loff_offset))
- for chunk_id, chunk_offset in chunk_table:
- writer.write(chunk_id) # 4 bytes
- writer.write(struct.pack(">Q", chunk_offset)) # 8 bytes
- # Write terminator (points to where trailer/checksum starts)
- writer.write(b"\x00\x00\x00\x00") # 4 bytes
- writer.write(struct.pack(">Q", trailer_offset)) # 8 bytes
- # Write PNAM chunk
- writer.write(pnam_data)
- # Write OIDF chunk (fanout table)
- fanout: list[int] = [0] * 256
- for sha, _pack_id, _offset in all_objects:
- first_byte = sha[0]
- fanout[first_byte] += 1
- # Convert counts to cumulative
- cumulative = 0
- for i in range(256):
- cumulative += fanout[i]
- writer.write(struct.pack(">L", cumulative))
- # Write OIDL chunk (object IDs)
- for sha, _pack_id, _offset in all_objects:
- writer.write(sha)
- # Write OOFF chunk (pack ID and offset for each object)
- for _sha, pack_id, offset in all_objects:
- writer.write(struct.pack(">L", pack_id))
- if offset >= 2**31:
- # Use large offset table
- large_offset_index = len(large_offsets)
- large_offsets.append(offset)
- # Set MSB to indicate large offset
- writer.write(struct.pack(">L", 0x80000000 | large_offset_index))
- else:
- writer.write(struct.pack(">L", offset))
- # Write LOFF chunk if needed
- if need_loff:
- for large_offset in large_offsets:
- writer.write(struct.pack(">Q", large_offset))
- # Write checksum
- return writer.write_sha()
- def write_midx_file(
- path: str | os.PathLike[str],
- pack_index_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]],
- hash_algorithm: int = HASH_ALGORITHM_SHA1,
- ) -> bytes:
- """Write a multi-pack-index file to disk.
- Args:
- path: Path where to write the MIDX file
- pack_index_entries: List of (pack_name, entries) tuples where entries are
- (sha, offset, crc32) tuples, sorted by SHA
- hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
- Returns:
- SHA-1 checksum of the written MIDX file
- """
- with GitFile(path, "wb") as f:
- return write_midx(f, pack_index_entries, hash_algorithm)
- # TODO: Add support for incremental MIDX chains
- # TODO: Add support for BTMP and RIDX chunks for bitmap integration
|