|
|
@@ -0,0 +1,418 @@
|
|
|
+# midx.py -- Multi-Pack-Index (MIDX) support
|
|
|
+# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
|
|
|
+#
|
|
|
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
|
|
|
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
|
|
|
+# General Public License as published by the Free Software Foundation; version 2.0
|
|
|
+# or (at your option) any later version. You can redistribute it and/or
|
|
|
+# modify it under the terms of either of these two licenses.
|
|
|
+#
|
|
|
+# Unless required by applicable law or agreed to in writing, software
|
|
|
+# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+# See the License for the specific language governing permissions and
|
|
|
+# limitations under the License.
|
|
|
+#
|
|
|
+# You should have received a copy of the licenses; if not, see
|
|
|
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
|
|
|
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
|
|
|
+# License, Version 2.0.
|
|
|
+#
|
|
|
+
|
|
|
+"""Multi-Pack-Index (MIDX) support.
|
|
|
+
|
|
|
+A multi-pack-index (MIDX) provides a single index that covers multiple pack files,
|
|
|
+enabling fast object lookup across all packs without opening each pack index.
|
|
|
+
|
|
|
+The MIDX file format consists of:
|
|
|
+- A header with signature, version, and hash algorithm
|
|
|
+- A chunk lookup table
|
|
|
+- Multiple chunks containing pack names, OID fanout, OID lookup, and object offsets
|
|
|
+- A trailer with checksum
|
|
|
+
|
|
|
+This module provides:
|
|
|
+- Reading MIDX files
|
|
|
+- Writing MIDX files
|
|
|
+- Integration with pack-based object stores
|
|
|
+"""
|
|
|
+
|
|
|
+import os
|
|
|
+import struct
|
|
|
+from collections.abc import Iterator
|
|
|
+from typing import IO, Any
|
|
|
+
|
|
|
+from .file import GitFile, _GitFile
|
|
|
+
|
|
|
+# MIDX signature
|
|
|
+MIDX_SIGNATURE = b"MIDX"
|
|
|
+
|
|
|
+# MIDX version
|
|
|
+MIDX_VERSION = 1
|
|
|
+
|
|
|
+# Chunk identifiers (4 bytes each)
|
|
|
+CHUNK_PNAM = b"PNAM" # Packfile names
|
|
|
+CHUNK_OIDF = b"OIDF" # OID fanout table
|
|
|
+CHUNK_OIDL = b"OIDL" # OID lookup table
|
|
|
+CHUNK_OOFF = b"OOFF" # Object offsets
|
|
|
+CHUNK_LOFF = b"LOFF" # Large offsets (optional)
|
|
|
+CHUNK_BTMP = b"BTMP" # Bitmapped packfiles (optional)
|
|
|
+CHUNK_RIDX = b"RIDX" # Reverse index (optional)
|
|
|
+
|
|
|
+# Hash algorithm identifiers
|
|
|
+HASH_ALGORITHM_SHA1 = 1
|
|
|
+HASH_ALGORITHM_SHA256 = 2
|
|
|
+
|
|
|
+
|
|
|
+class MultiPackIndex:
|
|
|
+ """Multi-pack-index for efficient object lookup across multiple pack files."""
|
|
|
+
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ filename: str | os.PathLike[str],
|
|
|
+ file: IO[bytes] | _GitFile | None = None,
|
|
|
+ contents: bytes | None = None,
|
|
|
+ size: int | None = None,
|
|
|
+ ) -> None:
|
|
|
+ """Initialize a MultiPackIndex.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ filename: Path to the MIDX file
|
|
|
+ file: Optional file object
|
|
|
+ contents: Optional mmap'd contents
|
|
|
+ size: Optional size of the MIDX file
|
|
|
+ """
|
|
|
+ self._filename = os.fspath(filename)
|
|
|
+ self._file = file
|
|
|
+ self._size = size
|
|
|
+
|
|
|
+ # Instance variables that will be set during parsing
|
|
|
+ self.version: int
|
|
|
+ self.hash_algorithm: int
|
|
|
+ self.hash_size: int
|
|
|
+ self.chunk_count: int
|
|
|
+ self.base_midx_files: int
|
|
|
+ self.pack_count: int
|
|
|
+ self.pack_names: list[str]
|
|
|
+ self.object_count: int
|
|
|
+ self._chunks: dict[bytes, int]
|
|
|
+ self._fanout_table: list[int]
|
|
|
+ self._oidl_offset: int
|
|
|
+ self._ooff_offset: int
|
|
|
+ self._loff_offset: int
|
|
|
+
|
|
|
+ # Load file contents
|
|
|
+ if contents is None:
|
|
|
+ if file is None:
|
|
|
+ with GitFile(filename, "rb") as f:
|
|
|
+ self._contents, self._size = self._load_file_contents(f, size)
|
|
|
+ else:
|
|
|
+ self._contents, self._size = self._load_file_contents(file, size)
|
|
|
+ else:
|
|
|
+ self._contents = contents
|
|
|
+
|
|
|
+ # Parse header
|
|
|
+ self._parse_header()
|
|
|
+
|
|
|
+ # Parse chunk lookup table
|
|
|
+ self._parse_chunk_table()
|
|
|
+
|
|
|
+ def _load_file_contents(
|
|
|
+ self, f: IO[bytes] | _GitFile, size: int | None = None
|
|
|
+ ) -> tuple[bytes | Any, int]:
|
|
|
+ """Load contents from a file, preferring mmap when possible.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ f: File-like object to load
|
|
|
+ size: Expected size, or None to determine from file
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Tuple of (contents, size)
|
|
|
+ """
|
|
|
+ # Simplified version - similar to pack.py's _load_file_contents
|
|
|
+ if size is None:
|
|
|
+ f.seek(0, 2) # SEEK_END
|
|
|
+ size = f.tell()
|
|
|
+ f.seek(0)
|
|
|
+
|
|
|
+ # For now, just read the entire file into memory
|
|
|
+ # TODO: Add mmap support for large files
|
|
|
+ contents = f.read(size)
|
|
|
+ return contents, size
|
|
|
+
|
|
|
+ def _parse_header(self) -> None:
|
|
|
+ """Parse the MIDX header."""
|
|
|
+ if len(self._contents) < 12:
|
|
|
+ raise ValueError("MIDX file too small")
|
|
|
+
|
|
|
+ # Check signature
|
|
|
+ signature = self._contents[0:4]
|
|
|
+ if signature != MIDX_SIGNATURE:
|
|
|
+ raise ValueError(f"Invalid MIDX signature: {signature!r}")
|
|
|
+
|
|
|
+ # Read version
|
|
|
+ self.version = self._contents[4]
|
|
|
+ if self.version != MIDX_VERSION:
|
|
|
+ raise ValueError(f"Unsupported MIDX version: {self.version}")
|
|
|
+
|
|
|
+ # Read object ID version (hash algorithm)
|
|
|
+ self.hash_algorithm = self._contents[5]
|
|
|
+ if self.hash_algorithm == HASH_ALGORITHM_SHA1:
|
|
|
+ self.hash_size = 20
|
|
|
+ elif self.hash_algorithm == HASH_ALGORITHM_SHA256:
|
|
|
+ self.hash_size = 32
|
|
|
+ else:
|
|
|
+ raise ValueError(f"Unknown hash algorithm: {self.hash_algorithm}")
|
|
|
+
|
|
|
+ # Read chunk count
|
|
|
+ self.chunk_count = self._contents[6]
|
|
|
+
|
|
|
+ # Read base MIDX files count (currently always 0)
|
|
|
+ self.base_midx_files = self._contents[7]
|
|
|
+ if self.base_midx_files != 0:
|
|
|
+ raise ValueError("Incremental MIDX not yet supported")
|
|
|
+
|
|
|
+ # Read pack file count
|
|
|
+ (self.pack_count,) = struct.unpack(">L", self._contents[8:12])
|
|
|
+
|
|
|
+ def _parse_chunk_table(self) -> None:
|
|
|
+ """Parse the chunk lookup table."""
|
|
|
+ self._chunks = {}
|
|
|
+
|
|
|
+ # Chunk table starts at offset 12
|
|
|
+ offset = 12
|
|
|
+
|
|
|
+ # Each chunk entry is 12 bytes (4-byte ID + 8-byte offset)
|
|
|
+ for i in range(self.chunk_count + 1): # +1 for terminator
|
|
|
+ chunk_id = self._contents[offset : offset + 4]
|
|
|
+ (chunk_offset,) = struct.unpack(
|
|
|
+ ">Q", self._contents[offset + 4 : offset + 12]
|
|
|
+ )
|
|
|
+
|
|
|
+ if chunk_id == b"\x00\x00\x00\x00":
|
|
|
+ # Terminator entry
|
|
|
+ break
|
|
|
+
|
|
|
+ self._chunks[chunk_id] = chunk_offset
|
|
|
+ offset += 12
|
|
|
+
|
|
|
+ # Parse required chunks
|
|
|
+ self._parse_pnam_chunk()
|
|
|
+ self._parse_oidf_chunk()
|
|
|
+ self._parse_oidl_chunk()
|
|
|
+ self._parse_ooff_chunk()
|
|
|
+
|
|
|
+ # Parse optional chunks
|
|
|
+ if CHUNK_LOFF in self._chunks:
|
|
|
+ self._parse_loff_chunk()
|
|
|
+
|
|
|
+ def _parse_pnam_chunk(self) -> None:
|
|
|
+ """Parse the Packfile Names (PNAM) chunk."""
|
|
|
+ if CHUNK_PNAM not in self._chunks:
|
|
|
+ raise ValueError("Required PNAM chunk not found")
|
|
|
+
|
|
|
+ offset = self._chunks[CHUNK_PNAM]
|
|
|
+ self.pack_names = []
|
|
|
+
|
|
|
+ # Find the end of the PNAM chunk (next chunk or end of chunks section)
|
|
|
+ next_offset = min(
|
|
|
+ (o for o in self._chunks.values() if o > offset),
|
|
|
+ default=len(self._contents),
|
|
|
+ )
|
|
|
+
|
|
|
+ # Parse null-terminated pack names
|
|
|
+ current = offset
|
|
|
+ while current < next_offset:
|
|
|
+ # Find the next null terminator
|
|
|
+ null_pos = self._contents.find(b"\x00", current, next_offset)
|
|
|
+ if null_pos == -1:
|
|
|
+ break
|
|
|
+
|
|
|
+ pack_name = self._contents[current:null_pos].decode("utf-8")
|
|
|
+ if pack_name: # Skip empty strings (padding)
|
|
|
+ self.pack_names.append(pack_name)
|
|
|
+ current = null_pos + 1
|
|
|
+
|
|
|
+ def _parse_oidf_chunk(self) -> None:
|
|
|
+ """Parse the OID Fanout (OIDF) chunk."""
|
|
|
+ if CHUNK_OIDF not in self._chunks:
|
|
|
+ raise ValueError("Required OIDF chunk not found")
|
|
|
+
|
|
|
+ offset = self._chunks[CHUNK_OIDF]
|
|
|
+ self._fanout_table = []
|
|
|
+
|
|
|
+ # Read 256 4-byte entries
|
|
|
+ for i in range(256):
|
|
|
+ (count,) = struct.unpack(
|
|
|
+ ">L", self._contents[offset + i * 4 : offset + i * 4 + 4]
|
|
|
+ )
|
|
|
+ self._fanout_table.append(count)
|
|
|
+
|
|
|
+ # Total object count is the last entry
|
|
|
+ self.object_count = self._fanout_table[255]
|
|
|
+
|
|
|
+ def _parse_oidl_chunk(self) -> None:
|
|
|
+ """Parse the OID Lookup (OIDL) chunk."""
|
|
|
+ if CHUNK_OIDL not in self._chunks:
|
|
|
+ raise ValueError("Required OIDL chunk not found")
|
|
|
+
|
|
|
+ self._oidl_offset = self._chunks[CHUNK_OIDL]
|
|
|
+
|
|
|
+ def _parse_ooff_chunk(self) -> None:
|
|
|
+ """Parse the Object Offsets (OOFF) chunk."""
|
|
|
+ if CHUNK_OOFF not in self._chunks:
|
|
|
+ raise ValueError("Required OOFF chunk not found")
|
|
|
+
|
|
|
+ self._ooff_offset = self._chunks[CHUNK_OOFF]
|
|
|
+
|
|
|
+ def _parse_loff_chunk(self) -> None:
|
|
|
+ """Parse the Large Offsets (LOFF) chunk."""
|
|
|
+ self._loff_offset = self._chunks[CHUNK_LOFF]
|
|
|
+
|
|
|
+ def __len__(self) -> int:
|
|
|
+ """Return the number of objects in this MIDX."""
|
|
|
+ return self.object_count
|
|
|
+
|
|
|
+ def _get_oid(self, index: int) -> bytes:
|
|
|
+ """Get the object ID at the given index.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ index: Index of the object
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Binary object ID
|
|
|
+ """
|
|
|
+ if index < 0 or index >= self.object_count:
|
|
|
+ raise IndexError(f"Index {index} out of range")
|
|
|
+
|
|
|
+ offset = self._oidl_offset + index * self.hash_size
|
|
|
+ return self._contents[offset : offset + self.hash_size]
|
|
|
+
|
|
|
+ def _get_pack_info(self, index: int) -> tuple[int, int]:
|
|
|
+ """Get pack ID and offset for object at the given index.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ index: Index of the object
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Tuple of (pack_id, offset)
|
|
|
+ """
|
|
|
+ if index < 0 or index >= self.object_count:
|
|
|
+ raise IndexError(f"Index {index} out of range")
|
|
|
+
|
|
|
+ # Each entry is 8 bytes (4-byte pack ID + 4-byte offset)
|
|
|
+ offset = self._ooff_offset + index * 8
|
|
|
+
|
|
|
+ (pack_id,) = struct.unpack(">L", self._contents[offset : offset + 4])
|
|
|
+ (pack_offset,) = struct.unpack(">L", self._contents[offset + 4 : offset + 8])
|
|
|
+
|
|
|
+ # Check if this is a large offset (MSB set)
|
|
|
+ if pack_offset & 0x80000000:
|
|
|
+ # Look up in LOFF chunk
|
|
|
+ if CHUNK_LOFF not in self._chunks:
|
|
|
+ raise ValueError("Large offset found but no LOFF chunk")
|
|
|
+
|
|
|
+ large_index = pack_offset & 0x7FFFFFFF
|
|
|
+ large_offset_pos = self._loff_offset + large_index * 8
|
|
|
+ (pack_offset,) = struct.unpack(
|
|
|
+ ">Q", self._contents[large_offset_pos : large_offset_pos + 8]
|
|
|
+ )
|
|
|
+
|
|
|
+ return pack_id, pack_offset
|
|
|
+
|
|
|
+ def object_offset(self, sha: bytes) -> tuple[str, int] | None:
|
|
|
+ """Return the pack name and offset for the given object.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ sha: Binary SHA-1 or SHA-256 hash
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Tuple of (pack_name, offset) or None if not found
|
|
|
+ """
|
|
|
+ if len(sha) != self.hash_size:
|
|
|
+ raise ValueError(
|
|
|
+ f"SHA size mismatch: expected {self.hash_size}, got {len(sha)}"
|
|
|
+ )
|
|
|
+
|
|
|
+ # Use fanout table to narrow search range
|
|
|
+ first_byte = sha[0]
|
|
|
+ start_idx = 0 if first_byte == 0 else self._fanout_table[first_byte - 1]
|
|
|
+ end_idx = self._fanout_table[first_byte]
|
|
|
+
|
|
|
+ # Binary search within the range
|
|
|
+ while start_idx < end_idx:
|
|
|
+ mid = (start_idx + end_idx) // 2
|
|
|
+ mid_sha = self._get_oid(mid)
|
|
|
+
|
|
|
+ if mid_sha == sha:
|
|
|
+ # Found it!
|
|
|
+ pack_id, offset = self._get_pack_info(mid)
|
|
|
+ return self.pack_names[pack_id], offset
|
|
|
+ elif mid_sha < sha:
|
|
|
+ start_idx = mid + 1
|
|
|
+ else:
|
|
|
+ end_idx = mid
|
|
|
+
|
|
|
+ return None
|
|
|
+
|
|
|
+ def __contains__(self, sha: bytes) -> bool:
|
|
|
+ """Check if the given object SHA is in this MIDX.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ sha: Binary SHA hash
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ True if the object is in this MIDX
|
|
|
+ """
|
|
|
+ return self.object_offset(sha) is not None
|
|
|
+
|
|
|
+ def iterentries(self) -> Iterator[tuple[bytes, str, int]]:
|
|
|
+ """Iterate over all entries in this MIDX.
|
|
|
+
|
|
|
+ Yields:
|
|
|
+ Tuples of (sha, pack_name, offset)
|
|
|
+ """
|
|
|
+ for i in range(self.object_count):
|
|
|
+ sha = self._get_oid(i)
|
|
|
+ pack_id, offset = self._get_pack_info(i)
|
|
|
+ pack_name = self.pack_names[pack_id]
|
|
|
+ yield sha, pack_name, offset
|
|
|
+
|
|
|
+ def close(self) -> None:
|
|
|
+ """Close the MIDX file."""
|
|
|
+ if self._file is not None:
|
|
|
+ self._file.close()
|
|
|
+ self._file = None
|
|
|
+
|
|
|
+
|
|
|
+def load_midx(path: str | os.PathLike[str]) -> MultiPackIndex:
|
|
|
+ """Load a multi-pack-index file by path.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ path: Path to the MIDX file
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ A MultiPackIndex loaded from the given path
|
|
|
+ """
|
|
|
+ with GitFile(path, "rb") as f:
|
|
|
+ return load_midx_file(path, f)
|
|
|
+
|
|
|
+
|
|
|
+def load_midx_file(
|
|
|
+ path: str | os.PathLike[str], f: IO[bytes] | _GitFile
|
|
|
+) -> MultiPackIndex:
|
|
|
+ """Load a multi-pack-index from a file-like object.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ path: Path for the MIDX file
|
|
|
+ f: File-like object
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ A MultiPackIndex loaded from the given file
|
|
|
+ """
|
|
|
+ return MultiPackIndex(path, file=f)
|
|
|
+
|
|
|
+
|
|
|
+# TODO: Implement MIDX writing functionality
|
|
|
+# TODO: Implement integration with object_store.py
|
|
|
+# TODO: Add support for incremental MIDX chains
|
|
|
+# TODO: Add support for BTMP and RIDX chunks for bitmap integration
|