1 месяц назад · 836e22b4a1
--- a/dulwich/midx.py
+++ b/dulwich/midx.py
@@ -0,0 +1,644 @@
 
															+# midx.py -- Multi-Pack-Index (MIDX) support
														
 
															+# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
														
 
															+#
														
 
															+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
														
 
															+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
														
 
															+# General Public License as published by the Free Software Foundation; version 2.0
														
 
															+# or (at your option) any later version. You can redistribute it and/or
														
 
															+# modify it under the terms of either of these two licenses.
														
 
															+#
														
 
															+# Unless required by applicable law or agreed to in writing, software
														
 
															+# distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+# See the License for the specific language governing permissions and
														
 
															+# limitations under the License.
														
 
															+#
														
 
															+# You should have received a copy of the licenses; if not, see
														
 
															+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
														
 
															+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
														
 
															+# License, Version 2.0.
														
 
															+#
														
 
															+
														
 
															+"""Multi-Pack-Index (MIDX) support.
														
 
															+
														
 
															+A multi-pack-index (MIDX) provides a single index that covers multiple pack files,
														
 
															+enabling fast object lookup across all packs without opening each pack index.
														
 
															+
														
 
															+The MIDX file format consists of:
														
 
															+- A header with signature, version, and hash algorithm
														
 
															+- A chunk lookup table
														
 
															+- Multiple chunks containing pack names, OID fanout, OID lookup, and object offsets
														
 
															+- A trailer with checksum
														
 
															+
														
 
															+This module provides:
														
 
															+- Reading MIDX files
														
 
															+- Writing MIDX files
														
 
															+- Integration with pack-based object stores
														
 
															+
														
 
															+Limitations:
														
 
															+- Incremental MIDX chains are not yet supported (base_midx_files must be 0)
														
 
															+- BTMP (bitmapped packfiles) chunk is not yet implemented
														
 
															+- RIDX (reverse index) chunk is not yet implemented
														
 
															+
														
 
															+Note: Incremental MIDX chains were introduced in Git 2.47 as an experimental
														
 
															+feature, where multiple MIDX files can be chained together. The format includes
														
 
															+a base_midx_files field in the header and uses a multi-pack-index.d/ directory
														
 
															+with a multi-pack-index-chain file. This feature is not yet supported by Dulwich
														
 
															+as the specification is still evolving.
														
 
															+"""
														
 
															+
														
 
															+import os
														
 
															+import struct
														
 
															+from collections.abc import Iterator
														
 
															+from io import UnsupportedOperation
														
 
															+from typing import IO, Any
														
 
															+
														
 
															+try:
														
 
															+    import mmap
														
 
															+except ImportError:
														
 
															+    has_mmap = False
														
 
															+else:
														
 
															+    has_mmap = True
														
 
															+
														
 
															+from .file import GitFile, _GitFile
														
 
															+from .objects import ObjectID, RawObjectID
														
 
															+from .pack import SHA1Writer
														
 
															+
														
 
															+# MIDX signature
														
 
															+MIDX_SIGNATURE = b"MIDX"
														
 
															+
														
 
															+# MIDX version
														
 
															+MIDX_VERSION = 1
														
 
															+
														
 
															+# Chunk identifiers (4 bytes each)
														
 
															+CHUNK_PNAM = b"PNAM"  # Packfile names
														
 
															+CHUNK_OIDF = b"OIDF"  # OID fanout table
														
 
															+CHUNK_OIDL = b"OIDL"  # OID lookup table
														
 
															+CHUNK_OOFF = b"OOFF"  # Object offsets
														
 
															+CHUNK_LOFF = b"LOFF"  # Large offsets (optional)
														
 
															+CHUNK_BTMP = b"BTMP"  # Bitmapped packfiles (optional)
														
 
															+CHUNK_RIDX = b"RIDX"  # Reverse index (optional)
														
 
															+
														
 
															+# Hash algorithm identifiers
														
 
															+HASH_ALGORITHM_SHA1 = 1
														
 
															+HASH_ALGORITHM_SHA256 = 2
														
 
															+
														
 
															+
														
 
															+class MultiPackIndex:
														
 
															+    """Multi-pack-index for efficient object lookup across multiple pack files."""
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        filename: str | os.PathLike[str],
														
 
															+        file: IO[bytes] | _GitFile | None = None,
														
 
															+        contents: bytes | None = None,
														
 
															+        size: int | None = None,
														
 
															+    ) -> None:
														
 
															+        """Initialize a MultiPackIndex.
														
 
															+
														
 
															+        Args:
														
 
															+            filename: Path to the MIDX file
														
 
															+            file: Optional file object
														
 
															+            contents: Optional mmap'd contents
														
 
															+            size: Optional size of the MIDX file
														
 
															+        """
														
 
															+        self._filename = os.fspath(filename)
														
 
															+        self._file = file
														
 
															+        self._size = size
														
 
															+
														
 
															+        # Instance variables that will be set during parsing
														
 
															+        self.version: int
														
 
															+        self.hash_algorithm: int
														
 
															+        self.hash_size: int
														
 
															+        self.chunk_count: int
														
 
															+        self.base_midx_files: int
														
 
															+        self.pack_count: int
														
 
															+        self.pack_names: list[str]
														
 
															+        self.object_count: int
														
 
															+        self._chunks: dict[bytes, int]
														
 
															+        self._fanout_table: list[int]
														
 
															+        self._oidl_offset: int
														
 
															+        self._ooff_offset: int
														
 
															+        self._loff_offset: int
														
 
															+
														
 
															+        # Load file contents
														
 
															+        if contents is None:
														
 
															+            if file is None:
														
 
															+                with GitFile(filename, "rb") as f:
														
 
															+                    self._contents, self._size = self._load_file_contents(f, size)
														
 
															+            else:
														
 
															+                self._contents, self._size = self._load_file_contents(file, size)
														
 
															+        else:
														
 
															+            self._contents = contents
														
 
															+
														
 
															+        # Parse header
														
 
															+        self._parse_header()
														
 
															+
														
 
															+        # Parse chunk lookup table
														
 
															+        self._parse_chunk_table()
														
 
															+
														
 
															+    def _load_file_contents(
														
 
															+        self, f: IO[bytes] | _GitFile, size: int | None = None
														
 
															+    ) -> tuple[bytes | Any, int]:
														
 
															+        """Load contents from a file, preferring mmap when possible.
														
 
															+
														
 
															+        Args:
														
 
															+            f: File-like object to load
														
 
															+            size: Expected size, or None to determine from file
														
 
															+
														
 
															+        Returns:
														
 
															+            Tuple of (contents, size)
														
 
															+        """
														
 
															+        try:
														
 
															+            fd = f.fileno()
														
 
															+        except (UnsupportedOperation, AttributeError):
														
 
															+            fd = None
														
 
															+
														
 
															+        # Attempt to use mmap if possible
														
 
															+        if fd is not None:
														
 
															+            if size is None:
														
 
															+                size = os.fstat(fd).st_size
														
 
															+            if has_mmap:
														
 
															+                try:
														
 
															+                    contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
														
 
															+                except (OSError, ValueError):
														
 
															+                    # Can't mmap - perhaps a socket or invalid file descriptor
														
 
															+                    pass
														
 
															+                else:
														
 
															+                    return contents, size
														
 
															+
														
 
															+        # Fall back to reading entire file into memory
														
 
															+        contents_bytes = f.read()
														
 
															+        size = len(contents_bytes)
														
 
															+        return contents_bytes, size
														
 
															+
														
 
															+    def _parse_header(self) -> None:
														
 
															+        """Parse the MIDX header."""
														
 
															+        if len(self._contents) < 12:
														
 
															+            raise ValueError("MIDX file too small")
														
 
															+
														
 
															+        # Check signature
														
 
															+        signature = self._contents[0:4]
														
 
															+        if signature != MIDX_SIGNATURE:
														
 
															+            raise ValueError(f"Invalid MIDX signature: {signature!r}")
														
 
															+
														
 
															+        # Read version
														
 
															+        self.version = self._contents[4]
														
 
															+        if self.version != MIDX_VERSION:
														
 
															+            raise ValueError(f"Unsupported MIDX version: {self.version}")
														
 
															+
														
 
															+        # Read object ID version (hash algorithm)
														
 
															+        self.hash_algorithm = self._contents[5]
														
 
															+        if self.hash_algorithm == HASH_ALGORITHM_SHA1:
														
 
															+            self.hash_size = 20
														
 
															+        elif self.hash_algorithm == HASH_ALGORITHM_SHA256:
														
 
															+            self.hash_size = 32
														
 
															+        else:
														
 
															+            raise ValueError(f"Unknown hash algorithm: {self.hash_algorithm}")
														
 
															+
														
 
															+        # Read chunk count
														
 
															+        self.chunk_count = self._contents[6]
														
 
															+
														
 
															+        # Read base MIDX files count (currently always 0)
														
 
															+        self.base_midx_files = self._contents[7]
														
 
															+        if self.base_midx_files != 0:
														
 
															+            raise ValueError("Incremental MIDX not yet supported")
														
 
															+
														
 
															+        # Read pack file count
														
 
															+        (self.pack_count,) = struct.unpack(">L", self._contents[8:12])
														
 
															+
														
 
															+    def _parse_chunk_table(self) -> None:
														
 
															+        """Parse the chunk lookup table."""
														
 
															+        self._chunks = {}
														
 
															+
														
 
															+        # Chunk table starts at offset 12
														
 
															+        offset = 12
														
 
															+
														
 
															+        # Each chunk entry is 12 bytes (4-byte ID + 8-byte offset)
														
 
															+        for i in range(self.chunk_count + 1):  # +1 for terminator
														
 
															+            chunk_id = self._contents[offset : offset + 4]
														
 
															+            (chunk_offset,) = struct.unpack(
														
 
															+                ">Q", self._contents[offset + 4 : offset + 12]
														
 
															+            )
														
 
															+
														
 
															+            if chunk_id == b"\x00\x00\x00\x00":
														
 
															+                # Terminator entry
														
 
															+                break
														
 
															+
														
 
															+            self._chunks[chunk_id] = chunk_offset
														
 
															+            offset += 12
														
 
															+
														
 
															+        # Parse required chunks
														
 
															+        self._parse_pnam_chunk()
														
 
															+        self._parse_oidf_chunk()
														
 
															+        self._parse_oidl_chunk()
														
 
															+        self._parse_ooff_chunk()
														
 
															+
														
 
															+        # Parse optional chunks
														
 
															+        if CHUNK_LOFF in self._chunks:
														
 
															+            self._parse_loff_chunk()
														
 
															+
														
 
															+    def _parse_pnam_chunk(self) -> None:
														
 
															+        """Parse the Packfile Names (PNAM) chunk."""
														
 
															+        if CHUNK_PNAM not in self._chunks:
														
 
															+            raise ValueError("Required PNAM chunk not found")
														
 
															+
														
 
															+        offset = self._chunks[CHUNK_PNAM]
														
 
															+        self.pack_names = []
														
 
															+
														
 
															+        # Find the end of the PNAM chunk (next chunk or end of chunks section)
														
 
															+        next_offset = min(
														
 
															+            (o for o in self._chunks.values() if o > offset),
														
 
															+            default=len(self._contents),
														
 
															+        )
														
 
															+
														
 
															+        # Parse null-terminated pack names
														
 
															+        current = offset
														
 
															+        while current < next_offset:
														
 
															+            # Find the next null terminator
														
 
															+            null_pos = self._contents.find(b"\x00", current, next_offset)
														
 
															+            if null_pos == -1:
														
 
															+                break
														
 
															+
														
 
															+            pack_name = self._contents[current:null_pos].decode("utf-8")
														
 
															+            if pack_name:  # Skip empty strings (padding)
														
 
															+                self.pack_names.append(pack_name)
														
 
															+            current = null_pos + 1
														
 
															+
														
 
															+    def _parse_oidf_chunk(self) -> None:
														
 
															+        """Parse the OID Fanout (OIDF) chunk."""
														
 
															+        if CHUNK_OIDF not in self._chunks:
														
 
															+            raise ValueError("Required OIDF chunk not found")
														
 
															+
														
 
															+        offset = self._chunks[CHUNK_OIDF]
														
 
															+        self._fanout_table = []
														
 
															+
														
 
															+        # Read 256 4-byte entries
														
 
															+        for i in range(256):
														
 
															+            (count,) = struct.unpack(
														
 
															+                ">L", self._contents[offset + i * 4 : offset + i * 4 + 4]
														
 
															+            )
														
 
															+            self._fanout_table.append(count)
														
 
															+
														
 
															+        # Total object count is the last entry
														
 
															+        self.object_count = self._fanout_table[255]
														
 
															+
														
 
															+    def _parse_oidl_chunk(self) -> None:
														
 
															+        """Parse the OID Lookup (OIDL) chunk."""
														
 
															+        if CHUNK_OIDL not in self._chunks:
														
 
															+            raise ValueError("Required OIDL chunk not found")
														
 
															+
														
 
															+        self._oidl_offset = self._chunks[CHUNK_OIDL]
														
 
															+
														
 
															+    def _parse_ooff_chunk(self) -> None:
														
 
															+        """Parse the Object Offsets (OOFF) chunk."""
														
 
															+        if CHUNK_OOFF not in self._chunks:
														
 
															+            raise ValueError("Required OOFF chunk not found")
														
 
															+
														
 
															+        self._ooff_offset = self._chunks[CHUNK_OOFF]
														
 
															+
														
 
															+    def _parse_loff_chunk(self) -> None:
														
 
															+        """Parse the Large Offsets (LOFF) chunk."""
														
 
															+        self._loff_offset = self._chunks[CHUNK_LOFF]
														
 
															+
														
 
															+    def __len__(self) -> int:
														
 
															+        """Return the number of objects in this MIDX."""
														
 
															+        return self.object_count
														
 
															+
														
 
															+    def _get_oid(self, index: int) -> RawObjectID:
														
 
															+        """Get the object ID at the given index.
														
 
															+
														
 
															+        Args:
														
 
															+            index: Index of the object
														
 
															+
														
 
															+        Returns:
														
 
															+            Binary object ID
														
 
															+        """
														
 
															+        if index < 0 or index >= self.object_count:
														
 
															+            raise IndexError(f"Index {index} out of range")
														
 
															+
														
 
															+        offset = self._oidl_offset + index * self.hash_size
														
 
															+        return RawObjectID(self._contents[offset : offset + self.hash_size])
														
 
															+
														
 
															+    def _get_pack_info(self, index: int) -> tuple[int, int]:
														
 
															+        """Get pack ID and offset for object at the given index.
														
 
															+
														
 
															+        Args:
														
 
															+            index: Index of the object
														
 
															+
														
 
															+        Returns:
														
 
															+            Tuple of (pack_id, offset)
														
 
															+        """
														
 
															+        if index < 0 or index >= self.object_count:
														
 
															+            raise IndexError(f"Index {index} out of range")
														
 
															+
														
 
															+        # Each entry is 8 bytes (4-byte pack ID + 4-byte offset)
														
 
															+        offset = self._ooff_offset + index * 8
														
 
															+
														
 
															+        (pack_id,) = struct.unpack(">L", self._contents[offset : offset + 4])
														
 
															+        (pack_offset,) = struct.unpack(">L", self._contents[offset + 4 : offset + 8])
														
 
															+
														
 
															+        # Check if this is a large offset (MSB set)
														
 
															+        if pack_offset & 0x80000000:
														
 
															+            # Look up in LOFF chunk
														
 
															+            if CHUNK_LOFF not in self._chunks:
														
 
															+                raise ValueError("Large offset found but no LOFF chunk")
														
 
															+
														
 
															+            large_index = pack_offset & 0x7FFFFFFF
														
 
															+            large_offset_pos = self._loff_offset + large_index * 8
														
 
															+            (pack_offset,) = struct.unpack(
														
 
															+                ">Q", self._contents[large_offset_pos : large_offset_pos + 8]
														
 
															+            )
														
 
															+
														
 
															+        return pack_id, pack_offset
														
 
															+
														
 
															+    def object_offset(self, sha: ObjectID | RawObjectID) -> tuple[str, int] | None:
														
 
															+        """Return the pack name and offset for the given object.
														
 
															+
														
 
															+        Args:
														
 
															+            sha: Binary SHA-1 or SHA-256 hash
														
 
															+
														
 
															+        Returns:
														
 
															+            Tuple of (pack_name, offset) or None if not found
														
 
															+        """
														
 
															+        if len(sha) != self.hash_size:
														
 
															+            raise ValueError(
														
 
															+                f"SHA size mismatch: expected {self.hash_size}, got {len(sha)}"
														
 
															+            )
														
 
															+
														
 
															+        # Use fanout table to narrow search range
														
 
															+        first_byte = sha[0]
														
 
															+        start_idx = 0 if first_byte == 0 else self._fanout_table[first_byte - 1]
														
 
															+        end_idx = self._fanout_table[first_byte]
														
 
															+
														
 
															+        # Binary search within the range
														
 
															+        while start_idx < end_idx:
														
 
															+            mid = (start_idx + end_idx) // 2
														
 
															+            mid_sha = self._get_oid(mid)
														
 
															+
														
 
															+            if mid_sha == sha:
														
 
															+                # Found it!
														
 
															+                pack_id, offset = self._get_pack_info(mid)
														
 
															+                return self.pack_names[pack_id], offset
														
 
															+            elif mid_sha < sha:
														
 
															+                start_idx = mid + 1
														
 
															+            else:
														
 
															+                end_idx = mid
														
 
															+
														
 
															+        return None
														
 
															+
														
 
															+    def __contains__(self, sha: ObjectID | RawObjectID) -> bool:
														
 
															+        """Check if the given object SHA is in this MIDX.
														
 
															+
														
 
															+        Args:
														
 
															+            sha: Binary SHA hash
														
 
															+
														
 
															+        Returns:
														
 
															+            True if the object is in this MIDX
														
 
															+        """
														
 
															+        return self.object_offset(sha) is not None
														
 
															+
														
 
															+    def iterentries(self) -> Iterator[tuple[RawObjectID, str, int]]:
														
 
															+        """Iterate over all entries in this MIDX.
														
 
															+
														
 
															+        Yields:
														
 
															+            Tuples of (sha, pack_name, offset)
														
 
															+        """
														
 
															+        for i in range(self.object_count):
														
 
															+            sha = self._get_oid(i)
														
 
															+            pack_id, offset = self._get_pack_info(i)
														
 
															+            pack_name = self.pack_names[pack_id]
														
 
															+            yield sha, pack_name, offset
														
 
															+
														
 
															+    def close(self) -> None:
														
 
															+        """Close the MIDX file and release mmap resources."""
														
 
															+        # Close mmap'd contents first if it's an mmap object
														
 
															+        if self._contents is not None and has_mmap:
														
 
															+            if isinstance(self._contents, mmap.mmap):
														
 
															+                self._contents.close()
														
 
															+        self._contents = None
														
 
															+
														
 
															+        # Close file handle
														
 
															+        if self._file is not None:
														
 
															+            self._file.close()
														
 
															+            self._file = None
														
 
															+
														
 
															+
														
 
															+def load_midx(path: str | os.PathLike[str]) -> MultiPackIndex:
														
 
															+    """Load a multi-pack-index file by path.
														
 
															+
														
 
															+    Args:
														
 
															+        path: Path to the MIDX file
														
 
															+
														
 
															+    Returns:
														
 
															+        A MultiPackIndex loaded from the given path
														
 
															+    """
														
 
															+    with GitFile(path, "rb") as f:
														
 
															+        return load_midx_file(path, f)
														
 
															+
														
 
															+
														
 
															+def load_midx_file(
														
 
															+    path: str | os.PathLike[str], f: IO[bytes] | _GitFile
														
 
															+) -> MultiPackIndex:
														
 
															+    """Load a multi-pack-index from a file-like object.
														
 
															+
														
 
															+    Args:
														
 
															+        path: Path for the MIDX file
														
 
															+        f: File-like object
														
 
															+
														
 
															+    Returns:
														
 
															+        A MultiPackIndex loaded from the given file
														
 
															+    """
														
 
															+    return MultiPackIndex(path, file=f)
														
 
															+
														
 
															+
														
 
															+def write_midx(
														
 
															+    f: IO[bytes],
														
 
															+    pack_index_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]],
														
 
															+    hash_algorithm: int = HASH_ALGORITHM_SHA1,
														
 
															+) -> bytes:
														
 
															+    """Write a multi-pack-index file.
														
 
															+
														
 
															+    Args:
														
 
															+        f: File-like object to write to
														
 
															+        pack_index_entries: List of (pack_name, entries) tuples where entries are
														
 
															+                          (sha, offset, crc32) tuples, sorted by SHA
														
 
															+        hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
														
 
															+
														
 
															+    Returns:
														
 
															+        SHA-1 checksum of the written MIDX file
														
 
															+    """
														
 
															+    if hash_algorithm == HASH_ALGORITHM_SHA1:
														
 
															+        hash_size = 20
														
 
															+    elif hash_algorithm == HASH_ALGORITHM_SHA256:
														
 
															+        hash_size = 32
														
 
															+    else:
														
 
															+        raise ValueError(f"Unknown hash algorithm: {hash_algorithm}")
														
 
															+
														
 
															+    # Wrap file in SHA1Writer to compute checksum
														
 
															+    writer = SHA1Writer(f)
														
 
															+
														
 
															+    # Sort pack entries by pack name (required by Git)
														
 
															+    pack_index_entries_sorted = sorted(pack_index_entries, key=lambda x: x[0])
														
 
															+
														
 
															+    # Collect all objects from all packs
														
 
															+    all_objects: list[tuple[RawObjectID, int, int]] = []  # (sha, pack_id, offset)
														
 
															+    pack_names: list[str] = []
														
 
															+
														
 
															+    for pack_id, (pack_name, entries) in enumerate(pack_index_entries_sorted):
														
 
															+        pack_names.append(pack_name)
														
 
															+        for sha, offset, _crc32 in entries:
														
 
															+            all_objects.append((sha, pack_id, offset))
														
 
															+
														
 
															+    # Sort all objects by SHA
														
 
															+    all_objects.sort(key=lambda x: x[0])
														
 
															+
														
 
															+    # Calculate offsets for chunks
														
 
															+    num_packs = len(pack_names)
														
 
															+    num_objects = len(all_objects)
														
 
															+
														
 
															+    # Header: 12 bytes
														
 
															+    header_size = 12
														
 
															+
														
 
															+    # Chunk count: PNAM, OIDF, OIDL, OOFF, and optionally LOFF
														
 
															+    # We'll determine if LOFF is needed later
														
 
															+    chunk_count = 4  # PNAM, OIDF, OIDL, OOFF
														
 
															+
														
 
															+    # Check if we need LOFF chunk (for offsets >= 2^31)
														
 
															+    need_loff = any(offset >= 2**31 for _sha, _pack_id, offset in all_objects)
														
 
															+    if need_loff:
														
 
															+        chunk_count += 1
														
 
															+
														
 
															+    # Chunk table: (chunk_count + 1) * 12 bytes (including terminator)
														
 
															+    chunk_table_size = (chunk_count + 1) * 12
														
 
															+
														
 
															+    # Calculate chunk offsets
														
 
															+    current_offset = header_size + chunk_table_size
														
 
															+
														
 
															+    # PNAM chunk: pack names as null-terminated strings, padded to 4-byte boundary
														
 
															+    pnam_data = b"".join(name.encode("utf-8") + b"\x00" for name in pack_names)
														
 
															+    # Pad to 4-byte boundary
														
 
															+    pnam_padding = (4 - len(pnam_data) % 4) % 4
														
 
															+    pnam_data += b"\x00" * pnam_padding
														
 
															+    pnam_offset = current_offset
														
 
															+    current_offset += len(pnam_data)
														
 
															+
														
 
															+    # OIDF chunk: 256 * 4 bytes
														
 
															+    oidf_offset = current_offset
														
 
															+    oidf_size = 256 * 4
														
 
															+    current_offset += oidf_size
														
 
															+
														
 
															+    # OIDL chunk: num_objects * hash_size bytes
														
 
															+    oidl_offset = current_offset
														
 
															+    oidl_size = num_objects * hash_size
														
 
															+    current_offset += oidl_size
														
 
															+
														
 
															+    # OOFF chunk: num_objects * 8 bytes (4 for pack_id + 4 for offset)
														
 
															+    ooff_offset = current_offset
														
 
															+    ooff_size = num_objects * 8
														
 
															+    current_offset += ooff_size
														
 
															+
														
 
															+    # LOFF chunk (if needed): variable size
														
 
															+    # We'll calculate the exact size when we know how many large offsets we have
														
 
															+    loff_offset = current_offset if need_loff else 0
														
 
															+    large_offsets: list[int] = []
														
 
															+
														
 
															+    # Calculate trailer offset (where checksum starts)
														
 
															+    # We need to pre-calculate large offset count for accurate trailer offset
														
 
															+    if need_loff:
														
 
															+        # Count large offsets
														
 
															+        large_offset_count = sum(1 for _, _, offset in all_objects if offset >= 2**31)
														
 
															+        loff_size = large_offset_count * 8
														
 
															+        trailer_offset = current_offset + loff_size
														
 
															+    else:
														
 
															+        trailer_offset = current_offset
														
 
															+
														
 
															+    # Write header
														
 
															+    writer.write(MIDX_SIGNATURE)  # 4 bytes: signature
														
 
															+    writer.write(bytes([MIDX_VERSION]))  # 1 byte: version
														
 
															+    writer.write(bytes([hash_algorithm]))  # 1 byte: hash algorithm
														
 
															+    writer.write(bytes([chunk_count]))  # 1 byte: chunk count
														
 
															+    writer.write(bytes([0]))  # 1 byte: base MIDX files (always 0)
														
 
															+    writer.write(struct.pack(">L", num_packs))  # 4 bytes: pack count
														
 
															+
														
 
															+    # Write chunk table
														
 
															+    chunk_table = [
														
 
															+        (CHUNK_PNAM, pnam_offset),
														
 
															+        (CHUNK_OIDF, oidf_offset),
														
 
															+        (CHUNK_OIDL, oidl_offset),
														
 
															+        (CHUNK_OOFF, ooff_offset),
														
 
															+    ]
														
 
															+    if need_loff:
														
 
															+        chunk_table.append((CHUNK_LOFF, loff_offset))
														
 
															+
														
 
															+    for chunk_id, chunk_offset in chunk_table:
														
 
															+        writer.write(chunk_id)  # 4 bytes
														
 
															+        writer.write(struct.pack(">Q", chunk_offset))  # 8 bytes
														
 
															+
														
 
															+    # Write terminator (points to where trailer/checksum starts)
														
 
															+    writer.write(b"\x00\x00\x00\x00")  # 4 bytes
														
 
															+    writer.write(struct.pack(">Q", trailer_offset))  # 8 bytes
														
 
															+
														
 
															+    # Write PNAM chunk
														
 
															+    writer.write(pnam_data)
														
 
															+
														
 
															+    # Write OIDF chunk (fanout table)
														
 
															+    fanout: list[int] = [0] * 256
														
 
															+    for sha, _pack_id, _offset in all_objects:
														
 
															+        first_byte = sha[0]
														
 
															+        fanout[first_byte] += 1
														
 
															+
														
 
															+    # Convert counts to cumulative
														
 
															+    cumulative = 0
														
 
															+    for i in range(256):
														
 
															+        cumulative += fanout[i]
														
 
															+        writer.write(struct.pack(">L", cumulative))
														
 
															+
														
 
															+    # Write OIDL chunk (object IDs)
														
 
															+    for sha, _pack_id, _offset in all_objects:
														
 
															+        writer.write(sha)
														
 
															+
														
 
															+    # Write OOFF chunk (pack ID and offset for each object)
														
 
															+    for _sha, pack_id, offset in all_objects:
														
 
															+        writer.write(struct.pack(">L", pack_id))
														
 
															+
														
 
															+        if offset >= 2**31:
														
 
															+            # Use large offset table
														
 
															+            large_offset_index = len(large_offsets)
														
 
															+            large_offsets.append(offset)
														
 
															+            # Set MSB to indicate large offset
														
 
															+            writer.write(struct.pack(">L", 0x80000000 | large_offset_index))
														
 
															+        else:
														
 
															+            writer.write(struct.pack(">L", offset))
														
 
															+
														
 
															+    # Write LOFF chunk if needed
														
 
															+    if need_loff:
														
 
															+        for large_offset in large_offsets:
														
 
															+            writer.write(struct.pack(">Q", large_offset))
														
 
															+
														
 
															+    # Write checksum
														
 
															+    return writer.write_sha()
														
 
															+
														
 
															+
														
 
															+def write_midx_file(
														
 
															+    path: str | os.PathLike[str],
														
 
															+    pack_index_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]],
														
 
															+    hash_algorithm: int = HASH_ALGORITHM_SHA1,
														
 
															+) -> bytes:
														
 
															+    """Write a multi-pack-index file to disk.
														
 
															+
														
 
															+    Args:
														
 
															+        path: Path where to write the MIDX file
														
 
															+        pack_index_entries: List of (pack_name, entries) tuples where entries are
														
 
															+                          (sha, offset, crc32) tuples, sorted by SHA
														
 
															+        hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
														
 
															+
														
 
															+    Returns:
														
 
															+        SHA-1 checksum of the written MIDX file
														
 
															+    """
														
 
															+    with GitFile(path, "wb") as f:
														
 
															+        return write_midx(f, pack_index_entries, hash_algorithm)
														
 
															+
														
 
															+
														
 
															+# TODO: Add support for incremental MIDX chains
														
 
															+# TODO: Add support for BTMP and RIDX chunks for bitmap integration
														
--- a/dulwich/object_store.py
+++ b/dulwich/object_store.py
@@ -42,6 +42,7 @@ from typing import (
 
															 from .errors import NotTreeError
														
 
															 from .file import GitFile, _GitFile
														
 
															+from .midx import MultiPackIndex, load_midx
														
 
															 from .objects import (
														
 
															     S_ISGITLINK,
														
 
															     ZERO_SHA,
														
@@ -1399,6 +1400,10 @@ class DiskObjectStore(PackBasedObjectStore):
 
															         self._commit_graph = None
														
 
															         self._use_commit_graph = True  # Default to true
														
 
															+        # Multi-pack-index support - lazy loaded
														
 
															+        self._midx: MultiPackIndex | None = None
														
 
															+        self._use_midx = True  # Default to true
														
 
															+
														
 
															     def __repr__(self) -> str:
														
 
															         """Return string representation of DiskObjectStore.
														
@@ -1485,6 +1490,9 @@ class DiskObjectStore(PackBasedObjectStore):
 
															         # Read core.commitGraph setting
														
 
															         use_commit_graph = config.get_boolean((b"core",), b"commitGraph", True)
														
 
															+        # Read core.multiPackIndex setting
														
 
															+        use_midx = config.get_boolean((b"core",), b"multiPackIndex", True)
														
 
															+
														
 
															         # Read core.fsyncObjectFiles setting
														
 
															         fsync_object_files = config.get_boolean((b"core",), b"fsyncObjectFiles", False)
														
@@ -1521,6 +1529,7 @@ class DiskObjectStore(PackBasedObjectStore):
 
															             dir_mode=dir_mode,
														
 
															         )
														
 
															         instance._use_commit_graph = use_commit_graph
														
 
															+        instance._use_midx = use_midx
														
 
															         return instance
														
 
															     @property
														
@@ -2036,6 +2045,162 @@ class DiskObjectStore(PackBasedObjectStore):
 
															                 self._commit_graph = read_commit_graph(graph_file)
														
 
															         return self._commit_graph
														
 
															+    def get_midx(self) -> MultiPackIndex | None:
														
 
															+        """Get the multi-pack-index for this object store.
														
 
															+
														
 
															+        Returns:
														
 
															+          MultiPackIndex object if available, None otherwise
														
 
															+
														
 
															+        Raises:
														
 
															+          ValueError: If MIDX file is corrupt
														
 
															+          OSError: If MIDX file cannot be read
														
 
															+        """
														
 
															+        if not self._use_midx:
														
 
															+            return None
														
 
															+
														
 
															+        if self._midx is None:
														
 
															+            # Look for MIDX in pack directory
														
 
															+            midx_file = os.path.join(self.pack_dir, "multi-pack-index")
														
 
															+            if os.path.exists(midx_file):
														
 
															+                self._midx = load_midx(midx_file)
														
 
															+        return self._midx
														
 
															+
														
 
															+    def _get_pack_by_name(self, pack_name: str) -> Pack:
														
 
															+        """Get a pack by its base name.
														
 
															+
														
 
															+        Args:
														
 
															+            pack_name: Base name of the pack (e.g., 'pack-abc123.pack' or 'pack-abc123.idx')
														
 
															+
														
 
															+        Returns:
														
 
															+            Pack object
														
 
															+
														
 
															+        Raises:
														
 
															+            KeyError: If pack doesn't exist
														
 
															+        """
														
 
															+        # Remove .pack or .idx extension if present
														
 
															+        if pack_name.endswith(".pack"):
														
 
															+            base_name = pack_name[:-5]
														
 
															+        elif pack_name.endswith(".idx"):
														
 
															+            base_name = pack_name[:-4]
														
 
															+        else:
														
 
															+            base_name = pack_name
														
 
															+
														
 
															+        # Check if already in cache
														
 
															+        if base_name in self._pack_cache:
														
 
															+            return self._pack_cache[base_name]
														
 
															+
														
 
															+        # Load the pack
														
 
															+        pack_path = os.path.join(self.pack_dir, base_name)
														
 
															+        if not os.path.exists(pack_path + ".pack"):
														
 
															+            raise KeyError(f"Pack {pack_name} not found")
														
 
															+
														
 
															+        pack = Pack(
														
 
															+            pack_path,
														
 
															+            delta_window_size=self.pack_delta_window_size,
														
 
															+            window_memory=self.pack_window_memory,
														
 
															+            delta_cache_size=self.pack_delta_cache_size,
														
 
															+            depth=self.pack_depth,
														
 
															+            threads=self.pack_threads,
														
 
															+            big_file_threshold=self.pack_big_file_threshold,
														
 
															+        )
														
 
															+        self._pack_cache[base_name] = pack
														
 
															+        return pack
														
 
															+
														
 
															+    def contains_packed(self, sha: ObjectID | RawObjectID) -> bool:
														
 
															+        """Check if a particular object is present by SHA1 and is packed.
														
 
															+
														
 
															+        This checks the MIDX first if available, then falls back to checking
														
 
															+        individual pack indexes.
														
 
															+
														
 
															+        Args:
														
 
															+            sha: Binary SHA of the object
														
 
															+
														
 
															+        Returns:
														
 
															+            True if the object is in a pack file
														
 
															+        """
														
 
															+        # Check MIDX first for faster lookup
														
 
															+        midx = self.get_midx()
														
 
															+        if midx is not None and sha in midx:
														
 
															+            return True
														
 
															+
														
 
															+        # Fall back to checking individual packs
														
 
															+        return super().contains_packed(sha)
														
 
															+
														
 
															+    def get_raw(self, name: RawObjectID | ObjectID) -> tuple[int, bytes]:
														
 
															+        """Obtain the raw fulltext for an object.
														
 
															+
														
 
															+        This uses the MIDX if available for faster lookups.
														
 
															+
														
 
															+        Args:
														
 
															+            name: SHA for the object (20 bytes binary or 40 bytes hex)
														
 
															+
														
 
															+        Returns:
														
 
															+            Tuple with numeric type and object contents
														
 
															+
														
 
															+        Raises:
														
 
															+            KeyError: If object not found
														
 
															+        """
														
 
															+        if name == ZERO_SHA:
														
 
															+            raise KeyError(name)
														
 
															+
														
 
															+        sha: RawObjectID
														
 
															+        if len(name) == 40:
														
 
															+            # name is ObjectID (hex), convert to RawObjectID
														
 
															+            sha = hex_to_sha(cast(ObjectID, name))
														
 
															+        elif len(name) == 20:
														
 
															+            # name is already RawObjectID (binary)
														
 
															+            sha = RawObjectID(name)
														
 
															+        else:
														
 
															+            raise AssertionError(f"Invalid object name {name!r}")
														
 
															+
														
 
															+        # Try MIDX first for faster lookup
														
 
															+        midx = self.get_midx()
														
 
															+        if midx is not None:
														
 
															+            result = midx.object_offset(sha)
														
 
															+            if result is not None:
														
 
															+                pack_name, _offset = result
														
 
															+                try:
														
 
															+                    pack = self._get_pack_by_name(pack_name)
														
 
															+                    return pack.get_raw(sha)
														
 
															+                except (KeyError, PackFileDisappeared):
														
 
															+                    # Pack disappeared or object not found, fall through to standard lookup
														
 
															+                    pass
														
 
															+
														
 
															+        # Fall back to the standard implementation
														
 
															+        return super().get_raw(name)
														
 
															+
														
 
															+    def write_midx(self) -> bytes:
														
 
															+        """Write a multi-pack-index file for this object store.
														
 
															+
														
 
															+        Creates a MIDX file that indexes all pack files in the pack directory.
														
 
															+
														
 
															+        Returns:
														
 
															+            SHA-1 checksum of the written MIDX file
														
 
															+
														
 
															+        Raises:
														
 
															+            OSError: If the pack directory doesn't exist or MIDX can't be written
														
 
															+        """
														
 
															+        from .midx import write_midx_file
														
 
															+
														
 
															+        # Get all pack files
														
 
															+        packs = self.packs
														
 
															+        if not packs:
														
 
															+            # No packs to index
														
 
															+            return b"\x00" * 20
														
 
															+
														
 
															+        # Collect entries from all packs
														
 
															+        pack_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]] = []
														
 
															+
														
 
															+        for pack in packs:
														
 
															+            # Git stores .idx extension in MIDX, not .pack
														
 
															+            pack_name = os.path.basename(pack._basename) + ".idx"
														
 
															+            entries = list(pack.index.iterentries())
														
 
															+            pack_entries.append((pack_name, entries))
														
 
															+
														
 
															+        # Write MIDX file
														
 
															+        midx_path = os.path.join(self.pack_dir, "multi-pack-index")
														
 
															+        return write_midx_file(midx_path, pack_entries)
														
 
															+
														
 
															     def write_commit_graph(
														
 
															         self, refs: Iterable[ObjectID] | None = None, reachable: bool = True
														
 
															     ) -> None:
														
@@ -2149,6 +2314,25 @@ class DiskObjectStore(PackBasedObjectStore):
 
															                 if time.time() - mtime > grace_period:
														
 
															                     os.remove(pack_path)
														
 
															+    def close(self) -> None:
														
 
															+        """Close the object store and release resources.
														
 
															+
														
 
															+        This method closes all cached pack files, MIDX, and frees associated resources.
														
 
															+        """
														
 
															+        # Close MIDX if it's loaded
														
 
															+        if self._midx is not None:
														
 
															+            self._midx.close()
														
 
															+            self._midx = None
														
 
															+
														
 
															+        # Close alternates
														
 
															+        if self._alternates is not None:
														
 
															+            for alt in self._alternates:
														
 
															+                alt.close()
														
 
															+            self._alternates = None
														
 
															+
														
 
															+        # Call parent class close to handle pack files
														
 
															+        super().close()
														
 
															+
														
 
															 class MemoryObjectStore(PackCapableObjectStore):
														
 
															     """Object store that keeps all objects in memory."""
														
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -162,6 +162,7 @@ def self_test_suite() -> unittest.TestSuite:
 
															         "mbox",
														
 
															         "merge",
														
 
															         "merge_drivers",
														
 
															+        "midx",
														
 
															         "missing_obj_finder",
														
 
															         "notes",
														
 
															         "objects",
														
--- a/tests/compat/__init__.py
+++ b/tests/compat/__init__.py
@@ -34,6 +34,7 @@ def test_suite() -> unittest.TestSuite:
 
															         "dumb",
														
 
															         "index",
														
 
															         "lfs",
														
 
															+        "midx",
														
 
															         "pack",
														
 
															         "patch",
														
 
															         "porcelain",
														
--- a/tests/compat/test_midx.py
+++ b/tests/compat/test_midx.py
@@ -0,0 +1,267 @@
 
															+# test_midx.py -- Compatibility tests for multi-pack-index functionality
														
 
															+# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
														
 
															+#
														
 
															+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
														
 
															+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
														
 
															+# General Public License as published by the Free Software Foundation; version 2.0
														
 
															+# or (at your option) any later version. You can redistribute it and/or
														
 
															+# modify it under the terms of either of these two licenses.
														
 
															+
														
 
															+"""Compatibility tests for Git multi-pack-index functionality.
														
 
															+
														
 
															+These tests verify that dulwich's MIDX implementation can read and interact
														
 
															+with MIDX files created by C Git, and that Git can read MIDX files created
														
 
															+by Dulwich.
														
 
															+"""
														
 
															+
														
 
															+import os
														
 
															+import tempfile
														
 
															+
														
 
															+from dulwich.midx import load_midx
														
 
															+from dulwich.object_store import DiskObjectStore
														
 
															+from dulwich.repo import Repo
														
 
															+
														
 
															+from .utils import CompatTestCase, run_git_or_fail
														
 
															+
														
 
															+
														
 
															+class MIDXCompatTests(CompatTestCase):
														
 
															+    """Compatibility tests for multi-pack-index functionality."""
														
 
															+
														
 
															+    # Multi-pack-index was introduced in Git 2.21.0
														
 
															+    min_git_version = (2, 21, 0)
														
 
															+
														
 
															+    def setUp(self):
														
 
															+        super().setUp()
														
 
															+        self.test_dir = tempfile.mkdtemp()
														
 
															+        self.repo_path = os.path.join(self.test_dir, "test-repo")
														
 
															+
														
 
															+        # Set up git identity to avoid committer identity errors
														
 
															+        self.overrideEnv("GIT_COMMITTER_NAME", "Test Author")
														
 
															+        self.overrideEnv("GIT_COMMITTER_EMAIL", "test@example.com")
														
 
															+        self.overrideEnv("GIT_AUTHOR_NAME", "Test Author")
														
 
															+        self.overrideEnv("GIT_AUTHOR_EMAIL", "test@example.com")
														
 
															+
														
 
															+    def tearDown(self):
														
 
															+        from .utils import rmtree_ro
														
 
															+
														
 
															+        rmtree_ro(self.test_dir)
														
 
															+
														
 
															+    def create_test_repo_with_packs(self):
														
 
															+        """Create a test repository with multiple pack files."""
														
 
															+        # Initialize repository
														
 
															+        run_git_or_fail(["init"], cwd=self.test_dir)
														
 
															+        os.rename(os.path.join(self.test_dir, ".git"), self.repo_path)
														
 
															+
														
 
															+        work_dir = os.path.join(self.test_dir, "work")
														
 
															+        os.makedirs(work_dir)
														
 
															+
														
 
															+        # Create .git file pointing to our repo
														
 
															+        with open(os.path.join(work_dir, ".git"), "w") as f:
														
 
															+            f.write(f"gitdir: {self.repo_path}\n")
														
 
															+
														
 
															+        # Create some commits and pack them
														
 
															+        for i in range(5):
														
 
															+            filename = f"file{i}.txt"
														
 
															+            with open(os.path.join(work_dir, filename), "w") as f:
														
 
															+                f.write(f"Content {i}\n" * 100)  # Make files bigger to ensure packing
														
 
															+
														
 
															+            run_git_or_fail(["add", filename], cwd=work_dir)
														
 
															+            run_git_or_fail(
														
 
															+                [
														
 
															+                    "commit",
														
 
															+                    "-m",
														
 
															+                    f"Commit {i}",
														
 
															+                    "--author",
														
 
															+                    "Test Author <test@example.com>",
														
 
															+                ],
														
 
															+                cwd=work_dir,
														
 
															+            )
														
 
															+
														
 
															+            # Create a pack file after each commit to get multiple packs
														
 
															+            if i > 0:  # Skip first commit to avoid empty pack
														
 
															+                run_git_or_fail(["repack", "-d"], cwd=work_dir)
														
 
															+
														
 
															+        return work_dir
														
 
															+
														
 
															+    def test_read_git_midx(self):
														
 
															+        """Test that Dulwich can read a MIDX file created by Git."""
														
 
															+        work_dir = self.create_test_repo_with_packs()
														
 
															+
														
 
															+        # Have Git create a MIDX file
														
 
															+        run_git_or_fail(["multi-pack-index", "write"], cwd=work_dir)
														
 
															+
														
 
															+        # Verify Git created the MIDX file
														
 
															+        midx_path = os.path.join(self.repo_path, "objects", "pack", "multi-pack-index")
														
 
															+        self.assertTrue(
														
 
															+            os.path.exists(midx_path), "Git did not create multi-pack-index file"
														
 
															+        )
														
 
															+
														
 
															+        # Load the MIDX file with Dulwich
														
 
															+        midx = load_midx(midx_path)
														
 
															+        try:
														
 
															+            # Verify we can read it
														
 
															+            self.assertGreater(len(midx), 0, "MIDX should contain objects")
														
 
															+            self.assertGreater(midx.pack_count, 0, "MIDX should reference packs")
														
 
															+
														
 
															+            # Verify the pack names look reasonable
														
 
															+            # Git stores .idx extensions in MIDX files
														
 
															+            for pack_name in midx.pack_names:
														
 
															+                self.assertTrue(pack_name.startswith("pack-"))
														
 
															+                self.assertTrue(pack_name.endswith(".idx"))
														
 
															+        finally:
														
 
															+            midx.close()
														
 
															+
														
 
															+    def test_git_uses_dulwich_midx(self):
														
 
															+        """Test that Git can use a MIDX file created by Dulwich."""
														
 
															+        work_dir = self.create_test_repo_with_packs()
														
 
															+
														
 
															+        # Use Dulwich to create a MIDX file
														
 
															+        repo = Repo(self.repo_path)
														
 
															+        try:
														
 
															+            store = repo.object_store
														
 
															+            self.assertIsInstance(store, DiskObjectStore)
														
 
															+
														
 
															+            # Write MIDX with Dulwich
														
 
															+            checksum = store.write_midx()
														
 
															+            self.assertEqual(20, len(checksum))
														
 
															+        finally:
														
 
															+            repo.close()
														
 
															+
														
 
															+        # Verify the file was created
														
 
															+        midx_path = os.path.join(self.repo_path, "objects", "pack", "multi-pack-index")
														
 
															+        self.assertTrue(os.path.exists(midx_path))
														
 
															+
														
 
															+        # Have Git verify the MIDX file (should succeed with return code 0)
														
 
															+        run_git_or_fail(["multi-pack-index", "verify"], cwd=work_dir)
														
 
															+
														
 
															+        # Try to use the MIDX with Git commands
														
 
															+        # This should work if the MIDX is valid
														
 
															+        run_git_or_fail(["fsck"], cwd=work_dir)
														
 
															+
														
 
															+    def test_midx_object_lookup_matches_git(self):
														
 
															+        """Test that object lookups through MIDX match Git's results."""
														
 
															+        work_dir = self.create_test_repo_with_packs()
														
 
															+
														
 
															+        # Have Git create a MIDX file
														
 
															+        run_git_or_fail(["multi-pack-index", "write"], cwd=work_dir)
														
 
															+
														
 
															+        # Load with Dulwich
														
 
															+        repo = Repo(self.repo_path)
														
 
															+        try:
														
 
															+            store = repo.object_store
														
 
															+
														
 
															+            # Get MIDX
														
 
															+            midx = store.get_midx()
														
 
															+            self.assertIsNotNone(midx, "MIDX should be loaded")
														
 
															+
														
 
															+            # Get all objects from Git
														
 
															+            result = run_git_or_fail(["rev-list", "--all", "--objects"], cwd=work_dir)
														
 
															+            object_shas = [
														
 
															+                line.split()[0].encode("ascii")
														
 
															+                for line in result.decode("utf-8").strip().split("\n")
														
 
															+                if line
														
 
															+            ]
														
 
															+
														
 
															+            # Verify we can find these objects through the MIDX
														
 
															+            found_count = 0
														
 
															+            for sha_hex in object_shas:
														
 
															+                # Convert hex to binary
														
 
															+                sha_bin = bytes.fromhex(sha_hex.decode("ascii"))
														
 
															+
														
 
															+                # Check if it's in the MIDX
														
 
															+                if sha_bin in midx:
														
 
															+                    found_count += 1
														
 
															+
														
 
															+                    # Verify we can get the object location
														
 
															+                    result = midx.object_offset(sha_bin)
														
 
															+                    self.assertIsNotNone(result)
														
 
															+                    pack_name, offset = result
														
 
															+                    self.assertIsInstance(pack_name, str)
														
 
															+                    self.assertIsInstance(offset, int)
														
 
															+                    self.assertGreater(offset, 0)
														
 
															+
														
 
															+            # We should find at least some objects in the MIDX
														
 
															+            self.assertGreater(
														
 
															+                found_count, 0, "Should find at least some objects in MIDX"
														
 
															+            )
														
 
															+        finally:
														
 
															+            repo.close()
														
 
															+
														
 
															+    def test_midx_with_multiple_packs(self):
														
 
															+        """Test MIDX functionality with multiple pack files."""
														
 
															+        work_dir = self.create_test_repo_with_packs()
														
 
															+
														
 
															+        # Create multiple pack files explicitly
														
 
															+        run_git_or_fail(["repack"], cwd=work_dir)
														
 
															+        run_git_or_fail(["repack"], cwd=work_dir)
														
 
															+
														
 
															+        # Create MIDX with Git
														
 
															+        run_git_or_fail(["multi-pack-index", "write"], cwd=work_dir)
														
 
															+
														
 
															+        # Load with Dulwich
														
 
															+        midx_path = os.path.join(self.repo_path, "objects", "pack", "multi-pack-index")
														
 
															+        midx = load_midx(midx_path)
														
 
															+        try:
														
 
															+            # Should have multiple packs
														
 
															+            # (Exact count may vary depending on Git version and repacking)
														
 
															+            self.assertGreaterEqual(midx.pack_count, 1)
														
 
															+
														
 
															+            # Verify we can iterate over all entries
														
 
															+            entries = list(midx.iterentries())
														
 
															+            self.assertGreater(len(entries), 0)
														
 
															+
														
 
															+            # All entries should have valid structure
														
 
															+            for sha, pack_name, offset in entries:
														
 
															+                self.assertEqual(20, len(sha))  # SHA-1 is 20 bytes
														
 
															+                self.assertIsInstance(pack_name, str)
														
 
															+                # Git stores .idx extensions in MIDX files
														
 
															+                self.assertTrue(pack_name.endswith(".idx"))
														
 
															+                self.assertIsInstance(offset, int)
														
 
															+                self.assertGreaterEqual(offset, 0)
														
 
															+        finally:
														
 
															+            midx.close()
														
 
															+
														
 
															+    def test_dulwich_object_store_with_git_midx(self):
														
 
															+        """Test that DiskObjectStore can use Git-created MIDX for lookups."""
														
 
															+        work_dir = self.create_test_repo_with_packs()
														
 
															+
														
 
															+        # Have Git create a MIDX file
														
 
															+        run_git_or_fail(["multi-pack-index", "write"], cwd=work_dir)
														
 
															+
														
 
															+        # Load repo with Dulwich
														
 
															+        repo = Repo(self.repo_path)
														
 
															+        try:
														
 
															+            # Get a commit from the repo
														
 
															+            result = run_git_or_fail(["rev-parse", "HEAD"], cwd=work_dir)
														
 
															+            head_sha = result.decode("utf-8").strip().encode("ascii")
														
 
															+
														
 
															+            # Verify we can access it through Dulwich
														
 
															+            # This should use the MIDX for lookup
														
 
															+            obj = repo.object_store[head_sha]
														
 
															+            self.assertIsNotNone(obj)
														
 
															+            self.assertEqual(b"commit", obj.type_name)
														
 
															+        finally:
														
 
															+            repo.close()
														
 
															+
														
 
															+    def test_repack_with_midx(self):
														
 
															+        """Test that repacking works correctly with MIDX present."""
														
 
															+        work_dir = self.create_test_repo_with_packs()
														
 
															+
														
 
															+        # Create MIDX with Dulwich
														
 
															+        repo = Repo(self.repo_path)
														
 
															+        try:
														
 
															+            repo.object_store.write_midx()
														
 
															+        finally:
														
 
															+            repo.close()
														
 
															+
														
 
															+        # Verify Git can still repack
														
 
															+        run_git_or_fail(["repack", "-d"], cwd=work_dir)
														
 
															+
														
 
															+        # The MIDX should still be readable
														
 
															+        midx_path = os.path.join(self.repo_path, "objects", "pack", "multi-pack-index")
														
 
															+        if os.path.exists(midx_path):  # Git may remove it during repack
														
 
															+            midx = load_midx(midx_path)
														
 
															+            try:
														
 
															+                self.assertGreaterEqual(len(midx), 0)
														
 
															+            finally:
														
 
															+                midx.close()
														
--- a/tests/test_midx.py
+++ b/tests/test_midx.py
@@ -0,0 +1,277 @@
 
															+# test_midx.py -- Tests for multi-pack-index
														
 
															+# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
														
 
															+#
														
 
															+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
														
 
															+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
														
 
															+# General Public License as published by the Free Software Foundation; version 2.0
														
 
															+# or (at your option) any later version. You can redistribute it and/or
														
 
															+# modify it under the terms of either of these two licenses.
														
 
															+#
														
 
															+# Unless required by applicable law or agreed to in writing, software
														
 
															+# distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+# See the License for the specific language governing permissions and
														
 
															+# limitations under the License.
														
 
															+#
														
 
															+# You should have received a copy of the licenses; if not, see
														
 
															+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
														
 
															+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
														
 
															+# License, Version 2.0.
														
 
															+#
														
 
															+
														
 
															+"""Tests for multi-pack-index (MIDX) functionality."""
														
 
															+
														
 
															+import os
														
 
															+import tempfile
														
 
															+from io import BytesIO
														
 
															+from unittest import TestCase
														
 
															+
														
 
															+from dulwich.midx import (
														
 
															+    HASH_ALGORITHM_SHA1,
														
 
															+    MultiPackIndex,
														
 
															+    write_midx,
														
 
															+    write_midx_file,
														
 
															+)
														
 
															+
														
 
															+
														
 
															+class MIDXWriteTests(TestCase):
														
 
															+    """Tests for writing MIDX files."""
														
 
															+
														
 
															+    def test_write_empty_midx(self):
														
 
															+        """Test writing an empty MIDX file."""
														
 
															+        f = BytesIO()
														
 
															+        pack_entries = []
														
 
															+        checksum = write_midx(f, pack_entries, HASH_ALGORITHM_SHA1)
														
 
															+
														
 
															+        # Checksum should be 20 bytes
														
 
															+        self.assertEqual(20, len(checksum))
														
 
															+
														
 
															+        # Should be able to read it back
														
 
															+        f.seek(0)
														
 
															+        midx = MultiPackIndex("test.midx", file=f, contents=f.read())
														
 
															+        self.assertEqual(0, len(midx))
														
 
															+        self.assertEqual(0, midx.pack_count)
														
 
															+        self.assertEqual([], midx.pack_names)
														
 
															+
														
 
															+    def test_write_single_pack_midx(self):
														
 
															+        """Test writing a MIDX file with a single pack."""
														
 
															+        f = BytesIO()
														
 
															+
														
 
															+        # Create some fake pack entries
														
 
															+        pack_entries = [
														
 
															+            (
														
 
															+                "pack-abc123.idx",
														
 
															+                [
														
 
															+                    (b"\x01" * 20, 100, 0x12345678),  # sha, offset, crc32
														
 
															+                    (b"\x02" * 20, 200, 0x87654321),
														
 
															+                    (b"\x03" * 20, 300, 0xABCDEF00),
														
 
															+                ],
														
 
															+            )
														
 
															+        ]
														
 
															+
														
 
															+        checksum = write_midx(f, pack_entries, HASH_ALGORITHM_SHA1)
														
 
															+        self.assertEqual(20, len(checksum))
														
 
															+
														
 
															+        # Read it back
														
 
															+        f.seek(0)
														
 
															+        midx = MultiPackIndex("test.midx", file=f, contents=f.read())
														
 
															+
														
 
															+        self.assertEqual(3, len(midx))
														
 
															+        self.assertEqual(1, midx.pack_count)
														
 
															+        self.assertEqual(["pack-abc123.idx"], midx.pack_names)
														
 
															+
														
 
															+        # Check object lookups
														
 
															+        result = midx.object_offset(b"\x01" * 20)
														
 
															+        self.assertIsNotNone(result)
														
 
															+        pack_name, offset = result
														
 
															+        self.assertEqual("pack-abc123.idx", pack_name)
														
 
															+        self.assertEqual(100, offset)
														
 
															+
														
 
															+        result = midx.object_offset(b"\x02" * 20)
														
 
															+        self.assertIsNotNone(result)
														
 
															+        pack_name, offset = result
														
 
															+        self.assertEqual("pack-abc123.idx", pack_name)
														
 
															+        self.assertEqual(200, offset)
														
 
															+
														
 
															+        result = midx.object_offset(b"\x03" * 20)
														
 
															+        self.assertIsNotNone(result)
														
 
															+        pack_name, offset = result
														
 
															+        self.assertEqual("pack-abc123.idx", pack_name)
														
 
															+        self.assertEqual(300, offset)
														
 
															+
														
 
															+        # Check non-existent object
														
 
															+        result = midx.object_offset(b"\xff" * 20)
														
 
															+        self.assertIsNone(result)
														
 
															+
														
 
															+    def test_write_multiple_packs_midx(self):
														
 
															+        """Test writing a MIDX file with multiple packs."""
														
 
															+        f = BytesIO()
														
 
															+
														
 
															+        pack_entries = [
														
 
															+            (
														
 
															+                "pack-111.idx",
														
 
															+                [
														
 
															+                    (b"\x01" * 20, 100, 0),
														
 
															+                    (b"\x03" * 20, 300, 0),
														
 
															+                ],
														
 
															+            ),
														
 
															+            (
														
 
															+                "pack-222.idx",
														
 
															+                [
														
 
															+                    (b"\x02" * 20, 50, 0),
														
 
															+                    (b"\x04" * 20, 150, 0),
														
 
															+                ],
														
 
															+            ),
														
 
															+        ]
														
 
															+
														
 
															+        checksum = write_midx(f, pack_entries, HASH_ALGORITHM_SHA1)
														
 
															+        self.assertEqual(20, len(checksum))
														
 
															+
														
 
															+        # Read it back
														
 
															+        f.seek(0)
														
 
															+        midx = MultiPackIndex("test.midx", file=f, contents=f.read())
														
 
															+
														
 
															+        self.assertEqual(4, len(midx))
														
 
															+        self.assertEqual(2, midx.pack_count)
														
 
															+        self.assertEqual(["pack-111.idx", "pack-222.idx"], midx.pack_names)
														
 
															+
														
 
															+        # Objects should be findable across packs
														
 
															+        result = midx.object_offset(b"\x01" * 20)
														
 
															+        self.assertIsNotNone(result)
														
 
															+        self.assertEqual("pack-111.idx", result[0])
														
 
															+
														
 
															+        result = midx.object_offset(b"\x02" * 20)
														
 
															+        self.assertIsNotNone(result)
														
 
															+        self.assertEqual("pack-222.idx", result[0])
														
 
															+
														
 
															+    def test_write_large_offsets(self):
														
 
															+        """Test writing a MIDX file with large offsets (>= 2^31)."""
														
 
															+        f = BytesIO()
														
 
															+
														
 
															+        large_offset = 2**32  # Offset that requires LOFF chunk
														
 
															+        pack_entries = [
														
 
															+            (
														
 
															+                "pack-large.idx",
														
 
															+                [
														
 
															+                    (b"\x01" * 20, 100, 0),
														
 
															+                    (b"\x02" * 20, large_offset, 0),  # Large offset
														
 
															+                ],
														
 
															+            )
														
 
															+        ]
														
 
															+
														
 
															+        checksum = write_midx(f, pack_entries, HASH_ALGORITHM_SHA1)
														
 
															+        self.assertEqual(20, len(checksum))
														
 
															+
														
 
															+        # Read it back
														
 
															+        f.seek(0)
														
 
															+        midx = MultiPackIndex("test.midx", file=f, contents=f.read())
														
 
															+
														
 
															+        self.assertEqual(2, len(midx))
														
 
															+
														
 
															+        # Small offset should work
														
 
															+        result = midx.object_offset(b"\x01" * 20)
														
 
															+        self.assertIsNotNone(result)
														
 
															+        self.assertEqual(100, result[1])
														
 
															+
														
 
															+        # Large offset should work
														
 
															+        result = midx.object_offset(b"\x02" * 20)
														
 
															+        self.assertIsNotNone(result)
														
 
															+        self.assertEqual(large_offset, result[1])
														
 
															+
														
 
															+    def test_write_midx_file(self):
														
 
															+        """Test writing a MIDX file to disk."""
														
 
															+        with tempfile.TemporaryDirectory() as tmpdir:
														
 
															+            midx_path = os.path.join(tmpdir, "multi-pack-index")
														
 
															+
														
 
															+            pack_entries = [
														
 
															+                (
														
 
															+                    "pack-test.idx",
														
 
															+                    [
														
 
															+                        (b"\xaa" * 20, 1000, 0),
														
 
															+                    ],
														
 
															+                )
														
 
															+            ]
														
 
															+
														
 
															+            checksum = write_midx_file(midx_path, pack_entries, HASH_ALGORITHM_SHA1)
														
 
															+            self.assertEqual(20, len(checksum))
														
 
															+
														
 
															+            # Verify file was created
														
 
															+            self.assertTrue(os.path.exists(midx_path))
														
 
															+
														
 
															+            # Read it back from disk
														
 
															+            with open(midx_path, "rb") as f:
														
 
															+                midx = MultiPackIndex(midx_path, file=f, contents=f.read())
														
 
															+
														
 
															+            self.assertEqual(1, len(midx))
														
 
															+            result = midx.object_offset(b"\xaa" * 20)
														
 
															+            self.assertIsNotNone(result)
														
 
															+            self.assertEqual("pack-test.idx", result[0])
														
 
															+            self.assertEqual(1000, result[1])
														
 
															+
														
 
															+
														
 
															+class MIDXContainsTests(TestCase):
														
 
															+    """Tests for MIDX __contains__ method."""
														
 
															+
														
 
															+    def test_contains_object(self):
														
 
															+        """Test checking if an object is in the MIDX."""
														
 
															+        f = BytesIO()
														
 
															+        pack_entries = [
														
 
															+            (
														
 
															+                "pack-test.idx",
														
 
															+                [
														
 
															+                    (b"\x01" * 20, 100, 0),
														
 
															+                    (b"\x02" * 20, 200, 0),
														
 
															+                ],
														
 
															+            )
														
 
															+        ]
														
 
															+
														
 
															+        write_midx(f, pack_entries, HASH_ALGORITHM_SHA1)
														
 
															+        f.seek(0)
														
 
															+        midx = MultiPackIndex("test.midx", file=f, contents=f.read())
														
 
															+
														
 
															+        self.assertTrue(b"\x01" * 20 in midx)
														
 
															+        self.assertTrue(b"\x02" * 20 in midx)
														
 
															+        self.assertFalse(b"\xff" * 20 in midx)
														
 
															+
														
 
															+
														
 
															+class MIDXIterEntriesTests(TestCase):
														
 
															+    """Tests for MIDX iterentries method."""
														
 
															+
														
 
															+    def test_iterentries(self):
														
 
															+        """Test iterating over MIDX entries."""
														
 
															+        f = BytesIO()
														
 
															+        pack_entries = [
														
 
															+            (
														
 
															+                "pack-111.idx",
														
 
															+                [
														
 
															+                    (b"\x01" * 20, 100, 0),
														
 
															+                    (b"\x03" * 20, 300, 0),
														
 
															+                ],
														
 
															+            ),
														
 
															+            (
														
 
															+                "pack-222.idx",
														
 
															+                [
														
 
															+                    (b"\x02" * 20, 50, 0),
														
 
															+                ],
														
 
															+            ),
														
 
															+        ]
														
 
															+
														
 
															+        write_midx(f, pack_entries, HASH_ALGORITHM_SHA1)
														
 
															+        f.seek(0)
														
 
															+        midx = MultiPackIndex("test.midx", file=f, contents=f.read())
														
 
															+
														
 
															+        entries = list(midx.iterentries())
														
 
															+        self.assertEqual(3, len(entries))
														
 
															+
														
 
															+        # Entries should be sorted by SHA
														
 
															+        self.assertEqual(b"\x01" * 20, entries[0][0])
														
 
															+        self.assertEqual("pack-111.idx", entries[0][1])
														
 
															+        self.assertEqual(100, entries[0][2])
														
 
															+
														
 
															+        self.assertEqual(b"\x02" * 20, entries[1][0])
														
 
															+        self.assertEqual("pack-222.idx", entries[1][1])
														
 
															+        self.assertEqual(50, entries[1][2])
														
 
															+
														
 
															+        self.assertEqual(b"\x03" * 20, entries[2][0])
														
 
															+        self.assertEqual("pack-111.idx", entries[2][1])
														
 
															+        self.assertEqual(300, entries[2][2])