ソースを参照

Add multi-pack-index (MIDX) reading support

Jelmer Vernooij 2 ヶ月 前
コミット
12cfb78a07
2 ファイル変更546 行追加0 行削除
  1. 418 0
      dulwich/midx.py
  2. 128 0
      dulwich/object_store.py

+ 418 - 0
dulwich/midx.py

@@ -0,0 +1,418 @@
+# midx.py -- Multi-Pack-Index (MIDX) support
+# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as published by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Multi-Pack-Index (MIDX) support.
+
+A multi-pack-index (MIDX) provides a single index that covers multiple pack files,
+enabling fast object lookup across all packs without opening each pack index.
+
+The MIDX file format consists of:
+- A header with signature, version, and hash algorithm
+- A chunk lookup table
+- Multiple chunks containing pack names, OID fanout, OID lookup, and object offsets
+- A trailer with checksum
+
+This module provides:
+- Reading MIDX files
+- Writing MIDX files
+- Integration with pack-based object stores
+"""
+
+import os
+import struct
+from collections.abc import Iterator
+from typing import IO, Any
+
+from .file import GitFile, _GitFile
+
+# MIDX signature
+MIDX_SIGNATURE = b"MIDX"
+
+# MIDX version
+MIDX_VERSION = 1
+
+# Chunk identifiers (4 bytes each)
+CHUNK_PNAM = b"PNAM"  # Packfile names
+CHUNK_OIDF = b"OIDF"  # OID fanout table
+CHUNK_OIDL = b"OIDL"  # OID lookup table
+CHUNK_OOFF = b"OOFF"  # Object offsets
+CHUNK_LOFF = b"LOFF"  # Large offsets (optional)
+CHUNK_BTMP = b"BTMP"  # Bitmapped packfiles (optional)
+CHUNK_RIDX = b"RIDX"  # Reverse index (optional)
+
+# Hash algorithm identifiers
+HASH_ALGORITHM_SHA1 = 1
+HASH_ALGORITHM_SHA256 = 2
+
+
+class MultiPackIndex:
+    """Multi-pack-index for efficient object lookup across multiple pack files."""
+
+    def __init__(
+        self,
+        filename: str | os.PathLike[str],
+        file: IO[bytes] | _GitFile | None = None,
+        contents: bytes | None = None,
+        size: int | None = None,
+    ) -> None:
+        """Initialize a MultiPackIndex.
+
+        Args:
+            filename: Path to the MIDX file
+            file: Optional file object
+            contents: Optional mmap'd contents
+            size: Optional size of the MIDX file
+        """
+        self._filename = os.fspath(filename)
+        self._file = file
+        self._size = size
+
+        # Instance variables that will be set during parsing
+        self.version: int
+        self.hash_algorithm: int
+        self.hash_size: int
+        self.chunk_count: int
+        self.base_midx_files: int
+        self.pack_count: int
+        self.pack_names: list[str]
+        self.object_count: int
+        self._chunks: dict[bytes, int]
+        self._fanout_table: list[int]
+        self._oidl_offset: int
+        self._ooff_offset: int
+        self._loff_offset: int
+
+        # Load file contents
+        if contents is None:
+            if file is None:
+                with GitFile(filename, "rb") as f:
+                    self._contents, self._size = self._load_file_contents(f, size)
+            else:
+                self._contents, self._size = self._load_file_contents(file, size)
+        else:
+            self._contents = contents
+
+        # Parse header
+        self._parse_header()
+
+        # Parse chunk lookup table
+        self._parse_chunk_table()
+
+    def _load_file_contents(
+        self, f: IO[bytes] | _GitFile, size: int | None = None
+    ) -> tuple[bytes | Any, int]:
+        """Load contents from a file, preferring mmap when possible.
+
+        Args:
+            f: File-like object to load
+            size: Expected size, or None to determine from file
+
+        Returns:
+            Tuple of (contents, size)
+        """
+        # Simplified version - similar to pack.py's _load_file_contents
+        if size is None:
+            f.seek(0, 2)  # SEEK_END
+            size = f.tell()
+            f.seek(0)
+
+        # For now, just read the entire file into memory
+        # TODO: Add mmap support for large files
+        contents = f.read(size)
+        return contents, size
+
+    def _parse_header(self) -> None:
+        """Parse the MIDX header."""
+        if len(self._contents) < 12:
+            raise ValueError("MIDX file too small")
+
+        # Check signature
+        signature = self._contents[0:4]
+        if signature != MIDX_SIGNATURE:
+            raise ValueError(f"Invalid MIDX signature: {signature!r}")
+
+        # Read version
+        self.version = self._contents[4]
+        if self.version != MIDX_VERSION:
+            raise ValueError(f"Unsupported MIDX version: {self.version}")
+
+        # Read object ID version (hash algorithm)
+        self.hash_algorithm = self._contents[5]
+        if self.hash_algorithm == HASH_ALGORITHM_SHA1:
+            self.hash_size = 20
+        elif self.hash_algorithm == HASH_ALGORITHM_SHA256:
+            self.hash_size = 32
+        else:
+            raise ValueError(f"Unknown hash algorithm: {self.hash_algorithm}")
+
+        # Read chunk count
+        self.chunk_count = self._contents[6]
+
+        # Read base MIDX files count (currently always 0)
+        self.base_midx_files = self._contents[7]
+        if self.base_midx_files != 0:
+            raise ValueError("Incremental MIDX not yet supported")
+
+        # Read pack file count
+        (self.pack_count,) = struct.unpack(">L", self._contents[8:12])
+
+    def _parse_chunk_table(self) -> None:
+        """Parse the chunk lookup table."""
+        self._chunks = {}
+
+        # Chunk table starts at offset 12
+        offset = 12
+
+        # Each chunk entry is 12 bytes (4-byte ID + 8-byte offset)
+        for i in range(self.chunk_count + 1):  # +1 for terminator
+            chunk_id = self._contents[offset : offset + 4]
+            (chunk_offset,) = struct.unpack(
+                ">Q", self._contents[offset + 4 : offset + 12]
+            )
+
+            if chunk_id == b"\x00\x00\x00\x00":
+                # Terminator entry
+                break
+
+            self._chunks[chunk_id] = chunk_offset
+            offset += 12
+
+        # Parse required chunks
+        self._parse_pnam_chunk()
+        self._parse_oidf_chunk()
+        self._parse_oidl_chunk()
+        self._parse_ooff_chunk()
+
+        # Parse optional chunks
+        if CHUNK_LOFF in self._chunks:
+            self._parse_loff_chunk()
+
+    def _parse_pnam_chunk(self) -> None:
+        """Parse the Packfile Names (PNAM) chunk."""
+        if CHUNK_PNAM not in self._chunks:
+            raise ValueError("Required PNAM chunk not found")
+
+        offset = self._chunks[CHUNK_PNAM]
+        self.pack_names = []
+
+        # Find the end of the PNAM chunk (next chunk or end of chunks section)
+        next_offset = min(
+            (o for o in self._chunks.values() if o > offset),
+            default=len(self._contents),
+        )
+
+        # Parse null-terminated pack names
+        current = offset
+        while current < next_offset:
+            # Find the next null terminator
+            null_pos = self._contents.find(b"\x00", current, next_offset)
+            if null_pos == -1:
+                break
+
+            pack_name = self._contents[current:null_pos].decode("utf-8")
+            if pack_name:  # Skip empty strings (padding)
+                self.pack_names.append(pack_name)
+            current = null_pos + 1
+
+    def _parse_oidf_chunk(self) -> None:
+        """Parse the OID Fanout (OIDF) chunk."""
+        if CHUNK_OIDF not in self._chunks:
+            raise ValueError("Required OIDF chunk not found")
+
+        offset = self._chunks[CHUNK_OIDF]
+        self._fanout_table = []
+
+        # Read 256 4-byte entries
+        for i in range(256):
+            (count,) = struct.unpack(
+                ">L", self._contents[offset + i * 4 : offset + i * 4 + 4]
+            )
+            self._fanout_table.append(count)
+
+        # Total object count is the last entry
+        self.object_count = self._fanout_table[255]
+
+    def _parse_oidl_chunk(self) -> None:
+        """Parse the OID Lookup (OIDL) chunk."""
+        if CHUNK_OIDL not in self._chunks:
+            raise ValueError("Required OIDL chunk not found")
+
+        self._oidl_offset = self._chunks[CHUNK_OIDL]
+
+    def _parse_ooff_chunk(self) -> None:
+        """Parse the Object Offsets (OOFF) chunk."""
+        if CHUNK_OOFF not in self._chunks:
+            raise ValueError("Required OOFF chunk not found")
+
+        self._ooff_offset = self._chunks[CHUNK_OOFF]
+
+    def _parse_loff_chunk(self) -> None:
+        """Parse the Large Offsets (LOFF) chunk."""
+        self._loff_offset = self._chunks[CHUNK_LOFF]
+
+    def __len__(self) -> int:
+        """Return the number of objects in this MIDX."""
+        return self.object_count
+
+    def _get_oid(self, index: int) -> bytes:
+        """Get the object ID at the given index.
+
+        Args:
+            index: Index of the object
+
+        Returns:
+            Binary object ID
+        """
+        if index < 0 or index >= self.object_count:
+            raise IndexError(f"Index {index} out of range")
+
+        offset = self._oidl_offset + index * self.hash_size
+        return self._contents[offset : offset + self.hash_size]
+
+    def _get_pack_info(self, index: int) -> tuple[int, int]:
+        """Get pack ID and offset for object at the given index.
+
+        Args:
+            index: Index of the object
+
+        Returns:
+            Tuple of (pack_id, offset)
+        """
+        if index < 0 or index >= self.object_count:
+            raise IndexError(f"Index {index} out of range")
+
+        # Each entry is 8 bytes (4-byte pack ID + 4-byte offset)
+        offset = self._ooff_offset + index * 8
+
+        (pack_id,) = struct.unpack(">L", self._contents[offset : offset + 4])
+        (pack_offset,) = struct.unpack(">L", self._contents[offset + 4 : offset + 8])
+
+        # Check if this is a large offset (MSB set)
+        if pack_offset & 0x80000000:
+            # Look up in LOFF chunk
+            if CHUNK_LOFF not in self._chunks:
+                raise ValueError("Large offset found but no LOFF chunk")
+
+            large_index = pack_offset & 0x7FFFFFFF
+            large_offset_pos = self._loff_offset + large_index * 8
+            (pack_offset,) = struct.unpack(
+                ">Q", self._contents[large_offset_pos : large_offset_pos + 8]
+            )
+
+        return pack_id, pack_offset
+
+    def object_offset(self, sha: bytes) -> tuple[str, int] | None:
+        """Return the pack name and offset for the given object.
+
+        Args:
+            sha: Binary SHA-1 or SHA-256 hash
+
+        Returns:
+            Tuple of (pack_name, offset) or None if not found
+        """
+        if len(sha) != self.hash_size:
+            raise ValueError(
+                f"SHA size mismatch: expected {self.hash_size}, got {len(sha)}"
+            )
+
+        # Use fanout table to narrow search range
+        first_byte = sha[0]
+        start_idx = 0 if first_byte == 0 else self._fanout_table[first_byte - 1]
+        end_idx = self._fanout_table[first_byte]
+
+        # Binary search within the range
+        while start_idx < end_idx:
+            mid = (start_idx + end_idx) // 2
+            mid_sha = self._get_oid(mid)
+
+            if mid_sha == sha:
+                # Found it!
+                pack_id, offset = self._get_pack_info(mid)
+                return self.pack_names[pack_id], offset
+            elif mid_sha < sha:
+                start_idx = mid + 1
+            else:
+                end_idx = mid
+
+        return None
+
+    def __contains__(self, sha: bytes) -> bool:
+        """Check if the given object SHA is in this MIDX.
+
+        Args:
+            sha: Binary SHA hash
+
+        Returns:
+            True if the object is in this MIDX
+        """
+        return self.object_offset(sha) is not None
+
+    def iterentries(self) -> Iterator[tuple[bytes, str, int]]:
+        """Iterate over all entries in this MIDX.
+
+        Yields:
+            Tuples of (sha, pack_name, offset)
+        """
+        for i in range(self.object_count):
+            sha = self._get_oid(i)
+            pack_id, offset = self._get_pack_info(i)
+            pack_name = self.pack_names[pack_id]
+            yield sha, pack_name, offset
+
+    def close(self) -> None:
+        """Close the MIDX file."""
+        if self._file is not None:
+            self._file.close()
+            self._file = None
+
+
+def load_midx(path: str | os.PathLike[str]) -> MultiPackIndex:
+    """Load a multi-pack-index file by path.
+
+    Args:
+        path: Path to the MIDX file
+
+    Returns:
+        A MultiPackIndex loaded from the given path
+    """
+    with GitFile(path, "rb") as f:
+        return load_midx_file(path, f)
+
+
+def load_midx_file(
+    path: str | os.PathLike[str], f: IO[bytes] | _GitFile
+) -> MultiPackIndex:
+    """Load a multi-pack-index from a file-like object.
+
+    Args:
+        path: Path for the MIDX file
+        f: File-like object
+
+    Returns:
+        A MultiPackIndex loaded from the given file
+    """
+    return MultiPackIndex(path, file=f)
+
+
+# TODO: Implement MIDX writing functionality
+# TODO: Implement integration with object_store.py
+# TODO: Add support for incremental MIDX chains
+# TODO: Add support for BTMP and RIDX chunks for bitmap integration

+ 128 - 0
dulwich/object_store.py

@@ -42,6 +42,7 @@ from typing import (
 
 from .errors import NotTreeError
 from .file import GitFile, _GitFile
+from .midx import MultiPackIndex, load_midx
 from .objects import (
     S_ISGITLINK,
     ZERO_SHA,
@@ -1399,6 +1400,10 @@ class DiskObjectStore(PackBasedObjectStore):
         self._commit_graph = None
         self._use_commit_graph = True  # Default to true
 
+        # Multi-pack-index support - lazy loaded
+        self._midx: MultiPackIndex | None = None
+        self._use_midx = True  # Default to true
+
     def __repr__(self) -> str:
         """Return string representation of DiskObjectStore.
 
@@ -1485,6 +1490,9 @@ class DiskObjectStore(PackBasedObjectStore):
         # Read core.commitGraph setting
         use_commit_graph = config.get_boolean((b"core",), b"commitGraph", True)
 
+        # Read core.multiPackIndex setting
+        use_midx = config.get_boolean((b"core",), b"multiPackIndex", True)
+
         # Read core.fsyncObjectFiles setting
         fsync_object_files = config.get_boolean((b"core",), b"fsyncObjectFiles", False)
 
@@ -1521,6 +1529,7 @@ class DiskObjectStore(PackBasedObjectStore):
             dir_mode=dir_mode,
         )
         instance._use_commit_graph = use_commit_graph
+        instance._use_midx = use_midx
         return instance
 
     @property
@@ -2036,6 +2045,125 @@ class DiskObjectStore(PackBasedObjectStore):
                 self._commit_graph = read_commit_graph(graph_file)
         return self._commit_graph
 
+    def get_midx(self) -> MultiPackIndex | None:
+        """Get the multi-pack-index for this object store.
+
+        Returns:
+          MultiPackIndex object if available, None otherwise
+
+        Raises:
+          ValueError: If MIDX file is corrupt
+          OSError: If MIDX file cannot be read
+        """
+        if not self._use_midx:
+            return None
+
+        if self._midx is None:
+            # Look for MIDX in pack directory
+            midx_file = os.path.join(self.pack_dir, "multi-pack-index")
+            if os.path.exists(midx_file):
+                self._midx = load_midx(midx_file)
+        return self._midx
+
+    def _get_pack_by_name(self, pack_name: str) -> Pack:
+        """Get a pack by its base name.
+
+        Args:
+            pack_name: Base name of the pack (e.g., 'pack-abc123.pack')
+
+        Returns:
+            Pack object
+
+        Raises:
+            KeyError: If pack doesn't exist
+        """
+        # Remove .pack extension if present
+        if pack_name.endswith(".pack"):
+            base_name = pack_name[:-5]
+        else:
+            base_name = pack_name
+
+        # Check if already in cache
+        if base_name in self._pack_cache:
+            return self._pack_cache[base_name]
+
+        # Load the pack
+        pack_path = os.path.join(self.pack_dir, base_name)
+        if not os.path.exists(pack_path + ".pack"):
+            raise KeyError(f"Pack {pack_name} not found")
+
+        pack = Pack(
+            pack_path,
+            delta_window_size=self.pack_delta_window_size,
+            window_memory=self.pack_window_memory,
+            delta_cache_size=self.pack_delta_cache_size,
+            depth=self.pack_depth,
+            threads=self.pack_threads,
+            big_file_threshold=self.pack_big_file_threshold,
+        )
+        self._pack_cache[base_name] = pack
+        return pack
+
+    def contains_packed(self, sha: bytes) -> bool:
+        """Check if a particular object is present by SHA1 and is packed.
+
+        This checks the MIDX first if available, then falls back to checking
+        individual pack indexes.
+
+        Args:
+            sha: Binary SHA of the object
+
+        Returns:
+            True if the object is in a pack file
+        """
+        # Check MIDX first for faster lookup
+        midx = self.get_midx()
+        if midx is not None and sha in midx:
+            return True
+
+        # Fall back to checking individual packs
+        return super().contains_packed(sha)
+
+    def get_raw(self, name: bytes) -> tuple[int, bytes]:
+        """Obtain the raw fulltext for an object.
+
+        This uses the MIDX if available for faster lookups.
+
+        Args:
+            name: SHA for the object (20 bytes binary or 40 bytes hex)
+
+        Returns:
+            Tuple with numeric type and object contents
+
+        Raises:
+            KeyError: If object not found
+        """
+        if name == ZERO_SHA:
+            raise KeyError(name)
+
+        if len(name) == 40:
+            sha = hex_to_sha(name)
+        elif len(name) == 20:
+            sha = name
+        else:
+            raise AssertionError(f"Invalid object name {name!r}")
+
+        # Try MIDX first for faster lookup
+        midx = self.get_midx()
+        if midx is not None:
+            result = midx.object_offset(sha)
+            if result is not None:
+                pack_name, _offset = result
+                try:
+                    pack = self._get_pack_by_name(pack_name)
+                    return pack.get_raw(sha)
+                except (KeyError, PackFileDisappeared):
+                    # Pack disappeared or object not found, fall through to standard lookup
+                    pass
+
+        # Fall back to the standard implementation
+        return super().get_raw(name)
+
     def write_commit_graph(
         self, refs: Iterable[ObjectID] | None = None, reachable: bool = True
     ) -> None: