Browse Source

Initial work on pack index v3 (#1119)

Initial work on pack index v3
Jelmer Vernooij 1 month ago
parent
commit
22d6e3799d
4 changed files with 362 additions and 12 deletions
  1. 6 0
      NEWS
  2. 186 10
      dulwich/pack.py
  3. 81 2
      tests/compat/test_pack.py
  4. 89 0
      tests/test_pack.py

+ 6 - 0
NEWS

@@ -1,5 +1,11 @@
 0.22.9	UNRELEASED
 
+ * Add support for pack index format version 3. This format supports variable
+   hash sizes to enable future SHA-256 support. The implementation includes
+   reading and writing v3 indexes with proper hash algorithm identification
+   (1 for SHA-1, 2 for SHA-256). Note that SHA-256 support itself is not yet
+   implemented and will raise NotImplementedError. (Jelmer Vernooij)
+
  * Fix ``LocalGitClient`` assertion error when fetching externally cloned repositories
    into ``MemoryRepo``. Previously, the client would fail with an AssertionError
    when trying to process pack data from repositories that were cloned externally.

+ 186 - 10
dulwich/pack.py

@@ -404,20 +404,17 @@ def load_pack_index_file(path: Union[str, os.PathLike], f):
       f: File-like object
     Returns: A PackIndex loaded from the given file
     """
-    # Ensure path is a string for PackIndex classes
-    path_str = os.fspath(path)
-    if isinstance(path_str, bytes):
-        path_str = os.fsdecode(path_str)
-
     contents, size = _load_file_contents(f)
     if contents[:4] == b"\377tOc":
         version = struct.unpack(b">L", contents[4:8])[0]
         if version == 2:
-            return PackIndex2(path_str, file=f, contents=contents, size=size)
+            return PackIndex2(path, file=f, contents=contents, size=size)
+        elif version == 3:
+            return PackIndex3(path, file=f, contents=contents, size=size)
         else:
             raise KeyError(f"Unknown pack index format {version}")
     else:
-        return PackIndex1(path_str, file=f, contents=contents, size=size)
+        return PackIndex1(path, file=f, contents=contents, size=size)
 
 
 def bisect_find_sha(start, end, sha, unpack_name):
@@ -453,6 +450,10 @@ class PackIndex:
     packfile of that object if it has it.
     """
 
+    # Default to SHA-1 for backward compatibility
+    hash_algorithm = 1
+    hash_size = 20
+
     def __eq__(self, other):
         if not isinstance(other, PackIndex):
             return False
@@ -770,7 +771,9 @@ class FilePackIndex(PackIndex):
 class PackIndex1(FilePackIndex):
     """Version 1 Pack Index file."""
 
-    def __init__(self, filename: str, file=None, contents=None, size=None) -> None:
+    def __init__(
+        self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
+    ) -> None:
         super().__init__(filename, file, contents, size)
         self.version = 1
         self._fan_out_table = self._read_fan_out_table(0)
@@ -795,7 +798,9 @@ class PackIndex1(FilePackIndex):
 class PackIndex2(FilePackIndex):
     """Version 2 Pack Index file."""
 
-    def __init__(self, filename: str, file=None, contents=None, size=None) -> None:
+    def __init__(
+        self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
+    ) -> None:
         super().__init__(filename, file, contents, size)
         if self._contents[:4] != b"\377tOc":
             raise AssertionError("Not a v2 pack index file")
@@ -833,6 +838,68 @@ class PackIndex2(FilePackIndex):
         return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
 
 
+class PackIndex3(FilePackIndex):
+    """Version 3 Pack Index file.
+
+    Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).
+    """
+
+    def __init__(
+        self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
+    ) -> None:
+        super().__init__(filename, file, contents, size)
+        if self._contents[:4] != b"\377tOc":
+            raise AssertionError("Not a v3 pack index file")
+        (self.version,) = unpack_from(b">L", self._contents, 4)
+        if self.version != 3:
+            raise AssertionError(f"Version was {self.version}")
+
+        # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
+        (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)
+        if self.hash_algorithm == 1:
+            self.hash_size = 20  # SHA-1
+        elif self.hash_algorithm == 2:
+            self.hash_size = 32  # SHA-256
+        else:
+            raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")
+
+        # Read length of shortened object names
+        (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)
+
+        # Calculate offsets based on variable hash size
+        self._fan_out_table = self._read_fan_out_table(
+            16
+        )  # After header (4 + 4 + 4 + 4)
+        self._name_table_offset = 16 + 0x100 * 4
+        self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)
+        self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
+        self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
+            self
+        )
+
+    def _unpack_entry(self, i):
+        return (
+            self._unpack_name(i),
+            self._unpack_offset(i),
+            self._unpack_crc32_checksum(i),
+        )
+
+    def _unpack_name(self, i):
+        offset = self._name_table_offset + i * self.hash_size
+        return self._contents[offset : offset + self.hash_size]
+
+    def _unpack_offset(self, i):
+        offset = self._pack_offset_table_offset + i * 4
+        offset = unpack_from(">L", self._contents, offset)[0]
+        if offset & (2**31):
+            offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
+            offset = unpack_from(">Q", self._contents, offset)[0]
+        return offset
+
+    def _unpack_crc32_checksum(self, i):
+        return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
+
+
 def read_pack_header(read) -> tuple[int, int]:
     """Read the header of a pack file.
 
@@ -1344,12 +1411,37 @@ class PackData:
         with GitFile(filename, "wb") as f:
             return write_pack_index_v2(f, entries, self.calculate_checksum())
 
-    def create_index(self, filename, progress=None, version=2, resolve_ext_ref=None):
+    def create_index_v3(
+        self, filename, progress=None, resolve_ext_ref=None, hash_algorithm=1
+    ):
+        """Create a version 3 index file for this data file.
+
+        Args:
+          filename: Index filename.
+          progress: Progress report function
+          resolve_ext_ref: Function to resolve external references
+          hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
+        Returns: Checksum of index file
+        """
+        entries = self.sorted_entries(
+            progress=progress, resolve_ext_ref=resolve_ext_ref
+        )
+        with GitFile(filename, "wb") as f:
+            return write_pack_index_v3(
+                f, entries, self.calculate_checksum(), hash_algorithm
+            )
+
+    def create_index(
+        self, filename, progress=None, version=2, resolve_ext_ref=None, hash_algorithm=1
+    ):
         """Create an  index file for this data file.
 
         Args:
           filename: Index filename.
           progress: Progress report function
+          version: Index version (1, 2, or 3)
+          resolve_ext_ref: Function to resolve external references
+          hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)
         Returns: Checksum of index file
         """
         if version == 1:
@@ -1360,6 +1452,13 @@ class PackData:
             return self.create_index_v2(
                 filename, progress, resolve_ext_ref=resolve_ext_ref
             )
+        elif version == 3:
+            return self.create_index_v3(
+                filename,
+                progress,
+                resolve_ext_ref=resolve_ext_ref,
+                hash_algorithm=hash_algorithm,
+            )
         else:
             raise ValueError(f"unknown index format {version}")
 
@@ -2496,6 +2595,83 @@ def write_pack_index_v2(
     return f.write_sha()
 
 
+def write_pack_index_v3(
+    f, entries: Iterable[PackIndexEntry], pack_checksum: bytes, hash_algorithm: int = 1
+) -> bytes:
+    """Write a new pack index file in v3 format.
+
+    Args:
+      f: File-like object to write to
+      entries: List of tuples with object name (sha), offset_in_pack, and
+        crc32_checksum.
+      pack_checksum: Checksum of the pack file.
+      hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
+    Returns: The SHA of the index file written
+    """
+    if hash_algorithm == 1:
+        hash_size = 20  # SHA-1
+        writer_cls = SHA1Writer
+    elif hash_algorithm == 2:
+        hash_size = 32  # SHA-256
+        # TODO: Add SHA256Writer when SHA-256 support is implemented
+        raise NotImplementedError("SHA-256 support not yet implemented")
+    else:
+        raise ValueError(f"Unknown hash algorithm {hash_algorithm}")
+
+    # Convert entries to list to allow multiple iterations
+    entries_list = list(entries)
+
+    # Calculate shortest unambiguous prefix length for object names
+    # For now, use full hash size (this could be optimized)
+    shortened_oid_len = hash_size
+
+    f = writer_cls(f)
+    f.write(b"\377tOc")  # Magic!
+    f.write(struct.pack(">L", 3))  # Version 3
+    f.write(struct.pack(">L", hash_algorithm))  # Hash algorithm
+    f.write(struct.pack(">L", shortened_oid_len))  # Shortened OID length
+
+    fan_out_table: dict[int, int] = defaultdict(lambda: 0)
+    for name, offset, entry_checksum in entries_list:
+        if len(name) != hash_size:
+            raise ValueError(
+                f"Object name has wrong length: expected {hash_size}, got {len(name)}"
+            )
+        fan_out_table[ord(name[:1])] += 1
+
+    # Fan-out table
+    largetable: list[int] = []
+    for i in range(0x100):
+        f.write(struct.pack(b">L", fan_out_table[i]))
+        fan_out_table[i + 1] += fan_out_table[i]
+
+    # Object names table
+    for name, offset, entry_checksum in entries_list:
+        f.write(name)
+
+    # CRC32 checksums table
+    for name, offset, entry_checksum in entries_list:
+        f.write(struct.pack(b">L", entry_checksum))
+
+    # Offset table
+    for name, offset, entry_checksum in entries_list:
+        if offset < 2**31:
+            f.write(struct.pack(b">L", offset))
+        else:
+            f.write(struct.pack(b">L", 2**31 + len(largetable)))
+            largetable.append(offset)
+
+    # Large offset table
+    for offset in largetable:
+        f.write(struct.pack(b">Q", offset))
+
+    assert len(pack_checksum) == hash_size, (
+        f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"
+    )
+    f.write(pack_checksum)
+    return f.write_sha()
+
+
 write_pack_index = write_pack_index_v2
 
 

+ 81 - 2
tests/compat/test_pack.py

@@ -28,12 +28,19 @@ import shutil
 import tempfile
 from typing import NoReturn
 
+from dulwich.file import GitFile
 from dulwich.objects import Blob
-from dulwich.pack import write_pack
+from dulwich.pack import (
+    PackData,
+    PackIndex3,
+    load_pack_index,
+    write_pack,
+    write_pack_index_v3,
+)
 
 from .. import SkipTest
 from ..test_pack import PackTests, a_sha, pack1_sha
-from .utils import require_git_version, run_git_or_fail
+from .utils import require_git_version, rmtree_ro, run_git_or_fail
 
 _NON_DELTA_RE = re.compile(b"non delta: (?P<non_delta>\\d+) objects")
 
@@ -163,3 +170,75 @@ class TestPack(PackTests):
             got_non_delta,
             f"Expected 4 non-delta objects, got {got_non_delta}",
         )
+
+
+class TestPackIndexCompat(PackTests):
+    """Compatibility tests for pack index formats."""
+
+    def setUp(self) -> None:
+        require_git_version((1, 5, 0))
+        super().setUp()
+        self._tempdir = tempfile.mkdtemp()
+        self.addCleanup(rmtree_ro, self._tempdir)
+
+    def test_dulwich_create_index_git_readable(self) -> None:
+        """Test that git can read pack indexes created by dulwich."""
+        # Create a simple pack with objects
+        blob = Blob()
+        blob.data = b"Test blob"
+
+        pack_path = os.path.join(self._tempdir, "test_pack")
+        entries = [(blob, None)]
+        write_pack(pack_path, entries)
+
+        # Load the pack and create v2 index (most compatible)
+        pack_data = PackData(pack_path + ".pack")
+        pack_data.create_index(pack_path + ".idx", version=2)
+
+        # Verify git can read it
+        output = run_git_or_fail(["verify-pack", "-v", pack_path + ".pack"])
+        self.assertIn(blob.id.decode("ascii"), output.decode("ascii"))
+
+    def test_dulwich_read_git_index(self) -> None:
+        """Test that dulwich can read pack indexes created by git."""
+        # Create a simple pack with objects
+        blob = Blob()
+        blob.data = b"Test blob for git"
+
+        pack_path = os.path.join(self._tempdir, "git_pack")
+        entries = [(blob, None)]
+        write_pack(pack_path, entries)
+
+        # Create index with git
+        run_git_or_fail(["index-pack", pack_path + ".pack"])
+
+        # Load with dulwich
+        idx = load_pack_index(pack_path + ".idx")
+
+        # Verify it works
+        self.assertIn(blob.id, idx)
+        self.assertEqual(len(idx), 1)
+
+    def test_index_format_v3_sha256_future(self) -> None:
+        """Test that v3 index format is ready for SHA-256 support."""
+        # This test verifies the v3 implementation structure is ready
+        # for SHA-256, even though SHA-256 itself is not yet implemented
+
+        # Create a dummy v3 index to test the format
+        entries = [(b"a" * 20, 100, 1234)]  # SHA-1 for now
+
+        v3_path = os.path.join(self._tempdir, "v3_test.idx")
+        with GitFile(v3_path, "wb") as f:
+            write_pack_index_v3(f, entries, b"x" * 20, hash_algorithm=1)
+
+        # Load and verify structure
+        idx = load_pack_index(v3_path)
+        self.assertIsInstance(idx, PackIndex3)
+        self.assertEqual(idx.version, 3)
+        self.assertEqual(idx.hash_algorithm, 1)  # SHA-1
+        self.assertEqual(idx.hash_size, 20)
+
+        # Verify SHA-256 would raise NotImplementedError
+        with self.assertRaises(NotImplementedError):
+            with GitFile(v3_path + ".sha256", "wb") as f:
+                write_pack_index_v3(f, entries, b"x" * 32, hash_algorithm=2)

+ 89 - 0
tests/test_pack.py

@@ -42,6 +42,7 @@ from dulwich.pack import (
     MemoryPackIndex,
     Pack,
     PackData,
+    PackIndex3,
     PackStreamReader,
     UnpackedObject,
     UnresolvedDeltas,
@@ -58,6 +59,7 @@ from dulwich.pack import (
     write_pack_header,
     write_pack_index_v1,
     write_pack_index_v2,
+    write_pack_index_v3,
     write_pack_object,
 )
 from dulwich.tests.utils import build_pack, make_object
@@ -407,6 +409,25 @@ class TestPackData(PackTests):
             self.assertEqual(oct(os.stat(filename).st_mode), indexmode)
             self.assertEqual(idx1, idx2)
 
+    def test_create_index_v3(self) -> None:
+        with self.get_pack_data(pack1_sha) as p:
+            filename = os.path.join(self.tempdir, "v3test.idx")
+            p.create_index_v3(filename)
+            idx1 = load_pack_index(filename)
+            idx2 = self.get_pack_index(pack1_sha)
+            self.assertEqual(oct(os.stat(filename).st_mode), indexmode)
+            self.assertEqual(idx1, idx2)
+            self.assertIsInstance(idx1, PackIndex3)
+            self.assertEqual(idx1.version, 3)
+
+    def test_create_index_version3(self) -> None:
+        with self.get_pack_data(pack1_sha) as p:
+            filename = os.path.join(self.tempdir, "version3test.idx")
+            p.create_index(filename, version=3)
+            idx = load_pack_index(filename)
+            self.assertIsInstance(idx, PackIndex3)
+            self.assertEqual(idx.version, 3)
+
     def test_compute_file_sha(self) -> None:
         f = BytesIO(b"abcd1234wxyz")
         self.assertEqual(
@@ -878,6 +899,74 @@ class TestPackIndexWritingv2(TestCase, BaseTestFilePackIndexWriting):
         BaseTestFilePackIndexWriting.tearDown(self)
 
 
+class TestPackIndexWritingv3(TestCase, BaseTestFilePackIndexWriting):
+    def setUp(self) -> None:
+        TestCase.setUp(self)
+        BaseTestFilePackIndexWriting.setUp(self)
+        self._has_crc32_checksum = True
+        self._supports_large = True
+        self._expected_version = 3
+        self._write_fn = write_pack_index_v3
+
+    def tearDown(self) -> None:
+        TestCase.tearDown(self)
+        BaseTestFilePackIndexWriting.tearDown(self)
+
+    def test_load_v3_index_returns_packindex3(self) -> None:
+        """Test that loading a v3 index file returns a PackIndex3 instance."""
+        entries = [(b"abcd" * 5, 0, zlib.crc32(b""))]
+        filename = os.path.join(self.tempdir, "test.idx")
+        self.writeIndex(filename, entries, b"1234567890" * 2)
+        idx = load_pack_index(filename)
+        self.assertIsInstance(idx, PackIndex3)
+        self.assertEqual(idx.version, 3)
+        self.assertEqual(idx.hash_algorithm, 1)  # SHA-1
+        self.assertEqual(idx.hash_size, 20)
+        self.assertEqual(idx.shortened_oid_len, 20)
+
+    def test_v3_hash_algorithm(self) -> None:
+        """Test v3 index correctly handles hash algorithm field."""
+        entries = [(b"a" * 20, 42, zlib.crc32(b"data"))]
+        filename = os.path.join(self.tempdir, "test_hash.idx")
+        # Write v3 index with SHA-1 (algorithm=1)
+        with GitFile(filename, "wb") as f:
+            write_pack_index_v3(f, entries, b"1" * 20, hash_algorithm=1)
+        idx = load_pack_index(filename)
+        self.assertEqual(idx.hash_algorithm, 1)
+        self.assertEqual(idx.hash_size, 20)
+
+    def test_v3_sha256_length(self) -> None:
+        """Test v3 index with SHA-256 hash length."""
+        # For now, test that SHA-256 is not yet implemented
+        entries = [(b"a" * 32, 42, zlib.crc32(b"data"))]
+        filename = os.path.join(self.tempdir, "test_sha256.idx")
+        # SHA-256 should raise NotImplementedError
+        with self.assertRaises(NotImplementedError) as cm:
+            with GitFile(filename, "wb") as f:
+                write_pack_index_v3(f, entries, b"1" * 32, hash_algorithm=2)
+        self.assertIn("SHA-256", str(cm.exception))
+
+    def test_v3_invalid_hash_algorithm(self) -> None:
+        """Test v3 index with invalid hash algorithm."""
+        entries = [(b"a" * 20, 42, zlib.crc32(b"data"))]
+        filename = os.path.join(self.tempdir, "test_invalid.idx")
+        # Invalid hash algorithm should raise ValueError
+        with self.assertRaises(ValueError) as cm:
+            with GitFile(filename, "wb") as f:
+                write_pack_index_v3(f, entries, b"1" * 20, hash_algorithm=99)
+        self.assertIn("Unknown hash algorithm", str(cm.exception))
+
+    def test_v3_wrong_hash_length(self) -> None:
+        """Test v3 index with mismatched hash length."""
+        # Entry with wrong hash length for SHA-1
+        entries = [(b"a" * 15, 42, zlib.crc32(b"data"))]  # Too short
+        filename = os.path.join(self.tempdir, "test_wrong_len.idx")
+        with self.assertRaises(ValueError) as cm:
+            with GitFile(filename, "wb") as f:
+                write_pack_index_v3(f, entries, b"1" * 20, hash_algorithm=1)
+        self.assertIn("wrong length", str(cm.exception))
+
+
 class MockFileWithoutFileno:
     """Mock file-like object without fileno method."""