Browse Source

Initial work on pack index v3 (#1119)

Initial work on pack index v3
Jelmer Vernooij 1 month ago
parent
commit
22d6e3799d
4 changed files with 362 additions and 12 deletions
  1. 6 0
      NEWS
  2. 186 10
      dulwich/pack.py
  3. 81 2
      tests/compat/test_pack.py
  4. 89 0
      tests/test_pack.py

+ 6 - 0
NEWS

@@ -1,5 +1,11 @@
 0.22.9	UNRELEASED
 0.22.9	UNRELEASED
 
 
+ * Add support for pack index format version 3. This format supports variable
+   hash sizes to enable future SHA-256 support. The implementation includes
+   reading and writing v3 indexes with proper hash algorithm identification
+   (1 for SHA-1, 2 for SHA-256). Note that SHA-256 support itself is not yet
+   implemented and will raise NotImplementedError. (Jelmer Vernooij)
+
  * Fix ``LocalGitClient`` assertion error when fetching externally cloned repositories
  * Fix ``LocalGitClient`` assertion error when fetching externally cloned repositories
    into ``MemoryRepo``. Previously, the client would fail with an AssertionError
    into ``MemoryRepo``. Previously, the client would fail with an AssertionError
    when trying to process pack data from repositories that were cloned externally.
    when trying to process pack data from repositories that were cloned externally.

+ 186 - 10
dulwich/pack.py

@@ -404,20 +404,17 @@ def load_pack_index_file(path: Union[str, os.PathLike], f):
       f: File-like object
       f: File-like object
     Returns: A PackIndex loaded from the given file
     Returns: A PackIndex loaded from the given file
     """
     """
-    # Ensure path is a string for PackIndex classes
-    path_str = os.fspath(path)
-    if isinstance(path_str, bytes):
-        path_str = os.fsdecode(path_str)
-
     contents, size = _load_file_contents(f)
     contents, size = _load_file_contents(f)
     if contents[:4] == b"\377tOc":
     if contents[:4] == b"\377tOc":
         version = struct.unpack(b">L", contents[4:8])[0]
         version = struct.unpack(b">L", contents[4:8])[0]
         if version == 2:
         if version == 2:
-            return PackIndex2(path_str, file=f, contents=contents, size=size)
+            return PackIndex2(path, file=f, contents=contents, size=size)
+        elif version == 3:
+            return PackIndex3(path, file=f, contents=contents, size=size)
         else:
         else:
             raise KeyError(f"Unknown pack index format {version}")
             raise KeyError(f"Unknown pack index format {version}")
     else:
     else:
-        return PackIndex1(path_str, file=f, contents=contents, size=size)
+        return PackIndex1(path, file=f, contents=contents, size=size)
 
 
 
 
 def bisect_find_sha(start, end, sha, unpack_name):
 def bisect_find_sha(start, end, sha, unpack_name):
@@ -453,6 +450,10 @@ class PackIndex:
     packfile of that object if it has it.
     packfile of that object if it has it.
     """
     """
 
 
+    # Default to SHA-1 for backward compatibility
+    hash_algorithm = 1
+    hash_size = 20
+
     def __eq__(self, other):
     def __eq__(self, other):
         if not isinstance(other, PackIndex):
         if not isinstance(other, PackIndex):
             return False
             return False
@@ -770,7 +771,9 @@ class FilePackIndex(PackIndex):
 class PackIndex1(FilePackIndex):
 class PackIndex1(FilePackIndex):
     """Version 1 Pack Index file."""
     """Version 1 Pack Index file."""
 
 
-    def __init__(self, filename: str, file=None, contents=None, size=None) -> None:
+    def __init__(
+        self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
+    ) -> None:
         super().__init__(filename, file, contents, size)
         super().__init__(filename, file, contents, size)
         self.version = 1
         self.version = 1
         self._fan_out_table = self._read_fan_out_table(0)
         self._fan_out_table = self._read_fan_out_table(0)
@@ -795,7 +798,9 @@ class PackIndex1(FilePackIndex):
 class PackIndex2(FilePackIndex):
 class PackIndex2(FilePackIndex):
     """Version 2 Pack Index file."""
     """Version 2 Pack Index file."""
 
 
-    def __init__(self, filename: str, file=None, contents=None, size=None) -> None:
+    def __init__(
+        self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
+    ) -> None:
         super().__init__(filename, file, contents, size)
         super().__init__(filename, file, contents, size)
         if self._contents[:4] != b"\377tOc":
         if self._contents[:4] != b"\377tOc":
             raise AssertionError("Not a v2 pack index file")
             raise AssertionError("Not a v2 pack index file")
@@ -833,6 +838,68 @@ class PackIndex2(FilePackIndex):
         return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
         return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
 
 
 
 
+class PackIndex3(FilePackIndex):
+    """Version 3 Pack Index file.
+
+    Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).
+    """
+
+    def __init__(
+        self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
+    ) -> None:
+        super().__init__(filename, file, contents, size)
+        if self._contents[:4] != b"\377tOc":
+            raise AssertionError("Not a v3 pack index file")
+        (self.version,) = unpack_from(b">L", self._contents, 4)
+        if self.version != 3:
+            raise AssertionError(f"Version was {self.version}")
+
+        # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
+        (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)
+        if self.hash_algorithm == 1:
+            self.hash_size = 20  # SHA-1
+        elif self.hash_algorithm == 2:
+            self.hash_size = 32  # SHA-256
+        else:
+            raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")
+
+        # Read length of shortened object names
+        (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)
+
+        # Calculate offsets based on variable hash size
+        self._fan_out_table = self._read_fan_out_table(
+            16
+        )  # After header (4 + 4 + 4 + 4)
+        self._name_table_offset = 16 + 0x100 * 4
+        self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)
+        self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
+        self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
+            self
+        )
+
+    def _unpack_entry(self, i):
+        return (
+            self._unpack_name(i),
+            self._unpack_offset(i),
+            self._unpack_crc32_checksum(i),
+        )
+
+    def _unpack_name(self, i):
+        offset = self._name_table_offset + i * self.hash_size
+        return self._contents[offset : offset + self.hash_size]
+
+    def _unpack_offset(self, i):
+        offset = self._pack_offset_table_offset + i * 4
+        offset = unpack_from(">L", self._contents, offset)[0]
+        if offset & (2**31):
+            offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
+            offset = unpack_from(">Q", self._contents, offset)[0]
+        return offset
+
+    def _unpack_crc32_checksum(self, i):
+        return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
+
+
 def read_pack_header(read) -> tuple[int, int]:
 def read_pack_header(read) -> tuple[int, int]:
     """Read the header of a pack file.
     """Read the header of a pack file.
 
 
@@ -1344,12 +1411,37 @@ class PackData:
         with GitFile(filename, "wb") as f:
         with GitFile(filename, "wb") as f:
             return write_pack_index_v2(f, entries, self.calculate_checksum())
             return write_pack_index_v2(f, entries, self.calculate_checksum())
 
 
-    def create_index(self, filename, progress=None, version=2, resolve_ext_ref=None):
+    def create_index_v3(
+        self, filename, progress=None, resolve_ext_ref=None, hash_algorithm=1
+    ):
+        """Create a version 3 index file for this data file.
+
+        Args:
+          filename: Index filename.
+          progress: Progress report function
+          resolve_ext_ref: Function to resolve external references
+          hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
+        Returns: Checksum of index file
+        """
+        entries = self.sorted_entries(
+            progress=progress, resolve_ext_ref=resolve_ext_ref
+        )
+        with GitFile(filename, "wb") as f:
+            return write_pack_index_v3(
+                f, entries, self.calculate_checksum(), hash_algorithm
+            )
+
+    def create_index(
+        self, filename, progress=None, version=2, resolve_ext_ref=None, hash_algorithm=1
+    ):
         """Create an  index file for this data file.
         """Create an  index file for this data file.
 
 
         Args:
         Args:
           filename: Index filename.
           filename: Index filename.
           progress: Progress report function
           progress: Progress report function
+          version: Index version (1, 2, or 3)
+          resolve_ext_ref: Function to resolve external references
+          hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)
         Returns: Checksum of index file
         Returns: Checksum of index file
         """
         """
         if version == 1:
         if version == 1:
@@ -1360,6 +1452,13 @@ class PackData:
             return self.create_index_v2(
             return self.create_index_v2(
                 filename, progress, resolve_ext_ref=resolve_ext_ref
                 filename, progress, resolve_ext_ref=resolve_ext_ref
             )
             )
+        elif version == 3:
+            return self.create_index_v3(
+                filename,
+                progress,
+                resolve_ext_ref=resolve_ext_ref,
+                hash_algorithm=hash_algorithm,
+            )
         else:
         else:
             raise ValueError(f"unknown index format {version}")
             raise ValueError(f"unknown index format {version}")
 
 
@@ -2496,6 +2595,83 @@ def write_pack_index_v2(
     return f.write_sha()
     return f.write_sha()
 
 
 
 
+def write_pack_index_v3(
+    f, entries: Iterable[PackIndexEntry], pack_checksum: bytes, hash_algorithm: int = 1
+) -> bytes:
+    """Write a new pack index file in v3 format.
+
+    Args:
+      f: File-like object to write to
+      entries: List of tuples with object name (sha), offset_in_pack, and
+        crc32_checksum.
+      pack_checksum: Checksum of the pack file.
+      hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
+    Returns: The SHA of the index file written
+    """
+    if hash_algorithm == 1:
+        hash_size = 20  # SHA-1
+        writer_cls = SHA1Writer
+    elif hash_algorithm == 2:
+        hash_size = 32  # SHA-256
+        # TODO: Add SHA256Writer when SHA-256 support is implemented
+        raise NotImplementedError("SHA-256 support not yet implemented")
+    else:
+        raise ValueError(f"Unknown hash algorithm {hash_algorithm}")
+
+    # Convert entries to list to allow multiple iterations
+    entries_list = list(entries)
+
+    # Calculate shortest unambiguous prefix length for object names
+    # For now, use full hash size (this could be optimized)
+    shortened_oid_len = hash_size
+
+    f = writer_cls(f)
+    f.write(b"\377tOc")  # Magic!
+    f.write(struct.pack(">L", 3))  # Version 3
+    f.write(struct.pack(">L", hash_algorithm))  # Hash algorithm
+    f.write(struct.pack(">L", shortened_oid_len))  # Shortened OID length
+
+    fan_out_table: dict[int, int] = defaultdict(lambda: 0)
+    for name, offset, entry_checksum in entries_list:
+        if len(name) != hash_size:
+            raise ValueError(
+                f"Object name has wrong length: expected {hash_size}, got {len(name)}"
+            )
+        fan_out_table[ord(name[:1])] += 1
+
+    # Fan-out table
+    largetable: list[int] = []
+    for i in range(0x100):
+        f.write(struct.pack(b">L", fan_out_table[i]))
+        fan_out_table[i + 1] += fan_out_table[i]
+
+    # Object names table
+    for name, offset, entry_checksum in entries_list:
+        f.write(name)
+
+    # CRC32 checksums table
+    for name, offset, entry_checksum in entries_list:
+        f.write(struct.pack(b">L", entry_checksum))
+
+    # Offset table
+    for name, offset, entry_checksum in entries_list:
+        if offset < 2**31:
+            f.write(struct.pack(b">L", offset))
+        else:
+            f.write(struct.pack(b">L", 2**31 + len(largetable)))
+            largetable.append(offset)
+
+    # Large offset table
+    for offset in largetable:
+        f.write(struct.pack(b">Q", offset))
+
+    assert len(pack_checksum) == hash_size, (
+        f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"
+    )
+    f.write(pack_checksum)
+    return f.write_sha()
+
+
 write_pack_index = write_pack_index_v2
 write_pack_index = write_pack_index_v2
 
 
 
 

+ 81 - 2
tests/compat/test_pack.py

@@ -28,12 +28,19 @@ import shutil
 import tempfile
 import tempfile
 from typing import NoReturn
 from typing import NoReturn
 
 
+from dulwich.file import GitFile
 from dulwich.objects import Blob
 from dulwich.objects import Blob
-from dulwich.pack import write_pack
+from dulwich.pack import (
+    PackData,
+    PackIndex3,
+    load_pack_index,
+    write_pack,
+    write_pack_index_v3,
+)
 
 
 from .. import SkipTest
 from .. import SkipTest
 from ..test_pack import PackTests, a_sha, pack1_sha
 from ..test_pack import PackTests, a_sha, pack1_sha
-from .utils import require_git_version, run_git_or_fail
+from .utils import require_git_version, rmtree_ro, run_git_or_fail
 
 
 _NON_DELTA_RE = re.compile(b"non delta: (?P<non_delta>\\d+) objects")
 _NON_DELTA_RE = re.compile(b"non delta: (?P<non_delta>\\d+) objects")
 
 
@@ -163,3 +170,75 @@ class TestPack(PackTests):
             got_non_delta,
             got_non_delta,
             f"Expected 4 non-delta objects, got {got_non_delta}",
             f"Expected 4 non-delta objects, got {got_non_delta}",
         )
         )
+
+
+class TestPackIndexCompat(PackTests):
+    """Compatibility tests for pack index formats."""
+
+    def setUp(self) -> None:
+        require_git_version((1, 5, 0))
+        super().setUp()
+        self._tempdir = tempfile.mkdtemp()
+        self.addCleanup(rmtree_ro, self._tempdir)
+
+    def test_dulwich_create_index_git_readable(self) -> None:
+        """Test that git can read pack indexes created by dulwich."""
+        # Create a simple pack with objects
+        blob = Blob()
+        blob.data = b"Test blob"
+
+        pack_path = os.path.join(self._tempdir, "test_pack")
+        entries = [(blob, None)]
+        write_pack(pack_path, entries)
+
+        # Load the pack and create v2 index (most compatible)
+        pack_data = PackData(pack_path + ".pack")
+        pack_data.create_index(pack_path + ".idx", version=2)
+
+        # Verify git can read it
+        output = run_git_or_fail(["verify-pack", "-v", pack_path + ".pack"])
+        self.assertIn(blob.id.decode("ascii"), output.decode("ascii"))
+
+    def test_dulwich_read_git_index(self) -> None:
+        """Test that dulwich can read pack indexes created by git."""
+        # Create a simple pack with objects
+        blob = Blob()
+        blob.data = b"Test blob for git"
+
+        pack_path = os.path.join(self._tempdir, "git_pack")
+        entries = [(blob, None)]
+        write_pack(pack_path, entries)
+
+        # Create index with git
+        run_git_or_fail(["index-pack", pack_path + ".pack"])
+
+        # Load with dulwich
+        idx = load_pack_index(pack_path + ".idx")
+
+        # Verify it works
+        self.assertIn(blob.id, idx)
+        self.assertEqual(len(idx), 1)
+
+    def test_index_format_v3_sha256_future(self) -> None:
+        """Test that v3 index format is ready for SHA-256 support."""
+        # This test verifies the v3 implementation structure is ready
+        # for SHA-256, even though SHA-256 itself is not yet implemented
+
+        # Create a dummy v3 index to test the format
+        entries = [(b"a" * 20, 100, 1234)]  # SHA-1 for now
+
+        v3_path = os.path.join(self._tempdir, "v3_test.idx")
+        with GitFile(v3_path, "wb") as f:
+            write_pack_index_v3(f, entries, b"x" * 20, hash_algorithm=1)
+
+        # Load and verify structure
+        idx = load_pack_index(v3_path)
+        self.assertIsInstance(idx, PackIndex3)
+        self.assertEqual(idx.version, 3)
+        self.assertEqual(idx.hash_algorithm, 1)  # SHA-1
+        self.assertEqual(idx.hash_size, 20)
+
+        # Verify SHA-256 would raise NotImplementedError
+        with self.assertRaises(NotImplementedError):
+            with GitFile(v3_path + ".sha256", "wb") as f:
+                write_pack_index_v3(f, entries, b"x" * 32, hash_algorithm=2)

+ 89 - 0
tests/test_pack.py

@@ -42,6 +42,7 @@ from dulwich.pack import (
     MemoryPackIndex,
     MemoryPackIndex,
     Pack,
     Pack,
     PackData,
     PackData,
+    PackIndex3,
     PackStreamReader,
     PackStreamReader,
     UnpackedObject,
     UnpackedObject,
     UnresolvedDeltas,
     UnresolvedDeltas,
@@ -58,6 +59,7 @@ from dulwich.pack import (
     write_pack_header,
     write_pack_header,
     write_pack_index_v1,
     write_pack_index_v1,
     write_pack_index_v2,
     write_pack_index_v2,
+    write_pack_index_v3,
     write_pack_object,
     write_pack_object,
 )
 )
 from dulwich.tests.utils import build_pack, make_object
 from dulwich.tests.utils import build_pack, make_object
@@ -407,6 +409,25 @@ class TestPackData(PackTests):
             self.assertEqual(oct(os.stat(filename).st_mode), indexmode)
             self.assertEqual(oct(os.stat(filename).st_mode), indexmode)
             self.assertEqual(idx1, idx2)
             self.assertEqual(idx1, idx2)
 
 
+    def test_create_index_v3(self) -> None:
+        with self.get_pack_data(pack1_sha) as p:
+            filename = os.path.join(self.tempdir, "v3test.idx")
+            p.create_index_v3(filename)
+            idx1 = load_pack_index(filename)
+            idx2 = self.get_pack_index(pack1_sha)
+            self.assertEqual(oct(os.stat(filename).st_mode), indexmode)
+            self.assertEqual(idx1, idx2)
+            self.assertIsInstance(idx1, PackIndex3)
+            self.assertEqual(idx1.version, 3)
+
+    def test_create_index_version3(self) -> None:
+        with self.get_pack_data(pack1_sha) as p:
+            filename = os.path.join(self.tempdir, "version3test.idx")
+            p.create_index(filename, version=3)
+            idx = load_pack_index(filename)
+            self.assertIsInstance(idx, PackIndex3)
+            self.assertEqual(idx.version, 3)
+
     def test_compute_file_sha(self) -> None:
     def test_compute_file_sha(self) -> None:
         f = BytesIO(b"abcd1234wxyz")
         f = BytesIO(b"abcd1234wxyz")
         self.assertEqual(
         self.assertEqual(
@@ -878,6 +899,74 @@ class TestPackIndexWritingv2(TestCase, BaseTestFilePackIndexWriting):
         BaseTestFilePackIndexWriting.tearDown(self)
         BaseTestFilePackIndexWriting.tearDown(self)
 
 
 
 
+class TestPackIndexWritingv3(TestCase, BaseTestFilePackIndexWriting):
+    def setUp(self) -> None:
+        TestCase.setUp(self)
+        BaseTestFilePackIndexWriting.setUp(self)
+        self._has_crc32_checksum = True
+        self._supports_large = True
+        self._expected_version = 3
+        self._write_fn = write_pack_index_v3
+
+    def tearDown(self) -> None:
+        TestCase.tearDown(self)
+        BaseTestFilePackIndexWriting.tearDown(self)
+
+    def test_load_v3_index_returns_packindex3(self) -> None:
+        """Test that loading a v3 index file returns a PackIndex3 instance."""
+        entries = [(b"abcd" * 5, 0, zlib.crc32(b""))]
+        filename = os.path.join(self.tempdir, "test.idx")
+        self.writeIndex(filename, entries, b"1234567890" * 2)
+        idx = load_pack_index(filename)
+        self.assertIsInstance(idx, PackIndex3)
+        self.assertEqual(idx.version, 3)
+        self.assertEqual(idx.hash_algorithm, 1)  # SHA-1
+        self.assertEqual(idx.hash_size, 20)
+        self.assertEqual(idx.shortened_oid_len, 20)
+
+    def test_v3_hash_algorithm(self) -> None:
+        """Test v3 index correctly handles hash algorithm field."""
+        entries = [(b"a" * 20, 42, zlib.crc32(b"data"))]
+        filename = os.path.join(self.tempdir, "test_hash.idx")
+        # Write v3 index with SHA-1 (algorithm=1)
+        with GitFile(filename, "wb") as f:
+            write_pack_index_v3(f, entries, b"1" * 20, hash_algorithm=1)
+        idx = load_pack_index(filename)
+        self.assertEqual(idx.hash_algorithm, 1)
+        self.assertEqual(idx.hash_size, 20)
+
+    def test_v3_sha256_length(self) -> None:
+        """Test v3 index with SHA-256 hash length."""
+        # For now, test that SHA-256 is not yet implemented
+        entries = [(b"a" * 32, 42, zlib.crc32(b"data"))]
+        filename = os.path.join(self.tempdir, "test_sha256.idx")
+        # SHA-256 should raise NotImplementedError
+        with self.assertRaises(NotImplementedError) as cm:
+            with GitFile(filename, "wb") as f:
+                write_pack_index_v3(f, entries, b"1" * 32, hash_algorithm=2)
+        self.assertIn("SHA-256", str(cm.exception))
+
+    def test_v3_invalid_hash_algorithm(self) -> None:
+        """Test v3 index with invalid hash algorithm."""
+        entries = [(b"a" * 20, 42, zlib.crc32(b"data"))]
+        filename = os.path.join(self.tempdir, "test_invalid.idx")
+        # Invalid hash algorithm should raise ValueError
+        with self.assertRaises(ValueError) as cm:
+            with GitFile(filename, "wb") as f:
+                write_pack_index_v3(f, entries, b"1" * 20, hash_algorithm=99)
+        self.assertIn("Unknown hash algorithm", str(cm.exception))
+
+    def test_v3_wrong_hash_length(self) -> None:
+        """Test v3 index with mismatched hash length."""
+        # Entry with wrong hash length for SHA-1
+        entries = [(b"a" * 15, 42, zlib.crc32(b"data"))]  # Too short
+        filename = os.path.join(self.tempdir, "test_wrong_len.idx")
+        with self.assertRaises(ValueError) as cm:
+            with GitFile(filename, "wb") as f:
+                write_pack_index_v3(f, entries, b"1" * 20, hash_algorithm=1)
+        self.assertIn("wrong length", str(cm.exception))
+
+
 class MockFileWithoutFileno:
 class MockFileWithoutFileno:
     """Mock file-like object without fileno method."""
     """Mock file-like object without fileno method."""