Sfoglia il codice sorgente

Fix MIDX format compatibility with Git and add compat tests

- Use .idx extension for pack names in MIDX (not .pack)
- Fix chunk table terminator offset
- Sort pack names lexicographically
Jelmer Vernooij 2 mesi fa
parent
commit
565be41d3b
4 ha cambiato i file con 288 aggiunte e 27 eliminazioni
  1. 16 5
      dulwich/midx.py
  2. 6 3
      dulwich/object_store.py
  3. 247 0
      tests/compat/test_midx.py
  4. 19 19
      tests/test_midx.py

+ 16 - 5
dulwich/midx.py

@@ -450,11 +450,14 @@ def write_midx(
     # Wrap file in SHA1Writer to compute checksum
     writer = SHA1Writer(f)
 
+    # Sort pack entries by pack name (required by Git)
+    pack_index_entries_sorted = sorted(pack_index_entries, key=lambda x: x[0])
+
     # Collect all objects from all packs
     all_objects: list[tuple[bytes, int, int]] = []  # (sha, pack_id, offset)
     pack_names: list[str] = []
 
-    for pack_id, (pack_name, entries) in enumerate(pack_index_entries):
+    for pack_id, (pack_name, entries) in enumerate(pack_index_entries_sorted):
         pack_names.append(pack_name)
         for sha, offset, _crc32 in entries:
             all_objects.append((sha, pack_id, offset))
@@ -508,11 +511,19 @@ def write_midx(
     current_offset += ooff_size
 
     # LOFF chunk (if needed): variable size
+    # We'll calculate the exact size when we know how many large offsets we have
     loff_offset = current_offset if need_loff else 0
     large_offsets: list[int] = []
+
+    # Calculate trailer offset (where checksum starts)
+    # We need to pre-calculate large offset count for accurate trailer offset
     if need_loff:
-        # We'll populate this as we write OOFF
-        pass
+        # Count large offsets
+        large_offset_count = sum(1 for _, _, offset in all_objects if offset >= 2**31)
+        loff_size = large_offset_count * 8
+        trailer_offset = current_offset + loff_size
+    else:
+        trailer_offset = current_offset
 
     # Write header
     writer.write(MIDX_SIGNATURE)  # 4 bytes: signature
@@ -536,9 +547,9 @@ def write_midx(
         writer.write(chunk_id)  # 4 bytes
         writer.write(struct.pack(">Q", chunk_offset))  # 8 bytes
 
-    # Write terminator
+    # Write terminator (points to where trailer/checksum starts)
     writer.write(b"\x00\x00\x00\x00")  # 4 bytes
-    writer.write(struct.pack(">Q", 0))  # 8 bytes
+    writer.write(struct.pack(">Q", trailer_offset))  # 8 bytes
 
     # Write PNAM chunk
     writer.write(pnam_data)

+ 6 - 3
dulwich/object_store.py

@@ -2069,7 +2069,7 @@ class DiskObjectStore(PackBasedObjectStore):
         """Get a pack by its base name.
 
         Args:
-            pack_name: Base name of the pack (e.g., 'pack-abc123.pack')
+            pack_name: Base name of the pack (e.g., 'pack-abc123.pack' or 'pack-abc123.idx')
 
         Returns:
             Pack object
@@ -2077,9 +2077,11 @@ class DiskObjectStore(PackBasedObjectStore):
         Raises:
             KeyError: If pack doesn't exist
         """
-        # Remove .pack extension if present
+        # Remove .pack or .idx extension if present
         if pack_name.endswith(".pack"):
             base_name = pack_name[:-5]
+        elif pack_name.endswith(".idx"):
+            base_name = pack_name[:-4]
         else:
             base_name = pack_name
 
@@ -2187,7 +2189,8 @@ class DiskObjectStore(PackBasedObjectStore):
         pack_entries: list[tuple[str, list[tuple[bytes, int, int | None]]]] = []
 
         for pack in packs:
-            pack_name = os.path.basename(pack._basename) + ".pack"
+            # Git stores .idx extension in MIDX, not .pack
+            pack_name = os.path.basename(pack._basename) + ".idx"
             entries = list(pack.index.iterentries())
             pack_entries.append((pack_name, entries))
 

+ 247 - 0
tests/compat/test_midx.py

@@ -0,0 +1,247 @@
+# test_midx.py -- Compatibility tests for multi-pack-index functionality
+# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as published by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+
+"""Compatibility tests for Git multi-pack-index functionality.
+
+These tests verify that dulwich's MIDX implementation can read and interact
+with MIDX files created by C Git, and that Git can read MIDX files created
+by Dulwich.
+"""
+
+import os
+import tempfile
+
+from dulwich.midx import load_midx
+from dulwich.object_store import DiskObjectStore
+from dulwich.repo import Repo
+
+from .utils import CompatTestCase, run_git_or_fail
+
+
+class MIDXCompatTests(CompatTestCase):
+    """Compatibility tests for multi-pack-index functionality."""
+
+    # Multi-pack-index was introduced in Git 2.21.0
+    min_git_version = (2, 21, 0)
+
+    def setUp(self):
+        super().setUp()
+        self.test_dir = tempfile.mkdtemp()
+        self.repo_path = os.path.join(self.test_dir, "test-repo")
+
+        # Set up git identity to avoid committer identity errors
+        self.overrideEnv("GIT_COMMITTER_NAME", "Test Author")
+        self.overrideEnv("GIT_COMMITTER_EMAIL", "test@example.com")
+        self.overrideEnv("GIT_AUTHOR_NAME", "Test Author")
+        self.overrideEnv("GIT_AUTHOR_EMAIL", "test@example.com")
+
+    def tearDown(self):
+        from .utils import rmtree_ro
+
+        rmtree_ro(self.test_dir)
+
+    def create_test_repo_with_packs(self):
+        """Create a test repository with multiple pack files."""
+        # Initialize repository
+        run_git_or_fail(["init"], cwd=self.test_dir)
+        os.rename(os.path.join(self.test_dir, ".git"), self.repo_path)
+
+        work_dir = os.path.join(self.test_dir, "work")
+        os.makedirs(work_dir)
+
+        # Create .git file pointing to our repo
+        with open(os.path.join(work_dir, ".git"), "w") as f:
+            f.write(f"gitdir: {self.repo_path}\n")
+
+        # Create some commits and pack them
+        for i in range(5):
+            filename = f"file{i}.txt"
+            with open(os.path.join(work_dir, filename), "w") as f:
+                f.write(f"Content {i}\n" * 100)  # Make files bigger to ensure packing
+
+            run_git_or_fail(["add", filename], cwd=work_dir)
+            run_git_or_fail(
+                [
+                    "commit",
+                    "-m",
+                    f"Commit {i}",
+                    "--author",
+                    "Test Author <test@example.com>",
+                ],
+                cwd=work_dir,
+            )
+
+            # Create a pack file after each commit to get multiple packs
+            if i > 0:  # Skip first commit to avoid empty pack
+                run_git_or_fail(["repack", "-d"], cwd=work_dir)
+
+        return work_dir
+
+    def test_read_git_midx(self):
+        """Test that Dulwich can read a MIDX file created by Git."""
+        work_dir = self.create_test_repo_with_packs()
+
+        # Have Git create a MIDX file
+        run_git_or_fail(["multi-pack-index", "write"], cwd=work_dir)
+
+        # Verify Git created the MIDX file
+        midx_path = os.path.join(self.repo_path, "objects", "pack", "multi-pack-index")
+        self.assertTrue(
+            os.path.exists(midx_path), "Git did not create multi-pack-index file"
+        )
+
+        # Load the MIDX file with Dulwich
+        midx = load_midx(midx_path)
+
+        # Verify we can read it
+        self.assertGreater(len(midx), 0, "MIDX should contain objects")
+        self.assertGreater(midx.pack_count, 0, "MIDX should reference packs")
+
+        # Verify the pack names look reasonable
+        # Git stores .idx extensions in MIDX files
+        for pack_name in midx.pack_names:
+            self.assertTrue(pack_name.startswith("pack-"))
+            self.assertTrue(pack_name.endswith(".idx"))
+
+    def test_git_uses_dulwich_midx(self):
+        """Test that Git can use a MIDX file created by Dulwich."""
+        work_dir = self.create_test_repo_with_packs()
+
+        # Use Dulwich to create a MIDX file
+        repo = Repo(self.repo_path)
+        store = repo.object_store
+        self.assertIsInstance(store, DiskObjectStore)
+
+        # Write MIDX with Dulwich
+        checksum = store.write_midx()
+        self.assertEqual(20, len(checksum))
+
+        # Verify the file was created
+        midx_path = os.path.join(self.repo_path, "objects", "pack", "multi-pack-index")
+        self.assertTrue(os.path.exists(midx_path))
+
+        # Have Git verify the MIDX file (should succeed with return code 0)
+        run_git_or_fail(["multi-pack-index", "verify"], cwd=work_dir)
+
+        # Try to use the MIDX with Git commands
+        # This should work if the MIDX is valid
+        run_git_or_fail(["fsck"], cwd=work_dir)
+
+    def test_midx_object_lookup_matches_git(self):
+        """Test that object lookups through MIDX match Git's results."""
+        work_dir = self.create_test_repo_with_packs()
+
+        # Have Git create a MIDX file
+        run_git_or_fail(["multi-pack-index", "write"], cwd=work_dir)
+
+        # Load with Dulwich
+        repo = Repo(self.repo_path)
+        store = repo.object_store
+
+        # Get MIDX
+        midx = store.get_midx()
+        self.assertIsNotNone(midx, "MIDX should be loaded")
+
+        # Get all objects from Git
+        result = run_git_or_fail(["rev-list", "--all", "--objects"], cwd=work_dir)
+        object_shas = [
+            line.split()[0].encode("ascii")
+            for line in result.decode("utf-8").strip().split("\n")
+            if line
+        ]
+
+        # Verify we can find these objects through the MIDX
+        found_count = 0
+        for sha_hex in object_shas:
+            # Convert hex to binary
+            sha_bin = bytes.fromhex(sha_hex.decode("ascii"))
+
+            # Check if it's in the MIDX
+            if sha_bin in midx:
+                found_count += 1
+
+                # Verify we can get the object location
+                result = midx.object_offset(sha_bin)
+                self.assertIsNotNone(result)
+                pack_name, offset = result
+                self.assertIsInstance(pack_name, str)
+                self.assertIsInstance(offset, int)
+                self.assertGreater(offset, 0)
+
+        # We should find at least some objects in the MIDX
+        self.assertGreater(found_count, 0, "Should find at least some objects in MIDX")
+
+    def test_midx_with_multiple_packs(self):
+        """Test MIDX functionality with multiple pack files."""
+        work_dir = self.create_test_repo_with_packs()
+
+        # Create multiple pack files explicitly
+        run_git_or_fail(["repack"], cwd=work_dir)
+        run_git_or_fail(["repack"], cwd=work_dir)
+
+        # Create MIDX with Git
+        run_git_or_fail(["multi-pack-index", "write"], cwd=work_dir)
+
+        # Load with Dulwich
+        midx_path = os.path.join(self.repo_path, "objects", "pack", "multi-pack-index")
+        midx = load_midx(midx_path)
+
+        # Should have multiple packs
+        # (Exact count may vary depending on Git version and repacking)
+        self.assertGreaterEqual(midx.pack_count, 1)
+
+        # Verify we can iterate over all entries
+        entries = list(midx.iterentries())
+        self.assertGreater(len(entries), 0)
+
+        # All entries should have valid structure
+        for sha, pack_name, offset in entries:
+            self.assertEqual(20, len(sha))  # SHA-1 is 20 bytes
+            self.assertIsInstance(pack_name, str)
+            # Git stores .idx extensions in MIDX files
+            self.assertTrue(pack_name.endswith(".idx"))
+            self.assertIsInstance(offset, int)
+            self.assertGreaterEqual(offset, 0)
+
+    def test_dulwich_object_store_with_git_midx(self):
+        """Test that DiskObjectStore can use Git-created MIDX for lookups."""
+        work_dir = self.create_test_repo_with_packs()
+
+        # Have Git create a MIDX file
+        run_git_or_fail(["multi-pack-index", "write"], cwd=work_dir)
+
+        # Load repo with Dulwich
+        repo = Repo(self.repo_path)
+
+        # Get a commit from the repo
+        result = run_git_or_fail(["rev-parse", "HEAD"], cwd=work_dir)
+        head_sha = result.decode("utf-8").strip().encode("ascii")
+
+        # Verify we can access it through Dulwich
+        # This should use the MIDX for lookup
+        obj = repo.object_store[head_sha]
+        self.assertIsNotNone(obj)
+        self.assertEqual(b"commit", obj.type_name)
+
+    def test_repack_with_midx(self):
+        """Test that repacking works correctly with MIDX present."""
+        work_dir = self.create_test_repo_with_packs()
+
+        # Create MIDX with Dulwich
+        repo = Repo(self.repo_path)
+        repo.object_store.write_midx()
+
+        # Verify Git can still repack
+        run_git_or_fail(["repack", "-d"], cwd=work_dir)
+
+        # The MIDX should still be readable
+        midx_path = os.path.join(self.repo_path, "objects", "pack", "multi-pack-index")
+        if os.path.exists(midx_path):  # Git may remove it during repack
+            midx = load_midx(midx_path)
+            self.assertGreaterEqual(len(midx), 0)

+ 19 - 19
tests/test_midx.py

@@ -60,7 +60,7 @@ class MIDXWriteTests(TestCase):
         # Create some fake pack entries
         pack_entries = [
             (
-                "pack-abc123.pack",
+                "pack-abc123.idx",
                 [
                     (b"\x01" * 20, 100, 0x12345678),  # sha, offset, crc32
                     (b"\x02" * 20, 200, 0x87654321),
@@ -78,25 +78,25 @@ class MIDXWriteTests(TestCase):
 
         self.assertEqual(3, len(midx))
         self.assertEqual(1, midx.pack_count)
-        self.assertEqual(["pack-abc123.pack"], midx.pack_names)
+        self.assertEqual(["pack-abc123.idx"], midx.pack_names)
 
         # Check object lookups
         result = midx.object_offset(b"\x01" * 20)
         self.assertIsNotNone(result)
         pack_name, offset = result
-        self.assertEqual("pack-abc123.pack", pack_name)
+        self.assertEqual("pack-abc123.idx", pack_name)
         self.assertEqual(100, offset)
 
         result = midx.object_offset(b"\x02" * 20)
         self.assertIsNotNone(result)
         pack_name, offset = result
-        self.assertEqual("pack-abc123.pack", pack_name)
+        self.assertEqual("pack-abc123.idx", pack_name)
         self.assertEqual(200, offset)
 
         result = midx.object_offset(b"\x03" * 20)
         self.assertIsNotNone(result)
         pack_name, offset = result
-        self.assertEqual("pack-abc123.pack", pack_name)
+        self.assertEqual("pack-abc123.idx", pack_name)
         self.assertEqual(300, offset)
 
         # Check non-existent object
@@ -109,14 +109,14 @@ class MIDXWriteTests(TestCase):
 
         pack_entries = [
             (
-                "pack-111.pack",
+                "pack-111.idx",
                 [
                     (b"\x01" * 20, 100, 0),
                     (b"\x03" * 20, 300, 0),
                 ],
             ),
             (
-                "pack-222.pack",
+                "pack-222.idx",
                 [
                     (b"\x02" * 20, 50, 0),
                     (b"\x04" * 20, 150, 0),
@@ -133,16 +133,16 @@ class MIDXWriteTests(TestCase):
 
         self.assertEqual(4, len(midx))
         self.assertEqual(2, midx.pack_count)
-        self.assertEqual(["pack-111.pack", "pack-222.pack"], midx.pack_names)
+        self.assertEqual(["pack-111.idx", "pack-222.idx"], midx.pack_names)
 
         # Objects should be findable across packs
         result = midx.object_offset(b"\x01" * 20)
         self.assertIsNotNone(result)
-        self.assertEqual("pack-111.pack", result[0])
+        self.assertEqual("pack-111.idx", result[0])
 
         result = midx.object_offset(b"\x02" * 20)
         self.assertIsNotNone(result)
-        self.assertEqual("pack-222.pack", result[0])
+        self.assertEqual("pack-222.idx", result[0])
 
     def test_write_large_offsets(self):
         """Test writing a MIDX file with large offsets (>= 2^31)."""
@@ -151,7 +151,7 @@ class MIDXWriteTests(TestCase):
         large_offset = 2**32  # Offset that requires LOFF chunk
         pack_entries = [
             (
-                "pack-large.pack",
+                "pack-large.idx",
                 [
                     (b"\x01" * 20, 100, 0),
                     (b"\x02" * 20, large_offset, 0),  # Large offset
@@ -185,7 +185,7 @@ class MIDXWriteTests(TestCase):
 
             pack_entries = [
                 (
-                    "pack-test.pack",
+                    "pack-test.idx",
                     [
                         (b"\xaa" * 20, 1000, 0),
                     ],
@@ -205,7 +205,7 @@ class MIDXWriteTests(TestCase):
             self.assertEqual(1, len(midx))
             result = midx.object_offset(b"\xaa" * 20)
             self.assertIsNotNone(result)
-            self.assertEqual("pack-test.pack", result[0])
+            self.assertEqual("pack-test.idx", result[0])
             self.assertEqual(1000, result[1])
 
 
@@ -217,7 +217,7 @@ class MIDXContainsTests(TestCase):
         f = BytesIO()
         pack_entries = [
             (
-                "pack-test.pack",
+                "pack-test.idx",
                 [
                     (b"\x01" * 20, 100, 0),
                     (b"\x02" * 20, 200, 0),
@@ -242,14 +242,14 @@ class MIDXIterEntriesTests(TestCase):
         f = BytesIO()
         pack_entries = [
             (
-                "pack-111.pack",
+                "pack-111.idx",
                 [
                     (b"\x01" * 20, 100, 0),
                     (b"\x03" * 20, 300, 0),
                 ],
             ),
             (
-                "pack-222.pack",
+                "pack-222.idx",
                 [
                     (b"\x02" * 20, 50, 0),
                 ],
@@ -265,13 +265,13 @@ class MIDXIterEntriesTests(TestCase):
 
         # Entries should be sorted by SHA
         self.assertEqual(b"\x01" * 20, entries[0][0])
-        self.assertEqual("pack-111.pack", entries[0][1])
+        self.assertEqual("pack-111.idx", entries[0][1])
         self.assertEqual(100, entries[0][2])
 
         self.assertEqual(b"\x02" * 20, entries[1][0])
-        self.assertEqual("pack-222.pack", entries[1][1])
+        self.assertEqual("pack-222.idx", entries[1][1])
         self.assertEqual(50, entries[1][2])
 
         self.assertEqual(b"\x03" * 20, entries[2][0])
-        self.assertEqual("pack-111.pack", entries[2][1])
+        self.assertEqual("pack-111.idx", entries[2][1])
         self.assertEqual(300, entries[2][2])