2 meses atrás · 2eaf3d4675
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,11 @@
 
				 0.22.9	UNRELEASED
			
 
				 
			
 
				+ * Add support for Git's ``feature.manyFiles`` configuration and index version 4.
			
 
				+   This enables compatibility with repositories using manyFiles feature and
			
 
				+   provides faster index writes through optional hash skipping. Supports
			
 
				+   ``feature.manyFiles``, ``index.version``, and ``index.skipHash`` configuration
			
 
				+   options. Path prefix compression not yet implemented. (Jelmer Vernooĳ, #1462)
			
 
				+
			
 
				  * In dulwich.porcelain docstring, list functions by their Python identifiers.
			
 
				    (Marnanel Thurman)
			
 
				 
			
--- a/dulwich/index.py
+++ b/dulwich/index.py
@@ -330,18 +330,57 @@ class UnsupportedIndexFormat(Exception):
 
				         self.index_format_version = version
			
 
				 
			
 
				 
			
 
				-def read_index(f: BinaryIO) -> Iterator[SerializedIndexEntry]:
			
 
				-    """Read an index file, yielding the individual entries."""
			
 
				+def read_index_header(f: BinaryIO) -> tuple[int, int]:
			
 
				+    """Read an index header from a file.
			
 
				+
			
 
				+    Returns:
			
 
				+      tuple of (version, num_entries)
			
 
				+    """
			
 
				     header = f.read(4)
			
 
				     if header != b"DIRC":
			
 
				         raise AssertionError(f"Invalid index file header: {header!r}")
			
 
				     (version, num_entries) = struct.unpack(b">LL", f.read(4 * 2))
			
 
				-    if version not in (1, 2, 3):
			
 
				+    if version not in (1, 2, 3, 4):
			
 
				         raise UnsupportedIndexFormat(version)
			
 
				+    return version, num_entries
			
 
				+
			
 
				+
			
 
				+def read_index(f: BinaryIO) -> Iterator[SerializedIndexEntry]:
			
 
				+    """Read an index file, yielding the individual entries."""
			
 
				+    version, num_entries = read_index_header(f)
			
 
				     for i in range(num_entries):
			
 
				         yield read_cache_entry(f, version)
			
 
				 
			
 
				 
			
 
				+def read_index_dict_with_version(
			
 
				+    f: BinaryIO,
			
 
				+) -> tuple[dict[bytes, Union[IndexEntry, ConflictedIndexEntry]], int]:
			
 
				+    """Read an index file and return it as a dictionary along with the version.
			
 
				+
			
 
				+    Returns:
			
 
				+      tuple of (entries_dict, version)
			
 
				+    """
			
 
				+    version, num_entries = read_index_header(f)
			
 
				+
			
 
				+    ret: dict[bytes, Union[IndexEntry, ConflictedIndexEntry]] = {}
			
 
				+    for i in range(num_entries):
			
 
				+        entry = read_cache_entry(f, version)
			
 
				+        stage = entry.stage()
			
 
				+        if stage == Stage.NORMAL:
			
 
				+            ret[entry.name] = IndexEntry.from_serialized(entry)
			
 
				+        else:
			
 
				+            existing = ret.setdefault(entry.name, ConflictedIndexEntry())
			
 
				+            if isinstance(existing, IndexEntry):
			
 
				+                raise AssertionError(f"Non-conflicted entry for {entry.name!r} exists")
			
 
				+            if stage == Stage.MERGE_CONFLICT_ANCESTOR:
			
 
				+                existing.ancestor = IndexEntry.from_serialized(entry)
			
 
				+            elif stage == Stage.MERGE_CONFLICT_THIS:
			
 
				+                existing.this = IndexEntry.from_serialized(entry)
			
 
				+            elif stage == Stage.MERGE_CONFLICT_OTHER:
			
 
				+                existing.other = IndexEntry.from_serialized(entry)
			
 
				+    return ret, version
			
 
				+
			
 
				+
			
 
				 def read_index_dict(f) -> dict[bytes, Union[IndexEntry, ConflictedIndexEntry]]:
			
 
				     """Read an index file and return it as a dictionary.
			
 
				        Dict Key is tuple of path and stage number, as
			
@@ -454,16 +493,25 @@ class Index:
 
				 
			
 
				     _byname: dict[bytes, Union[IndexEntry, ConflictedIndexEntry]]
			
 
				 
			
 
				-    def __init__(self, filename: Union[bytes, str], read=True) -> None:
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        filename: Union[bytes, str],
			
 
				+        read=True,
			
 
				+        skip_hash: bool = False,
			
 
				+        version: Optional[int] = None,
			
 
				+    ) -> None:
			
 
				         """Create an index object associated with the given filename.
			
 
				 
			
 
				         Args:
			
 
				           filename: Path to the index file
			
 
				           read: Whether to initialize the index from the given file, should it exist.
			
 
				+          skip_hash: Whether to skip SHA1 hash when writing (for manyfiles feature)
			
 
				+          version: Index format version to use (None = auto-detect from file or use default)
			
 
				         """
			
 
				         self._filename = filename
			
 
				         # TODO(jelmer): Store the version returned by read_index
			
 
				-        self._version = None
			
 
				+        self._version = version
			
 
				+        self._skip_hash = skip_hash
			
 
				         self.clear()
			
 
				         if read:
			
 
				             self.read()
			
@@ -479,10 +527,19 @@ class Index:
 
				         """Write current contents of index to disk."""
			
 
				         f = GitFile(self._filename, "wb")
			
 
				         try:
			
 
				-            f = SHA1Writer(f)
			
 
				-            write_index_dict(f, self._byname, version=self._version)
			
 
				-        finally:
			
 
				+            if self._skip_hash:
			
 
				+                # When skipHash is enabled, write the index without computing SHA1
			
 
				+                write_index_dict(f, self._byname, version=self._version)
			
 
				+                # Write 20 zero bytes instead of SHA1
			
 
				+                f.write(b"\x00" * 20)
			
 
				+                f.close()
			
 
				+            else:
			
 
				+                f = SHA1Writer(f)
			
 
				+                write_index_dict(f, self._byname, version=self._version)
			
 
				+                f.close()
			
 
				+        except:
			
 
				             f.close()
			
 
				+            raise
			
 
				 
			
 
				     def read(self) -> None:
			
 
				         """Read current contents of index from disk."""
			
@@ -491,7 +548,9 @@ class Index:
 
				         f = GitFile(self._filename, "rb")
			
 
				         try:
			
 
				             f = SHA1Reader(f)
			
 
				-            self.update(read_index_dict(f))
			
 
				+            entries, version = read_index_dict_with_version(f)
			
 
				+            self._version = version
			
 
				+            self.update(entries)
			
 
				             # FIXME: Additional data?
			
 
				             f.read(os.path.getsize(self._filename) - f.tell() - 20)
			
 
				             f.check_sha(allow_empty=True)
			
--- a/dulwich/repo.py
+++ b/dulwich/repo.py
@@ -1369,7 +1369,31 @@ class Repo(BaseRepo):
 
				 
			
 
				         if not self.has_index():
			
 
				             raise NoIndexPresent
			
 
				-        return Index(self.index_path())
			
 
				+
			
 
				+        # Check for manyFiles feature configuration
			
 
				+        config = self.get_config_stack()
			
 
				+        many_files = config.get_boolean(b"feature", b"manyFiles", False)
			
 
				+        skip_hash = False
			
 
				+        index_version = None
			
 
				+
			
 
				+        if many_files:
			
 
				+            # When feature.manyFiles is enabled, set index.version=4 and index.skipHash=true
			
 
				+            try:
			
 
				+                index_version_str = config.get(b"index", b"version")
			
 
				+                index_version = int(index_version_str)
			
 
				+            except KeyError:
			
 
				+                index_version = 4  # Default to version 4 for manyFiles
			
 
				+            skip_hash = config.get_boolean(b"index", b"skipHash", True)
			
 
				+        else:
			
 
				+            # Check for explicit index settings
			
 
				+            try:
			
 
				+                index_version_str = config.get(b"index", b"version")
			
 
				+                index_version = int(index_version_str)
			
 
				+            except KeyError:
			
 
				+                index_version = None
			
 
				+            skip_hash = config.get_boolean(b"index", b"skipHash", False)
			
 
				+
			
 
				+        return Index(self.index_path(), skip_hash=skip_hash, version=index_version)
			
 
				 
			
 
				     def has_index(self) -> bool:
			
 
				         """Check if an index is present."""
			
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1211,3 +1211,163 @@ class TestIndexEntryFromPath(TestCase):
 
				         self.assertEqual(
			
 
				             sorted(changes), [b"conflict", b"file1", b"file2", b"file3", b"file4"]
			
 
				         )
			
 
				+
			
 
				+
			
 
				+class TestManyFilesFeature(TestCase):
			
 
				+    """Tests for the manyFiles feature (index version 4 and skipHash)."""
			
 
				+
			
 
				+    def setUp(self):
			
 
				+        self.tempdir = tempfile.mkdtemp()
			
 
				+        self.addCleanup(shutil.rmtree, self.tempdir)
			
 
				+
			
 
				+    def test_index_version_4_parsing(self):
			
 
				+        """Test that index version 4 files can be parsed."""
			
 
				+        index_path = os.path.join(self.tempdir, "index")
			
 
				+
			
 
				+        # Create an index with version 4
			
 
				+        index = Index(index_path, read=False, version=4)
			
 
				+
			
 
				+        # Add some entries
			
 
				+        entry = IndexEntry(
			
 
				+            ctime=(1234567890, 0),
			
 
				+            mtime=(1234567890, 0),
			
 
				+            dev=1,
			
 
				+            ino=1,
			
 
				+            mode=0o100644,
			
 
				+            uid=1000,
			
 
				+            gid=1000,
			
 
				+            size=5,
			
 
				+            sha=b"0" * 40,
			
 
				+        )
			
 
				+        index[b"test.txt"] = entry
			
 
				+
			
 
				+        # Write and read back
			
 
				+        index.write()
			
 
				+
			
 
				+        # Read the index back
			
 
				+        index2 = Index(index_path)
			
 
				+        self.assertEqual(index2._version, 4)
			
 
				+        self.assertIn(b"test.txt", index2)
			
 
				+
			
 
				+    def test_skip_hash_feature(self):
			
 
				+        """Test that skipHash feature works correctly."""
			
 
				+        index_path = os.path.join(self.tempdir, "index")
			
 
				+
			
 
				+        # Create an index with skipHash enabled
			
 
				+        index = Index(index_path, read=False, skip_hash=True)
			
 
				+
			
 
				+        # Add some entries
			
 
				+        entry = IndexEntry(
			
 
				+            ctime=(1234567890, 0),
			
 
				+            mtime=(1234567890, 0),
			
 
				+            dev=1,
			
 
				+            ino=1,
			
 
				+            mode=0o100644,
			
 
				+            uid=1000,
			
 
				+            gid=1000,
			
 
				+            size=5,
			
 
				+            sha=b"0" * 40,
			
 
				+        )
			
 
				+        index[b"test.txt"] = entry
			
 
				+
			
 
				+        # Write the index
			
 
				+        index.write()
			
 
				+
			
 
				+        # Verify the file was written with zero hash
			
 
				+        with open(index_path, "rb") as f:
			
 
				+            f.seek(-20, 2)  # Seek to last 20 bytes
			
 
				+            trailing_hash = f.read(20)
			
 
				+            self.assertEqual(trailing_hash, b"\x00" * 20)
			
 
				+
			
 
				+        # Verify we can still read it back
			
 
				+        index2 = Index(index_path)
			
 
				+        self.assertIn(b"test.txt", index2)
			
 
				+
			
 
				+    def test_version_4_no_padding(self):
			
 
				+        """Test that version 4 entries have no padding."""
			
 
				+        # Create a version 4 entry and version 2 entry to compare
			
 
				+        entry = SerializedIndexEntry(
			
 
				+            name=b"test.txt",
			
 
				+            ctime=(1234567890, 0),
			
 
				+            mtime=(1234567890, 0),
			
 
				+            dev=1,
			
 
				+            ino=1,
			
 
				+            mode=0o100644,
			
 
				+            uid=1000,
			
 
				+            gid=1000,
			
 
				+            size=5,
			
 
				+            sha=b"0" * 40,
			
 
				+            flags=len(b"test.txt"),
			
 
				+            extended_flags=0,
			
 
				+        )
			
 
				+
			
 
				+        # Test version 2 (with padding)
			
 
				+        buf_v2 = BytesIO()
			
 
				+        from dulwich.index import write_cache_entry
			
 
				+
			
 
				+        write_cache_entry(buf_v2, entry, version=2)
			
 
				+        v2_data = buf_v2.getvalue()
			
 
				+
			
 
				+        # Test version 4 (without padding)
			
 
				+        buf_v4 = BytesIO()
			
 
				+        write_cache_entry(buf_v4, entry, version=4)
			
 
				+        v4_data = buf_v4.getvalue()
			
 
				+
			
 
				+        # Version 4 should be shorter due to no padding
			
 
				+        self.assertLess(len(v4_data), len(v2_data))
			
 
				+
			
 
				+        # Both should parse correctly
			
 
				+        buf_v2.seek(0)
			
 
				+        from dulwich.index import read_cache_entry
			
 
				+
			
 
				+        parsed_v2 = read_cache_entry(buf_v2, version=2)
			
 
				+
			
 
				+        buf_v4.seek(0)
			
 
				+        parsed_v4 = read_cache_entry(buf_v4, version=4)
			
 
				+
			
 
				+        # Both should have the same content
			
 
				+        self.assertEqual(parsed_v2.name, parsed_v4.name)
			
 
				+        self.assertEqual(parsed_v2.sha, parsed_v4.sha)
			
 
				+
			
 
				+
			
 
				+class TestManyFilesRepoIntegration(TestCase):
			
 
				+    """Tests for manyFiles feature integration with Repo."""
			
 
				+
			
 
				+    def setUp(self):
			
 
				+        self.tempdir = tempfile.mkdtemp()
			
 
				+        self.addCleanup(shutil.rmtree, self.tempdir)
			
 
				+
			
 
				+    def test_repo_with_manyfiles_config(self):
			
 
				+        """Test that a repository with feature.manyFiles=true uses the right settings."""
			
 
				+        from dulwich.repo import Repo
			
 
				+
			
 
				+        # Create a new repository
			
 
				+        repo = Repo.init(self.tempdir)
			
 
				+
			
 
				+        # Set feature.manyFiles=true in config
			
 
				+        config = repo.get_config()
			
 
				+        config.set(b"feature", b"manyFiles", b"true")
			
 
				+        config.write_to_path()
			
 
				+
			
 
				+        # Open the index - should have skipHash enabled and version 4
			
 
				+        index = repo.open_index()
			
 
				+        self.assertTrue(index._skip_hash)
			
 
				+        self.assertEqual(index._version, 4)
			
 
				+
			
 
				+    def test_repo_with_explicit_index_settings(self):
			
 
				+        """Test that explicit index.version and index.skipHash work."""
			
 
				+        from dulwich.repo import Repo
			
 
				+
			
 
				+        # Create a new repository
			
 
				+        repo = Repo.init(self.tempdir)
			
 
				+
			
 
				+        # Set explicit index settings
			
 
				+        config = repo.get_config()
			
 
				+        config.set(b"index", b"version", b"3")
			
 
				+        config.set(b"index", b"skipHash", b"false")
			
 
				+        config.write_to_path()
			
 
				+
			
 
				+        # Open the index - should respect explicit settings
			
 
				+        index = repo.open_index()
			
 
				+        self.assertFalse(index._skip_hash)
			
 
				+        self.assertEqual(index._version, 3)