Ver código fonte

Add basic support for manyfiles feature. Fixes #1462

Jelmer Vernooij 2 meses atrás
pai
commit
2eaf3d4675
4 arquivos alterados com 259 adições e 10 exclusões
  1. 6 0
      NEWS
  2. 68 9
      dulwich/index.py
  3. 25 1
      dulwich/repo.py
  4. 160 0
      tests/test_index.py

+ 6 - 0
NEWS

@@ -1,5 +1,11 @@
 0.22.9	UNRELEASED
 
+ * Add support for Git's ``feature.manyFiles`` configuration and index version 4.
+   This enables compatibility with repositories using manyFiles feature and
+   provides faster index writes through optional hash skipping. Supports
+   ``feature.manyFiles``, ``index.version``, and ``index.skipHash`` configuration
+   options. Path prefix compression not yet implemented. (Jelmer Vernooij, #1462)
+
  * In dulwich.porcelain docstring, list functions by their Python identifiers.
    (Marnanel Thurman)
 

+ 68 - 9
dulwich/index.py

@@ -330,18 +330,57 @@ class UnsupportedIndexFormat(Exception):
         self.index_format_version = version
 
 
-def read_index(f: BinaryIO) -> Iterator[SerializedIndexEntry]:
-    """Read an index file, yielding the individual entries."""
+def read_index_header(f: BinaryIO) -> tuple[int, int]:
+    """Read an index header from a file.
+
+    Returns:
+      tuple of (version, num_entries)
+    """
     header = f.read(4)
     if header != b"DIRC":
         raise AssertionError(f"Invalid index file header: {header!r}")
     (version, num_entries) = struct.unpack(b">LL", f.read(4 * 2))
-    if version not in (1, 2, 3):
+    if version not in (1, 2, 3, 4):
         raise UnsupportedIndexFormat(version)
+    return version, num_entries
+
+
+def read_index(f: BinaryIO) -> Iterator[SerializedIndexEntry]:
+    """Read an index file, yielding the individual entries."""
+    version, num_entries = read_index_header(f)
     for i in range(num_entries):
         yield read_cache_entry(f, version)
 
 
+def read_index_dict_with_version(
+    f: BinaryIO,
+) -> tuple[dict[bytes, Union[IndexEntry, ConflictedIndexEntry]], int]:
+    """Read an index file and return it as a dictionary along with the version.
+
+    Returns:
+      tuple of (entries_dict, version)
+    """
+    version, num_entries = read_index_header(f)
+
+    ret: dict[bytes, Union[IndexEntry, ConflictedIndexEntry]] = {}
+    for i in range(num_entries):
+        entry = read_cache_entry(f, version)
+        stage = entry.stage()
+        if stage == Stage.NORMAL:
+            ret[entry.name] = IndexEntry.from_serialized(entry)
+        else:
+            existing = ret.setdefault(entry.name, ConflictedIndexEntry())
+            if isinstance(existing, IndexEntry):
+                raise AssertionError(f"Non-conflicted entry for {entry.name!r} exists")
+            if stage == Stage.MERGE_CONFLICT_ANCESTOR:
+                existing.ancestor = IndexEntry.from_serialized(entry)
+            elif stage == Stage.MERGE_CONFLICT_THIS:
+                existing.this = IndexEntry.from_serialized(entry)
+            elif stage == Stage.MERGE_CONFLICT_OTHER:
+                existing.other = IndexEntry.from_serialized(entry)
+    return ret, version
+
+
 def read_index_dict(f) -> dict[bytes, Union[IndexEntry, ConflictedIndexEntry]]:
     """Read an index file and return it as a dictionary.
        Dict Key is tuple of path and stage number, as
@@ -454,16 +493,25 @@ class Index:
 
     _byname: dict[bytes, Union[IndexEntry, ConflictedIndexEntry]]
 
-    def __init__(self, filename: Union[bytes, str], read=True) -> None:
+    def __init__(
+        self,
+        filename: Union[bytes, str],
+        read=True,
+        skip_hash: bool = False,
+        version: Optional[int] = None,
+    ) -> None:
         """Create an index object associated with the given filename.
 
         Args:
           filename: Path to the index file
           read: Whether to initialize the index from the given file, should it exist.
+          skip_hash: Whether to skip SHA1 hash when writing (for manyfiles feature)
+          version: Index format version to use (None = auto-detect from file or use default)
         """
         self._filename = filename
         # TODO(jelmer): Store the version returned by read_index
-        self._version = None
+        self._version = version
+        self._skip_hash = skip_hash
         self.clear()
         if read:
             self.read()
@@ -479,10 +527,19 @@ class Index:
         """Write current contents of index to disk."""
         f = GitFile(self._filename, "wb")
         try:
-            f = SHA1Writer(f)
-            write_index_dict(f, self._byname, version=self._version)
-        finally:
+            if self._skip_hash:
+                # When skipHash is enabled, write the index without computing SHA1
+                write_index_dict(f, self._byname, version=self._version)
+                # Write 20 zero bytes instead of SHA1
+                f.write(b"\x00" * 20)
+                f.close()
+            else:
+                f = SHA1Writer(f)
+                write_index_dict(f, self._byname, version=self._version)
+                f.close()
+        except:
             f.close()
+            raise
 
     def read(self) -> None:
         """Read current contents of index from disk."""
@@ -491,7 +548,9 @@ class Index:
         f = GitFile(self._filename, "rb")
         try:
             f = SHA1Reader(f)
-            self.update(read_index_dict(f))
+            entries, version = read_index_dict_with_version(f)
+            self._version = version
+            self.update(entries)
             # FIXME: Additional data?
             f.read(os.path.getsize(self._filename) - f.tell() - 20)
             f.check_sha(allow_empty=True)

+ 25 - 1
dulwich/repo.py

@@ -1369,7 +1369,31 @@ class Repo(BaseRepo):
 
         if not self.has_index():
             raise NoIndexPresent
-        return Index(self.index_path())
+
+        # Check for manyFiles feature configuration
+        config = self.get_config_stack()
+        many_files = config.get_boolean(b"feature", b"manyFiles", False)
+        skip_hash = False
+        index_version = None
+
+        if many_files:
+            # When feature.manyFiles is enabled, set index.version=4 and index.skipHash=true
+            try:
+                index_version_str = config.get(b"index", b"version")
+                index_version = int(index_version_str)
+            except KeyError:
+                index_version = 4  # Default to version 4 for manyFiles
+            skip_hash = config.get_boolean(b"index", b"skipHash", True)
+        else:
+            # Check for explicit index settings
+            try:
+                index_version_str = config.get(b"index", b"version")
+                index_version = int(index_version_str)
+            except KeyError:
+                index_version = None
+            skip_hash = config.get_boolean(b"index", b"skipHash", False)
+
+        return Index(self.index_path(), skip_hash=skip_hash, version=index_version)
 
     def has_index(self) -> bool:
         """Check if an index is present."""

+ 160 - 0
tests/test_index.py

@@ -1211,3 +1211,163 @@ class TestIndexEntryFromPath(TestCase):
         self.assertEqual(
             sorted(changes), [b"conflict", b"file1", b"file2", b"file3", b"file4"]
         )
+
+
+class TestManyFilesFeature(TestCase):
+    """Tests for the manyFiles feature (index version 4 and skipHash)."""
+
+    def setUp(self):
+        self.tempdir = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, self.tempdir)
+
+    def test_index_version_4_parsing(self):
+        """Test that index version 4 files can be parsed."""
+        index_path = os.path.join(self.tempdir, "index")
+
+        # Create an index with version 4
+        index = Index(index_path, read=False, version=4)
+
+        # Add some entries
+        entry = IndexEntry(
+            ctime=(1234567890, 0),
+            mtime=(1234567890, 0),
+            dev=1,
+            ino=1,
+            mode=0o100644,
+            uid=1000,
+            gid=1000,
+            size=5,
+            sha=b"0" * 40,
+        )
+        index[b"test.txt"] = entry
+
+        # Write and read back
+        index.write()
+
+        # Read the index back
+        index2 = Index(index_path)
+        self.assertEqual(index2._version, 4)
+        self.assertIn(b"test.txt", index2)
+
+    def test_skip_hash_feature(self):
+        """Test that skipHash feature works correctly."""
+        index_path = os.path.join(self.tempdir, "index")
+
+        # Create an index with skipHash enabled
+        index = Index(index_path, read=False, skip_hash=True)
+
+        # Add some entries
+        entry = IndexEntry(
+            ctime=(1234567890, 0),
+            mtime=(1234567890, 0),
+            dev=1,
+            ino=1,
+            mode=0o100644,
+            uid=1000,
+            gid=1000,
+            size=5,
+            sha=b"0" * 40,
+        )
+        index[b"test.txt"] = entry
+
+        # Write the index
+        index.write()
+
+        # Verify the file was written with zero hash
+        with open(index_path, "rb") as f:
+            f.seek(-20, 2)  # Seek to last 20 bytes
+            trailing_hash = f.read(20)
+            self.assertEqual(trailing_hash, b"\x00" * 20)
+
+        # Verify we can still read it back
+        index2 = Index(index_path)
+        self.assertIn(b"test.txt", index2)
+
+    def test_version_4_no_padding(self):
+        """Test that version 4 entries have no padding."""
+        # Create a version 4 entry and version 2 entry to compare
+        entry = SerializedIndexEntry(
+            name=b"test.txt",
+            ctime=(1234567890, 0),
+            mtime=(1234567890, 0),
+            dev=1,
+            ino=1,
+            mode=0o100644,
+            uid=1000,
+            gid=1000,
+            size=5,
+            sha=b"0" * 40,
+            flags=len(b"test.txt"),
+            extended_flags=0,
+        )
+
+        # Test version 2 (with padding)
+        buf_v2 = BytesIO()
+        from dulwich.index import write_cache_entry
+
+        write_cache_entry(buf_v2, entry, version=2)
+        v2_data = buf_v2.getvalue()
+
+        # Test version 4 (without padding)
+        buf_v4 = BytesIO()
+        write_cache_entry(buf_v4, entry, version=4)
+        v4_data = buf_v4.getvalue()
+
+        # Version 4 should be shorter due to no padding
+        self.assertLess(len(v4_data), len(v2_data))
+
+        # Both should parse correctly
+        buf_v2.seek(0)
+        from dulwich.index import read_cache_entry
+
+        parsed_v2 = read_cache_entry(buf_v2, version=2)
+
+        buf_v4.seek(0)
+        parsed_v4 = read_cache_entry(buf_v4, version=4)
+
+        # Both should have the same content
+        self.assertEqual(parsed_v2.name, parsed_v4.name)
+        self.assertEqual(parsed_v2.sha, parsed_v4.sha)
+
+
+class TestManyFilesRepoIntegration(TestCase):
+    """Tests for manyFiles feature integration with Repo."""
+
+    def setUp(self):
+        self.tempdir = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, self.tempdir)
+
+    def test_repo_with_manyfiles_config(self):
+        """Test that a repository with feature.manyFiles=true uses the right settings."""
+        from dulwich.repo import Repo
+
+        # Create a new repository
+        repo = Repo.init(self.tempdir)
+
+        # Set feature.manyFiles=true in config
+        config = repo.get_config()
+        config.set(b"feature", b"manyFiles", b"true")
+        config.write_to_path()
+
+        # Open the index - should have skipHash enabled and version 4
+        index = repo.open_index()
+        self.assertTrue(index._skip_hash)
+        self.assertEqual(index._version, 4)
+
+    def test_repo_with_explicit_index_settings(self):
+        """Test that explicit index.version and index.skipHash work."""
+        from dulwich.repo import Repo
+
+        # Create a new repository
+        repo = Repo.init(self.tempdir)
+
+        # Set explicit index settings
+        config = repo.get_config()
+        config.set(b"index", b"version", b"3")
+        config.set(b"index", b"skipHash", b"false")
+        config.write_to_path()
+
+        # Open the index - should respect explicit settings
+        index = repo.open_index()
+        self.assertFalse(index._skip_hash)
+        self.assertEqual(index._version, 3)