Przeglądaj źródła

Add sparse index support for improved performance with large repositories (#1946)

- Read and write sparse directory entries with SKIP_WORKTREE flag
- Index expansion (sparse -> full) and collapse (full -> sparse)
operations
- Sparse directory extension (sdir) for compatibility signaling
- Detection methods for identifying sparse directory entries

Fixes #1797
Jelmer Vernooij 3 miesięcy temu
rodzic
commit
1702e77e70
7 zmienionych plików z 795 dodań i 14 usunięć
  1. 10 10
      Cargo.lock
  2. 2 2
      Cargo.toml
  3. 7 0
      NEWS
  4. 1 1
      dulwich/__init__.py
  5. 282 1
      dulwich/index.py
  6. 220 0
      tests/compat/test_index.py
  7. 273 0
      tests/test_index.py

+ 10 - 10
Cargo.lock

@@ -87,9 +87,9 @@ dependencies = [
 
 [[package]]
 name = "pyo3"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383"
+checksum = "fa8e48c12afdeb26aa4be4e5c49fb5e11c3efa0878db783a960eea2b9ac6dd19"
 dependencies = [
  "indoc",
  "libc",
@@ -104,18 +104,18 @@ dependencies = [
 
 [[package]]
 name = "pyo3-build-config"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f"
+checksum = "bc1989dbf2b60852e0782c7487ebf0b4c7f43161ffe820849b56cf05f945cee1"
 dependencies = [
  "target-lexicon",
 ]
 
 [[package]]
 name = "pyo3-ffi"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105"
+checksum = "c808286da7500385148930152e54fb6883452033085bf1f857d85d4e82ca905c"
 dependencies = [
  "libc",
  "pyo3-build-config",
@@ -123,9 +123,9 @@ dependencies = [
 
 [[package]]
 name = "pyo3-macros"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded"
+checksum = "83a0543c16be0d86cf0dbf2e2b636ece9fd38f20406bb43c255e0bc368095f92"
 dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
@@ -135,9 +135,9 @@ dependencies = [
 
 [[package]]
 name = "pyo3-macros-backend"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf"
+checksum = "2a00da2ce064dcd582448ea24a5a26fa9527e0483103019b741ebcbe632dcd29"
 dependencies = [
  "heck",
  "proc-macro2",

+ 2 - 2
Cargo.toml

@@ -3,7 +3,7 @@ members = ["crates/*"]
 resolver = "2"
 
 [workspace.dependencies]
-pyo3 = ">=0.25,<0.27"
+pyo3 = ">=0.25,<0.28"
 
 [workspace.package]
-version = "0.24.6"
+version = "0.24.7"

+ 7 - 0
NEWS

@@ -1,3 +1,10 @@
+0.24.7	UNRELEASED
+
+ * Add sparse index support for improved performance with large repositories.
+   Implements reading and writing of sparse directory entries, index expansion/
+   collapse operations, and the 'sdir' extension.
+   (Jelmer Vernooij, #1797)
+
 0.24.6	2025-10-19
 
  * Fix import failure when ``sys.stdin`` is ``None``. The ``dulwich.server``

+ 1 - 1
dulwich/__init__.py

@@ -31,7 +31,7 @@ if sys.version_info >= (3, 10):
 else:
     from typing_extensions import ParamSpec
 
-__version__ = (0, 24, 6)
+__version__ = (0, 24, 7)
 
 __all__ = ["__version__", "replace_me"]
 

+ 282 - 1
dulwich/index.py

@@ -28,7 +28,7 @@ import stat
 import struct
 import sys
 import types
-from collections.abc import Generator, Iterable, Iterator, Mapping, Sequence
+from collections.abc import Generator, Iterable, Iterator, Mapping, Sequence, Set
 from dataclasses import dataclass
 from enum import Enum
 from typing import (
@@ -94,6 +94,7 @@ REUC_EXTENSION = b"REUC"
 UNTR_EXTENSION = b"UNTR"
 EOIE_EXTENSION = b"EOIE"
 IEOT_EXTENSION = b"IEOT"
+SDIR_EXTENSION = b"sdir"  # Sparse directory extension
 
 
 def _encode_varint(value: int) -> bytes:
@@ -303,6 +304,25 @@ class SerializedIndexEntry:
         """
         return Stage((self.flags & FLAG_STAGEMASK) >> FLAG_STAGESHIFT)
 
+    def is_sparse_dir(self) -> bool:
+        """Check if this entry represents a sparse directory.
+
+        A sparse directory entry is a collapsed representation of an entire
+        directory tree in a sparse index. It has:
+        - Directory mode (0o040000)
+        - SKIP_WORKTREE flag set
+        - Path ending with '/'
+        - SHA pointing to a tree object
+
+        Returns:
+          True if entry is a sparse directory entry
+        """
+        return (
+            stat.S_ISDIR(self.mode)
+            and bool(self.extended_flags & EXTENDED_FLAG_SKIP_WORKTREE)
+            and self.name.endswith(b"/")
+        )
+
 
 @dataclass
 class IndexExtension:
@@ -327,6 +347,8 @@ class IndexExtension:
             return ResolveUndoExtension.from_bytes(data)
         elif signature == UNTR_EXTENSION:
             return UntrackedExtension.from_bytes(data)
+        elif signature == SDIR_EXTENSION:
+            return SparseDirExtension.from_bytes(data)
         else:
             # Unknown extension - just store raw data
             return cls(signature, data)
@@ -430,6 +452,41 @@ class UntrackedExtension(IndexExtension):
         return cls(data)
 
 
+class SparseDirExtension(IndexExtension):
+    """Sparse directory extension.
+
+    This extension indicates that the index contains sparse directory entries.
+    Tools that don't understand sparse index should avoid interacting with
+    the index when this extension is present.
+
+    The extension data is empty - its presence is the signal.
+    """
+
+    def __init__(self) -> None:
+        """Initialize SparseDirExtension."""
+        super().__init__(SDIR_EXTENSION, b"")
+
+    @classmethod
+    def from_bytes(cls, data: bytes) -> "SparseDirExtension":
+        """Parse SparseDirExtension from bytes.
+
+        Args:
+          data: Raw bytes to parse (should be empty)
+
+        Returns:
+          SparseDirExtension instance
+        """
+        return cls()
+
+    def to_bytes(self) -> bytes:
+        """Serialize SparseDirExtension to bytes.
+
+        Returns:
+          Empty bytes (extension presence is the signal)
+        """
+        return b""
+
+
 @dataclass
 class IndexEntry:
     """Represents an entry in the Git index.
@@ -532,6 +589,28 @@ class IndexEntry:
             if self.extended_flags == 0:
                 self.flags &= ~FLAG_EXTENDED
 
+    def is_sparse_dir(self, name: bytes) -> bool:
+        """Check if this entry represents a sparse directory.
+
+        A sparse directory entry is a collapsed representation of an entire
+        directory tree in a sparse index. It has:
+        - Directory mode (0o040000)
+        - SKIP_WORKTREE flag set
+        - Path ending with '/'
+        - SHA pointing to a tree object
+
+        Args:
+          name: The path name for this entry (IndexEntry doesn't store name)
+
+        Returns:
+          True if entry is a sparse directory entry
+        """
+        return (
+            stat.S_ISDIR(self.mode)
+            and bool(self.extended_flags & EXTENDED_FLAG_SKIP_WORKTREE)
+            and name.endswith(b"/")
+        )
+
 
 class ConflictedIndexEntry:
     """Index entry that represents a conflict."""
@@ -1219,6 +1298,208 @@ class Index:
         """
         return commit_tree(object_store, self.iterobjects())
 
+    def is_sparse(self) -> bool:
+        """Check if this index contains sparse directory entries.
+
+        Returns:
+          True if any sparse directory extension is present
+        """
+        return any(isinstance(ext, SparseDirExtension) for ext in self._extensions)
+
+    def ensure_full_index(self, object_store: "BaseObjectStore") -> None:
+        """Expand all sparse directory entries into full file entries.
+
+        This converts a sparse index into a full index by recursively
+        expanding any sparse directory entries into their constituent files.
+
+        Args:
+          object_store: Object store to read tree objects from
+
+        Raises:
+          KeyError: If a tree object referenced by a sparse dir entry doesn't exist
+        """
+        if not self.is_sparse():
+            return
+
+        # Find all sparse directory entries
+        sparse_dirs = []
+        for path, entry in list(self._byname.items()):
+            if isinstance(entry, IndexEntry) and entry.is_sparse_dir(path):
+                sparse_dirs.append((path, entry))
+
+        # Expand each sparse directory
+        for path, entry in sparse_dirs:
+            # Remove the sparse directory entry
+            del self._byname[path]
+
+            # Get the tree object
+            tree = object_store[entry.sha]
+            if not isinstance(tree, Tree):
+                raise ValueError(f"Sparse directory {path!r} points to non-tree object")
+
+            # Recursively add all entries from the tree
+            self._expand_tree(path.rstrip(b"/"), tree, object_store, entry)
+
+        # Remove the sparse directory extension
+        self._extensions = [
+            ext for ext in self._extensions if not isinstance(ext, SparseDirExtension)
+        ]
+
+    def _expand_tree(
+        self,
+        prefix: bytes,
+        tree: Tree,
+        object_store: "BaseObjectStore",
+        template_entry: IndexEntry,
+    ) -> None:
+        """Recursively expand a tree into index entries.
+
+        Args:
+          prefix: Path prefix for entries (without trailing slash)
+          tree: Tree object to expand
+          object_store: Object store to read nested trees from
+          template_entry: Template entry to copy metadata from
+        """
+        for name, mode, sha in tree.items():
+            if prefix:
+                full_path = prefix + b"/" + name
+            else:
+                full_path = name
+
+            if stat.S_ISDIR(mode):
+                # Recursively expand subdirectories
+                subtree = object_store[sha]
+                if not isinstance(subtree, Tree):
+                    raise ValueError(
+                        f"Directory entry {full_path!r} points to non-tree object"
+                    )
+                self._expand_tree(full_path, subtree, object_store, template_entry)
+            else:
+                # Create an index entry for this file
+                # Use the template entry for metadata but with the file's sha and mode
+                new_entry = IndexEntry(
+                    ctime=template_entry.ctime,
+                    mtime=template_entry.mtime,
+                    dev=template_entry.dev,
+                    ino=template_entry.ino,
+                    mode=mode,
+                    uid=template_entry.uid,
+                    gid=template_entry.gid,
+                    size=0,  # Size is unknown from tree
+                    sha=sha,
+                    flags=0,
+                    extended_flags=0,  # Don't copy skip-worktree flag
+                )
+                self._byname[full_path] = new_entry
+
+    def convert_to_sparse(
+        self,
+        object_store: "BaseObjectStore",
+        tree_sha: bytes,
+        sparse_dirs: Set[bytes],
+    ) -> None:
+        """Convert full index entries to sparse directory entries.
+
+        This collapses directories that are entirely outside the sparse
+        checkout cone into single sparse directory entries.
+
+        Args:
+          object_store: Object store to read tree objects
+          tree_sha: SHA of the tree (usually HEAD) to base sparse dirs on
+          sparse_dirs: Set of directory paths (with trailing /) to collapse
+
+        Raises:
+          KeyError: If tree_sha or a subdirectory doesn't exist
+        """
+        if not sparse_dirs:
+            return
+
+        # Get the base tree
+        tree = object_store[tree_sha]
+        if not isinstance(tree, Tree):
+            raise ValueError(f"tree_sha {tree_sha!r} is not a tree object")
+
+        # For each sparse directory, find its tree SHA and create sparse entry
+        for dir_path in sparse_dirs:
+            dir_path_stripped = dir_path.rstrip(b"/")
+
+            # Find the tree SHA for this directory
+            subtree_sha = self._find_subtree_sha(tree, dir_path_stripped, object_store)
+            if subtree_sha is None:
+                # Directory doesn't exist in tree, skip it
+                continue
+
+            # Remove all entries under this directory
+            entries_to_remove = [
+                path
+                for path in self._byname
+                if path.startswith(dir_path) or path == dir_path_stripped
+            ]
+            for path in entries_to_remove:
+                del self._byname[path]
+
+            # Create a sparse directory entry
+            # Use minimal metadata since it's not a real file
+            sparse_entry = IndexEntry(
+                ctime=0,
+                mtime=0,
+                dev=0,
+                ino=0,
+                mode=stat.S_IFDIR,
+                uid=0,
+                gid=0,
+                size=0,
+                sha=subtree_sha,
+                flags=0,
+                extended_flags=EXTENDED_FLAG_SKIP_WORKTREE,
+            )
+            self._byname[dir_path] = sparse_entry
+
+        # Add sparse directory extension if not present
+        if not self.is_sparse():
+            self._extensions.append(SparseDirExtension())
+
+    def _find_subtree_sha(
+        self,
+        tree: Tree,
+        path: bytes,
+        object_store: "BaseObjectStore",
+    ) -> Optional[bytes]:
+        """Find the SHA of a subtree at a given path.
+
+        Args:
+          tree: Root tree object to search in
+          path: Path to the subdirectory (no trailing slash)
+          object_store: Object store to read nested trees from
+
+        Returns:
+          SHA of the subtree, or None if path doesn't exist
+        """
+        if not path:
+            return tree.id
+
+        parts = path.split(b"/")
+        current_tree = tree
+
+        for part in parts:
+            # Look for this part in the current tree
+            try:
+                mode, sha = current_tree[part]
+            except KeyError:
+                return None
+
+            if not stat.S_ISDIR(mode):
+                # Path component is a file, not a directory
+                return None
+
+            # Load the next tree
+            obj = object_store[sha]
+            if not isinstance(obj, Tree):
+                return None
+            current_tree = obj
+
+        return current_tree.id
+
 
 def commit_tree(
     object_store: ObjectContainer, blobs: Iterable[tuple[bytes, bytes, int]]

+ 220 - 0
tests/compat/test_index.py

@@ -962,3 +962,223 @@ class IndexV4CompatTestCase(CompatTestCase):
         self.assertIn(b"unchanged.txt", entries)
         self.assertNotIn(b"old1.txt", entries)
         self.assertNotIn(b"old2.txt", entries)
+
+
+class SparseIndexCompatTestCase(CompatTestCase):
+    """Tests for Git sparse index compatibility with C Git."""
+
+    def setUp(self) -> None:
+        super().setUp()
+        self.tempdir = tempfile.mkdtemp()
+        self.addCleanup(self._cleanup)
+
+    def _cleanup(self) -> None:
+        import shutil
+
+        shutil.rmtree(self.tempdir, ignore_errors=True)
+
+    def test_read_sparse_index_created_by_git(self) -> None:
+        """Test that Dulwich can read a sparse index created by Git."""
+        # Sparse index requires Git 2.37+
+        require_git_version((2, 37, 0))
+
+        repo_path = os.path.join(self.tempdir, "test_repo")
+        os.mkdir(repo_path)
+
+        # Initialize repo
+        run_git_or_fail(["init"], cwd=repo_path)
+        run_git_or_fail(["config", "core.sparseCheckout", "true"], cwd=repo_path)
+        run_git_or_fail(
+            ["config", "index.sparse", "true"], cwd=repo_path
+        )  # Enable sparse index
+
+        # Create directory structure
+        os.makedirs(os.path.join(repo_path, "included"), exist_ok=True)
+        os.makedirs(os.path.join(repo_path, "excluded", "subdir"), exist_ok=True)
+
+        # Create files
+        with open(os.path.join(repo_path, "included", "file1.txt"), "w") as f:
+            f.write("included file 1\n")
+        with open(os.path.join(repo_path, "included", "file2.txt"), "w") as f:
+            f.write("included file 2\n")
+        with open(os.path.join(repo_path, "excluded", "file3.txt"), "w") as f:
+            f.write("excluded file 3\n")
+        with open(os.path.join(repo_path, "excluded", "subdir", "file4.txt"), "w") as f:
+            f.write("excluded file 4\n")
+        with open(os.path.join(repo_path, "root.txt"), "w") as f:
+            f.write("root file\n")
+
+        # Add and commit all files
+        run_git_or_fail(["add", "."], cwd=repo_path)
+        run_git_or_fail(["commit", "-m", "initial"], cwd=repo_path)
+
+        # Set up sparse-checkout to include only "included/" and root files
+        sparse_checkout_path = os.path.join(
+            repo_path, ".git", "info", "sparse-checkout"
+        )
+        os.makedirs(os.path.dirname(sparse_checkout_path), exist_ok=True)
+        with open(sparse_checkout_path, "w") as f:
+            f.write("/*\n")  # Include top-level files
+            f.write("!/*/\n")  # Exclude all directories
+            f.write("/included/\n")  # Re-include "included" directory
+
+        # Run sparse-checkout reapply to create sparse index
+        run_git_or_fail(["sparse-checkout", "reapply"], cwd=repo_path)
+
+        # Read the index with Dulwich
+        from dulwich.index import Index
+
+        index_path = os.path.join(repo_path, ".git", "index")
+        idx = Index(index_path)
+
+        # Git may or may not create a sparse index depending on the repo state
+        # The key test is that Dulwich can read it without errors
+        self.assertIsNotNone(idx)
+
+        # If it is sparse, verify we can handle sparse directory entries
+        if idx.is_sparse():
+            for path, entry in idx.items():
+                if entry.is_sparse_dir(path):
+                    # Verify sparse dirs are for excluded paths
+                    self.assertTrue(path.startswith(b"excluded"))
+        else:
+            # If not sparse, we should still be able to read all entries
+            self.assertGreater(len(idx), 0)
+
+    def test_write_sparse_index_readable_by_git(self) -> None:
+        """Test that Git can read a sparse index created by Dulwich."""
+        # Sparse index requires Git 2.37+
+        require_git_version((2, 37, 0))
+
+        from dulwich.index import Index, IndexEntry, SparseDirExtension
+        from dulwich.objects import Blob, Tree
+        from dulwich.repo import Repo
+
+        repo_path = os.path.join(self.tempdir, "test_repo")
+        os.mkdir(repo_path)
+
+        # Initialize repo with Git
+        run_git_or_fail(["init"], cwd=repo_path)
+        run_git_or_fail(["config", "index.sparse", "true"], cwd=repo_path)
+
+        # Create a tree structure using Dulwich
+        repo = Repo(repo_path)
+
+        # Create blobs
+        blob1 = Blob()
+        blob1.data = b"file1 content"
+        repo.object_store.add_object(blob1)
+
+        blob2 = Blob()
+        blob2.data = b"file2 content"
+        repo.object_store.add_object(blob2)
+
+        # Create subtree for sparse directory
+        subtree = Tree()
+        subtree[b"file1.txt"] = (0o100644, blob1.id)
+        subtree[b"file2.txt"] = (0o100644, blob2.id)
+        repo.object_store.add_object(subtree)
+
+        # Create root tree
+        tree = Tree()
+        tree[b"sparse_dir"] = (0o040000, subtree.id)
+        repo.object_store.add_object(tree)
+
+        # Create a commit
+        from dulwich.objects import Commit
+
+        commit = Commit()
+        commit.tree = tree.id
+        commit.author = commit.committer = b"Test <test@example.com>"
+        commit.author_time = commit.commit_time = 1234567890
+        commit.author_timezone = commit.commit_timezone = 0
+        commit.encoding = b"UTF-8"
+        commit.message = b"Test commit"
+        repo.object_store.add_object(commit)
+        repo.refs[b"refs/heads/master"] = commit.id
+
+        # Create sparse index with Dulwich
+        index = Index(os.path.join(repo_path, ".git", "index"), read=False)
+
+        # Add sparse directory entry
+        sparse_entry = IndexEntry(
+            ctime=0,
+            mtime=0,
+            dev=0,
+            ino=0,
+            mode=0o040000,
+            uid=0,
+            gid=0,
+            size=0,
+            sha=subtree.id,
+            extended_flags=0x4000,  # SKIP_WORKTREE
+        )
+        index[b"sparse_dir/"] = sparse_entry
+        index._extensions.append(SparseDirExtension())
+        index.write()
+
+        # Verify Git can read the index
+        output = run_git_or_fail(["ls-files", "--debug"], cwd=repo_path)
+        self.assertIn(b"sparse_dir/", output)
+
+        # Verify Git recognizes it as sparse
+        output = run_git_or_fail(["status"], cwd=repo_path)
+        # Should not crash
+        self.assertIsNotNone(output)
+
+    def test_expand_sparse_index_matches_git(self) -> None:
+        """Test that expanding a sparse index matches Git's behavior."""
+        # Sparse index requires Git 2.37+
+        require_git_version((2, 37, 0))
+
+        repo_path = os.path.join(self.tempdir, "test_repo")
+        os.mkdir(repo_path)
+
+        # Initialize repo
+        run_git_or_fail(["init"], cwd=repo_path)
+        run_git_or_fail(["config", "core.sparseCheckout", "true"], cwd=repo_path)
+        run_git_or_fail(["config", "index.sparse", "true"], cwd=repo_path)
+
+        # Create directory structure
+        os.makedirs(os.path.join(repo_path, "dir1", "subdir"), exist_ok=True)
+        with open(os.path.join(repo_path, "dir1", "file1.txt"), "w") as f:
+            f.write("file1\n")
+        with open(os.path.join(repo_path, "dir1", "subdir", "file2.txt"), "w") as f:
+            f.write("file2\n")
+
+        # Commit files
+        run_git_or_fail(["add", "."], cwd=repo_path)
+        run_git_or_fail(["commit", "-m", "initial"], cwd=repo_path)
+
+        # Set up sparse-checkout to exclude dir1
+        sparse_checkout_path = os.path.join(
+            repo_path, ".git", "info", "sparse-checkout"
+        )
+        os.makedirs(os.path.dirname(sparse_checkout_path), exist_ok=True)
+        with open(sparse_checkout_path, "w") as f:
+            f.write("/*\n")
+            f.write("!/dir1/\n")
+
+        run_git_or_fail(["sparse-checkout", "reapply"], cwd=repo_path)
+
+        # Read sparse index with Dulwich
+        from dulwich.index import Index
+        from dulwich.repo import Repo
+
+        index_path = os.path.join(repo_path, ".git", "index")
+        idx = Index(index_path)
+
+        if idx.is_sparse():
+            # Expand the index
+            repo = Repo(repo_path)
+            idx.ensure_full_index(repo.object_store)
+
+            # Should no longer be sparse
+            self.assertFalse(idx.is_sparse())
+
+            # Write it back
+            idx.write()
+
+            # Git should still be able to read it
+            output = run_git_or_fail(["status"], cwd=repo_path)
+            self.assertIsNotNone(output)

+ 273 - 0
tests/test_index.py

@@ -3008,3 +3008,276 @@ class TestUpdateWorkingTree(TestCase):
                     parent = os.path.dirname(path)
                     # We're not creating these directories, just testing the logic doesn't fail
                     self.assertIsInstance(parent, bytes)
+
+
+class TestSparseIndex(TestCase):
+    """Tests for sparse index support."""
+
+    def test_serialized_index_entry_is_sparse_dir(self):
+        """Test SerializedIndexEntry.is_sparse_dir() method."""
+        from dulwich.index import EXTENDED_FLAG_SKIP_WORKTREE
+
+        # Regular file entry - not sparse
+        regular_entry = SerializedIndexEntry(
+            name=b"file.txt",
+            ctime=0,
+            mtime=0,
+            dev=0,
+            ino=0,
+            mode=0o100644,
+            uid=0,
+            gid=0,
+            size=0,
+            sha=b"\x00" * 20,
+            flags=0,
+            extended_flags=0,
+        )
+        self.assertFalse(regular_entry.is_sparse_dir())
+
+        # Directory mode but no skip-worktree flag - not sparse
+        dir_entry = SerializedIndexEntry(
+            name=b"dir/",
+            ctime=0,
+            mtime=0,
+            dev=0,
+            ino=0,
+            mode=stat.S_IFDIR,
+            uid=0,
+            gid=0,
+            size=0,
+            sha=b"\x00" * 20,
+            flags=0,
+            extended_flags=0,
+        )
+        self.assertFalse(dir_entry.is_sparse_dir())
+
+        # Skip-worktree flag but not directory - not sparse
+        skip_file = SerializedIndexEntry(
+            name=b"file.txt",
+            ctime=0,
+            mtime=0,
+            dev=0,
+            ino=0,
+            mode=0o100644,
+            uid=0,
+            gid=0,
+            size=0,
+            sha=b"\x00" * 20,
+            flags=0,
+            extended_flags=EXTENDED_FLAG_SKIP_WORKTREE,
+        )
+        self.assertFalse(skip_file.is_sparse_dir())
+
+        # Directory mode + skip-worktree + trailing slash - sparse!
+        sparse_dir = SerializedIndexEntry(
+            name=b"sparse_dir/",
+            ctime=0,
+            mtime=0,
+            dev=0,
+            ino=0,
+            mode=stat.S_IFDIR,
+            uid=0,
+            gid=0,
+            size=0,
+            sha=b"\x00" * 20,
+            flags=0,
+            extended_flags=EXTENDED_FLAG_SKIP_WORKTREE,
+        )
+        self.assertTrue(sparse_dir.is_sparse_dir())
+
+    def test_index_entry_is_sparse_dir(self):
+        """Test IndexEntry.is_sparse_dir() method."""
+        from dulwich.index import EXTENDED_FLAG_SKIP_WORKTREE
+
+        # Regular file - not sparse
+        regular = IndexEntry(
+            ctime=0,
+            mtime=0,
+            dev=0,
+            ino=0,
+            mode=0o100644,
+            uid=0,
+            gid=0,
+            size=0,
+            sha=b"\x00" * 20,
+            extended_flags=0,
+        )
+        self.assertFalse(regular.is_sparse_dir(b"file.txt"))
+
+        # Sparse directory entry
+        sparse = IndexEntry(
+            ctime=0,
+            mtime=0,
+            dev=0,
+            ino=0,
+            mode=stat.S_IFDIR,
+            uid=0,
+            gid=0,
+            size=0,
+            sha=b"\x00" * 20,
+            extended_flags=EXTENDED_FLAG_SKIP_WORKTREE,
+        )
+        self.assertTrue(sparse.is_sparse_dir(b"dir/"))
+        self.assertFalse(sparse.is_sparse_dir(b"dir"))  # No trailing slash
+
+    def test_sparse_dir_extension(self):
+        """Test SparseDirExtension serialization."""
+        from dulwich.index import SDIR_EXTENSION, SparseDirExtension
+
+        ext = SparseDirExtension()
+        self.assertEqual(ext.signature, SDIR_EXTENSION)
+        self.assertEqual(ext.to_bytes(), b"")
+
+        # Test round-trip
+        ext2 = SparseDirExtension.from_bytes(b"")
+        self.assertEqual(ext2.signature, SDIR_EXTENSION)
+        self.assertEqual(ext2.to_bytes(), b"")
+
+    def test_index_is_sparse(self):
+        """Test Index.is_sparse() method."""
+        from dulwich.index import SparseDirExtension
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            index_path = os.path.join(tmpdir, "index")
+            idx = Index(index_path, read=False)
+
+            # Initially not sparse
+            self.assertFalse(idx.is_sparse())
+
+            # Add sparse directory extension
+            idx._extensions.append(SparseDirExtension())
+            self.assertTrue(idx.is_sparse())
+
+    def test_index_expansion(self):
+        """Test Index.ensure_full_index() expands sparse directories."""
+        from dulwich.index import EXTENDED_FLAG_SKIP_WORKTREE, SparseDirExtension
+        from dulwich.object_store import MemoryObjectStore
+        from dulwich.objects import Blob, Tree
+
+        # Create a tree structure
+        store = MemoryObjectStore()
+
+        blob1 = Blob()
+        blob1.data = b"file1"
+        store.add_object(blob1)
+
+        blob2 = Blob()
+        blob2.data = b"file2"
+        store.add_object(blob2)
+
+        subtree = Tree()
+        subtree[b"file1.txt"] = (0o100644, blob1.id)
+        subtree[b"file2.txt"] = (0o100644, blob2.id)
+        store.add_object(subtree)
+
+        # Create an index with a sparse directory entry
+        with tempfile.TemporaryDirectory() as tmpdir:
+            index_path = os.path.join(tmpdir, "index")
+            idx = Index(index_path, read=False)
+
+            # Add sparse directory entry
+            sparse_entry = IndexEntry(
+                ctime=0,
+                mtime=0,
+                dev=0,
+                ino=0,
+                mode=stat.S_IFDIR,
+                uid=0,
+                gid=0,
+                size=0,
+                sha=subtree.id,
+                extended_flags=EXTENDED_FLAG_SKIP_WORKTREE,
+            )
+            idx[b"subdir/"] = sparse_entry
+            idx._extensions.append(SparseDirExtension())
+
+            self.assertTrue(idx.is_sparse())
+            self.assertEqual(len(idx), 1)
+
+            # Expand the index
+            idx.ensure_full_index(store)
+
+            # Should no longer be sparse
+            self.assertFalse(idx.is_sparse())
+
+            # Should have 2 entries now (the files)
+            self.assertEqual(len(idx), 2)
+            self.assertIn(b"subdir/file1.txt", idx)
+            self.assertIn(b"subdir/file2.txt", idx)
+
+            # Entries should point to the correct blobs
+            self.assertEqual(idx[b"subdir/file1.txt"].sha, blob1.id)
+            self.assertEqual(idx[b"subdir/file2.txt"].sha, blob2.id)
+
+    def test_index_collapse(self):
+        """Test Index.convert_to_sparse() collapses directories."""
+        from dulwich.object_store import MemoryObjectStore
+        from dulwich.objects import Blob, Tree
+
+        # Create a tree structure
+        store = MemoryObjectStore()
+
+        blob1 = Blob()
+        blob1.data = b"file1"
+        store.add_object(blob1)
+
+        blob2 = Blob()
+        blob2.data = b"file2"
+        store.add_object(blob2)
+
+        subtree = Tree()
+        subtree[b"file1.txt"] = (0o100644, blob1.id)
+        subtree[b"file2.txt"] = (0o100644, blob2.id)
+        store.add_object(subtree)
+
+        tree = Tree()
+        tree[b"subdir"] = (stat.S_IFDIR, subtree.id)
+        store.add_object(tree)
+
+        # Create an index with full entries
+        with tempfile.TemporaryDirectory() as tmpdir:
+            index_path = os.path.join(tmpdir, "index")
+            idx = Index(index_path, read=False)
+
+            idx[b"subdir/file1.txt"] = IndexEntry(
+                ctime=0,
+                mtime=0,
+                dev=0,
+                ino=0,
+                mode=0o100644,
+                uid=0,
+                gid=0,
+                size=5,
+                sha=blob1.id,
+                extended_flags=0,
+            )
+            idx[b"subdir/file2.txt"] = IndexEntry(
+                ctime=0,
+                mtime=0,
+                dev=0,
+                ino=0,
+                mode=0o100644,
+                uid=0,
+                gid=0,
+                size=5,
+                sha=blob2.id,
+                extended_flags=0,
+            )
+
+            self.assertEqual(len(idx), 2)
+            self.assertFalse(idx.is_sparse())
+
+            # Collapse subdir to sparse
+            idx.convert_to_sparse(store, tree.id, {b"subdir/"})
+
+            # Should now be sparse
+            self.assertTrue(idx.is_sparse())
+
+            # Should have 1 entry (the sparse dir)
+            self.assertEqual(len(idx), 1)
+            self.assertIn(b"subdir/", idx)
+
+            # Entry should be a sparse directory
+            entry = idx[b"subdir/"]
+            self.assertTrue(entry.is_sparse_dir(b"subdir/"))
+            self.assertEqual(entry.sha, subtree.id)