Explorar o código

Skip-worktree support, sparse checkout impl (#1495)

This PR adds and refines functionality around sparse checkout,
including:

- **Skip-worktree bit support** in `IndexEntry`, preserving
`extended_flags` so that excluded files actually stay out of the working
tree.
- **Updates to `write_index`** so that index entries with extended flags
are written as version 3 or higher.
- **New TDD tests** in `test_porcelain.py` for cone-mode sparse checkout
(currently skipped, as cone mode is not fully implemented yet, but the
tests are in place for future development). I did my best but I decided
to submit now and not potentially spoil it. (I can take these out but I
am confident I'll be able to do cone mode with a bit more thought)
- **Amended tests** in `test_index.py`
- **Local modifications checks** in the porcelain `sparse_checkout`
function, so we can raise or force-remove on excluded paths.
Louis Maddox hai 1 mes
pai
achega
840632ac62
Modificáronse 5 ficheiros con 399 adicións e 13 borrados
  1. 57 12
      dulwich/index.py
  2. 111 0
      dulwich/porcelain.py
  3. 35 0
      dulwich/repo.py
  4. 8 0
      tests/test_index.py
  5. 188 1
      tests/test_porcelain.py

+ 57 - 12
dulwich/index.py

@@ -106,6 +106,8 @@ class IndexEntry:
     gid: int
     size: int
     sha: bytes
+    flags: int
+    extended_flags: int
 
     @classmethod
     def from_serialized(cls, serialized: SerializedIndexEntry) -> "IndexEntry":
@@ -119,9 +121,14 @@ class IndexEntry:
             gid=serialized.gid,
             size=serialized.size,
             sha=serialized.sha,
+            flags=serialized.flags,
+            extended_flags=serialized.extended_flags,
         )
 
     def serialize(self, name: bytes, stage: Stage) -> SerializedIndexEntry:
+        # Clear out any existing stage bits, then set them from the Stage.
+        new_flags = self.flags & ~FLAG_STAGEMASK
+        new_flags |= stage.value << FLAG_STAGESHIFT
         return SerializedIndexEntry(
             name=name,
             ctime=self.ctime,
@@ -133,10 +140,34 @@ class IndexEntry:
             gid=self.gid,
             size=self.size,
             sha=self.sha,
-            flags=stage.value << FLAG_STAGESHIFT,
-            extended_flags=0,
+            flags=new_flags,
+            extended_flags=self.extended_flags,
         )
 
+    def stage(self) -> Stage:
+        return Stage((self.flags & FLAG_STAGEMASK) >> FLAG_STAGESHIFT)
+
+    @property
+    def skip_worktree(self) -> bool:
+        """Return True if the skip-worktree bit is set in extended_flags."""
+        return bool(self.extended_flags & EXTENDED_FLAG_SKIP_WORKTREE)
+
+    def set_skip_worktree(self, skip: bool = True) -> None:
+        """Helper method to set or clear the skip-worktree bit in extended_flags.
+        Also sets FLAG_EXTENDED in self.flags if needed.
+        """
+        if skip:
+            # Turn on the skip-worktree bit
+            self.extended_flags |= EXTENDED_FLAG_SKIP_WORKTREE
+            # Also ensure the main 'extended' bit is set in flags
+            self.flags |= FLAG_EXTENDED
+        else:
+            # Turn off the skip-worktree bit
+            self.extended_flags &= ~EXTENDED_FLAG_SKIP_WORKTREE
+            # Optionally unset the main extended bit if no extended flags remain
+            if self.extended_flags == 0:
+                self.flags &= ~FLAG_EXTENDED
+
 
 class ConflictedIndexEntry:
     """Index entry that represents a conflict."""
@@ -348,10 +379,22 @@ def write_index(
     """
     if version is None:
         version = DEFAULT_VERSION
+    # STEP 1: check if any extended_flags are set
+    uses_extended_flags = any(e.extended_flags != 0 for e in entries)
+    if uses_extended_flags and version < 3:
+        # Force or bump the version to 3
+        version = 3
+    # The rest is unchanged, but you might insert a final check:
+    if version < 3:
+        # Double-check no extended flags appear
+        for e in entries:
+            if e.extended_flags != 0:
+                raise AssertionError("Attempt to use extended flags in index < v3")
+    # Proceed with the existing code to write the header and entries.
     f.write(b"DIRC")
     f.write(struct.pack(b">LL", version, len(entries)))
     for entry in entries:
-        write_cache_entry(f, entry, version)
+        write_cache_entry(f, entry, version=version)
 
 
 def write_index_dict(
@@ -689,15 +732,17 @@ def index_entry_from_stat(
         mode = cleanup_mode(stat_val.st_mode)
 
     return IndexEntry(
-        stat_val.st_ctime,
-        stat_val.st_mtime,
-        stat_val.st_dev,
-        stat_val.st_ino,
-        mode,
-        stat_val.st_uid,
-        stat_val.st_gid,
-        stat_val.st_size,
-        hex_sha,
+        ctime=stat_val.st_ctime,
+        mtime=stat_val.st_mtime,
+        dev=stat_val.st_dev,
+        ino=stat_val.st_ino,
+        mode=mode,
+        uid=stat_val.st_uid,
+        gid=stat_val.st_gid,
+        size=stat_val.st_size,
+        sha=hex_sha,
+        flags=0,
+        extended_flags=0,
     )
 
 

+ 111 - 0
dulwich/porcelain.py

@@ -45,6 +45,7 @@ Currently implemented:
  * remote{_add}
  * receive-pack
  * reset
+ * sparse_checkout
  * submodule_add
  * submodule_init
  * submodule_list
@@ -94,6 +95,7 @@ from .file import ensure_dir_exists
 from .graph import can_fast_forward
 from .ignore import IgnoreFilterManager
 from .index import (
+    EXTENDED_FLAG_SKIP_WORKTREE,
     _fs_to_tree_path,
     blob_from_path_and_stat,
     build_file_from_blob,
@@ -2098,6 +2100,115 @@ def checkout_branch(repo, target: Union[bytes, str], force: bool = False) -> Non
                 dir_path = os.path.dirname(dir_path)
 
 
+def sparse_checkout(repo, patterns=None, force=False):
+    """Perform a sparse checkout by excluding certain paths via skip-worktree bits.
+
+    Mark any paths not matching the given patterns with skip-worktree in the index and
+    remove them from the working tree.  If `force=False` and a file has local
+    modifications, a `CheckoutError` is raised to prevent accidental data loss.
+
+    By default, patterns are stored in or read from `.git/info/sparse-checkout`, and
+    follow standard Gitignore/fnmatch rules.
+
+    Args:
+      repo: A path to a repository or a Repo instance.
+      patterns: A list of Gitignore-style patterns to include.
+      force: Whether to allow destructive removals of uncommitted changes
+             in newly excluded paths.
+
+    Raises:
+      CheckoutError: If local modifications would be discarded without force=True.
+      Error: If no patterns are given or an I/O failure occurs.
+    """
+    repo = Repo(repo) if not isinstance(repo, Repo) else repo
+
+    # 1) Read or write the sparse-checkout file
+    if patterns is not None:
+        repo.set_sparse_checkout_patterns(patterns)
+    else:
+        patterns = repo.get_sparse_checkout_patterns()
+        if patterns is None:
+            raise Error("No sparse checkout patterns provided and no file found.")
+
+    # 2) Preprocess patterns: "docs/" -> "docs/*", unify path separators
+    processed_pats = []
+    for pat in patterns:
+        if pat.endswith("/"):
+            pat += "*"
+        processed_pats.append(pat)
+    patterns = processed_pats
+
+    def matches_any_pattern(index_path):
+        forward_path = index_path.replace("\\", "/")
+        for pat in patterns:
+            if fnmatch.fnmatch(forward_path, pat):
+                return True
+        return False
+
+    # 3) Helper to detect local modifications
+    normalizer = repo.get_blob_normalizer()
+
+    def local_modifications_exist(full_path, index_entry):
+        if not os.path.exists(full_path):
+            return False
+        try:
+            with open(full_path, "rb") as f:
+                disk_data = f.read()
+        except OSError:
+            return True
+        try:
+            blob = repo.object_store[index_entry.sha]
+        except KeyError:
+            return True
+        norm_data = normalizer.checkin_normalize(disk_data, full_path)
+        return norm_data != blob.data
+
+    # 4) Update skip-worktree bits in the index
+    index = repo.open_index()
+    for path, entry in list(index.items()):
+        path_str = path.decode("utf-8")
+        # If the file matches any pattern => included => clear skip-worktree
+        if matches_any_pattern(path_str):
+            entry.set_skip_worktree(False)
+        else:
+            entry.set_skip_worktree(True)
+        index[path] = entry
+    index.write()
+
+    # 5) Update the working tree to reflect skip-worktree bits
+    for path, entry in list(index.items()):
+        path_str = path.decode("utf-8")
+        full_path = os.path.join(repo.path, path_str)
+        skip_bit_set = bool(entry.extended_flags & EXTENDED_FLAG_SKIP_WORKTREE)
+
+        if skip_bit_set:
+            # The file is excluded
+            if os.path.exists(full_path):
+                # If force is False and local modifications exist, fail
+                if not force and local_modifications_exist(full_path, entry):
+                    raise CheckoutError(
+                        f"Local modifications in {path_str} would be overwritten "
+                        f"by sparse checkout. Use force=True to override."
+                    )
+                try:
+                    os.remove(full_path)
+                except OSError as e:
+                    raise Error(f"Failed to remove excluded file {path_str}: {e}")
+        else:
+            # The file is included
+            if not os.path.exists(full_path):
+                try:
+                    blob = repo.object_store[entry.sha]
+                except KeyError:
+                    raise Error(
+                        f"Blob {entry.sha} not found in object store for {path_str}."
+                    )
+                ensure_dir_exists(os.path.dirname(full_path))
+                with open(full_path, "wb") as f:
+                    f.write(blob.data)
+    return
+
+
 def check_mailmap(repo, contact):
     """Check canonical name and email of contact.
 

+ 35 - 0
dulwich/repo.py

@@ -1493,6 +1493,8 @@ class Repo(BaseRepo):
                 gid=st.st_gid if st else 0,
                 size=len(self[tree_entry[1]].data),
                 sha=tree_entry[1],
+                flags=0,
+                extended_flags=0,
             )
 
             index[tree_path] = index_entry
@@ -1836,6 +1838,39 @@ class Repo(BaseRepo):
         except KeyError:
             return BlobNormalizer(config_stack, git_attributes)
 
+    def _sparse_checkout_file_path(self) -> str:
+        """Return the path of the sparse-checkout file in this repo's control dir."""
+        return os.path.join(self.controldir(), "info", "sparse-checkout")
+
+    def get_sparse_checkout_patterns(self) -> list[str]:
+        """Return a list of sparse-checkout patterns from info/sparse-checkout.
+
+        Returns:
+            A list of patterns. Returns an empty list if the file is missing.
+        """
+        path = self._sparse_checkout_file_path()
+        try:
+            with open(path, encoding="utf-8") as f:
+                return [line.strip() for line in f if line.strip()]
+        except FileNotFoundError:
+            return []
+
+    def set_sparse_checkout_patterns(self, patterns: list[str]) -> None:
+        """Write the given sparse-checkout patterns into info/sparse-checkout.
+
+        Creates the info/ directory if it does not exist.
+
+        Args:
+            patterns: A list of gitignore-style patterns to store.
+        """
+        info_dir = os.path.join(self.controldir(), "info")
+        os.makedirs(info_dir, exist_ok=True)
+
+        path = self._sparse_checkout_file_path()
+        with open(path, "w", encoding="utf-8") as f:
+            for pat in patterns:
+                f.write(pat + "\n")
+
 
 class MemoryRepo(BaseRepo):
     """Repo that stores refs, objects, and named files in memory.

+ 8 - 0
tests/test_index.py

@@ -105,6 +105,8 @@ class SimpleIndexTestCase(IndexTestCase):
                 1000,
                 0,
                 b"e69de29bb2d1d6434b8b29ae775ad8c2e48c5391",
+                0,
+                0,
             ),
             self.get_simple_index("index")[b"bla"],
         )
@@ -180,6 +182,8 @@ class ReadIndexDictTests(IndexTestCase):
                 1000,
                 0,
                 b"e69de29bb2d1d6434b8b29ae775ad8c2e48c5391",
+                0,
+                0,
             )
         }
         filename = os.path.join(self.tempdir, "test-simple-write-index")
@@ -290,6 +294,8 @@ class IndexEntryFromStatTests(TestCase):
                 1000,
                 12288,
                 b"2222222222222222222222222222222222222222",
+                0,
+                0,
             ),
         )
 
@@ -321,6 +327,8 @@ class IndexEntryFromStatTests(TestCase):
                 1000,
                 12288,
                 b"2222222222222222222222222222222222222222",
+                0,
+                0,
             ),
         )
 

+ 188 - 1
tests/test_porcelain.py

@@ -40,7 +40,11 @@ from dulwich import porcelain
 from dulwich.diff_tree import tree_changes
 from dulwich.errors import CommitError
 from dulwich.objects import ZERO_SHA, Blob, Tag, Tree
-from dulwich.porcelain import CheckoutError
+from dulwich.porcelain import (
+    CheckoutError,  # Hypothetical or real error class
+    add,
+    commit,
+)
 from dulwich.repo import NoIndexPresent, Repo
 from dulwich.server import DictBackend
 from dulwich.tests.utils import build_commit_graph, make_commit, make_object
@@ -3679,3 +3683,186 @@ class ForEachTests(PorcelainTestCase):
                 (b"tag", b"refs/tags/v1.1"),
             ],
         )
+
+
+class SparseCheckoutTests(PorcelainTestCase):
+    """Integration tests for Dulwich's sparse checkout feature."""
+
+    # NOTE: We do NOT override `setUp()` here because the parent class
+    #       (PorcelainTestCase) already:
+    #         1) Creates self.test_dir = a unique temp dir
+    #         2) Creates a subdir named "repo"
+    #         3) Calls Repo.init() on that path
+    #       Re-initializing again caused FileExistsError.
+
+    #
+    # Utility/Placeholder
+    #
+    def sparse_checkout(self, repo, patterns, force=False):
+        """Wrapper around the actual porcelain.sparse_checkout function
+        to handle any test-specific setup or logging.
+        """
+        return porcelain.sparse_checkout(repo, patterns, force=force)
+
+    def _write_file(self, rel_path, content):
+        """Helper to write a file in the repository working tree."""
+        abs_path = os.path.join(self.repo_path, rel_path)
+        os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+        with open(abs_path, "w") as f:
+            f.write(content)
+        return abs_path
+
+    def _commit_file(self, rel_path, content):
+        """Helper to write, add, and commit a file."""
+        abs_path = self._write_file(rel_path, content)
+        add(self.repo_path, paths=[abs_path])
+        commit(self.repo_path, message=b"Added " + rel_path.encode("utf-8"))
+
+    def _list_wtree_files(self):
+        """Return a set of all files (not dirs) present
+        in the working tree, ignoring .git/.
+        """
+        found_files = set()
+        for root, dirs, files in os.walk(self.repo_path):
+            # Skip .git in the walk
+            if ".git" in dirs:
+                dirs.remove(".git")
+
+            for filename in files:
+                file_rel = os.path.relpath(os.path.join(root, filename), self.repo_path)
+                found_files.add(file_rel)
+        return found_files
+
+    def test_only_included_paths_appear_in_wtree(self):
+        """Only included paths remain in the working tree, excluded paths are removed.
+
+        Commits two files, "keep_me.txt" and "exclude_me.txt". Then applies a
+        sparse-checkout pattern containing only "keep_me.txt". Ensures that
+        the latter remains in the working tree, while "exclude_me.txt" is
+        removed. This verifies correct application of sparse-checkout patterns
+        to remove files not listed.
+        """
+        self._commit_file("keep_me.txt", "I'll stay\n")
+        self._commit_file("exclude_me.txt", "I'll be excluded\n")
+
+        patterns = ["keep_me.txt"]
+        self.sparse_checkout(self.repo, patterns)
+
+        actual_files = self._list_wtree_files()
+        expected_files = {"keep_me.txt"}
+        self.assertEqual(
+            expected_files,
+            actual_files,
+            f"Expected only {expected_files}, but found {actual_files}",
+        )
+
+    def test_previously_included_paths_become_excluded(self):
+        """Previously included files become excluded after pattern changes.
+
+        Verifies that files initially brought into the working tree (e.g.,
+        by including `data/`) can later be excluded by narrowing the
+        sparse-checkout pattern to just `data/included_1.txt`. Confirms that
+        the file `data/included_2.txt` remains in the index with
+        skip-worktree set (rather than being removed entirely), ensuring
+        data is not lost and Dulwich correctly updates the index flags.
+        """
+        self._commit_file("data/included_1.txt", "some content\n")
+        self._commit_file("data/included_2.txt", "other content\n")
+
+        initial_patterns = ["data/"]
+        self.sparse_checkout(self.repo, initial_patterns)
+
+        updated_patterns = ["data/included_1.txt"]
+        self.sparse_checkout(self.repo, updated_patterns)
+
+        actual_files = self._list_wtree_files()
+        expected_files = {os.path.join("data", "included_1.txt")}
+        self.assertEqual(expected_files, actual_files)
+
+        idx = self.repo.open_index()
+        self.assertIn(b"data/included_2.txt", idx)
+        entry = idx[b"data/included_2.txt"]
+        self.assertTrue(entry.skip_worktree)
+
+    def test_force_removes_local_changes_for_excluded_paths(self):
+        """Forced sparse checkout removes local modifications for newly excluded paths.
+
+        Verifies that specifying force=True allows destructive operations
+        which discard uncommitted changes. First, we commit "file1.txt" and
+        then modify it. Next, we apply a pattern that excludes the file,
+        using force=True. The local modifications (and the file) should
+        be removed, leaving the working tree empty.
+        """
+        self._commit_file("file1.txt", "original content\n")
+
+        file1_path = os.path.join(self.repo_path, "file1.txt")
+        with open(file1_path, "a") as f:
+            f.write("local changes!\n")
+
+        new_patterns = ["some_other_file.txt"]
+        self.sparse_checkout(self.repo, new_patterns, force=True)
+
+        actual_files = self._list_wtree_files()
+        self.assertEqual(
+            set(),
+            actual_files,
+            "Force-sparse-checkout did not remove file with local changes.",
+        )
+
+    def test_destructive_refuse_uncommitted_changes_without_force(self):
+        """Fail on uncommitted changes for newly excluded paths without force.
+
+        Ensures that a sparse checkout is blocked if it would remove local
+        modifications from the working tree. We commit 'config.yaml', then
+        modify it, and finally attempt to exclude it via new patterns without
+        using force=True. This should raise a CheckoutError rather than
+        discarding the local changes.
+        """
+        self._commit_file("config.yaml", "initial\n")
+        cfg_path = os.path.join(self.repo_path, "config.yaml")
+        with open(cfg_path, "a") as f:
+            f.write("local modifications\n")
+
+        exclude_patterns = ["docs/"]
+        with self.assertRaises(CheckoutError):
+            self.sparse_checkout(self.repo, exclude_patterns, force=False)
+
+    def test_fnmatch_gitignore_pattern_expansion(self):
+        """Reading/writing patterns align with gitignore/fnmatch expansions.
+
+        Ensures that `sparse_checkout` interprets wildcard patterns (like `*.py`)
+        in the same way Git's sparse-checkout would. Multiple files are committed
+        to `src/` (e.g. `foo.py`, `foo_test.py`, `foo_helper.py`) and to `docs/`.
+        Then the pattern `src/foo*.py` is applied, confirming that only the
+        matching Python files remain in the working tree while the Markdown file
+        under `docs/` is excluded.
+
+        Finally, verifies that the `.git/info/sparse-checkout` file contains the
+        specified wildcard pattern (`src/foo*.py`), ensuring correct round-trip
+        of user-supplied patterns.
+        """
+        self._commit_file("src/foo.py", "print('hello')\n")
+        self._commit_file("src/foo_test.py", "print('test')\n")
+        self._commit_file("docs/readme.md", "# docs\n")
+        self._commit_file("src/foo_helper.py", "print('helper')\n")
+
+        patterns = ["src/foo*.py"]
+        self.sparse_checkout(self.repo, patterns)
+
+        actual_files = self._list_wtree_files()
+        expected_files = {
+            os.path.join("src", "foo.py"),
+            os.path.join("src", "foo_test.py"),
+            os.path.join("src", "foo_helper.py"),
+        }
+        self.assertEqual(
+            expected_files,
+            actual_files,
+            "Wildcard pattern not matched as expected. Either too strict or too broad.",
+        )
+
+        sc_file = os.path.join(self.repo_path, ".git", "info", "sparse-checkout")
+        self.assertTrue(os.path.isfile(sc_file))
+        with open(sc_file) as f:
+            lines = f.read().strip().split()
+            self.assertIn("src/foo*.py", lines)