Переглянути джерело

Support recursive cone mode (#2043)

Jelmer Vernooij 3 тижнів тому
батько
коміт
44f64531f0
2 змінених файлів з 155 додано та 132 видалено
  1. 56 47
      dulwich/sparse_patterns.py
  2. 99 85
      tests/test_sparse_patterns.py

+ 56 - 47
dulwich/sparse_patterns.py

@@ -28,7 +28,7 @@ __all__ = [
     "compute_included_paths_cone",
     "compute_included_paths_full",
     "determine_included_paths",
-    "match_gitignore_patterns",
+    "match_sparse_patterns",
     "parse_sparse_patterns",
 ]
 
@@ -86,8 +86,8 @@ def compute_included_paths_full(index: Index, lines: Sequence[str]) -> set[str]:
     included = set()
     for path_bytes, entry in index.items():
         path_str = path_bytes.decode("utf-8")
-        # For .gitignore logic, match_gitignore_patterns returns True if 'included'
-        if match_gitignore_patterns(path_str, parsed, path_is_dir=False):
+        # For .gitignore logic, match_sparse_patterns returns True if 'included'
+        if match_sparse_patterns(path_str, parsed, path_is_dir=False):
             included.add(path_str)
     return included
 
@@ -252,9 +252,10 @@ def parse_sparse_patterns(lines: Sequence[str]) -> list[tuple[str, bool, bool, b
     anchoring, and directory-only markers, and returns data suitable for matching.
 
     Example:
-      ``line = "/*.txt" -> ("/.txt", False, False, True)``
-      ``line = "!/docs/" -> ("/docs/", True, True, True)``
-      ``line = "mydir/" -> ("mydir/", False, True, False)`` not anchored, no leading "/"
+      ``line = "*.txt" -> ("*.txt", False, False, False)`` not negated/dir/anchored
+      ``line = "/*.txt" -> ("*.txt", False, False, True)`` anchored, not negated/dir
+      ``line = "!/*.txt" -> ("*.txt", False, False, True)`` anchored/negated, not dir
+      ``line = "!/mydir/" -> ("mydir", True, True, True)`` anchored/negated/dir
 
     Args:
       lines: A list of raw lines (strings) from the sparse-checkout file.
@@ -277,19 +278,16 @@ def parse_sparse_patterns(lines: Sequence[str]) -> list[tuple[str, bool, bool, b
         if anchored:
             line = line[1:]  # remove leading '/'
 
-        # If pattern ends with '/', we consider it directory-only
-        # (like "docs/"). Real Git might treat it slightly differently,
-        # but we'll simplify and mark it as "dir_only" if it ends in "/".
-        dir_only = False
-        if line.endswith("/"):
-            dir_only = True
+        # If pattern ends with '/', we consider it directory-only (like "docs/").
+        dir_only = line.endswith("/")
+        if dir_only:
             line = line[:-1]
 
         results.append((line, negation, dir_only, anchored))
     return results
 
 
-def match_gitignore_patterns(
+def match_sparse_patterns(
     path_str: str,
     parsed_patterns: Sequence[tuple[str, bool, bool, bool]],
     path_is_dir: bool = False,
@@ -301,16 +299,22 @@ def match_gitignore_patterns(
       2. If a pattern matches, we set the "include" state depending on negation.
       3. Later matches override earlier ones.
 
-    In a .gitignore sense, lines that do not start with '!' are "ignore" patterns,
-    lines that start with '!' are "unignore" (re-include). But in sparse checkout,
-    it's effectively reversed: a non-negation line is "include," negation is "exclude."
-    However, many flows still rely on the same final logic: the last matching pattern
+    In a sparse checkout, lines that do not start with '!' are positive patterns,
+    indicating files/directories to check out (include in the index), and those that
+    start with '!' are negative ('negated'), meaning they indicate files not to check
+    out (not included in the index). This is fairly straightforward.
+
+    In a .gitignore, it's the same syntax but with a reverse effect: positive means
+    "ignore" (exclude from the index) and negative means "unignore" (re-include in the
+    index).
+
+    Many routines still rely on the same final logic: the last matching pattern
     decides "excluded" vs. "included."
 
     We'll interpret "include" as returning True, "exclude" as returning False.
 
-    Each pattern can include negation (!), directory-only markers, or be anchored
-    to the start of the path. The last matching pattern determines whether the
+    Each pattern can include negation ('!'), directory-only markers ('/' as suffix), or
+    be anchored ('/' as prefix). The last matching pattern determines whether the
     path is ultimately included or excluded.
 
     Args:
@@ -322,9 +326,8 @@ def match_gitignore_patterns(
     Returns:
       True if the path is included by the last matching pattern, False otherwise.
     """
-    # Start by assuming "excluded" (like a .gitignore starts by including everything
-    # until matched, but for sparse-checkout we often treat unmatched as "excluded").
-    # We will flip if we match an "include" pattern.
+    # Start by assuming "excluded". Like how .gitignore initially includes everything
+    # until matched, but reversed: sparse-checkout initially excludes everything.
     is_included = False
 
     for pattern, negation, dir_only, anchored in parsed_patterns:
@@ -336,37 +339,43 @@ def match_gitignore_patterns(
                 matched = True
         else:
             matched = False
-        # If dir_only is True and path_is_dir is False, we skip matching
+        # If dir_only is True, handle directory-only matching separately
         if dir_only and not matched:
             if path_str == pattern + "/":
                 matched = not forbidden_path
             elif fnmatch(path_str, f"{pattern}/*"):
                 matched = True  # root subpath (anchored or unanchored)
             elif not anchored:
-                matched = fnmatch(path_str, f"*/{pattern}/*")  # unanchored subpath
-
-        # If anchored is True, pattern should match from the start of path_str.
-        # If not anchored, we can match anywhere.
-        if anchored and not matched:
-            # We match from the beginning. For example, pattern = "docs"
-            # path_str = "docs/readme.md" -> start is "docs"
-            # We'll just do a prefix check or prefix + slash check
-            # Or you can do a partial fnmatch. We'll do a manual approach:
-            if pattern == "":
-                # Means it was just "/", which can happen if line was "/"
-                # That might represent top-level only?
-                # We'll skip for simplicity or treat it as a special case.
-                continue
-            elif path_str == pattern:
-                matched = True
-            elif path_str.startswith(pattern + "/"):
-                matched = True
-            else:
-                matched = False
-        elif not matched:
-            # Not anchored: we can do a simple wildcard match or a substring match.
-            # For simplicity, let's use Python's fnmatch:
-            matched = fnmatch(path_str, pattern) or fnmatch(path_str, f"*/{pattern}")
+                # For unanchored dir-only patterns, match the directory at any level
+                # e.g., "docs/" should match "A/docs" as a directory and "A/docs/*" as files within
+                if fnmatch(path_str, f"*/{pattern}") and path_is_dir:
+                    matched = True
+                elif fnmatch(path_str, f"*/{pattern}/*"):
+                    matched = True  # unanchored subpath
+        # If dir_only is False, or if dir_only is True and we already matched, continue with other logic
+        elif not dir_only:
+            # If anchored is True, pattern should match from the start of path_str.
+            # If not anchored, we can match anywhere.
+            if anchored and not matched:
+                # We match from the beginning. For example, pattern = "docs"
+                # path_str = "docs/readme.md" -> start is "docs"
+                # We'll just do a prefix check or prefix + slash check
+                # Or you can do a partial fnmatch. We'll do a manual approach:
+                if pattern == "":
+                    # Means it was just "/", which should match everything recursively
+                    matched = True
+                elif path_str == pattern:
+                    matched = True
+                elif path_str.startswith(pattern + "/"):
+                    matched = True
+                else:
+                    matched = False
+            elif not matched:
+                # Not anchored: we can do a simple wildcard match or a substring match.
+                # For simplicity, let's use Python's fnmatch:
+                matched = fnmatch(path_str, pattern) or fnmatch(
+                    path_str, f"*/{pattern}"
+                )
 
         if matched:
             # If negation is True, that means 'exclude'. If negation is False, 'include'.

+ 99 - 85
tests/test_sparse_patterns.py

@@ -37,7 +37,7 @@ from dulwich.sparse_patterns import (
     compute_included_paths_cone,
     compute_included_paths_full,
     determine_included_paths,
-    match_gitignore_patterns,
+    match_sparse_patterns,
     parse_sparse_patterns,
 )
 
@@ -57,44 +57,41 @@ class ParseSparsePatternsTests(TestCase):
         parsed = parse_sparse_patterns(lines)
         self.assertEqual(parsed, [])
 
-    def test_simple_patterns(self):
+    def test_sparse_pattern_combos(self):
         lines = [
-            "*.py",
-            "!*.md",
-            "/docs/",
-            "!/docs/images/",
-        ]
-        parsed = parse_sparse_patterns(lines)
-        self.assertEqual(len(parsed), 4)
-
-        self.assertEqual(parsed[0], ("*.py", False, False, False))  # include *.py
-        self.assertEqual(parsed[1], ("*.md", True, False, False))  # exclude *.md
-        self.assertEqual(parsed[2], ("docs", False, True, True))  # anchored, dir_only
-        self.assertEqual(parsed[3], ("docs/images", True, True, True))
-
-    def test_trailing_slash_dir(self):
-        lines = [
-            "src/",
+            "*.py",  # Python files anywhere
+            "!*.md",  # markdown files anywhere
+            "/docs/",  # root docs dir
+            "!/docs/images/",  # no root docs/images subdir
+            "src/",  # src dir anywhere
+            "/*.toml",  # root TOML files
+            "!/*.bak",  # no root backup files
+            "!data/",  # no data dirs anywhere
         ]
         parsed = parse_sparse_patterns(lines)
-        # "src/" => (pattern="src", negation=False, dir_only=True, anchored=False)
-        self.assertEqual(parsed, [("src", False, True, False)])
+        self.assertEqual(len(parsed), 8)
 
-    def test_negation_anchor(self):
-        lines = [
-            "!/foo.txt",
-        ]
-        parsed = parse_sparse_patterns(lines)
-        # => (pattern="foo.txt", negation=True, dir_only=False, anchored=True)
-        self.assertEqual(parsed, [("foo.txt", True, False, True)])
+        # Returns a 4-tuple of: (pattern, negation, dir_only, anchored)
+        self.assertEqual(parsed[0], ("*.py", False, False, False))  # _,_,_
+        self.assertEqual(parsed[1], ("*.md", True, False, False))  # N,_,_
+        self.assertEqual(parsed[2], ("docs", False, True, True))  # _,D,A
+        self.assertEqual(parsed[3], ("docs/images", True, True, True))  # N,D,A
+        self.assertEqual(parsed[4], ("src", False, True, False))  # _,D,_
+        self.assertEqual(parsed[5], ("*.toml", False, False, True))  # _,_,A
+        self.assertEqual(parsed[6], ("*.bak", True, False, True))  # N,_,A
+        self.assertEqual(parsed[7], ("data", True, True, False))  # N,D,_
 
 
-class MatchGitignorePatternsTests(TestCase):
-    """Test the match_gitignore_patterns function."""
+class MatchSparsePatternsTests(TestCase):
+    """Test the match_sparse_patterns function."""
 
+    # def match_sparse_patterns(path_str, parsed_patterns, path_is_dir=False):
     def test_no_patterns_returns_excluded(self):
         """If no patterns are provided, by default we treat the path as excluded."""
-        self.assertFalse(match_gitignore_patterns("anyfile.py", []))
+        self.assertFalse(match_sparse_patterns("foo.py", [], path_is_dir=False))
+        self.assertFalse(match_sparse_patterns("A/", [], path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/B/", [], path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/B/bar.md", [], path_is_dir=False))
 
     def test_last_match_wins(self):
         """Checks that the last pattern to match determines included vs excluded."""
@@ -106,78 +103,95 @@ class MatchGitignorePatternsTests(TestCase):
         )
         # "foo.py" matches first pattern => included
         # then matches second pattern => excluded
-        self.assertFalse(match_gitignore_patterns("foo.py", parsed))
+        self.assertFalse(match_sparse_patterns("foo.py", parsed))
+        self.assertFalse(match_sparse_patterns("A/foo.py", parsed))
+        self.assertFalse(match_sparse_patterns("A/B/foo.py", parsed))
+        self.assertTrue(match_sparse_patterns("bar.py", parsed))
+        self.assertTrue(match_sparse_patterns("A/bar.py", parsed))
+        self.assertTrue(match_sparse_patterns("A/B/bar.py", parsed))
+        self.assertFalse(match_sparse_patterns("bar.md", parsed))
+        self.assertFalse(match_sparse_patterns("A/bar.md", parsed))
+        self.assertFalse(match_sparse_patterns("A/B", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/B", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns(".cache", parsed, path_is_dir=True))
 
     def test_dir_only(self):
         """A pattern with a trailing slash should only match directories and subdirectories."""
         parsed = parse_sparse_patterns(["docs/"])
-        # Because we set path_is_dir=False, it won't match
-        self.assertTrue(
-            match_gitignore_patterns("docs/readme.md", parsed, path_is_dir=False)
-        )
-        self.assertTrue(match_gitignore_patterns("docs", parsed, path_is_dir=True))
+        # The directory pattern is not rooted, so can be at any level
+        self.assertTrue(match_sparse_patterns("docs", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("A/docs", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("A/B/docs", parsed, path_is_dir=True))
         # Even if the path name is "docs", if it's a file, won't match:
-        self.assertFalse(match_gitignore_patterns("docs", parsed, path_is_dir=False))
+        self.assertFalse(match_sparse_patterns("docs", parsed, path_is_dir=False))
+        self.assertFalse(match_sparse_patterns("A/docs", parsed, path_is_dir=False))
+        self.assertFalse(match_sparse_patterns("A/B/docs", parsed, path_is_dir=False))
+        # Subfiles and subdirs of the included dir should match
+        self.assertTrue(match_sparse_patterns("docs/x.md", parsed))
+        self.assertTrue(match_sparse_patterns("docs/A/x.md", parsed))
+        self.assertTrue(match_sparse_patterns("docs/A/B/x.md", parsed))
+        self.assertTrue(match_sparse_patterns("docs/A", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("docs/A/B", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("docs", parsed, path_is_dir=True))
 
     def test_anchored(self):
         """Anchored patterns match from the start of the path only."""
-        parsed = parse_sparse_patterns(["/foo"])
-        self.assertTrue(match_gitignore_patterns("foo", parsed))
+        parsed = parse_sparse_patterns(["/foo"])  # Can be file or dir, must be at root
+        self.assertTrue(match_sparse_patterns("foo", parsed))
+        self.assertTrue(match_sparse_patterns("foo", parsed, path_is_dir=True))
         # But "some/foo" doesn't match because anchored requires start
-        self.assertFalse(match_gitignore_patterns("some/foo", parsed))
+        self.assertFalse(match_sparse_patterns("A/foo", parsed))
+        self.assertFalse(match_sparse_patterns("A/foo", parsed, path_is_dir=True))
 
-    def test_unanchored_uses_fnmatch(self):
+    def test_unanchored(self):
         parsed = parse_sparse_patterns(["foo"])
-        self.assertTrue(match_gitignore_patterns("some/foo", parsed))
-        self.assertFalse(match_gitignore_patterns("some/bar", parsed))
+        self.assertTrue(match_sparse_patterns("foo", parsed))
+        self.assertTrue(match_sparse_patterns("foo", parsed, path_is_dir=True))
+        # But "some/foo" doesn't match because anchored requires start
+        self.assertTrue(match_sparse_patterns("A/foo", parsed))
+        self.assertTrue(match_sparse_patterns("A/foo", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("bar", parsed))
+        self.assertFalse(match_sparse_patterns("A/bar", parsed))
 
     def test_anchored_empty_pattern(self):
         """Test handling of empty pattern with anchoring (e.g., '/')."""
+        # `/` should be recursive match of all files
         parsed = parse_sparse_patterns(["/"])
-        # Check the structure of the parsed empty pattern first
-        self.assertEqual(parsed, [("", False, False, True)])
-        # When the pattern is empty with anchoring, it's continued (skipped) in match_gitignore_patterns
-        # for non-empty paths but for empty string it might match due to empty string comparisons
-        self.assertFalse(match_gitignore_patterns("foo", parsed))
-        # An empty string with empty pattern will match (implementation detail)
-        self.assertTrue(match_gitignore_patterns("", parsed))
-
-    def test_anchored_dir_only_exact_match(self):
-        """Test anchored directory-only patterns with exact matching."""
+        self.assertEqual(parsed, [("", False, False, True)])  # anchored
+        self.assertTrue(match_sparse_patterns("", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("A", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("A/B", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("foo", parsed))
+        self.assertTrue(match_sparse_patterns("A/foo", parsed))
+        self.assertTrue(match_sparse_patterns("A/B/foo", parsed))
+
+    def test_anchored_dir_only(self):
+        """Test anchored directory-only patterns."""
         parsed = parse_sparse_patterns(["/docs/"])
-        # Test with exact match "docs" and path_is_dir=True
-        self.assertTrue(match_gitignore_patterns("docs", parsed, path_is_dir=True))
-        # Test with "docs/" (exact match + trailing slash)
-        self.assertTrue(match_gitignore_patterns("docs/", parsed, path_is_dir=True))
-
-    def test_complex_anchored_patterns(self):
-        """Test more complex anchored pattern matching."""
-        parsed = parse_sparse_patterns(["/dir/subdir"])
-        # Test exact match
-        self.assertTrue(match_gitignore_patterns("dir/subdir", parsed))
-        # Test subdirectory path
-        self.assertTrue(match_gitignore_patterns("dir/subdir/file.txt", parsed))
+        self.assertTrue(match_sparse_patterns("docs", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("docs", parsed))  # file named docs
+        self.assertFalse(match_sparse_patterns("A", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/B", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/docs", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/docs", parsed))
+        self.assertFalse(match_sparse_patterns("A/B/docs", parsed))
+        self.assertFalse(match_sparse_patterns("A/B/docs", parsed, path_is_dir=True))
+
+    def test_anchored_subpath(self):
+        """Test anchored subpath pattern matching."""
+        parsed = parse_sparse_patterns(["/A/B"])
+        # TODO: should this also match the dir "A" (positively?)
+        # self.assertTrue(match_sparse_patterns("A", parsed, path_is_dir=True))
+        # self.assertFalse(match_sparse_patterns("A", parsed, path_is_dir=False))
+        # Test exact match (both as file and dir, not dir-only pattern)
+        self.assertTrue(match_sparse_patterns("A/B", parsed))
+        self.assertTrue(match_sparse_patterns("A/B", parsed, path_is_dir=True))
+        # Test subdirectory path (file and dir)
+        self.assertTrue(match_sparse_patterns("A/B/file.txt", parsed))
+        self.assertTrue(match_sparse_patterns("A/B/C", parsed, path_is_dir=True))
         # Test non-matching path
-        self.assertFalse(match_gitignore_patterns("otherdir/subdir", parsed))
-
-    def test_pattern_matching_edge_cases(self):
-        """Test various edge cases in pattern matching."""
-        # Test exact equality with an anchored pattern
-        parsed = parse_sparse_patterns(["/foo"])
-        self.assertTrue(match_gitignore_patterns("foo", parsed))
-
-        # Test with path_is_dir=True
-        self.assertTrue(match_gitignore_patterns("foo", parsed, path_is_dir=True))
-
-        # Test exact match with pattern with dir_only=True
-        parsed = parse_sparse_patterns(["/bar/"])
-        self.assertTrue(match_gitignore_patterns("bar", parsed, path_is_dir=True))
-
-        # Test startswith match for anchored pattern
-        parsed = parse_sparse_patterns(["/prefix"])
-        self.assertTrue(
-            match_gitignore_patterns("prefix/subdirectory/file.txt", parsed)
-        )
+        self.assertFalse(match_sparse_patterns("X", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("X/Y", parsed, path_is_dir=True))
 
 
 class ComputeIncludedPathsFullTests(TestCase):