Procházet zdrojové kódy

Support recursive cone mode (#2043)

Jelmer Vernooij před 3 týdny
rodič
revize
44f64531f0
2 změnil soubory, kde provedl 155 přidání a 132 odebrání
  1. 56 47
      dulwich/sparse_patterns.py
  2. 99 85
      tests/test_sparse_patterns.py

+ 56 - 47
dulwich/sparse_patterns.py

@@ -28,7 +28,7 @@ __all__ = [
     "compute_included_paths_cone",
     "compute_included_paths_cone",
     "compute_included_paths_full",
     "compute_included_paths_full",
     "determine_included_paths",
     "determine_included_paths",
-    "match_gitignore_patterns",
+    "match_sparse_patterns",
     "parse_sparse_patterns",
     "parse_sparse_patterns",
 ]
 ]
 
 
@@ -86,8 +86,8 @@ def compute_included_paths_full(index: Index, lines: Sequence[str]) -> set[str]:
     included = set()
     included = set()
     for path_bytes, entry in index.items():
     for path_bytes, entry in index.items():
         path_str = path_bytes.decode("utf-8")
         path_str = path_bytes.decode("utf-8")
-        # For .gitignore logic, match_gitignore_patterns returns True if 'included'
-        if match_gitignore_patterns(path_str, parsed, path_is_dir=False):
+        # For .gitignore logic, match_sparse_patterns returns True if 'included'
+        if match_sparse_patterns(path_str, parsed, path_is_dir=False):
             included.add(path_str)
             included.add(path_str)
     return included
     return included
 
 
@@ -252,9 +252,10 @@ def parse_sparse_patterns(lines: Sequence[str]) -> list[tuple[str, bool, bool, b
     anchoring, and directory-only markers, and returns data suitable for matching.
     anchoring, and directory-only markers, and returns data suitable for matching.
 
 
     Example:
     Example:
-      ``line = "/*.txt" -> ("/.txt", False, False, True)``
-      ``line = "!/docs/" -> ("/docs/", True, True, True)``
-      ``line = "mydir/" -> ("mydir/", False, True, False)`` not anchored, no leading "/"
+      ``line = "*.txt" -> ("*.txt", False, False, False)`` not negated/dir/anchored
+      ``line = "/*.txt" -> ("*.txt", False, False, True)`` anchored, not negated/dir
+      ``line = "!/*.txt" -> ("*.txt", False, False, True)`` anchored/negated, not dir
+      ``line = "!/mydir/" -> ("mydir", True, True, True)`` anchored/negated/dir
 
 
     Args:
     Args:
       lines: A list of raw lines (strings) from the sparse-checkout file.
       lines: A list of raw lines (strings) from the sparse-checkout file.
@@ -277,19 +278,16 @@ def parse_sparse_patterns(lines: Sequence[str]) -> list[tuple[str, bool, bool, b
         if anchored:
         if anchored:
             line = line[1:]  # remove leading '/'
             line = line[1:]  # remove leading '/'
 
 
-        # If pattern ends with '/', we consider it directory-only
-        # (like "docs/"). Real Git might treat it slightly differently,
-        # but we'll simplify and mark it as "dir_only" if it ends in "/".
-        dir_only = False
-        if line.endswith("/"):
-            dir_only = True
+        # If pattern ends with '/', we consider it directory-only (like "docs/").
+        dir_only = line.endswith("/")
+        if dir_only:
             line = line[:-1]
             line = line[:-1]
 
 
         results.append((line, negation, dir_only, anchored))
         results.append((line, negation, dir_only, anchored))
     return results
     return results
 
 
 
 
-def match_gitignore_patterns(
+def match_sparse_patterns(
     path_str: str,
     path_str: str,
     parsed_patterns: Sequence[tuple[str, bool, bool, bool]],
     parsed_patterns: Sequence[tuple[str, bool, bool, bool]],
     path_is_dir: bool = False,
     path_is_dir: bool = False,
@@ -301,16 +299,22 @@ def match_gitignore_patterns(
       2. If a pattern matches, we set the "include" state depending on negation.
       2. If a pattern matches, we set the "include" state depending on negation.
       3. Later matches override earlier ones.
       3. Later matches override earlier ones.
 
 
-    In a .gitignore sense, lines that do not start with '!' are "ignore" patterns,
-    lines that start with '!' are "unignore" (re-include). But in sparse checkout,
-    it's effectively reversed: a non-negation line is "include," negation is "exclude."
-    However, many flows still rely on the same final logic: the last matching pattern
+    In a sparse checkout, lines that do not start with '!' are positive patterns,
+    indicating files/directories to check out (include in the index), and those that
+    start with '!' are negative ('negated'), meaning they indicate files not to check
+    out (not included in the index). This is fairly straightforward.
+
+    In a .gitignore, it's the same syntax but with a reverse effect: positive means
+    "ignore" (exclude from the index) and negative means "unignore" (re-include in the
+    index).
+
+    Many routines still rely on the same final logic: the last matching pattern
     decides "excluded" vs. "included."
     decides "excluded" vs. "included."
 
 
     We'll interpret "include" as returning True, "exclude" as returning False.
     We'll interpret "include" as returning True, "exclude" as returning False.
 
 
-    Each pattern can include negation (!), directory-only markers, or be anchored
-    to the start of the path. The last matching pattern determines whether the
+    Each pattern can include negation ('!'), directory-only markers ('/' as suffix), or
+    be anchored ('/' as prefix). The last matching pattern determines whether the
     path is ultimately included or excluded.
     path is ultimately included or excluded.
 
 
     Args:
     Args:
@@ -322,9 +326,8 @@ def match_gitignore_patterns(
     Returns:
     Returns:
       True if the path is included by the last matching pattern, False otherwise.
       True if the path is included by the last matching pattern, False otherwise.
     """
     """
-    # Start by assuming "excluded" (like a .gitignore starts by including everything
-    # until matched, but for sparse-checkout we often treat unmatched as "excluded").
-    # We will flip if we match an "include" pattern.
+    # Start by assuming "excluded". Like how .gitignore initially includes everything
+    # until matched, but reversed: sparse-checkout initially excludes everything.
     is_included = False
     is_included = False
 
 
     for pattern, negation, dir_only, anchored in parsed_patterns:
     for pattern, negation, dir_only, anchored in parsed_patterns:
@@ -336,37 +339,43 @@ def match_gitignore_patterns(
                 matched = True
                 matched = True
         else:
         else:
             matched = False
             matched = False
-        # If dir_only is True and path_is_dir is False, we skip matching
+        # If dir_only is True, handle directory-only matching separately
         if dir_only and not matched:
         if dir_only and not matched:
             if path_str == pattern + "/":
             if path_str == pattern + "/":
                 matched = not forbidden_path
                 matched = not forbidden_path
             elif fnmatch(path_str, f"{pattern}/*"):
             elif fnmatch(path_str, f"{pattern}/*"):
                 matched = True  # root subpath (anchored or unanchored)
                 matched = True  # root subpath (anchored or unanchored)
             elif not anchored:
             elif not anchored:
-                matched = fnmatch(path_str, f"*/{pattern}/*")  # unanchored subpath
-
-        # If anchored is True, pattern should match from the start of path_str.
-        # If not anchored, we can match anywhere.
-        if anchored and not matched:
-            # We match from the beginning. For example, pattern = "docs"
-            # path_str = "docs/readme.md" -> start is "docs"
-            # We'll just do a prefix check or prefix + slash check
-            # Or you can do a partial fnmatch. We'll do a manual approach:
-            if pattern == "":
-                # Means it was just "/", which can happen if line was "/"
-                # That might represent top-level only?
-                # We'll skip for simplicity or treat it as a special case.
-                continue
-            elif path_str == pattern:
-                matched = True
-            elif path_str.startswith(pattern + "/"):
-                matched = True
-            else:
-                matched = False
-        elif not matched:
-            # Not anchored: we can do a simple wildcard match or a substring match.
-            # For simplicity, let's use Python's fnmatch:
-            matched = fnmatch(path_str, pattern) or fnmatch(path_str, f"*/{pattern}")
+                # For unanchored dir-only patterns, match the directory at any level
+                # e.g., "docs/" should match "A/docs" as a directory and "A/docs/*" as files within
+                if fnmatch(path_str, f"*/{pattern}") and path_is_dir:
+                    matched = True
+                elif fnmatch(path_str, f"*/{pattern}/*"):
+                    matched = True  # unanchored subpath
+        # If dir_only is False, or if dir_only is True and we already matched, continue with other logic
+        elif not dir_only:
+            # If anchored is True, pattern should match from the start of path_str.
+            # If not anchored, we can match anywhere.
+            if anchored and not matched:
+                # We match from the beginning. For example, pattern = "docs"
+                # path_str = "docs/readme.md" -> start is "docs"
+                # We'll just do a prefix check or prefix + slash check
+                # Or you can do a partial fnmatch. We'll do a manual approach:
+                if pattern == "":
+                    # Means it was just "/", which should match everything recursively
+                    matched = True
+                elif path_str == pattern:
+                    matched = True
+                elif path_str.startswith(pattern + "/"):
+                    matched = True
+                else:
+                    matched = False
+            elif not matched:
+                # Not anchored: we can do a simple wildcard match or a substring match.
+                # For simplicity, let's use Python's fnmatch:
+                matched = fnmatch(path_str, pattern) or fnmatch(
+                    path_str, f"*/{pattern}"
+                )
 
 
         if matched:
         if matched:
             # If negation is True, that means 'exclude'. If negation is False, 'include'.
             # If negation is True, that means 'exclude'. If negation is False, 'include'.

+ 99 - 85
tests/test_sparse_patterns.py

@@ -37,7 +37,7 @@ from dulwich.sparse_patterns import (
     compute_included_paths_cone,
     compute_included_paths_cone,
     compute_included_paths_full,
     compute_included_paths_full,
     determine_included_paths,
     determine_included_paths,
-    match_gitignore_patterns,
+    match_sparse_patterns,
     parse_sparse_patterns,
     parse_sparse_patterns,
 )
 )
 
 
@@ -57,44 +57,41 @@ class ParseSparsePatternsTests(TestCase):
         parsed = parse_sparse_patterns(lines)
         parsed = parse_sparse_patterns(lines)
         self.assertEqual(parsed, [])
         self.assertEqual(parsed, [])
 
 
-    def test_simple_patterns(self):
+    def test_sparse_pattern_combos(self):
         lines = [
         lines = [
-            "*.py",
-            "!*.md",
-            "/docs/",
-            "!/docs/images/",
-        ]
-        parsed = parse_sparse_patterns(lines)
-        self.assertEqual(len(parsed), 4)
-
-        self.assertEqual(parsed[0], ("*.py", False, False, False))  # include *.py
-        self.assertEqual(parsed[1], ("*.md", True, False, False))  # exclude *.md
-        self.assertEqual(parsed[2], ("docs", False, True, True))  # anchored, dir_only
-        self.assertEqual(parsed[3], ("docs/images", True, True, True))
-
-    def test_trailing_slash_dir(self):
-        lines = [
-            "src/",
+            "*.py",  # Python files anywhere
+            "!*.md",  # markdown files anywhere
+            "/docs/",  # root docs dir
+            "!/docs/images/",  # no root docs/images subdir
+            "src/",  # src dir anywhere
+            "/*.toml",  # root TOML files
+            "!/*.bak",  # no root backup files
+            "!data/",  # no data dirs anywhere
         ]
         ]
         parsed = parse_sparse_patterns(lines)
         parsed = parse_sparse_patterns(lines)
-        # "src/" => (pattern="src", negation=False, dir_only=True, anchored=False)
-        self.assertEqual(parsed, [("src", False, True, False)])
+        self.assertEqual(len(parsed), 8)
 
 
-    def test_negation_anchor(self):
-        lines = [
-            "!/foo.txt",
-        ]
-        parsed = parse_sparse_patterns(lines)
-        # => (pattern="foo.txt", negation=True, dir_only=False, anchored=True)
-        self.assertEqual(parsed, [("foo.txt", True, False, True)])
+        # Returns a 4-tuple of: (pattern, negation, dir_only, anchored)
+        self.assertEqual(parsed[0], ("*.py", False, False, False))  # _,_,_
+        self.assertEqual(parsed[1], ("*.md", True, False, False))  # N,_,_
+        self.assertEqual(parsed[2], ("docs", False, True, True))  # _,D,A
+        self.assertEqual(parsed[3], ("docs/images", True, True, True))  # N,D,A
+        self.assertEqual(parsed[4], ("src", False, True, False))  # _,D,_
+        self.assertEqual(parsed[5], ("*.toml", False, False, True))  # _,_,A
+        self.assertEqual(parsed[6], ("*.bak", True, False, True))  # N,_,A
+        self.assertEqual(parsed[7], ("data", True, True, False))  # N,D,_
 
 
 
 
-class MatchGitignorePatternsTests(TestCase):
-    """Test the match_gitignore_patterns function."""
+class MatchSparsePatternsTests(TestCase):
+    """Test the match_sparse_patterns function."""
 
 
+    # def match_sparse_patterns(path_str, parsed_patterns, path_is_dir=False):
     def test_no_patterns_returns_excluded(self):
     def test_no_patterns_returns_excluded(self):
         """If no patterns are provided, by default we treat the path as excluded."""
         """If no patterns are provided, by default we treat the path as excluded."""
-        self.assertFalse(match_gitignore_patterns("anyfile.py", []))
+        self.assertFalse(match_sparse_patterns("foo.py", [], path_is_dir=False))
+        self.assertFalse(match_sparse_patterns("A/", [], path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/B/", [], path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/B/bar.md", [], path_is_dir=False))
 
 
     def test_last_match_wins(self):
     def test_last_match_wins(self):
         """Checks that the last pattern to match determines included vs excluded."""
         """Checks that the last pattern to match determines included vs excluded."""
@@ -106,78 +103,95 @@ class MatchGitignorePatternsTests(TestCase):
         )
         )
         # "foo.py" matches first pattern => included
         # "foo.py" matches first pattern => included
         # then matches second pattern => excluded
         # then matches second pattern => excluded
-        self.assertFalse(match_gitignore_patterns("foo.py", parsed))
+        self.assertFalse(match_sparse_patterns("foo.py", parsed))
+        self.assertFalse(match_sparse_patterns("A/foo.py", parsed))
+        self.assertFalse(match_sparse_patterns("A/B/foo.py", parsed))
+        self.assertTrue(match_sparse_patterns("bar.py", parsed))
+        self.assertTrue(match_sparse_patterns("A/bar.py", parsed))
+        self.assertTrue(match_sparse_patterns("A/B/bar.py", parsed))
+        self.assertFalse(match_sparse_patterns("bar.md", parsed))
+        self.assertFalse(match_sparse_patterns("A/bar.md", parsed))
+        self.assertFalse(match_sparse_patterns("A/B", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/B", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns(".cache", parsed, path_is_dir=True))
 
 
     def test_dir_only(self):
     def test_dir_only(self):
         """A pattern with a trailing slash should only match directories and subdirectories."""
         """A pattern with a trailing slash should only match directories and subdirectories."""
         parsed = parse_sparse_patterns(["docs/"])
         parsed = parse_sparse_patterns(["docs/"])
-        # Because we set path_is_dir=False, it won't match
-        self.assertTrue(
-            match_gitignore_patterns("docs/readme.md", parsed, path_is_dir=False)
-        )
-        self.assertTrue(match_gitignore_patterns("docs", parsed, path_is_dir=True))
+        # The directory pattern is not rooted, so can be at any level
+        self.assertTrue(match_sparse_patterns("docs", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("A/docs", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("A/B/docs", parsed, path_is_dir=True))
         # Even if the path name is "docs", if it's a file, won't match:
         # Even if the path name is "docs", if it's a file, won't match:
-        self.assertFalse(match_gitignore_patterns("docs", parsed, path_is_dir=False))
+        self.assertFalse(match_sparse_patterns("docs", parsed, path_is_dir=False))
+        self.assertFalse(match_sparse_patterns("A/docs", parsed, path_is_dir=False))
+        self.assertFalse(match_sparse_patterns("A/B/docs", parsed, path_is_dir=False))
+        # Subfiles and subdirs of the included dir should match
+        self.assertTrue(match_sparse_patterns("docs/x.md", parsed))
+        self.assertTrue(match_sparse_patterns("docs/A/x.md", parsed))
+        self.assertTrue(match_sparse_patterns("docs/A/B/x.md", parsed))
+        self.assertTrue(match_sparse_patterns("docs/A", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("docs/A/B", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("docs", parsed, path_is_dir=True))
 
 
     def test_anchored(self):
     def test_anchored(self):
         """Anchored patterns match from the start of the path only."""
         """Anchored patterns match from the start of the path only."""
-        parsed = parse_sparse_patterns(["/foo"])
-        self.assertTrue(match_gitignore_patterns("foo", parsed))
+        parsed = parse_sparse_patterns(["/foo"])  # Can be file or dir, must be at root
+        self.assertTrue(match_sparse_patterns("foo", parsed))
+        self.assertTrue(match_sparse_patterns("foo", parsed, path_is_dir=True))
         # But "some/foo" doesn't match because anchored requires start
         # But "some/foo" doesn't match because anchored requires start
-        self.assertFalse(match_gitignore_patterns("some/foo", parsed))
+        self.assertFalse(match_sparse_patterns("A/foo", parsed))
+        self.assertFalse(match_sparse_patterns("A/foo", parsed, path_is_dir=True))
 
 
-    def test_unanchored_uses_fnmatch(self):
+    def test_unanchored(self):
         parsed = parse_sparse_patterns(["foo"])
         parsed = parse_sparse_patterns(["foo"])
-        self.assertTrue(match_gitignore_patterns("some/foo", parsed))
-        self.assertFalse(match_gitignore_patterns("some/bar", parsed))
+        self.assertTrue(match_sparse_patterns("foo", parsed))
+        self.assertTrue(match_sparse_patterns("foo", parsed, path_is_dir=True))
+        # But "some/foo" doesn't match because anchored requires start
+        self.assertTrue(match_sparse_patterns("A/foo", parsed))
+        self.assertTrue(match_sparse_patterns("A/foo", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("bar", parsed))
+        self.assertFalse(match_sparse_patterns("A/bar", parsed))
 
 
     def test_anchored_empty_pattern(self):
     def test_anchored_empty_pattern(self):
         """Test handling of empty pattern with anchoring (e.g., '/')."""
         """Test handling of empty pattern with anchoring (e.g., '/')."""
+        # `/` should be recursive match of all files
         parsed = parse_sparse_patterns(["/"])
         parsed = parse_sparse_patterns(["/"])
-        # Check the structure of the parsed empty pattern first
-        self.assertEqual(parsed, [("", False, False, True)])
-        # When the pattern is empty with anchoring, it's continued (skipped) in match_gitignore_patterns
-        # for non-empty paths but for empty string it might match due to empty string comparisons
-        self.assertFalse(match_gitignore_patterns("foo", parsed))
-        # An empty string with empty pattern will match (implementation detail)
-        self.assertTrue(match_gitignore_patterns("", parsed))
-
-    def test_anchored_dir_only_exact_match(self):
-        """Test anchored directory-only patterns with exact matching."""
+        self.assertEqual(parsed, [("", False, False, True)])  # anchored
+        self.assertTrue(match_sparse_patterns("", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("A", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("A/B", parsed, path_is_dir=True))
+        self.assertTrue(match_sparse_patterns("foo", parsed))
+        self.assertTrue(match_sparse_patterns("A/foo", parsed))
+        self.assertTrue(match_sparse_patterns("A/B/foo", parsed))
+
+    def test_anchored_dir_only(self):
+        """Test anchored directory-only patterns."""
         parsed = parse_sparse_patterns(["/docs/"])
         parsed = parse_sparse_patterns(["/docs/"])
-        # Test with exact match "docs" and path_is_dir=True
-        self.assertTrue(match_gitignore_patterns("docs", parsed, path_is_dir=True))
-        # Test with "docs/" (exact match + trailing slash)
-        self.assertTrue(match_gitignore_patterns("docs/", parsed, path_is_dir=True))
-
-    def test_complex_anchored_patterns(self):
-        """Test more complex anchored pattern matching."""
-        parsed = parse_sparse_patterns(["/dir/subdir"])
-        # Test exact match
-        self.assertTrue(match_gitignore_patterns("dir/subdir", parsed))
-        # Test subdirectory path
-        self.assertTrue(match_gitignore_patterns("dir/subdir/file.txt", parsed))
+        self.assertTrue(match_sparse_patterns("docs", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("docs", parsed))  # file named docs
+        self.assertFalse(match_sparse_patterns("A", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/B", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/docs", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("A/docs", parsed))
+        self.assertFalse(match_sparse_patterns("A/B/docs", parsed))
+        self.assertFalse(match_sparse_patterns("A/B/docs", parsed, path_is_dir=True))
+
+    def test_anchored_subpath(self):
+        """Test anchored subpath pattern matching."""
+        parsed = parse_sparse_patterns(["/A/B"])
+        # TODO: should this also match the dir "A" (positively?)
+        # self.assertTrue(match_sparse_patterns("A", parsed, path_is_dir=True))
+        # self.assertFalse(match_sparse_patterns("A", parsed, path_is_dir=False))
+        # Test exact match (both as file and dir, not dir-only pattern)
+        self.assertTrue(match_sparse_patterns("A/B", parsed))
+        self.assertTrue(match_sparse_patterns("A/B", parsed, path_is_dir=True))
+        # Test subdirectory path (file and dir)
+        self.assertTrue(match_sparse_patterns("A/B/file.txt", parsed))
+        self.assertTrue(match_sparse_patterns("A/B/C", parsed, path_is_dir=True))
         # Test non-matching path
         # Test non-matching path
-        self.assertFalse(match_gitignore_patterns("otherdir/subdir", parsed))
-
-    def test_pattern_matching_edge_cases(self):
-        """Test various edge cases in pattern matching."""
-        # Test exact equality with an anchored pattern
-        parsed = parse_sparse_patterns(["/foo"])
-        self.assertTrue(match_gitignore_patterns("foo", parsed))
-
-        # Test with path_is_dir=True
-        self.assertTrue(match_gitignore_patterns("foo", parsed, path_is_dir=True))
-
-        # Test exact match with pattern with dir_only=True
-        parsed = parse_sparse_patterns(["/bar/"])
-        self.assertTrue(match_gitignore_patterns("bar", parsed, path_is_dir=True))
-
-        # Test startswith match for anchored pattern
-        parsed = parse_sparse_patterns(["/prefix"])
-        self.assertTrue(
-            match_gitignore_patterns("prefix/subdirectory/file.txt", parsed)
-        )
+        self.assertFalse(match_sparse_patterns("X", parsed, path_is_dir=True))
+        self.assertFalse(match_sparse_patterns("X/Y", parsed, path_is_dir=True))
 
 
 
 
 class ComputeIncludedPathsFullTests(TestCase):
 class ComputeIncludedPathsFullTests(TestCase):