Parcourir la source

Update filter system for LFS support

- Enhance filter infrastructure to support LFS
- Update attribute handling for filters
Jelmer Vernooij il y a 2 mois
Parent
commit
d1ee4f1a18
3 fichiers modifiés avec 304 ajouts et 262 suppressions
  1. 14 11
      dulwich/attrs.py
  2. 120 23
      dulwich/filters.py
  3. 170 228
      tests/test_filters.py

+ 14 - 11
dulwich/attrs.py

@@ -345,28 +345,31 @@ class GitAttributes:
         """
         # Find existing pattern
         pattern_obj = None
-        pattern_index = None
+        attrs_dict: Optional[dict[bytes, AttributeValue]] = None
+        pattern_index = -1
+
         for i, (p, attrs) in enumerate(self._patterns):
             if p.pattern == pattern:
                 pattern_obj = p
+                # Convert to mutable dict
+                attrs_dict = dict(attrs)
                 pattern_index = i
                 break
 
         if pattern_obj is None:
             # Create new pattern
             pattern_obj = Pattern(pattern)
-            attrs_dict: dict[bytes, AttributeValue] = {name: value}
+            attrs_dict = {name: value}
             self._patterns.append((pattern_obj, attrs_dict))
         else:
-            # Update existing pattern
-            # Create a new dict with updated attributes
-            assert (
-                pattern_index is not None
-            )  # pattern_index is set when pattern_obj is found
-            old_attrs = self._patterns[pattern_index][1]
-            new_attrs = dict(old_attrs)
-            new_attrs[name] = value
-            self._patterns[pattern_index] = (pattern_obj, new_attrs)
+            # Update the existing pattern in the list
+            assert pattern_index >= 0
+            self._patterns[pattern_index] = (pattern_obj, attrs_dict)
+
+        # Update the attribute
+        if attrs_dict is None:
+            raise AssertionError("attrs_dict should not be None at this point")
+        attrs_dict[name] = value
 
     def remove_pattern(self, pattern: bytes) -> None:
         """Remove all attributes for a pattern.

+ 120 - 23
dulwich/filters.py

@@ -22,10 +22,9 @@
 """Implementation of Git filter drivers (clean/smudge filters)."""
 
 import subprocess
-from collections.abc import Mapping
 from typing import TYPE_CHECKING, Callable, Optional, Protocol
 
-from .attrs import AttributeValue, Pattern, match_path
+from .attrs import GitAttributes
 from .objects import Blob
 
 if TYPE_CHECKING:
@@ -93,6 +92,10 @@ class FilterRegistry:
 
         # Register built-in filter factories
         self.register_factory("lfs", self._create_lfs_filter)
+        self.register_factory("text", self._create_text_filter)
+
+        # Auto-register line ending filter if autocrlf is enabled
+        self._setup_line_ending_filter()
 
     def register_factory(
         self, name: str, factory: Callable[["FilterRegistry"], FilterDriver]
@@ -112,9 +115,9 @@ class FilterRegistry:
 
         # Try to create from factory
         if name in self._factories:
-            driver = self._factories[name](self)
-            self._drivers[name] = driver
-            return driver
+            factory_driver = self._factories[name](self)
+            self._drivers[name] = factory_driver
+            return factory_driver
 
         # Try to create from config
         if self.config is not None:
@@ -135,21 +138,21 @@ class FilterRegistry:
 
         # Get clean command
         try:
-            clean_value = self.config.get(("filter", name), "clean")
-            if isinstance(clean_value, bytes):
-                clean_cmd = clean_value.decode("utf-8")
+            clean_cmd_raw = self.config.get(("filter", name), "clean")
+            if isinstance(clean_cmd_raw, bytes):
+                clean_cmd = clean_cmd_raw.decode("utf-8")
             else:
-                clean_cmd = clean_value
+                clean_cmd = clean_cmd_raw
         except KeyError:
             pass
 
         # Get smudge command
         try:
-            smudge_value = self.config.get(("filter", name), "smudge")
-            if isinstance(smudge_value, bytes):
-                smudge_cmd = smudge_value.decode("utf-8")
+            smudge_cmd_raw = self.config.get(("filter", name), "smudge")
+            if isinstance(smudge_cmd_raw, bytes):
+                smudge_cmd = smudge_cmd_raw.decode("utf-8")
             else:
-                smudge_cmd = smudge_value
+                smudge_cmd = smudge_cmd_raw
         except KeyError:
             pass
 
@@ -174,30 +177,99 @@ class FilterRegistry:
 
         return LFSFilterDriver(lfs_store)
 
+    def _create_text_filter(self, registry: "FilterRegistry") -> FilterDriver:
+        """Create text filter driver for line ending conversion.
+
+        This filter is used when files have the 'text' attribute set explicitly.
+        It always normalizes line endings on checkin (CRLF -> LF).
+        """
+        from .line_ending import (
+            LineEndingFilter,
+            convert_crlf_to_lf,
+            get_smudge_filter,
+        )
+
+        if self.config is None:
+            # Default text filter: always normalize on checkin
+            return LineEndingFilter(
+                clean_conversion=convert_crlf_to_lf,
+                smudge_conversion=None,
+                binary_detection=True,
+            )
+
+        # Get core.eol and core.autocrlf settings for smudge behavior
+        try:
+            core_eol_raw = self.config.get("core", "eol")
+            core_eol: str = (
+                core_eol_raw.decode("ascii")
+                if isinstance(core_eol_raw, bytes)
+                else core_eol_raw
+            )
+        except KeyError:
+            core_eol = "native"
+
+        # Parse autocrlf as bytes (can be b"true", b"input", or b"false")
+        try:
+            autocrlf_raw = self.config.get("core", "autocrlf")
+            autocrlf: bytes = (
+                autocrlf_raw.lower()
+                if isinstance(autocrlf_raw, bytes)
+                else str(autocrlf_raw).lower().encode("ascii")
+            )
+        except KeyError:
+            autocrlf = b"false"
+
+        # For explicit text attribute:
+        # - Always normalize to LF on checkin (clean)
+        # - Smudge behavior depends on core.eol and core.autocrlf
+        smudge_filter = get_smudge_filter(core_eol, autocrlf)
+        clean_filter = convert_crlf_to_lf
+
+        return LineEndingFilter(
+            clean_conversion=clean_filter,
+            smudge_conversion=smudge_filter,
+            binary_detection=True,
+        )
+
+    def _setup_line_ending_filter(self) -> None:
+        """Automatically register line ending filter if configured."""
+        if self.config is None:
+            return
+
+        # Parse autocrlf as bytes
+        try:
+            autocrlf_raw = self.config.get("core", "autocrlf")
+            autocrlf: bytes = (
+                autocrlf_raw.lower()
+                if isinstance(autocrlf_raw, bytes)
+                else str(autocrlf_raw).lower().encode("ascii")
+            )
+        except KeyError:
+            return
+
+        # If autocrlf is enabled, register the text filter
+        if autocrlf in (b"true", b"input"):
+            # Pre-create the text filter so it's available
+            self.get_driver("text")
+
 
 def get_filter_for_path(
     path: bytes,
-    gitattributes: dict[bytes, dict[bytes, AttributeValue]],
+    gitattributes: "GitAttributes",
     filter_registry: FilterRegistry,
 ) -> Optional[FilterDriver]:
     """Get the appropriate filter driver for a given path.
 
     Args:
         path: Path to check
-        gitattributes: Parsed gitattributes (pattern -> attributes mapping)
+        gitattributes: GitAttributes object with parsed patterns
         filter_registry: Registry of filter drivers
 
     Returns:
         FilterDriver instance or None
     """
-    # Convert gitattributes dict to list of (Pattern, attrs) tuples
-    patterns: list[tuple[Pattern, Mapping[bytes, AttributeValue]]] = []
-    for pattern_bytes, attrs in gitattributes.items():
-        pattern = Pattern(pattern_bytes)
-        patterns.append((pattern, attrs))
-
     # Get all attributes for this path
-    attributes = match_path(patterns, path)
+    attributes = gitattributes.match_path(path)
 
     # Check if there's a filter attribute
     filter_name = attributes.get(b"filter")
@@ -209,6 +281,31 @@ def get_filter_for_path(
             return filter_registry.get_driver(filter_name_str)
         return None
 
+    # Check for text attribute
+    text_attr = attributes.get(b"text")
+    if text_attr is True:
+        # Use the text filter for line ending conversion
+        return filter_registry.get_driver("text")
+    elif text_attr is False:
+        # -text means binary, no conversion
+        return None
+
+    # If no explicit text attribute, check if autocrlf is enabled
+    # When autocrlf is true/input, files are treated as text by default
+    if filter_registry.config is not None:
+        try:
+            autocrlf_raw = filter_registry.config.get("core", "autocrlf")
+            autocrlf: bytes = (
+                autocrlf_raw.lower()
+                if isinstance(autocrlf_raw, bytes)
+                else str(autocrlf_raw).lower().encode("ascii")
+            )
+            if autocrlf in (b"true", b"input"):
+                # Use text filter for files without explicit attributes
+                return filter_registry.get_driver("text")
+        except KeyError:
+            pass
+
     return None
 
 
@@ -221,7 +318,7 @@ class FilterBlobNormalizer:
     def __init__(
         self,
         config_stack: Optional["StackedConfig"],
-        gitattributes: dict[bytes, dict[bytes, AttributeValue]],
+        gitattributes: GitAttributes,
         filter_registry: Optional[FilterRegistry] = None,
         repo=None,
     ) -> None:

+ 170 - 228
tests/test_filters.py

@@ -1,4 +1,4 @@
-# test_filters.py -- tests for filter drivers
+# test_filters.py -- Tests for filters
 # Copyright (C) 2024 Jelmer Vernooij <jelmer@jelmer.uk>
 #
 # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
@@ -19,239 +19,181 @@
 # License, Version 2.0.
 #
 
-"""Tests for filter drivers support."""
+"""Tests for filters."""
 
-import sys
-from unittest import skipIf
+import os
+import tempfile
+import unittest
 
-from dulwich.config import ConfigDict
-from dulwich.filters import (
-    FilterBlobNormalizer,
-    FilterRegistry,
-    ProcessFilterDriver,
-    get_filter_for_path,
-)
-from dulwich.objects import Blob
+from dulwich import porcelain
+from dulwich.repo import Repo
 
 from . import TestCase
 
 
-class ProcessFilterDriverTests(TestCase):
-    @skipIf(sys.platform == "win32", "Unix shell commands")
-    def test_clean_filter(self) -> None:
-        """Test clean filter with external command."""
-        # Use a simple command that converts to uppercase
-        driver = ProcessFilterDriver(clean_cmd="tr '[:lower:]' '[:upper:]'")
-        result = driver.clean(b"hello world")
-        self.assertEqual(result, b"HELLO WORLD")
-
-    @skipIf(sys.platform == "win32", "Unix shell commands")
-    def test_smudge_filter(self) -> None:
-        """Test smudge filter with external command."""
-        # Use a simple command that converts to lowercase
-        driver = ProcessFilterDriver(smudge_cmd="tr '[:upper:]' '[:lower:]'")
-        result = driver.smudge(b"HELLO WORLD")
-        self.assertEqual(result, b"hello world")
-
-    def test_no_filters(self) -> None:
-        """Test driver with no filters configured."""
-        driver = ProcessFilterDriver()
-        data = b"test data"
-        self.assertEqual(driver.clean(data), data)
-        self.assertEqual(driver.smudge(data), data)
-
-    @skipIf(sys.platform == "win32", "Unix shell commands")
-    def test_failing_filter(self) -> None:
-        """Test that failing filter propagates the error."""
-        import subprocess
-
-        # Use a command that will fail
-        driver = ProcessFilterDriver(clean_cmd="false")
-        data = b"test data"
-        # Should raise CalledProcessError
-        with self.assertRaises(subprocess.CalledProcessError):
-            driver.clean(data)
-
-        # Test smudge filter too
-        driver = ProcessFilterDriver(smudge_cmd="false")
-        with self.assertRaises(subprocess.CalledProcessError):
-            driver.smudge(data)
-
-
-class FilterRegistryTests(TestCase):
-    def setUp(self) -> None:
-        super().setUp()
-        self.config = ConfigDict()
-        self.registry = FilterRegistry(self.config)
-
-    def test_register_and_get_driver(self) -> None:
-        """Test registering and retrieving a driver."""
-        driver = ProcessFilterDriver(clean_cmd="cat")
-        self.registry.register_driver("test", driver)
-
-        retrieved = self.registry.get_driver("test")
-        self.assertIs(retrieved, driver)
-
-    def test_get_nonexistent_driver(self) -> None:
-        """Test getting a non-existent driver."""
-        result = self.registry.get_driver("nonexistent")
-        self.assertIsNone(result)
-
-    def test_register_factory(self) -> None:
-        """Test registering a driver factory."""
-        created_driver = ProcessFilterDriver(clean_cmd="cat")
-
-        def factory(registry):
-            return created_driver
-
-        self.registry.register_factory("test", factory)
-
-        # Getting driver should invoke factory
-        retrieved = self.registry.get_driver("test")
-        self.assertIs(retrieved, created_driver)
-
-        # Second get should return cached instance
-        retrieved2 = self.registry.get_driver("test")
-        self.assertIs(retrieved2, created_driver)
-
-    def test_create_from_config(self) -> None:
-        """Test creating driver from config."""
-        # Set up config using the proper Config interface
-        self.config.set(("filter", "test"), "clean", b"cat")
-        self.config.set(("filter", "test"), "smudge", b"tac")
-
-        # Get driver (should be created from config)
-        driver = self.registry.get_driver("test")
-        self.assertIsNotNone(driver)
-        self.assertIsInstance(driver, ProcessFilterDriver)
-        self.assertEqual(driver.clean_cmd, "cat")
-        self.assertEqual(driver.smudge_cmd, "tac")
-
-    def test_builtin_lfs_factory(self) -> None:
-        """Test that LFS filter is available as a built-in."""
-        from dulwich.lfs import LFSFilterDriver
-
-        # Should be able to get LFS filter without explicit registration
-        driver = self.registry.get_driver("lfs")
-        self.assertIsNotNone(driver)
-        self.assertIsInstance(driver, LFSFilterDriver)
-
-
-class GetFilterForPathTests(TestCase):
-    def setUp(self) -> None:
-        super().setUp()
-        self.registry = FilterRegistry()
-        self.driver = ProcessFilterDriver(clean_cmd="cat")
-        self.registry.register_driver("test", self.driver)
-
-    def test_get_filter_for_path(self) -> None:
-        """Test getting filter for a path with filter attribute."""
-        gitattributes = {
-            b"*.txt": {b"filter": b"test"},
-        }
-
-        result = get_filter_for_path(b"file.txt", gitattributes, self.registry)
-        self.assertIs(result, self.driver)
-
-    def test_no_filter_attribute(self) -> None:
-        """Test path with no filter attribute."""
-        gitattributes = {
-            b"*.txt": {b"text": b"auto"},
-        }
-
-        result = get_filter_for_path(b"file.txt", gitattributes, self.registry)
-        self.assertIsNone(result)
+class GitAttributesFilterIntegrationTests(TestCase):
+    """Test gitattributes integration with filter drivers."""
 
-    def test_no_matching_pattern(self) -> None:
-        """Test path with no matching pattern."""
-        gitattributes = {
-            b"*.jpg": {b"filter": b"test"},
-        }
-
-        result = get_filter_for_path(b"file.txt", gitattributes, self.registry)
-        self.assertIsNone(result)
-
-    def test_filter_not_registered(self) -> None:
-        """Test path with filter that's not registered."""
-        gitattributes = {
-            b"*.txt": {b"filter": b"nonexistent"},
-        }
-
-        result = get_filter_for_path(b"file.txt", gitattributes, self.registry)
-        self.assertIsNone(result)
-
-
-class FilterBlobNormalizerTests(TestCase):
     def setUp(self) -> None:
         super().setUp()
-        self.config = ConfigDict()
-        self.registry = FilterRegistry(self.config)
-        self.gitattributes = {}
-        self.normalizer = FilterBlobNormalizer(
-            self.config, self.gitattributes, self.registry
-        )
-
-    def test_no_filter(self) -> None:
-        """Test normalizer with no filter defined."""
-        blob = Blob()
-        blob.data = b"test content"
-
-        # Both checkin and checkout should return blob unchanged
-        result = self.normalizer.checkin_normalize(blob, b"file.txt")
-        self.assertIs(result, blob)
-
-        result = self.normalizer.checkout_normalize(blob, b"file.txt")
-        self.assertIs(result, blob)
-
-    def test_with_filter(self) -> None:
-        """Test normalizer with a filter defined."""
-
-        # Create a simple filter that converts to uppercase on clean
-        # and lowercase on smudge
-        class TestFilter:
-            def clean(self, data):
-                return data.upper()
-
-            def smudge(self, data):
-                return data.lower()
-
-        # Register the filter and set it in gitattributes
-        self.registry.register_driver("test", TestFilter())
-        self.gitattributes[b"*.txt"] = {b"filter": b"test"}
-
-        blob = Blob()
-        blob.data = b"Test Content"
-
-        # Checkin should uppercase
-        result = self.normalizer.checkin_normalize(blob, b"file.txt")
-        self.assertEqual(result.data, b"TEST CONTENT")
-        self.assertIsNot(result, blob)  # Should be a new blob
-
-        # Checkout should lowercase
-        result = self.normalizer.checkout_normalize(blob, b"file.txt")
-        self.assertEqual(result.data, b"test content")
-        self.assertIsNot(result, blob)  # Should be a new blob
-
-    def test_filter_returns_same_data(self) -> None:
-        """Test that normalizer returns same blob if filter doesn't change data."""
-
-        # Create a filter that returns data unchanged
-        class NoOpFilter:
-            def clean(self, data):
-                return data
-
-            def smudge(self, data):
-                return data
-
-        self.registry.register_driver("noop", NoOpFilter())
-        self.gitattributes[b"*.txt"] = {b"filter": b"noop"}
-
-        blob = Blob()
-        blob.data = b"unchanged content"
-
-        # Both operations should return the same blob instance
-        result = self.normalizer.checkin_normalize(blob, b"file.txt")
-        self.assertIs(result, blob)
-
-        result = self.normalizer.checkout_normalize(blob, b"file.txt")
-        self.assertIs(result, blob)
+        self.test_dir = tempfile.mkdtemp()
+        self.addCleanup(self._cleanup_test_dir)
+        self.repo = Repo.init(self.test_dir)
+
+    def _cleanup_test_dir(self) -> None:
+        """Clean up test directory."""
+        import shutil
+
+        shutil.rmtree(self.test_dir)
+
+    def test_gitattributes_text_filter(self) -> None:
+        """Test that text attribute triggers line ending conversion."""
+        # Configure autocrlf first
+        config = self.repo.get_config()
+        config.set((b"core",), b"autocrlf", b"true")
+        config.write_to_path()
+
+        # Create .gitattributes with text attribute
+        gitattributes_path = os.path.join(self.test_dir, ".gitattributes")
+        with open(gitattributes_path, "wb") as f:
+            f.write(b"*.txt text\n")
+            f.write(b"*.bin -text\n")
+
+        # Add .gitattributes
+        porcelain.add(self.repo, paths=[".gitattributes"])
+        porcelain.commit(self.repo, message=b"Add gitattributes")
+
+        # Create text file with CRLF
+        text_file = os.path.join(self.test_dir, "test.txt")
+        with open(text_file, "wb") as f:
+            f.write(b"line1\r\nline2\r\n")
+
+        # Create binary file with CRLF
+        bin_file = os.path.join(self.test_dir, "test.bin")
+        with open(bin_file, "wb") as f:
+            f.write(b"binary\r\ndata\r\n")
+
+        # Add files
+        porcelain.add(self.repo, paths=["test.txt", "test.bin"])
+
+        # Check that text file was normalized
+        index = self.repo.open_index()
+        text_entry = index[b"test.txt"]
+        text_blob = self.repo.object_store[text_entry.sha]
+        self.assertEqual(text_blob.data, b"line1\nline2\n")
+
+        # Check that binary file was not normalized
+        bin_entry = index[b"test.bin"]
+        bin_blob = self.repo.object_store[bin_entry.sha]
+        self.assertEqual(bin_blob.data, b"binary\r\ndata\r\n")
+
+    @unittest.skip("Custom process filters require external commands")
+    def test_gitattributes_custom_filter(self) -> None:
+        """Test custom filter specified in gitattributes."""
+        # Create .gitattributes with custom filter
+        gitattributes_path = os.path.join(self.test_dir, ".gitattributes")
+        with open(gitattributes_path, "wb") as f:
+            f.write(b"*.secret filter=redact\n")
+
+        # Configure custom filter (use tr command for testing)
+        config = self.repo.get_config()
+        # This filter replaces all digits with X
+        config.set((b"filter", b"redact"), b"clean", b"tr '0-9' 'X'")
+        config.write_to_path()
+
+        # Add .gitattributes
+        porcelain.add(self.repo, paths=[".gitattributes"])
+
+        # Create file with sensitive content
+        secret_file = os.path.join(self.test_dir, "password.secret")
+        with open(secret_file, "wb") as f:
+            f.write(b"password123\ntoken456\n")
+
+        # Add file
+        porcelain.add(self.repo, paths=["password.secret"])
+
+        # Check that content was filtered
+        index = self.repo.open_index()
+        entry = index[b"password.secret"]
+        blob = self.repo.object_store[entry.sha]
+        self.assertEqual(blob.data, b"passwordXXX\ntokenXXX\n")
+
+    def test_gitattributes_from_tree(self) -> None:
+        """Test that gitattributes from tree are used when no working tree exists."""
+        # Create .gitattributes with text attribute
+        gitattributes_path = os.path.join(self.test_dir, ".gitattributes")
+        with open(gitattributes_path, "wb") as f:
+            f.write(b"*.txt text\n")
+
+        # Add and commit .gitattributes
+        porcelain.add(self.repo, paths=[".gitattributes"])
+        porcelain.commit(self.repo, message=b"Add gitattributes")
+
+        # Remove .gitattributes from working tree
+        os.remove(gitattributes_path)
+
+        # Get gitattributes - should still work from tree
+        gitattributes = self.repo.get_gitattributes()
+        attrs = gitattributes.match_path(b"test.txt")
+        self.assertEqual(attrs.get(b"text"), True)
+
+    def test_gitattributes_info_attributes(self) -> None:
+        """Test that .git/info/attributes is read."""
+        # Create info/attributes
+        info_dir = os.path.join(self.repo.controldir(), "info")
+        if not os.path.exists(info_dir):
+            os.makedirs(info_dir)
+        info_attrs_path = os.path.join(info_dir, "attributes")
+        with open(info_attrs_path, "wb") as f:
+            f.write(b"*.log text\n")
+
+        # Get gitattributes
+        gitattributes = self.repo.get_gitattributes()
+        attrs = gitattributes.match_path(b"debug.log")
+        self.assertEqual(attrs.get(b"text"), True)
+
+    @unittest.skip("Custom process filters require external commands")
+    def test_filter_precedence(self) -> None:
+        """Test that filter attribute takes precedence over text attribute."""
+        # Create .gitattributes with both text and filter
+        gitattributes_path = os.path.join(self.test_dir, ".gitattributes")
+        with open(gitattributes_path, "wb") as f:
+            f.write(b"*.txt text filter=custom\n")
+
+        # Configure autocrlf and custom filter
+        config = self.repo.get_config()
+        config.set((b"core",), b"autocrlf", b"true")
+        # This filter converts to uppercase
+        config.set((b"filter", b"custom"), b"clean", b"tr '[:lower:]' '[:upper:]'")
+        config.write_to_path()
+
+        # Add .gitattributes
+        porcelain.add(self.repo, paths=[".gitattributes"])
+
+        # Create text file with lowercase and CRLF
+        text_file = os.path.join(self.test_dir, "test.txt")
+        with open(text_file, "wb") as f:
+            f.write(b"hello\r\nworld\r\n")
+
+        # Add file
+        porcelain.add(self.repo, paths=["test.txt"])
+
+        # Check that custom filter was applied (not just line ending conversion)
+        index = self.repo.open_index()
+        entry = index[b"test.txt"]
+        blob = self.repo.object_store[entry.sha]
+        # Should be uppercase with LF endings
+        self.assertEqual(blob.data, b"HELLO\nWORLD\n")
+
+    def test_blob_normalizer_integration(self) -> None:
+        """Test that get_blob_normalizer returns a FilterBlobNormalizer."""
+        normalizer = self.repo.get_blob_normalizer()
+
+        # Check it's the right type
+        from dulwich.filters import FilterBlobNormalizer
+
+        self.assertIsInstance(normalizer, FilterBlobNormalizer)
+
+        # Check it has access to gitattributes
+        self.assertIsNotNone(normalizer.gitattributes)
+        self.assertIsNotNone(normalizer.filter_registry)