Просмотр исходного кода

Add support for core.whitespace and core.safecrlf

* Add core.whitespace configuration support for whitespace error detection
* Add core.safecrlf configuration support for safe CRLF conversion checks
* Refactor LineEndingFilter with from_config classmethod for cleaner code

Fixes #1806
Jelmer Vernooij 4 месяцев назад
Родитель
Сommit
37a7203201
6 измененных файлов с 871 добавлено и 53 удалено
  1. 10 1
      NEWS
  2. 2 46
      dulwich/filters.py
  3. 172 6
      dulwich/line_ending.py
  4. 282 0
      dulwich/whitespace.py
  5. 131 0
      tests/test_line_ending.py
  6. 274 0
      tests/test_whitespace.py

+ 10 - 1
NEWS

@@ -31,7 +31,16 @@
 
  * Fix handling of CRLF line endings with ``core.autocrlf = input`` to prevent
    unchanged files from appearing as unstaged in status.
-   (Jelmer Vernooij, #1770)
+   (Jelmer Vernooij, #1773)
+
+ * Add support for ``core.whitespace`` configuration for whitespace error
+   detection and fixing. Supports blank-at-eol, space-before-tab, indent-with-non-tab,
+   tab-in-indent, blank-at-eof, cr-at-eol, and tabwidth settings.
+   (Jelmer Vernooij, #1806)
+
+ * Add support for ``core.safecrlf`` configuration to check if CRLF/LF conversions
+   would be reversible and optionally abort or warn on potentially lossy conversions.
+   (Jelmer Vernooij, #1806)
 
  * Add support for ``http.extraHeader`` configuration to pass additional HTTP
    headers to the server when communicating over HTTP(S).

+ 2 - 46
dulwich/filters.py

@@ -652,53 +652,9 @@ class FilterRegistry:
         This filter is used when files have the 'text' attribute set explicitly.
         It always normalizes line endings on checkin (CRLF -> LF).
         """
-        from .line_ending import (
-            LineEndingFilter,
-            convert_crlf_to_lf,
-            get_smudge_filter,
-        )
+        from .line_ending import LineEndingFilter
 
-        if self.config is None:
-            # Default text filter: always normalize on checkin
-            return LineEndingFilter(
-                clean_conversion=convert_crlf_to_lf,
-                smudge_conversion=None,
-                binary_detection=True,
-            )
-
-        # Get core.eol and core.autocrlf settings for smudge behavior
-        try:
-            core_eol_raw = self.config.get("core", "eol")
-            core_eol: str = (
-                core_eol_raw.decode("ascii")
-                if isinstance(core_eol_raw, bytes)
-                else core_eol_raw
-            )
-        except KeyError:
-            core_eol = "native"
-
-        # Parse autocrlf as bytes (can be b"true", b"input", or b"false")
-        try:
-            autocrlf_raw = self.config.get("core", "autocrlf")
-            autocrlf: bytes = (
-                autocrlf_raw.lower()
-                if isinstance(autocrlf_raw, bytes)
-                else str(autocrlf_raw).lower().encode("ascii")
-            )
-        except KeyError:
-            autocrlf = b"false"
-
-        # For explicit text attribute:
-        # - Always normalize to LF on checkin (clean)
-        # - Smudge behavior depends on core.eol and core.autocrlf
-        smudge_filter = get_smudge_filter(core_eol, autocrlf)
-        clean_filter = convert_crlf_to_lf
-
-        return LineEndingFilter(
-            clean_conversion=clean_filter,
-            smudge_conversion=smudge_filter,
-            binary_detection=True,
-        )
+        return LineEndingFilter.from_config(self.config, for_text_attr=True)
 
     def _setup_line_ending_filter(self) -> None:
         """Automatically register line ending filter if configured."""

+ 172 - 6
dulwich/line_ending.py

@@ -137,6 +137,7 @@ Sources:
 - https://adaptivepatchwork.com/2012/03/01/mind-the-end-of-your-line/
 """
 
+import logging
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 if TYPE_CHECKING:
@@ -153,6 +154,8 @@ from .patch import is_binary
 CRLF = b"\r\n"
 LF = b"\n"
 
+logger = logging.getLogger(__name__)
+
 
 class LineEndingFilter(FilterDriver):
     """Filter driver for line ending conversion."""
@@ -162,13 +165,91 @@ class LineEndingFilter(FilterDriver):
         clean_conversion: Optional[Callable[[bytes], bytes]] = None,
         smudge_conversion: Optional[Callable[[bytes], bytes]] = None,
         binary_detection: bool = True,
+        safecrlf: bytes = b"false",
     ):
         """Initialize LineEndingFilter."""
         self.clean_conversion = clean_conversion
         self.smudge_conversion = smudge_conversion
         self.binary_detection = binary_detection
+        self.safecrlf = safecrlf
+
+    @classmethod
+    def from_config(
+        cls, config: Optional["StackedConfig"], for_text_attr: bool = False
+    ) -> "LineEndingFilter":
+        """Create a LineEndingFilter from git configuration.
+
+        Args:
+            config: Git configuration stack
+            for_text_attr: If True, always normalize on checkin (for text attribute)
+
+        Returns:
+            Configured LineEndingFilter instance
+        """
+        if config is None:
+            # Default filter
+            if for_text_attr:
+                # For text attribute: always normalize on checkin
+                return cls(
+                    clean_conversion=convert_crlf_to_lf,
+                    smudge_conversion=None,
+                    binary_detection=True,
+                )
+            else:
+                # No config: no conversion
+                return cls()
+
+        # Get core.eol setting
+        try:
+            core_eol_raw = config.get("core", "eol")
+            core_eol: str = (
+                core_eol_raw.decode("ascii")
+                if isinstance(core_eol_raw, bytes)
+                else str(core_eol_raw)
+            )
+        except KeyError:
+            core_eol = "native"
+
+        # Get core.autocrlf setting
+        try:
+            autocrlf_raw = config.get("core", "autocrlf")
+            autocrlf: bytes = (
+                autocrlf_raw.lower()
+                if isinstance(autocrlf_raw, bytes)
+                else str(autocrlf_raw).lower().encode("ascii")
+            )
+        except KeyError:
+            autocrlf = b"false"
+
+        # Get core.safecrlf setting
+        try:
+            safecrlf_raw = config.get("core", "safecrlf")
+            safecrlf = (
+                safecrlf_raw
+                if isinstance(safecrlf_raw, bytes)
+                else safecrlf_raw.encode("utf-8")
+            )
+        except KeyError:
+            safecrlf = b"false"
+
+        if for_text_attr:
+            # For text attribute: always normalize to LF on checkin
+            # Smudge behavior depends on core.eol and core.autocrlf
+            smudge_filter = get_smudge_filter(core_eol, autocrlf)
+            clean_filter: Optional[Callable[[bytes], bytes]] = convert_crlf_to_lf
+        else:
+            # Normal autocrlf behavior
+            smudge_filter = get_smudge_filter(core_eol, autocrlf)
+            clean_filter = get_clean_filter(core_eol, autocrlf)
 
-    def clean(self, data: bytes) -> bytes:
+        return cls(
+            clean_conversion=clean_filter,
+            smudge_conversion=smudge_filter,
+            binary_detection=True,
+            safecrlf=safecrlf,
+        )
+
+    def clean(self, data: bytes, path: bytes = b"") -> bytes:
         """Apply line ending conversion for checkin (working tree -> repository)."""
         if self.clean_conversion is None:
             return data
@@ -177,7 +258,13 @@ class LineEndingFilter(FilterDriver):
         if self.binary_detection and is_binary(data):
             return data
 
-        return self.clean_conversion(data)
+        converted = self.clean_conversion(data)
+
+        # Check if conversion is safe
+        if self.safecrlf != b"false":
+            check_safecrlf(data, converted, self.safecrlf, path)
+
+        return converted
 
     def smudge(self, data: bytes, path: bytes = b"") -> bytes:
         """Apply line ending conversion for checkout (repository -> working tree)."""
@@ -188,7 +275,13 @@ class LineEndingFilter(FilterDriver):
         if self.binary_detection and is_binary(data):
             return data
 
-        return self.smudge_conversion(data)
+        converted = self.smudge_conversion(data)
+
+        # Check if conversion is safe
+        if self.safecrlf != b"false":
+            check_safecrlf(data, converted, self.safecrlf, path)
+
+        return converted
 
     def cleanup(self) -> None:
         """Clean up any resources held by this filter driver."""
@@ -231,6 +324,52 @@ def convert_lf_to_crlf(text_hunk: bytes) -> bytes:
     return CRLF.join(cleaned_parts)
 
 
+def check_safecrlf(
+    original: bytes, converted: bytes, safecrlf: bytes, path: bytes = b""
+) -> None:
+    """Check if CRLF conversion is safe according to core.safecrlf setting.
+
+    Args:
+        original: Original content before conversion
+        converted: Content after conversion
+        safecrlf: Value of core.safecrlf config (b"true", b"warn", or b"false")
+        path: Path to the file being checked (for error messages)
+
+    Raises:
+        ValueError: If safecrlf is "true" and conversion would lose data
+    """
+    if safecrlf == b"false":
+        return
+
+    # Check if conversion is reversible
+    if safecrlf in (b"true", b"warn"):
+        # For CRLF->LF conversion, check if converting back would recover original
+        if CRLF in original and CRLF not in converted:
+            # This was a CRLF->LF conversion
+            recovered = convert_lf_to_crlf(converted)
+            if recovered != original:
+                msg = (
+                    f"CRLF would be replaced by LF in {path.decode('utf-8', 'replace')}"
+                )
+                if safecrlf == b"true":
+                    raise ValueError(msg)
+                else:  # warn
+                    logger.warning(msg)
+
+        # For LF->CRLF conversion, check if converting back would recover original
+        elif LF in original and CRLF in converted and CRLF not in original:
+            # This was a LF->CRLF conversion
+            recovered = convert_crlf_to_lf(converted)
+            if recovered != original:
+                msg = (
+                    f"LF would be replaced by CRLF in {path.decode('utf-8', 'replace')}"
+                )
+                if safecrlf == b"true":
+                    raise ValueError(msg)
+                else:  # warn
+                    logger.warning(msg)
+
+
 def get_smudge_filter(
     core_eol: str, core_autocrlf: bytes
 ) -> Optional[Callable[[bytes], bytes]]:
@@ -345,6 +484,7 @@ class BlobNormalizer(FilterBlobNormalizer):
         gitattributes: dict[str, Any],
         core_eol: str = "native",
         autocrlf: bytes = b"false",
+        safecrlf: bytes = b"false",
     ) -> None:
         """Initialize FilteringBlobNormalizer."""
         # Set up a filter registry with line ending filters
@@ -360,6 +500,7 @@ class BlobNormalizer(FilterBlobNormalizer):
             clean_conversion=clean_filter or convert_crlf_to_lf,
             smudge_conversion=smudge_filter or convert_lf_to_crlf,
             binary_detection=True,
+            safecrlf=safecrlf,
         )
         filter_registry.register_driver("text", line_ending_filter)
 
@@ -398,12 +539,24 @@ class BlobNormalizer(FilterBlobNormalizer):
         # (autocrlf is enabled), apply it to all files
         if result is blob and self.fallback_write_filter is not None:
             # Apply the clean filter with binary detection
+            # Get safecrlf from config
+            safecrlf = b"false"
+            if hasattr(self, "filter_registry") and hasattr(
+                self.filter_registry, "config_stack"
+            ):
+                safecrlf = self.filter_registry.config_stack.get(
+                    b"core", b"safecrlf", b"false"
+                )
+                if hasattr(safecrlf, "encode"):
+                    safecrlf = safecrlf.encode("utf-8")
+
             line_ending_filter = LineEndingFilter(
                 clean_conversion=self.fallback_write_filter,
                 smudge_conversion=None,
                 binary_detection=True,
+                safecrlf=safecrlf,
             )
-            filtered_data = line_ending_filter.clean(blob.data)
+            filtered_data = line_ending_filter.clean(blob.data, tree_path)
             if filtered_data != blob.data:
                 new_blob = Blob()
                 new_blob.data = filtered_data
@@ -426,12 +579,24 @@ class BlobNormalizer(FilterBlobNormalizer):
         # (autocrlf is enabled), apply it to all files
         if result is blob and self.fallback_read_filter is not None:
             # Apply the smudge filter with binary detection
+            # Get safecrlf from config
+            safecrlf = b"false"
+            if hasattr(self, "filter_registry") and hasattr(
+                self.filter_registry, "config_stack"
+            ):
+                safecrlf = self.filter_registry.config_stack.get(
+                    b"core", b"safecrlf", b"false"
+                )
+                if hasattr(safecrlf, "encode"):
+                    safecrlf = safecrlf.encode("utf-8")
+
             line_ending_filter = LineEndingFilter(
                 clean_conversion=None,
                 smudge_conversion=self.fallback_read_filter,
                 binary_detection=True,
+                safecrlf=safecrlf,
             )
-            filtered_data = line_ending_filter.smudge(blob.data)
+            filtered_data = line_ending_filter.smudge(blob.data, tree_path)
             if filtered_data != blob.data:
                 new_blob = Blob()
                 new_blob.data = filtered_data
@@ -474,9 +639,10 @@ class TreeBlobNormalizer(BlobNormalizer):
         tree: Optional[ObjectID] = None,
         core_eol: str = "native",
         autocrlf: bytes = b"false",
+        safecrlf: bytes = b"false",
     ) -> None:
         """Initialize TreeBlobNormalizer."""
-        super().__init__(config_stack, git_attributes, core_eol, autocrlf)
+        super().__init__(config_stack, git_attributes, core_eol, autocrlf, safecrlf)
         if tree:
             self.existing_paths = {
                 name for name, _, _ in iter_tree_contents(object_store, tree)

+ 282 - 0
dulwich/whitespace.py

@@ -0,0 +1,282 @@
+# whitespace.py -- Whitespace error detection and fixing
+# Copyright (C) 2025 Dulwich contributors
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as published by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+"""Whitespace error detection and fixing functionality.
+
+This module implements Git's core.whitespace configuration and related
+whitespace error detection capabilities.
+"""
+
+from typing import Optional
+
+# Default whitespace errors Git checks for
+DEFAULT_WHITESPACE_ERRORS = {
+    "blank-at-eol",
+    "space-before-tab",
+    "blank-at-eof",
+}
+
+# All available whitespace error types
+WHITESPACE_ERROR_TYPES = {
+    "blank-at-eol",  # Trailing whitespace at end of line
+    "space-before-tab",  # Space before tab in indentation
+    "indent-with-non-tab",  # Indent with space when tabs expected (8+ spaces)
+    "tab-in-indent",  # Tab in indentation when spaces expected
+    "blank-at-eof",  # Blank lines at end of file
+    "trailing-space",  # Trailing whitespace (same as blank-at-eol)
+    "cr-at-eol",  # Carriage return at end of line
+    "tabwidth",  # Special: sets tab width (not an error type)
+}
+
+
+def parse_whitespace_config(value: Optional[str]) -> tuple[set[str], int]:
+    """Parse core.whitespace configuration value.
+
+    Args:
+        value: The core.whitespace config value (e.g., "blank-at-eol,space-before-tab")
+
+    Returns:
+        Tuple of (enabled error types, tab width)
+    """
+    if value is None:
+        return DEFAULT_WHITESPACE_ERRORS.copy(), 8
+
+    if value == "":
+        return set(), 8
+
+    # Start with defaults if no explicit errors are specified or if negation is used
+    parts = value.split(",")
+    has_negation = any(p.strip().startswith("-") for p in parts)
+    has_explicit_errors = any(p.strip() in WHITESPACE_ERROR_TYPES for p in parts)
+
+    if has_negation or not has_explicit_errors:
+        enabled = DEFAULT_WHITESPACE_ERRORS.copy()
+    else:
+        enabled = set()
+
+    tab_width = 8
+
+    for part in parts:
+        part = part.strip()
+        if not part:
+            continue
+
+        # Handle negation
+        if part.startswith("-"):
+            error_type = part[1:]
+            if error_type in WHITESPACE_ERROR_TYPES:
+                enabled.discard(error_type)
+        elif part.startswith("tabwidth="):
+            try:
+                tab_width = int(part[9:])
+                if tab_width < 1:
+                    tab_width = 8
+            except ValueError:
+                tab_width = 8
+        elif part in WHITESPACE_ERROR_TYPES:
+            enabled.add(part)
+
+    # Handle aliases
+    if "trailing-space" in enabled:
+        enabled.add("blank-at-eol")
+        enabled.discard("trailing-space")
+
+    return enabled, tab_width
+
+
+class WhitespaceChecker:
+    """Checks for whitespace errors in text content."""
+
+    def __init__(self, enabled_errors: set[str], tab_width: int = 8):
+        """Initialize whitespace checker.
+
+        Args:
+            enabled_errors: Set of error types to check for
+            tab_width: Width of tab character for indentation checking
+        """
+        self.enabled_errors = enabled_errors
+        self.tab_width = tab_width
+
+    def check_line(self, line: bytes, line_num: int) -> list[tuple[str, int]]:
+        """Check a single line for whitespace errors.
+
+        Args:
+            line: Line content (without newline)
+            line_num: Line number (1-based)
+
+        Returns:
+            List of (error_type, line_number) tuples
+        """
+        errors = []
+
+        # Check for trailing whitespace (blank-at-eol)
+        if "blank-at-eol" in self.enabled_errors:
+            if line and (line[-1:] == b" " or line[-1:] == b"\t"):
+                # Find where trailing whitespace starts
+                i = len(line) - 1
+                while i >= 0 and line[i : i + 1] in (b" ", b"\t"):
+                    i -= 1
+                errors.append(("blank-at-eol", line_num))
+
+        # Check for space before tab
+        if "space-before-tab" in self.enabled_errors:
+            # Check in indentation
+            i = 0
+            while i < len(line) and line[i : i + 1] in (b" ", b"\t"):
+                if i > 0 and line[i - 1 : i] == b" " and line[i : i + 1] == b"\t":
+                    errors.append(("space-before-tab", line_num))
+                    break
+                i += 1
+
+        # Check for indent-with-non-tab (8+ spaces at start)
+        if "indent-with-non-tab" in self.enabled_errors:
+            space_count = 0
+            for i in range(len(line)):
+                if line[i : i + 1] == b" ":
+                    space_count += 1
+                    if space_count >= self.tab_width:
+                        errors.append(("indent-with-non-tab", line_num))
+                        break
+                elif line[i : i + 1] == b"\t":
+                    space_count = 0  # Reset on tab
+                else:
+                    break  # Non-whitespace character
+
+        # Check for tab-in-indent
+        if "tab-in-indent" in self.enabled_errors:
+            for i in range(len(line)):
+                if line[i : i + 1] == b"\t":
+                    errors.append(("tab-in-indent", line_num))
+                    break
+                elif line[i : i + 1] not in (b" ", b"\t"):
+                    break  # Non-whitespace character
+
+        # Check for carriage return
+        if "cr-at-eol" in self.enabled_errors:
+            if line and line[-1:] == b"\r":
+                errors.append(("cr-at-eol", line_num))
+
+        return errors
+
+    def check_content(self, content: bytes) -> list[tuple[str, int]]:
+        """Check content for whitespace errors.
+
+        Args:
+            content: File content to check
+
+        Returns:
+            List of (error_type, line_number) tuples
+        """
+        errors = []
+        lines = content.split(b"\n")
+
+        # Handle CRLF line endings
+        for i, line in enumerate(lines):
+            if line.endswith(b"\r"):
+                lines[i] = line[:-1]
+
+        # Check each line
+        for i, line in enumerate(lines):
+            errors.extend(self.check_line(line, i + 1))
+
+        # Check for blank lines at end of file
+        if "blank-at-eof" in self.enabled_errors:
+            # Skip the last empty line if content ends with newline
+            check_lines = lines[:-1] if lines and lines[-1] == b"" else lines
+
+            if check_lines:
+                trailing_blank_count = 0
+                for i in range(len(check_lines) - 1, -1, -1):
+                    if check_lines[i] == b"":
+                        trailing_blank_count += 1
+                    else:
+                        break
+
+                if trailing_blank_count > 0:
+                    # Report the line number of the last non-empty line + 1
+                    errors.append(("blank-at-eof", len(check_lines) + 1))
+
+        return errors
+
+
+def fix_whitespace_errors(
+    content: bytes, errors: list[tuple[str, int]], fix_types: Optional[set[str]] = None
+) -> bytes:
+    """Fix whitespace errors in content.
+
+    Args:
+        content: Original content
+        errors: List of errors from WhitespaceChecker
+        fix_types: Set of error types to fix (None means fix all)
+
+    Returns:
+        Fixed content
+    """
+    if not errors:
+        return content
+
+    lines = content.split(b"\n")
+
+    # Handle CRLF line endings - we need to track which lines had them
+    has_crlf = []
+    for i, line in enumerate(lines):
+        if line.endswith(b"\r"):
+            has_crlf.append(i)
+            lines[i] = line[:-1]
+
+    # Group errors by line
+    errors_by_line: dict[int, list[str]] = {}
+    for error_type, line_num in errors:
+        if fix_types is None or error_type in fix_types:
+            if line_num not in errors_by_line:
+                errors_by_line[line_num] = []
+            errors_by_line[line_num].append(error_type)
+
+    # Fix errors
+    for line_num, error_types in errors_by_line.items():
+        if line_num > len(lines):
+            continue
+
+        line_idx = line_num - 1
+        line = lines[line_idx]
+
+        # Fix trailing whitespace
+        if "blank-at-eol" in error_types:
+            # Remove trailing spaces and tabs
+            while line and line[-1:] in (b" ", b"\t"):
+                line = line[:-1]
+            lines[line_idx] = line
+
+        # Fix carriage return - since we already stripped CRs, we just don't restore them
+        if "cr-at-eol" in error_types and line_idx in has_crlf:
+            has_crlf.remove(line_idx)
+
+    # Restore CRLF for lines that should keep them
+    for idx in has_crlf:
+        if idx < len(lines):
+            lines[idx] = lines[idx] + b"\r"
+
+    # Fix blank lines at end of file
+    if fix_types is None or "blank-at-eof" in fix_types:
+        # Remove trailing empty lines
+        while len(lines) > 1 and lines[-1] == b"" and lines[-2] == b"":
+            lines.pop()
+
+    return b"\n".join(lines)

+ 131 - 0
tests/test_line_ending.py

@@ -25,6 +25,7 @@ from dulwich.line_ending import (
     BlobNormalizer,
     LineEndingFilter,
     TreeBlobNormalizer,
+    check_safecrlf,
     convert_crlf_to_lf,
     convert_lf_to_crlf,
     get_clean_filter_autocrlf,
@@ -551,3 +552,133 @@ class LineEndingIntegrationTests(TestCase):
         bin_blob.data = b"binary content"
         result = normalizer.checkin_normalize(bin_blob, b"test.bin")
         self.assertEqual(result.data, b"LFS pointer")
+
+
+class LineEndingFilterFromConfigTests(TestCase):
+    """Test LineEndingFilter.from_config classmethod."""
+
+    def test_from_config_none(self) -> None:
+        """Test from_config with no config."""
+        # No config, not for text attr - no conversion
+        filter = LineEndingFilter.from_config(None, for_text_attr=False)
+        self.assertIsNone(filter.clean_conversion)
+        self.assertIsNone(filter.smudge_conversion)
+        self.assertEqual(filter.safecrlf, b"false")
+
+        # No config, for text attr - normalize on checkin
+        filter = LineEndingFilter.from_config(None, for_text_attr=True)
+        self.assertIsNotNone(filter.clean_conversion)
+        self.assertIsNone(filter.smudge_conversion)
+        self.assertEqual(filter.safecrlf, b"false")
+
+    def test_from_config_autocrlf_true(self) -> None:
+        """Test from_config with autocrlf=true."""
+        from dulwich.config import ConfigDict
+
+        config = ConfigDict()
+        config.set(b"core", b"autocrlf", b"true")
+
+        filter = LineEndingFilter.from_config(config, for_text_attr=False)
+        self.assertIsNotNone(filter.clean_conversion)
+        self.assertIsNotNone(filter.smudge_conversion)
+        self.assertEqual(filter.safecrlf, b"false")
+
+    def test_from_config_with_safecrlf(self) -> None:
+        """Test from_config with safecrlf setting."""
+        from dulwich.config import ConfigDict
+
+        config = ConfigDict()
+        config.set(b"core", b"autocrlf", b"input")
+        config.set(b"core", b"safecrlf", b"warn")
+
+        filter = LineEndingFilter.from_config(config, for_text_attr=False)
+        self.assertIsNotNone(filter.clean_conversion)
+        self.assertIsNone(filter.smudge_conversion)
+        self.assertEqual(filter.safecrlf, b"warn")
+
+    def test_from_config_text_attr_overrides(self) -> None:
+        """Test that for_text_attr=True always normalizes on checkin."""
+        from dulwich.config import ConfigDict
+
+        config = ConfigDict()
+        config.set(b"core", b"autocrlf", b"false")
+
+        # Even with autocrlf=false, text attr should normalize
+        filter = LineEndingFilter.from_config(config, for_text_attr=True)
+        self.assertIsNotNone(filter.clean_conversion)
+        # Smudge should still be None since autocrlf=false
+        self.assertIsNone(filter.smudge_conversion)
+
+
+class SafeCRLFTests(TestCase):
+    """Test core.safecrlf functionality."""
+
+    def test_safecrlf_false(self) -> None:
+        """Test that safecrlf=false allows any conversion."""
+        original = b"line1\r\nline2\r\n"
+        converted = b"line1\nline2\n"
+        # Should not raise
+        check_safecrlf(original, converted, b"false", b"test.txt")
+
+    def test_safecrlf_true_safe_conversion(self) -> None:
+        """Test that safecrlf=true allows safe conversions."""
+        # CRLF -> LF -> CRLF is reversible
+        original = b"line1\r\nline2\r\n"
+        converted = b"line1\nline2\n"
+        # Should not raise because conversion is reversible
+        check_safecrlf(original, converted, b"true", b"test.txt")
+
+    def test_safecrlf_true_unsafe_conversion(self) -> None:
+        """Test that safecrlf=true fails on unsafe conversions."""
+        # Mixed line endings would be lost
+        original = b"line1\r\nline2\nline3\r\n"
+        converted = b"line1\nline2\nline3\n"
+        # Should raise because converting back gives all CRLF
+        with self.assertRaises(ValueError) as cm:
+            check_safecrlf(original, converted, b"true", b"test.txt")
+        self.assertIn("CRLF would be replaced by LF", str(cm.exception))
+
+    def test_safecrlf_warn(self) -> None:
+        """Test that safecrlf=warn issues warnings."""
+        # Mixed line endings would be lost
+        original = b"line1\r\nline2\nline3\r\n"
+        converted = b"line1\nline2\nline3\n"
+        # Should warn but not raise
+        with self.assertLogs("dulwich.line_ending", level="WARNING") as cm:
+            check_safecrlf(original, converted, b"warn", b"test.txt")
+            self.assertEqual(len(cm.output), 1)
+            self.assertIn("CRLF would be replaced by LF", cm.output[0])
+
+    def test_lineending_filter_with_safecrlf(self) -> None:
+        """Test LineEndingFilter with safecrlf enabled."""
+        # Test with safecrlf=true
+        filter_strict = LineEndingFilter(
+            clean_conversion=convert_crlf_to_lf,
+            smudge_conversion=None,
+            binary_detection=False,
+            safecrlf=b"true",
+        )
+
+        # Safe conversion should work
+        safe_data = b"line1\r\nline2\r\n"
+        result = filter_strict.clean(safe_data, b"test.txt")
+        self.assertEqual(result, b"line1\nline2\n")
+
+        # Unsafe conversion should fail
+        unsafe_data = b"line1\r\nline2\nline3\r\n"
+        with self.assertRaises(ValueError):
+            filter_strict.clean(unsafe_data, b"test.txt")
+
+        # Test with safecrlf=warn
+        filter_warn = LineEndingFilter(
+            clean_conversion=convert_crlf_to_lf,
+            smudge_conversion=None,
+            binary_detection=False,
+            safecrlf=b"warn",
+        )
+
+        # Should warn but still convert
+        with self.assertLogs("dulwich.line_ending", level="WARNING") as cm:
+            result = filter_warn.clean(unsafe_data, b"test.txt")
+            self.assertEqual(result, b"line1\nline2\nline3\n")
+            self.assertEqual(len(cm.output), 1)

+ 274 - 0
tests/test_whitespace.py

@@ -0,0 +1,274 @@
+# test_whitespace.py -- Tests for whitespace error detection
+# Copyright (C) 2025 Dulwich contributors
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as published by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Tests for whitespace error detection."""
+
+from dulwich.whitespace import (
+    DEFAULT_WHITESPACE_ERRORS,
+    WhitespaceChecker,
+    fix_whitespace_errors,
+    parse_whitespace_config,
+)
+
+from . import TestCase
+
+
+class WhitespaceConfigTests(TestCase):
+    """Test core.whitespace configuration parsing."""
+
+    def test_parse_default(self) -> None:
+        """Test default whitespace configuration."""
+        errors, tab_width = parse_whitespace_config(None)
+        self.assertEqual(errors, DEFAULT_WHITESPACE_ERRORS)
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_empty(self) -> None:
+        """Test empty whitespace configuration."""
+        errors, tab_width = parse_whitespace_config("")
+        self.assertEqual(errors, set())
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_single_error(self) -> None:
+        """Test single error type."""
+        errors, tab_width = parse_whitespace_config("blank-at-eol")
+        self.assertEqual(errors, {"blank-at-eol"})
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_multiple_errors(self) -> None:
+        """Test multiple error types."""
+        errors, tab_width = parse_whitespace_config(
+            "blank-at-eol,space-before-tab,tab-in-indent"
+        )
+        self.assertEqual(errors, {"blank-at-eol", "space-before-tab", "tab-in-indent"})
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_with_negation(self) -> None:
+        """Test negation of default errors."""
+        errors, tab_width = parse_whitespace_config("-blank-at-eol")
+        # Should have defaults minus blank-at-eol
+        expected = DEFAULT_WHITESPACE_ERRORS - {"blank-at-eol"}
+        self.assertEqual(errors, expected)
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_trailing_space_alias(self) -> None:
+        """Test that trailing-space is an alias for blank-at-eol."""
+        errors, tab_width = parse_whitespace_config("trailing-space")
+        self.assertEqual(errors, {"blank-at-eol"})
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_tabwidth(self) -> None:
+        """Test tabwidth setting."""
+        errors, tab_width = parse_whitespace_config("blank-at-eol,tabwidth=4")
+        self.assertEqual(errors, {"blank-at-eol"})
+        self.assertEqual(tab_width, 4)
+
+    def test_parse_invalid_tabwidth(self) -> None:
+        """Test invalid tabwidth defaults to 8."""
+        errors, tab_width = parse_whitespace_config("tabwidth=invalid")
+        self.assertEqual(tab_width, 8)
+
+        errors, tab_width = parse_whitespace_config("tabwidth=0")
+        self.assertEqual(tab_width, 8)
+
+
+class WhitespaceCheckerTests(TestCase):
+    """Test WhitespaceChecker functionality."""
+
+    def test_blank_at_eol(self) -> None:
+        """Test detection of trailing whitespace."""
+        checker = WhitespaceChecker({"blank-at-eol"})
+
+        # No trailing whitespace
+        errors = checker.check_line(b"normal line", 1)
+        self.assertEqual(errors, [])
+
+        # Trailing space
+        errors = checker.check_line(b"trailing space ", 1)
+        self.assertEqual(errors, [("blank-at-eol", 1)])
+
+        # Trailing tab
+        errors = checker.check_line(b"trailing tab\t", 1)
+        self.assertEqual(errors, [("blank-at-eol", 1)])
+
+        # Multiple trailing whitespace
+        errors = checker.check_line(b"multiple  \t ", 1)
+        self.assertEqual(errors, [("blank-at-eol", 1)])
+
+    def test_space_before_tab(self) -> None:
+        """Test detection of space before tab in indentation."""
+        checker = WhitespaceChecker({"space-before-tab"})
+
+        # No space before tab
+        errors = checker.check_line(b"\tindented", 1)
+        self.assertEqual(errors, [])
+
+        # Space before tab in indentation
+        errors = checker.check_line(b" \tindented", 1)
+        self.assertEqual(errors, [("space-before-tab", 1)])
+
+        # Space before tab not in indentation (should not trigger)
+        errors = checker.check_line(b"code \t comment", 1)
+        self.assertEqual(errors, [])
+
+    def test_indent_with_non_tab(self) -> None:
+        """Test detection of 8+ spaces at start of line."""
+        checker = WhitespaceChecker({"indent-with-non-tab"}, tab_width=8)
+
+        # Less than 8 spaces
+        errors = checker.check_line(b"    code", 1)
+        self.assertEqual(errors, [])
+
+        # Exactly 8 spaces
+        errors = checker.check_line(b"        code", 1)
+        self.assertEqual(errors, [("indent-with-non-tab", 1)])
+
+        # More than 8 spaces
+        errors = checker.check_line(b"         code", 1)
+        self.assertEqual(errors, [("indent-with-non-tab", 1)])
+
+        # Tab after spaces resets count
+        errors = checker.check_line(b"    \t    code", 1)
+        self.assertEqual(errors, [])
+
+        # Custom tab width
+        checker = WhitespaceChecker({"indent-with-non-tab"}, tab_width=4)
+        errors = checker.check_line(b"    code", 1)
+        self.assertEqual(errors, [("indent-with-non-tab", 1)])
+
+    def test_tab_in_indent(self) -> None:
+        """Test detection of tabs in indentation."""
+        checker = WhitespaceChecker({"tab-in-indent"})
+
+        # No tabs
+        errors = checker.check_line(b"    code", 1)
+        self.assertEqual(errors, [])
+
+        # Tab in indentation
+        errors = checker.check_line(b"\tcode", 1)
+        self.assertEqual(errors, [("tab-in-indent", 1)])
+
+        # Tab after non-whitespace (should not trigger)
+        errors = checker.check_line(b"code\tcomment", 1)
+        self.assertEqual(errors, [])
+
+    def test_cr_at_eol(self) -> None:
+        """Test detection of carriage return at end of line."""
+        checker = WhitespaceChecker({"cr-at-eol"})
+
+        # No CR
+        errors = checker.check_line(b"normal line", 1)
+        self.assertEqual(errors, [])
+
+        # CR at end
+        errors = checker.check_line(b"line\r", 1)
+        self.assertEqual(errors, [("cr-at-eol", 1)])
+
+    def test_blank_at_eof(self) -> None:
+        """Test detection of blank lines at end of file."""
+        checker = WhitespaceChecker({"blank-at-eof"})
+
+        # No trailing blank lines
+        content = b"line1\nline2\nline3"
+        errors = checker.check_content(content)
+        self.assertEqual(errors, [])
+
+        # One trailing blank line (normal for files ending with newline)
+        content = b"line1\nline2\nline3\n"
+        errors = checker.check_content(content)
+        self.assertEqual(errors, [])
+
+        # Multiple trailing blank lines
+        content = b"line1\nline2\n\n\n"
+        errors = checker.check_content(content)
+        self.assertEqual(errors, [("blank-at-eof", 5)])
+
+        # Only blank lines
+        content = b"\n\n\n"
+        errors = checker.check_content(content)
+        self.assertEqual(errors, [("blank-at-eof", 4)])
+
+    def test_multiple_errors(self) -> None:
+        """Test detection of multiple error types."""
+        checker = WhitespaceChecker(
+            {"blank-at-eol", "space-before-tab", "tab-in-indent"}
+        )
+
+        # Line with multiple errors
+        errors = checker.check_line(b" \tcode  ", 1)
+        error_types = {e[0] for e in errors}
+        self.assertEqual(
+            error_types, {"blank-at-eol", "space-before-tab", "tab-in-indent"}
+        )
+
+    def test_check_content_crlf(self) -> None:
+        """Test content checking with CRLF line endings."""
+        checker = WhitespaceChecker({"blank-at-eol", "cr-at-eol"})
+
+        # CRLF line endings
+        content = b"line1\r\nline2 \r\nline3\r\n"
+        errors = checker.check_content(content)
+        # Should detect trailing space on line 2 but not CR (since CRLF is handled)
+        self.assertEqual(errors, [("blank-at-eol", 2)])
+
+
+class WhitespaceFixTests(TestCase):
+    """Test whitespace error fixing."""
+
+    def test_fix_blank_at_eol(self) -> None:
+        """Test fixing trailing whitespace."""
+        content = b"line1  \nline2\t\nline3"
+        errors = [("blank-at-eol", 1), ("blank-at-eol", 2)]
+        fixed = fix_whitespace_errors(content, errors)
+        self.assertEqual(fixed, b"line1\nline2\nline3")
+
+    def test_fix_blank_at_eof(self) -> None:
+        """Test fixing blank lines at end of file."""
+        content = b"line1\nline2\n\n\n"
+        errors = [("blank-at-eof", 4)]
+        fixed = fix_whitespace_errors(content, errors)
+        self.assertEqual(fixed, b"line1\nline2\n")
+
+    def test_fix_cr_at_eol(self) -> None:
+        """Test fixing carriage returns."""
+        content = b"line1\r\nline2\r\nline3\r"
+        errors = [("cr-at-eol", 1), ("cr-at-eol", 2), ("cr-at-eol", 3)]
+        fixed = fix_whitespace_errors(content, errors)
+        # Our fix function removes all CRs when cr-at-eol errors are fixed
+        self.assertEqual(fixed, b"line1\nline2\nline3")
+
+    def test_fix_specific_types(self) -> None:
+        """Test fixing only specific error types."""
+        content = b"line1  \nline2\n\n\n"
+        errors = [("blank-at-eol", 1), ("blank-at-eof", 4)]
+
+        # Fix only blank-at-eol
+        fixed = fix_whitespace_errors(content, errors, fix_types={"blank-at-eol"})
+        self.assertEqual(fixed, b"line1\nline2\n\n\n")
+
+        # Fix only blank-at-eof
+        fixed = fix_whitespace_errors(content, errors, fix_types={"blank-at-eof"})
+        self.assertEqual(fixed, b"line1  \nline2\n")
+
+    def test_fix_no_errors(self) -> None:
+        """Test fixing with no errors returns original content."""
+        content = b"line1\nline2\nline3"
+        fixed = fix_whitespace_errors(content, [])
+        self.assertEqual(fixed, content)