Просмотр исходного кода

Add support for core.whitespace and core.safecrlf

* Add core.whitespace configuration support for whitespace error detection
* Add core.safecrlf configuration support for safe CRLF conversion checks
* Refactor LineEndingFilter with from_config classmethod for cleaner code

Fixes #1806
Jelmer Vernooij 4 месяцев назад
Родитель
Сommit
37a7203201
6 измененных файлов с 871 добавлено и 53 удалено
  1. 10 1
      NEWS
  2. 2 46
      dulwich/filters.py
  3. 172 6
      dulwich/line_ending.py
  4. 282 0
      dulwich/whitespace.py
  5. 131 0
      tests/test_line_ending.py
  6. 274 0
      tests/test_whitespace.py

+ 10 - 1
NEWS

@@ -31,7 +31,16 @@
 
 
  * Fix handling of CRLF line endings with ``core.autocrlf = input`` to prevent
  * Fix handling of CRLF line endings with ``core.autocrlf = input`` to prevent
    unchanged files from appearing as unstaged in status.
    unchanged files from appearing as unstaged in status.
-   (Jelmer Vernooij, #1770)
+   (Jelmer Vernooij, #1773)
+
+ * Add support for ``core.whitespace`` configuration for whitespace error
+   detection and fixing. Supports blank-at-eol, space-before-tab, indent-with-non-tab,
+   tab-in-indent, blank-at-eof, cr-at-eol, and tabwidth settings.
+   (Jelmer Vernooij, #1806)
+
+ * Add support for ``core.safecrlf`` configuration to check if CRLF/LF conversions
+   would be reversible and optionally abort or warn on potentially lossy conversions.
+   (Jelmer Vernooij, #1806)
 
 
  * Add support for ``http.extraHeader`` configuration to pass additional HTTP
  * Add support for ``http.extraHeader`` configuration to pass additional HTTP
    headers to the server when communicating over HTTP(S).
    headers to the server when communicating over HTTP(S).

+ 2 - 46
dulwich/filters.py

@@ -652,53 +652,9 @@ class FilterRegistry:
         This filter is used when files have the 'text' attribute set explicitly.
         This filter is used when files have the 'text' attribute set explicitly.
         It always normalizes line endings on checkin (CRLF -> LF).
         It always normalizes line endings on checkin (CRLF -> LF).
         """
         """
-        from .line_ending import (
-            LineEndingFilter,
-            convert_crlf_to_lf,
-            get_smudge_filter,
-        )
+        from .line_ending import LineEndingFilter
 
 
-        if self.config is None:
-            # Default text filter: always normalize on checkin
-            return LineEndingFilter(
-                clean_conversion=convert_crlf_to_lf,
-                smudge_conversion=None,
-                binary_detection=True,
-            )
-
-        # Get core.eol and core.autocrlf settings for smudge behavior
-        try:
-            core_eol_raw = self.config.get("core", "eol")
-            core_eol: str = (
-                core_eol_raw.decode("ascii")
-                if isinstance(core_eol_raw, bytes)
-                else core_eol_raw
-            )
-        except KeyError:
-            core_eol = "native"
-
-        # Parse autocrlf as bytes (can be b"true", b"input", or b"false")
-        try:
-            autocrlf_raw = self.config.get("core", "autocrlf")
-            autocrlf: bytes = (
-                autocrlf_raw.lower()
-                if isinstance(autocrlf_raw, bytes)
-                else str(autocrlf_raw).lower().encode("ascii")
-            )
-        except KeyError:
-            autocrlf = b"false"
-
-        # For explicit text attribute:
-        # - Always normalize to LF on checkin (clean)
-        # - Smudge behavior depends on core.eol and core.autocrlf
-        smudge_filter = get_smudge_filter(core_eol, autocrlf)
-        clean_filter = convert_crlf_to_lf
-
-        return LineEndingFilter(
-            clean_conversion=clean_filter,
-            smudge_conversion=smudge_filter,
-            binary_detection=True,
-        )
+        return LineEndingFilter.from_config(self.config, for_text_attr=True)
 
 
     def _setup_line_ending_filter(self) -> None:
     def _setup_line_ending_filter(self) -> None:
         """Automatically register line ending filter if configured."""
         """Automatically register line ending filter if configured."""

+ 172 - 6
dulwich/line_ending.py

@@ -137,6 +137,7 @@ Sources:
 - https://adaptivepatchwork.com/2012/03/01/mind-the-end-of-your-line/
 - https://adaptivepatchwork.com/2012/03/01/mind-the-end-of-your-line/
 """
 """
 
 
+import logging
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 
 if TYPE_CHECKING:
 if TYPE_CHECKING:
@@ -153,6 +154,8 @@ from .patch import is_binary
 CRLF = b"\r\n"
 CRLF = b"\r\n"
 LF = b"\n"
 LF = b"\n"
 
 
+logger = logging.getLogger(__name__)
+
 
 
 class LineEndingFilter(FilterDriver):
 class LineEndingFilter(FilterDriver):
     """Filter driver for line ending conversion."""
     """Filter driver for line ending conversion."""
@@ -162,13 +165,91 @@ class LineEndingFilter(FilterDriver):
         clean_conversion: Optional[Callable[[bytes], bytes]] = None,
         clean_conversion: Optional[Callable[[bytes], bytes]] = None,
         smudge_conversion: Optional[Callable[[bytes], bytes]] = None,
         smudge_conversion: Optional[Callable[[bytes], bytes]] = None,
         binary_detection: bool = True,
         binary_detection: bool = True,
+        safecrlf: bytes = b"false",
     ):
     ):
         """Initialize LineEndingFilter."""
         """Initialize LineEndingFilter."""
         self.clean_conversion = clean_conversion
         self.clean_conversion = clean_conversion
         self.smudge_conversion = smudge_conversion
         self.smudge_conversion = smudge_conversion
         self.binary_detection = binary_detection
         self.binary_detection = binary_detection
+        self.safecrlf = safecrlf
+
+    @classmethod
+    def from_config(
+        cls, config: Optional["StackedConfig"], for_text_attr: bool = False
+    ) -> "LineEndingFilter":
+        """Create a LineEndingFilter from git configuration.
+
+        Args:
+            config: Git configuration stack
+            for_text_attr: If True, always normalize on checkin (for text attribute)
+
+        Returns:
+            Configured LineEndingFilter instance
+        """
+        if config is None:
+            # Default filter
+            if for_text_attr:
+                # For text attribute: always normalize on checkin
+                return cls(
+                    clean_conversion=convert_crlf_to_lf,
+                    smudge_conversion=None,
+                    binary_detection=True,
+                )
+            else:
+                # No config: no conversion
+                return cls()
+
+        # Get core.eol setting
+        try:
+            core_eol_raw = config.get("core", "eol")
+            core_eol: str = (
+                core_eol_raw.decode("ascii")
+                if isinstance(core_eol_raw, bytes)
+                else str(core_eol_raw)
+            )
+        except KeyError:
+            core_eol = "native"
+
+        # Get core.autocrlf setting
+        try:
+            autocrlf_raw = config.get("core", "autocrlf")
+            autocrlf: bytes = (
+                autocrlf_raw.lower()
+                if isinstance(autocrlf_raw, bytes)
+                else str(autocrlf_raw).lower().encode("ascii")
+            )
+        except KeyError:
+            autocrlf = b"false"
+
+        # Get core.safecrlf setting
+        try:
+            safecrlf_raw = config.get("core", "safecrlf")
+            safecrlf = (
+                safecrlf_raw
+                if isinstance(safecrlf_raw, bytes)
+                else safecrlf_raw.encode("utf-8")
+            )
+        except KeyError:
+            safecrlf = b"false"
+
+        if for_text_attr:
+            # For text attribute: always normalize to LF on checkin
+            # Smudge behavior depends on core.eol and core.autocrlf
+            smudge_filter = get_smudge_filter(core_eol, autocrlf)
+            clean_filter: Optional[Callable[[bytes], bytes]] = convert_crlf_to_lf
+        else:
+            # Normal autocrlf behavior
+            smudge_filter = get_smudge_filter(core_eol, autocrlf)
+            clean_filter = get_clean_filter(core_eol, autocrlf)
 
 
-    def clean(self, data: bytes) -> bytes:
+        return cls(
+            clean_conversion=clean_filter,
+            smudge_conversion=smudge_filter,
+            binary_detection=True,
+            safecrlf=safecrlf,
+        )
+
+    def clean(self, data: bytes, path: bytes = b"") -> bytes:
         """Apply line ending conversion for checkin (working tree -> repository)."""
         """Apply line ending conversion for checkin (working tree -> repository)."""
         if self.clean_conversion is None:
         if self.clean_conversion is None:
             return data
             return data
@@ -177,7 +258,13 @@ class LineEndingFilter(FilterDriver):
         if self.binary_detection and is_binary(data):
         if self.binary_detection and is_binary(data):
             return data
             return data
 
 
-        return self.clean_conversion(data)
+        converted = self.clean_conversion(data)
+
+        # Check if conversion is safe
+        if self.safecrlf != b"false":
+            check_safecrlf(data, converted, self.safecrlf, path)
+
+        return converted
 
 
     def smudge(self, data: bytes, path: bytes = b"") -> bytes:
     def smudge(self, data: bytes, path: bytes = b"") -> bytes:
         """Apply line ending conversion for checkout (repository -> working tree)."""
         """Apply line ending conversion for checkout (repository -> working tree)."""
@@ -188,7 +275,13 @@ class LineEndingFilter(FilterDriver):
         if self.binary_detection and is_binary(data):
         if self.binary_detection and is_binary(data):
             return data
             return data
 
 
-        return self.smudge_conversion(data)
+        converted = self.smudge_conversion(data)
+
+        # Check if conversion is safe
+        if self.safecrlf != b"false":
+            check_safecrlf(data, converted, self.safecrlf, path)
+
+        return converted
 
 
     def cleanup(self) -> None:
     def cleanup(self) -> None:
         """Clean up any resources held by this filter driver."""
         """Clean up any resources held by this filter driver."""
@@ -231,6 +324,52 @@ def convert_lf_to_crlf(text_hunk: bytes) -> bytes:
     return CRLF.join(cleaned_parts)
     return CRLF.join(cleaned_parts)
 
 
 
 
+def check_safecrlf(
+    original: bytes, converted: bytes, safecrlf: bytes, path: bytes = b""
+) -> None:
+    """Check if CRLF conversion is safe according to core.safecrlf setting.
+
+    Args:
+        original: Original content before conversion
+        converted: Content after conversion
+        safecrlf: Value of core.safecrlf config (b"true", b"warn", or b"false")
+        path: Path to the file being checked (for error messages)
+
+    Raises:
+        ValueError: If safecrlf is "true" and conversion would lose data
+    """
+    if safecrlf == b"false":
+        return
+
+    # Check if conversion is reversible
+    if safecrlf in (b"true", b"warn"):
+        # For CRLF->LF conversion, check if converting back would recover original
+        if CRLF in original and CRLF not in converted:
+            # This was a CRLF->LF conversion
+            recovered = convert_lf_to_crlf(converted)
+            if recovered != original:
+                msg = (
+                    f"CRLF would be replaced by LF in {path.decode('utf-8', 'replace')}"
+                )
+                if safecrlf == b"true":
+                    raise ValueError(msg)
+                else:  # warn
+                    logger.warning(msg)
+
+        # For LF->CRLF conversion, check if converting back would recover original
+        elif LF in original and CRLF in converted and CRLF not in original:
+            # This was a LF->CRLF conversion
+            recovered = convert_crlf_to_lf(converted)
+            if recovered != original:
+                msg = (
+                    f"LF would be replaced by CRLF in {path.decode('utf-8', 'replace')}"
+                )
+                if safecrlf == b"true":
+                    raise ValueError(msg)
+                else:  # warn
+                    logger.warning(msg)
+
+
 def get_smudge_filter(
 def get_smudge_filter(
     core_eol: str, core_autocrlf: bytes
     core_eol: str, core_autocrlf: bytes
 ) -> Optional[Callable[[bytes], bytes]]:
 ) -> Optional[Callable[[bytes], bytes]]:
@@ -345,6 +484,7 @@ class BlobNormalizer(FilterBlobNormalizer):
         gitattributes: dict[str, Any],
         gitattributes: dict[str, Any],
         core_eol: str = "native",
         core_eol: str = "native",
         autocrlf: bytes = b"false",
         autocrlf: bytes = b"false",
+        safecrlf: bytes = b"false",
     ) -> None:
     ) -> None:
         """Initialize FilteringBlobNormalizer."""
         """Initialize FilteringBlobNormalizer."""
         # Set up a filter registry with line ending filters
         # Set up a filter registry with line ending filters
@@ -360,6 +500,7 @@ class BlobNormalizer(FilterBlobNormalizer):
             clean_conversion=clean_filter or convert_crlf_to_lf,
             clean_conversion=clean_filter or convert_crlf_to_lf,
             smudge_conversion=smudge_filter or convert_lf_to_crlf,
             smudge_conversion=smudge_filter or convert_lf_to_crlf,
             binary_detection=True,
             binary_detection=True,
+            safecrlf=safecrlf,
         )
         )
         filter_registry.register_driver("text", line_ending_filter)
         filter_registry.register_driver("text", line_ending_filter)
 
 
@@ -398,12 +539,24 @@ class BlobNormalizer(FilterBlobNormalizer):
         # (autocrlf is enabled), apply it to all files
         # (autocrlf is enabled), apply it to all files
         if result is blob and self.fallback_write_filter is not None:
         if result is blob and self.fallback_write_filter is not None:
             # Apply the clean filter with binary detection
             # Apply the clean filter with binary detection
+            # Get safecrlf from config
+            safecrlf = b"false"
+            if hasattr(self, "filter_registry") and hasattr(
+                self.filter_registry, "config_stack"
+            ):
+                safecrlf = self.filter_registry.config_stack.get(
+                    b"core", b"safecrlf", b"false"
+                )
+                if hasattr(safecrlf, "encode"):
+                    safecrlf = safecrlf.encode("utf-8")
+
             line_ending_filter = LineEndingFilter(
             line_ending_filter = LineEndingFilter(
                 clean_conversion=self.fallback_write_filter,
                 clean_conversion=self.fallback_write_filter,
                 smudge_conversion=None,
                 smudge_conversion=None,
                 binary_detection=True,
                 binary_detection=True,
+                safecrlf=safecrlf,
             )
             )
-            filtered_data = line_ending_filter.clean(blob.data)
+            filtered_data = line_ending_filter.clean(blob.data, tree_path)
             if filtered_data != blob.data:
             if filtered_data != blob.data:
                 new_blob = Blob()
                 new_blob = Blob()
                 new_blob.data = filtered_data
                 new_blob.data = filtered_data
@@ -426,12 +579,24 @@ class BlobNormalizer(FilterBlobNormalizer):
         # (autocrlf is enabled), apply it to all files
         # (autocrlf is enabled), apply it to all files
         if result is blob and self.fallback_read_filter is not None:
         if result is blob and self.fallback_read_filter is not None:
             # Apply the smudge filter with binary detection
             # Apply the smudge filter with binary detection
+            # Get safecrlf from config
+            safecrlf = b"false"
+            if hasattr(self, "filter_registry") and hasattr(
+                self.filter_registry, "config_stack"
+            ):
+                safecrlf = self.filter_registry.config_stack.get(
+                    b"core", b"safecrlf", b"false"
+                )
+                if hasattr(safecrlf, "encode"):
+                    safecrlf = safecrlf.encode("utf-8")
+
             line_ending_filter = LineEndingFilter(
             line_ending_filter = LineEndingFilter(
                 clean_conversion=None,
                 clean_conversion=None,
                 smudge_conversion=self.fallback_read_filter,
                 smudge_conversion=self.fallback_read_filter,
                 binary_detection=True,
                 binary_detection=True,
+                safecrlf=safecrlf,
             )
             )
-            filtered_data = line_ending_filter.smudge(blob.data)
+            filtered_data = line_ending_filter.smudge(blob.data, tree_path)
             if filtered_data != blob.data:
             if filtered_data != blob.data:
                 new_blob = Blob()
                 new_blob = Blob()
                 new_blob.data = filtered_data
                 new_blob.data = filtered_data
@@ -474,9 +639,10 @@ class TreeBlobNormalizer(BlobNormalizer):
         tree: Optional[ObjectID] = None,
         tree: Optional[ObjectID] = None,
         core_eol: str = "native",
         core_eol: str = "native",
         autocrlf: bytes = b"false",
         autocrlf: bytes = b"false",
+        safecrlf: bytes = b"false",
     ) -> None:
     ) -> None:
         """Initialize TreeBlobNormalizer."""
         """Initialize TreeBlobNormalizer."""
-        super().__init__(config_stack, git_attributes, core_eol, autocrlf)
+        super().__init__(config_stack, git_attributes, core_eol, autocrlf, safecrlf)
         if tree:
         if tree:
             self.existing_paths = {
             self.existing_paths = {
                 name for name, _, _ in iter_tree_contents(object_store, tree)
                 name for name, _, _ in iter_tree_contents(object_store, tree)

+ 282 - 0
dulwich/whitespace.py

@@ -0,0 +1,282 @@
+# whitespace.py -- Whitespace error detection and fixing
+# Copyright (C) 2025 Dulwich contributors
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as published by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+"""Whitespace error detection and fixing functionality.
+
+This module implements Git's core.whitespace configuration and related
+whitespace error detection capabilities.
+"""
+
+from typing import Optional
+
+# Default whitespace errors Git checks for
+DEFAULT_WHITESPACE_ERRORS = {
+    "blank-at-eol",
+    "space-before-tab",
+    "blank-at-eof",
+}
+
+# All available whitespace error types
+WHITESPACE_ERROR_TYPES = {
+    "blank-at-eol",  # Trailing whitespace at end of line
+    "space-before-tab",  # Space before tab in indentation
+    "indent-with-non-tab",  # Indent with space when tabs expected (8+ spaces)
+    "tab-in-indent",  # Tab in indentation when spaces expected
+    "blank-at-eof",  # Blank lines at end of file
+    "trailing-space",  # Trailing whitespace (same as blank-at-eol)
+    "cr-at-eol",  # Carriage return at end of line
+    "tabwidth",  # Special: sets tab width (not an error type)
+}
+
+
+def parse_whitespace_config(value: Optional[str]) -> tuple[set[str], int]:
+    """Parse core.whitespace configuration value.
+
+    Args:
+        value: The core.whitespace config value (e.g., "blank-at-eol,space-before-tab")
+
+    Returns:
+        Tuple of (enabled error types, tab width)
+    """
+    if value is None:
+        return DEFAULT_WHITESPACE_ERRORS.copy(), 8
+
+    if value == "":
+        return set(), 8
+
+    # Start with defaults if no explicit errors are specified or if negation is used
+    parts = value.split(",")
+    has_negation = any(p.strip().startswith("-") for p in parts)
+    has_explicit_errors = any(p.strip() in WHITESPACE_ERROR_TYPES for p in parts)
+
+    if has_negation or not has_explicit_errors:
+        enabled = DEFAULT_WHITESPACE_ERRORS.copy()
+    else:
+        enabled = set()
+
+    tab_width = 8
+
+    for part in parts:
+        part = part.strip()
+        if not part:
+            continue
+
+        # Handle negation
+        if part.startswith("-"):
+            error_type = part[1:]
+            if error_type in WHITESPACE_ERROR_TYPES:
+                enabled.discard(error_type)
+        elif part.startswith("tabwidth="):
+            try:
+                tab_width = int(part[9:])
+                if tab_width < 1:
+                    tab_width = 8
+            except ValueError:
+                tab_width = 8
+        elif part in WHITESPACE_ERROR_TYPES:
+            enabled.add(part)
+
+    # Handle aliases
+    if "trailing-space" in enabled:
+        enabled.add("blank-at-eol")
+        enabled.discard("trailing-space")
+
+    return enabled, tab_width
+
+
+class WhitespaceChecker:
+    """Checks for whitespace errors in text content."""
+
+    def __init__(self, enabled_errors: set[str], tab_width: int = 8):
+        """Initialize whitespace checker.
+
+        Args:
+            enabled_errors: Set of error types to check for
+            tab_width: Width of tab character for indentation checking
+        """
+        self.enabled_errors = enabled_errors
+        self.tab_width = tab_width
+
+    def check_line(self, line: bytes, line_num: int) -> list[tuple[str, int]]:
+        """Check a single line for whitespace errors.
+
+        Args:
+            line: Line content (without newline)
+            line_num: Line number (1-based)
+
+        Returns:
+            List of (error_type, line_number) tuples
+        """
+        errors = []
+
+        # Check for trailing whitespace (blank-at-eol)
+        if "blank-at-eol" in self.enabled_errors:
+            if line and (line[-1:] == b" " or line[-1:] == b"\t"):
+                # Find where trailing whitespace starts
+                i = len(line) - 1
+                while i >= 0 and line[i : i + 1] in (b" ", b"\t"):
+                    i -= 1
+                errors.append(("blank-at-eol", line_num))
+
+        # Check for space before tab
+        if "space-before-tab" in self.enabled_errors:
+            # Check in indentation
+            i = 0
+            while i < len(line) and line[i : i + 1] in (b" ", b"\t"):
+                if i > 0 and line[i - 1 : i] == b" " and line[i : i + 1] == b"\t":
+                    errors.append(("space-before-tab", line_num))
+                    break
+                i += 1
+
+        # Check for indent-with-non-tab (8+ spaces at start)
+        if "indent-with-non-tab" in self.enabled_errors:
+            space_count = 0
+            for i in range(len(line)):
+                if line[i : i + 1] == b" ":
+                    space_count += 1
+                    if space_count >= self.tab_width:
+                        errors.append(("indent-with-non-tab", line_num))
+                        break
+                elif line[i : i + 1] == b"\t":
+                    space_count = 0  # Reset on tab
+                else:
+                    break  # Non-whitespace character
+
+        # Check for tab-in-indent
+        if "tab-in-indent" in self.enabled_errors:
+            for i in range(len(line)):
+                if line[i : i + 1] == b"\t":
+                    errors.append(("tab-in-indent", line_num))
+                    break
+                elif line[i : i + 1] not in (b" ", b"\t"):
+                    break  # Non-whitespace character
+
+        # Check for carriage return
+        if "cr-at-eol" in self.enabled_errors:
+            if line and line[-1:] == b"\r":
+                errors.append(("cr-at-eol", line_num))
+
+        return errors
+
+    def check_content(self, content: bytes) -> list[tuple[str, int]]:
+        """Check content for whitespace errors.
+
+        Args:
+            content: File content to check
+
+        Returns:
+            List of (error_type, line_number) tuples
+        """
+        errors = []
+        lines = content.split(b"\n")
+
+        # Handle CRLF line endings
+        for i, line in enumerate(lines):
+            if line.endswith(b"\r"):
+                lines[i] = line[:-1]
+
+        # Check each line
+        for i, line in enumerate(lines):
+            errors.extend(self.check_line(line, i + 1))
+
+        # Check for blank lines at end of file
+        if "blank-at-eof" in self.enabled_errors:
+            # Skip the last empty line if content ends with newline
+            check_lines = lines[:-1] if lines and lines[-1] == b"" else lines
+
+            if check_lines:
+                trailing_blank_count = 0
+                for i in range(len(check_lines) - 1, -1, -1):
+                    if check_lines[i] == b"":
+                        trailing_blank_count += 1
+                    else:
+                        break
+
+                if trailing_blank_count > 0:
+                    # Report the line number of the last non-empty line + 1
+                    errors.append(("blank-at-eof", len(check_lines) + 1))
+
+        return errors
+
+
+def fix_whitespace_errors(
+    content: bytes, errors: list[tuple[str, int]], fix_types: Optional[set[str]] = None
+) -> bytes:
+    """Fix whitespace errors in content.
+
+    Args:
+        content: Original content
+        errors: List of errors from WhitespaceChecker
+        fix_types: Set of error types to fix (None means fix all)
+
+    Returns:
+        Fixed content
+    """
+    if not errors:
+        return content
+
+    lines = content.split(b"\n")
+
+    # Handle CRLF line endings - we need to track which lines had them
+    has_crlf = []
+    for i, line in enumerate(lines):
+        if line.endswith(b"\r"):
+            has_crlf.append(i)
+            lines[i] = line[:-1]
+
+    # Group errors by line
+    errors_by_line: dict[int, list[str]] = {}
+    for error_type, line_num in errors:
+        if fix_types is None or error_type in fix_types:
+            if line_num not in errors_by_line:
+                errors_by_line[line_num] = []
+            errors_by_line[line_num].append(error_type)
+
+    # Fix errors
+    for line_num, error_types in errors_by_line.items():
+        if line_num > len(lines):
+            continue
+
+        line_idx = line_num - 1
+        line = lines[line_idx]
+
+        # Fix trailing whitespace
+        if "blank-at-eol" in error_types:
+            # Remove trailing spaces and tabs
+            while line and line[-1:] in (b" ", b"\t"):
+                line = line[:-1]
+            lines[line_idx] = line
+
+        # Fix carriage return - since we already stripped CRs, we just don't restore them
+        if "cr-at-eol" in error_types and line_idx in has_crlf:
+            has_crlf.remove(line_idx)
+
+    # Restore CRLF for lines that should keep them
+    for idx in has_crlf:
+        if idx < len(lines):
+            lines[idx] = lines[idx] + b"\r"
+
+    # Fix blank lines at end of file
+    if fix_types is None or "blank-at-eof" in fix_types:
+        # Remove trailing empty lines
+        while len(lines) > 1 and lines[-1] == b"" and lines[-2] == b"":
+            lines.pop()
+
+    return b"\n".join(lines)

+ 131 - 0
tests/test_line_ending.py

@@ -25,6 +25,7 @@ from dulwich.line_ending import (
     BlobNormalizer,
     BlobNormalizer,
     LineEndingFilter,
     LineEndingFilter,
     TreeBlobNormalizer,
     TreeBlobNormalizer,
+    check_safecrlf,
     convert_crlf_to_lf,
     convert_crlf_to_lf,
     convert_lf_to_crlf,
     convert_lf_to_crlf,
     get_clean_filter_autocrlf,
     get_clean_filter_autocrlf,
@@ -551,3 +552,133 @@ class LineEndingIntegrationTests(TestCase):
         bin_blob.data = b"binary content"
         bin_blob.data = b"binary content"
         result = normalizer.checkin_normalize(bin_blob, b"test.bin")
         result = normalizer.checkin_normalize(bin_blob, b"test.bin")
         self.assertEqual(result.data, b"LFS pointer")
         self.assertEqual(result.data, b"LFS pointer")
+
+
+class LineEndingFilterFromConfigTests(TestCase):
+    """Test LineEndingFilter.from_config classmethod."""
+
+    def test_from_config_none(self) -> None:
+        """Test from_config with no config."""
+        # No config, not for text attr - no conversion
+        filter = LineEndingFilter.from_config(None, for_text_attr=False)
+        self.assertIsNone(filter.clean_conversion)
+        self.assertIsNone(filter.smudge_conversion)
+        self.assertEqual(filter.safecrlf, b"false")
+
+        # No config, for text attr - normalize on checkin
+        filter = LineEndingFilter.from_config(None, for_text_attr=True)
+        self.assertIsNotNone(filter.clean_conversion)
+        self.assertIsNone(filter.smudge_conversion)
+        self.assertEqual(filter.safecrlf, b"false")
+
+    def test_from_config_autocrlf_true(self) -> None:
+        """Test from_config with autocrlf=true."""
+        from dulwich.config import ConfigDict
+
+        config = ConfigDict()
+        config.set(b"core", b"autocrlf", b"true")
+
+        filter = LineEndingFilter.from_config(config, for_text_attr=False)
+        self.assertIsNotNone(filter.clean_conversion)
+        self.assertIsNotNone(filter.smudge_conversion)
+        self.assertEqual(filter.safecrlf, b"false")
+
+    def test_from_config_with_safecrlf(self) -> None:
+        """Test from_config with safecrlf setting."""
+        from dulwich.config import ConfigDict
+
+        config = ConfigDict()
+        config.set(b"core", b"autocrlf", b"input")
+        config.set(b"core", b"safecrlf", b"warn")
+
+        filter = LineEndingFilter.from_config(config, for_text_attr=False)
+        self.assertIsNotNone(filter.clean_conversion)
+        self.assertIsNone(filter.smudge_conversion)
+        self.assertEqual(filter.safecrlf, b"warn")
+
+    def test_from_config_text_attr_overrides(self) -> None:
+        """Test that for_text_attr=True always normalizes on checkin."""
+        from dulwich.config import ConfigDict
+
+        config = ConfigDict()
+        config.set(b"core", b"autocrlf", b"false")
+
+        # Even with autocrlf=false, text attr should normalize
+        filter = LineEndingFilter.from_config(config, for_text_attr=True)
+        self.assertIsNotNone(filter.clean_conversion)
+        # Smudge should still be None since autocrlf=false
+        self.assertIsNone(filter.smudge_conversion)
+
+
+class SafeCRLFTests(TestCase):
+    """Test core.safecrlf functionality."""
+
+    def test_safecrlf_false(self) -> None:
+        """Test that safecrlf=false allows any conversion."""
+        original = b"line1\r\nline2\r\n"
+        converted = b"line1\nline2\n"
+        # Should not raise
+        check_safecrlf(original, converted, b"false", b"test.txt")
+
+    def test_safecrlf_true_safe_conversion(self) -> None:
+        """Test that safecrlf=true allows safe conversions."""
+        # CRLF -> LF -> CRLF is reversible
+        original = b"line1\r\nline2\r\n"
+        converted = b"line1\nline2\n"
+        # Should not raise because conversion is reversible
+        check_safecrlf(original, converted, b"true", b"test.txt")
+
+    def test_safecrlf_true_unsafe_conversion(self) -> None:
+        """Test that safecrlf=true fails on unsafe conversions."""
+        # Mixed line endings would be lost
+        original = b"line1\r\nline2\nline3\r\n"
+        converted = b"line1\nline2\nline3\n"
+        # Should raise because converting back gives all CRLF
+        with self.assertRaises(ValueError) as cm:
+            check_safecrlf(original, converted, b"true", b"test.txt")
+        self.assertIn("CRLF would be replaced by LF", str(cm.exception))
+
+    def test_safecrlf_warn(self) -> None:
+        """Test that safecrlf=warn issues warnings."""
+        # Mixed line endings would be lost
+        original = b"line1\r\nline2\nline3\r\n"
+        converted = b"line1\nline2\nline3\n"
+        # Should warn but not raise
+        with self.assertLogs("dulwich.line_ending", level="WARNING") as cm:
+            check_safecrlf(original, converted, b"warn", b"test.txt")
+            self.assertEqual(len(cm.output), 1)
+            self.assertIn("CRLF would be replaced by LF", cm.output[0])
+
+    def test_lineending_filter_with_safecrlf(self) -> None:
+        """Test LineEndingFilter with safecrlf enabled."""
+        # Test with safecrlf=true
+        filter_strict = LineEndingFilter(
+            clean_conversion=convert_crlf_to_lf,
+            smudge_conversion=None,
+            binary_detection=False,
+            safecrlf=b"true",
+        )
+
+        # Safe conversion should work
+        safe_data = b"line1\r\nline2\r\n"
+        result = filter_strict.clean(safe_data, b"test.txt")
+        self.assertEqual(result, b"line1\nline2\n")
+
+        # Unsafe conversion should fail
+        unsafe_data = b"line1\r\nline2\nline3\r\n"
+        with self.assertRaises(ValueError):
+            filter_strict.clean(unsafe_data, b"test.txt")
+
+        # Test with safecrlf=warn
+        filter_warn = LineEndingFilter(
+            clean_conversion=convert_crlf_to_lf,
+            smudge_conversion=None,
+            binary_detection=False,
+            safecrlf=b"warn",
+        )
+
+        # Should warn but still convert
+        with self.assertLogs("dulwich.line_ending", level="WARNING") as cm:
+            result = filter_warn.clean(unsafe_data, b"test.txt")
+            self.assertEqual(result, b"line1\nline2\nline3\n")
+            self.assertEqual(len(cm.output), 1)

+ 274 - 0
tests/test_whitespace.py

@@ -0,0 +1,274 @@
+# test_whitespace.py -- Tests for whitespace error detection
+# Copyright (C) 2025 Dulwich contributors
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as published by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Tests for whitespace error detection."""
+
+from dulwich.whitespace import (
+    DEFAULT_WHITESPACE_ERRORS,
+    WhitespaceChecker,
+    fix_whitespace_errors,
+    parse_whitespace_config,
+)
+
+from . import TestCase
+
+
+class WhitespaceConfigTests(TestCase):
+    """Test core.whitespace configuration parsing."""
+
+    def test_parse_default(self) -> None:
+        """Test default whitespace configuration."""
+        errors, tab_width = parse_whitespace_config(None)
+        self.assertEqual(errors, DEFAULT_WHITESPACE_ERRORS)
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_empty(self) -> None:
+        """Test empty whitespace configuration."""
+        errors, tab_width = parse_whitespace_config("")
+        self.assertEqual(errors, set())
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_single_error(self) -> None:
+        """Test single error type."""
+        errors, tab_width = parse_whitespace_config("blank-at-eol")
+        self.assertEqual(errors, {"blank-at-eol"})
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_multiple_errors(self) -> None:
+        """Test multiple error types."""
+        errors, tab_width = parse_whitespace_config(
+            "blank-at-eol,space-before-tab,tab-in-indent"
+        )
+        self.assertEqual(errors, {"blank-at-eol", "space-before-tab", "tab-in-indent"})
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_with_negation(self) -> None:
+        """Test negation of default errors."""
+        errors, tab_width = parse_whitespace_config("-blank-at-eol")
+        # Should have defaults minus blank-at-eol
+        expected = DEFAULT_WHITESPACE_ERRORS - {"blank-at-eol"}
+        self.assertEqual(errors, expected)
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_trailing_space_alias(self) -> None:
+        """Test that trailing-space is an alias for blank-at-eol."""
+        errors, tab_width = parse_whitespace_config("trailing-space")
+        self.assertEqual(errors, {"blank-at-eol"})
+        self.assertEqual(tab_width, 8)
+
+    def test_parse_tabwidth(self) -> None:
+        """Test tabwidth setting."""
+        errors, tab_width = parse_whitespace_config("blank-at-eol,tabwidth=4")
+        self.assertEqual(errors, {"blank-at-eol"})
+        self.assertEqual(tab_width, 4)
+
+    def test_parse_invalid_tabwidth(self) -> None:
+        """Test invalid tabwidth defaults to 8."""
+        errors, tab_width = parse_whitespace_config("tabwidth=invalid")
+        self.assertEqual(tab_width, 8)
+
+        errors, tab_width = parse_whitespace_config("tabwidth=0")
+        self.assertEqual(tab_width, 8)
+
+
+class WhitespaceCheckerTests(TestCase):
+    """Test WhitespaceChecker functionality."""
+
+    def test_blank_at_eol(self) -> None:
+        """Test detection of trailing whitespace."""
+        checker = WhitespaceChecker({"blank-at-eol"})
+
+        # No trailing whitespace
+        errors = checker.check_line(b"normal line", 1)
+        self.assertEqual(errors, [])
+
+        # Trailing space
+        errors = checker.check_line(b"trailing space ", 1)
+        self.assertEqual(errors, [("blank-at-eol", 1)])
+
+        # Trailing tab
+        errors = checker.check_line(b"trailing tab\t", 1)
+        self.assertEqual(errors, [("blank-at-eol", 1)])
+
+        # Multiple trailing whitespace
+        errors = checker.check_line(b"multiple  \t ", 1)
+        self.assertEqual(errors, [("blank-at-eol", 1)])
+
+    def test_space_before_tab(self) -> None:
+        """Test detection of space before tab in indentation."""
+        checker = WhitespaceChecker({"space-before-tab"})
+
+        # No space before tab
+        errors = checker.check_line(b"\tindented", 1)
+        self.assertEqual(errors, [])
+
+        # Space before tab in indentation
+        errors = checker.check_line(b" \tindented", 1)
+        self.assertEqual(errors, [("space-before-tab", 1)])
+
+        # Space before tab not in indentation (should not trigger)
+        errors = checker.check_line(b"code \t comment", 1)
+        self.assertEqual(errors, [])
+
+    def test_indent_with_non_tab(self) -> None:
+        """Test detection of 8+ spaces at start of line."""
+        checker = WhitespaceChecker({"indent-with-non-tab"}, tab_width=8)
+
+        # Less than 8 spaces
+        errors = checker.check_line(b"    code", 1)
+        self.assertEqual(errors, [])
+
+        # Exactly 8 spaces
+        errors = checker.check_line(b"        code", 1)
+        self.assertEqual(errors, [("indent-with-non-tab", 1)])
+
+        # More than 8 spaces
+        errors = checker.check_line(b"         code", 1)
+        self.assertEqual(errors, [("indent-with-non-tab", 1)])
+
+        # Tab after spaces resets count
+        errors = checker.check_line(b"    \t    code", 1)
+        self.assertEqual(errors, [])
+
+        # Custom tab width
+        checker = WhitespaceChecker({"indent-with-non-tab"}, tab_width=4)
+        errors = checker.check_line(b"    code", 1)
+        self.assertEqual(errors, [("indent-with-non-tab", 1)])
+
+    def test_tab_in_indent(self) -> None:
+        """Test detection of tabs in indentation."""
+        checker = WhitespaceChecker({"tab-in-indent"})
+
+        # No tabs
+        errors = checker.check_line(b"    code", 1)
+        self.assertEqual(errors, [])
+
+        # Tab in indentation
+        errors = checker.check_line(b"\tcode", 1)
+        self.assertEqual(errors, [("tab-in-indent", 1)])
+
+        # Tab after non-whitespace (should not trigger)
+        errors = checker.check_line(b"code\tcomment", 1)
+        self.assertEqual(errors, [])
+
+    def test_cr_at_eol(self) -> None:
+        """Test detection of carriage return at end of line."""
+        checker = WhitespaceChecker({"cr-at-eol"})
+
+        # No CR
+        errors = checker.check_line(b"normal line", 1)
+        self.assertEqual(errors, [])
+
+        # CR at end
+        errors = checker.check_line(b"line\r", 1)
+        self.assertEqual(errors, [("cr-at-eol", 1)])
+
+    def test_blank_at_eof(self) -> None:
+        """Test detection of blank lines at end of file."""
+        checker = WhitespaceChecker({"blank-at-eof"})
+
+        # No trailing blank lines
+        content = b"line1\nline2\nline3"
+        errors = checker.check_content(content)
+        self.assertEqual(errors, [])
+
+        # One trailing blank line (normal for files ending with newline)
+        content = b"line1\nline2\nline3\n"
+        errors = checker.check_content(content)
+        self.assertEqual(errors, [])
+
+        # Multiple trailing blank lines
+        content = b"line1\nline2\n\n\n"
+        errors = checker.check_content(content)
+        self.assertEqual(errors, [("blank-at-eof", 5)])
+
+        # Only blank lines
+        content = b"\n\n\n"
+        errors = checker.check_content(content)
+        self.assertEqual(errors, [("blank-at-eof", 4)])
+
+    def test_multiple_errors(self) -> None:
+        """Test detection of multiple error types."""
+        checker = WhitespaceChecker(
+            {"blank-at-eol", "space-before-tab", "tab-in-indent"}
+        )
+
+        # Line with multiple errors
+        errors = checker.check_line(b" \tcode  ", 1)
+        error_types = {e[0] for e in errors}
+        self.assertEqual(
+            error_types, {"blank-at-eol", "space-before-tab", "tab-in-indent"}
+        )
+
+    def test_check_content_crlf(self) -> None:
+        """Test content checking with CRLF line endings."""
+        checker = WhitespaceChecker({"blank-at-eol", "cr-at-eol"})
+
+        # CRLF line endings
+        content = b"line1\r\nline2 \r\nline3\r\n"
+        errors = checker.check_content(content)
+        # Should detect trailing space on line 2 but not CR (since CRLF is handled)
+        self.assertEqual(errors, [("blank-at-eol", 2)])
+
+
+class WhitespaceFixTests(TestCase):
+    """Test whitespace error fixing."""
+
+    def test_fix_blank_at_eol(self) -> None:
+        """Test fixing trailing whitespace."""
+        content = b"line1  \nline2\t\nline3"
+        errors = [("blank-at-eol", 1), ("blank-at-eol", 2)]
+        fixed = fix_whitespace_errors(content, errors)
+        self.assertEqual(fixed, b"line1\nline2\nline3")
+
+    def test_fix_blank_at_eof(self) -> None:
+        """Test fixing blank lines at end of file."""
+        content = b"line1\nline2\n\n\n"
+        errors = [("blank-at-eof", 4)]
+        fixed = fix_whitespace_errors(content, errors)
+        self.assertEqual(fixed, b"line1\nline2\n")
+
+    def test_fix_cr_at_eol(self) -> None:
+        """Test fixing carriage returns."""
+        content = b"line1\r\nline2\r\nline3\r"
+        errors = [("cr-at-eol", 1), ("cr-at-eol", 2), ("cr-at-eol", 3)]
+        fixed = fix_whitespace_errors(content, errors)
+        # Our fix function removes all CRs when cr-at-eol errors are fixed
+        self.assertEqual(fixed, b"line1\nline2\nline3")
+
+    def test_fix_specific_types(self) -> None:
+        """Test fixing only specific error types."""
+        content = b"line1  \nline2\n\n\n"
+        errors = [("blank-at-eol", 1), ("blank-at-eof", 4)]
+
+        # Fix only blank-at-eol
+        fixed = fix_whitespace_errors(content, errors, fix_types={"blank-at-eol"})
+        self.assertEqual(fixed, b"line1\nline2\n\n\n")
+
+        # Fix only blank-at-eof
+        fixed = fix_whitespace_errors(content, errors, fix_types={"blank-at-eof"})
+        self.assertEqual(fixed, b"line1  \nline2\n")
+
+    def test_fix_no_errors(self) -> None:
+        """Test fixing with no errors returns original content."""
+        content = b"line1\nline2\nline3"
+        fixed = fix_whitespace_errors(content, [])
+        self.assertEqual(fixed, content)