whitespace.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. # whitespace.py -- Whitespace error detection and fixing
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Whitespace error detection and fixing functionality.
  22. This module implements Git's core.whitespace configuration and related
  23. whitespace error detection capabilities.
  24. """
  25. from collections.abc import Sequence, Set
  26. from typing import Optional
  27. # Default whitespace errors Git checks for
  28. DEFAULT_WHITESPACE_ERRORS = {
  29. "blank-at-eol",
  30. "space-before-tab",
  31. "blank-at-eof",
  32. }
  33. # All available whitespace error types
  34. WHITESPACE_ERROR_TYPES = {
  35. "blank-at-eol", # Trailing whitespace at end of line
  36. "space-before-tab", # Space before tab in indentation
  37. "indent-with-non-tab", # Indent with space when tabs expected (8+ spaces)
  38. "tab-in-indent", # Tab in indentation when spaces expected
  39. "blank-at-eof", # Blank lines at end of file
  40. "trailing-space", # Trailing whitespace (same as blank-at-eol)
  41. "cr-at-eol", # Carriage return at end of line
  42. "tabwidth", # Special: sets tab width (not an error type)
  43. }
  44. def parse_whitespace_config(value: Optional[str]) -> tuple[set[str], int]:
  45. """Parse core.whitespace configuration value.
  46. Args:
  47. value: The core.whitespace config value (e.g., "blank-at-eol,space-before-tab")
  48. Returns:
  49. Tuple of (enabled error types, tab width)
  50. """
  51. if value is None:
  52. return DEFAULT_WHITESPACE_ERRORS.copy(), 8
  53. if value == "":
  54. return set(), 8
  55. # Start with defaults if no explicit errors are specified or if negation is used
  56. parts = value.split(",")
  57. has_negation = any(p.strip().startswith("-") for p in parts)
  58. has_explicit_errors = any(p.strip() in WHITESPACE_ERROR_TYPES for p in parts)
  59. if has_negation or not has_explicit_errors:
  60. enabled = DEFAULT_WHITESPACE_ERRORS.copy()
  61. else:
  62. enabled = set()
  63. tab_width = 8
  64. for part in parts:
  65. part = part.strip()
  66. if not part:
  67. continue
  68. # Handle negation
  69. if part.startswith("-"):
  70. error_type = part[1:]
  71. if error_type in WHITESPACE_ERROR_TYPES:
  72. enabled.discard(error_type)
  73. elif part.startswith("tabwidth="):
  74. try:
  75. tab_width = int(part[9:])
  76. if tab_width < 1:
  77. tab_width = 8
  78. except ValueError:
  79. tab_width = 8
  80. elif part in WHITESPACE_ERROR_TYPES:
  81. enabled.add(part)
  82. # Handle aliases
  83. if "trailing-space" in enabled:
  84. enabled.add("blank-at-eol")
  85. enabled.discard("trailing-space")
  86. return enabled, tab_width
  87. class WhitespaceChecker:
  88. """Checks for whitespace errors in text content."""
  89. def __init__(self, enabled_errors: set[str], tab_width: int = 8):
  90. """Initialize whitespace checker.
  91. Args:
  92. enabled_errors: Set of error types to check for
  93. tab_width: Width of tab character for indentation checking
  94. """
  95. self.enabled_errors = enabled_errors
  96. self.tab_width = tab_width
  97. def check_line(self, line: bytes, line_num: int) -> list[tuple[str, int]]:
  98. """Check a single line for whitespace errors.
  99. Args:
  100. line: Line content (without newline)
  101. line_num: Line number (1-based)
  102. Returns:
  103. List of (error_type, line_number) tuples
  104. """
  105. errors = []
  106. # Check for trailing whitespace (blank-at-eol)
  107. if "blank-at-eol" in self.enabled_errors:
  108. if line and (line[-1:] == b" " or line[-1:] == b"\t"):
  109. # Find where trailing whitespace starts
  110. i = len(line) - 1
  111. while i >= 0 and line[i : i + 1] in (b" ", b"\t"):
  112. i -= 1
  113. errors.append(("blank-at-eol", line_num))
  114. # Check for space before tab
  115. if "space-before-tab" in self.enabled_errors:
  116. # Check in indentation
  117. i = 0
  118. while i < len(line) and line[i : i + 1] in (b" ", b"\t"):
  119. if i > 0 and line[i - 1 : i] == b" " and line[i : i + 1] == b"\t":
  120. errors.append(("space-before-tab", line_num))
  121. break
  122. i += 1
  123. # Check for indent-with-non-tab (8+ spaces at start)
  124. if "indent-with-non-tab" in self.enabled_errors:
  125. space_count = 0
  126. for i in range(len(line)):
  127. if line[i : i + 1] == b" ":
  128. space_count += 1
  129. if space_count >= self.tab_width:
  130. errors.append(("indent-with-non-tab", line_num))
  131. break
  132. elif line[i : i + 1] == b"\t":
  133. space_count = 0 # Reset on tab
  134. else:
  135. break # Non-whitespace character
  136. # Check for tab-in-indent
  137. if "tab-in-indent" in self.enabled_errors:
  138. for i in range(len(line)):
  139. if line[i : i + 1] == b"\t":
  140. errors.append(("tab-in-indent", line_num))
  141. break
  142. elif line[i : i + 1] not in (b" ", b"\t"):
  143. break # Non-whitespace character
  144. # Check for carriage return
  145. if "cr-at-eol" in self.enabled_errors:
  146. if line and line[-1:] == b"\r":
  147. errors.append(("cr-at-eol", line_num))
  148. return errors
  149. def check_content(self, content: bytes) -> list[tuple[str, int]]:
  150. """Check content for whitespace errors.
  151. Args:
  152. content: File content to check
  153. Returns:
  154. List of (error_type, line_number) tuples
  155. """
  156. errors = []
  157. lines = content.split(b"\n")
  158. # Handle CRLF line endings
  159. for i, line in enumerate(lines):
  160. if line.endswith(b"\r"):
  161. lines[i] = line[:-1]
  162. # Check each line
  163. for i, line in enumerate(lines):
  164. errors.extend(self.check_line(line, i + 1))
  165. # Check for blank lines at end of file
  166. if "blank-at-eof" in self.enabled_errors:
  167. # Skip the last empty line if content ends with newline
  168. check_lines = lines[:-1] if lines and lines[-1] == b"" else lines
  169. if check_lines:
  170. trailing_blank_count = 0
  171. for i in range(len(check_lines) - 1, -1, -1):
  172. if check_lines[i] == b"":
  173. trailing_blank_count += 1
  174. else:
  175. break
  176. if trailing_blank_count > 0:
  177. # Report the line number of the last non-empty line + 1
  178. errors.append(("blank-at-eof", len(check_lines) + 1))
  179. return errors
  180. def fix_whitespace_errors(
  181. content: bytes,
  182. errors: Sequence[tuple[str, int]],
  183. fix_types: Optional[Set[str]] = None,
  184. ) -> bytes:
  185. """Fix whitespace errors in content.
  186. Args:
  187. content: Original content
  188. errors: List of errors from WhitespaceChecker
  189. fix_types: Set of error types to fix (None means fix all)
  190. Returns:
  191. Fixed content
  192. """
  193. if not errors:
  194. return content
  195. lines = content.split(b"\n")
  196. # Handle CRLF line endings - we need to track which lines had them
  197. has_crlf = []
  198. for i, line in enumerate(lines):
  199. if line.endswith(b"\r"):
  200. has_crlf.append(i)
  201. lines[i] = line[:-1]
  202. # Group errors by line
  203. errors_by_line: dict[int, list[str]] = {}
  204. for error_type, line_num in errors:
  205. if fix_types is None or error_type in fix_types:
  206. if line_num not in errors_by_line:
  207. errors_by_line[line_num] = []
  208. errors_by_line[line_num].append(error_type)
  209. # Fix errors
  210. for line_num, error_types in errors_by_line.items():
  211. if line_num > len(lines):
  212. continue
  213. line_idx = line_num - 1
  214. line = lines[line_idx]
  215. # Fix trailing whitespace
  216. if "blank-at-eol" in error_types:
  217. # Remove trailing spaces and tabs
  218. while line and line[-1:] in (b" ", b"\t"):
  219. line = line[:-1]
  220. lines[line_idx] = line
  221. # Fix carriage return - since we already stripped CRs, we just don't restore them
  222. if "cr-at-eol" in error_types and line_idx in has_crlf:
  223. has_crlf.remove(line_idx)
  224. # Restore CRLF for lines that should keep them
  225. for idx in has_crlf:
  226. if idx < len(lines):
  227. lines[idx] = lines[idx] + b"\r"
  228. # Fix blank lines at end of file
  229. if fix_types is None or "blank-at-eof" in fix_types:
  230. # Remove trailing empty lines
  231. while len(lines) > 1 and lines[-1] == b"" and lines[-2] == b"":
  232. lines.pop()
  233. return b"\n".join(lines)