whitespace.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. # whitespace.py -- Whitespace error detection and fixing
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Whitespace error detection and fixing functionality.
  22. This module implements Git's core.whitespace configuration and related
  23. whitespace error detection capabilities.
  24. """
  25. __all__ = [
  26. "DEFAULT_WHITESPACE_ERRORS",
  27. "WHITESPACE_ERROR_TYPES",
  28. "WhitespaceChecker",
  29. "fix_whitespace_errors",
  30. "parse_whitespace_config",
  31. ]
  32. from collections.abc import Sequence, Set
  33. # Default whitespace errors Git checks for
  34. DEFAULT_WHITESPACE_ERRORS = {
  35. "blank-at-eol",
  36. "space-before-tab",
  37. "blank-at-eof",
  38. }
  39. # All available whitespace error types
  40. WHITESPACE_ERROR_TYPES = {
  41. "blank-at-eol", # Trailing whitespace at end of line
  42. "space-before-tab", # Space before tab in indentation
  43. "indent-with-non-tab", # Indent with space when tabs expected (8+ spaces)
  44. "tab-in-indent", # Tab in indentation when spaces expected
  45. "blank-at-eof", # Blank lines at end of file
  46. "trailing-space", # Trailing whitespace (same as blank-at-eol)
  47. "cr-at-eol", # Carriage return at end of line
  48. "tabwidth", # Special: sets tab width (not an error type)
  49. }
  50. def parse_whitespace_config(value: str | None) -> tuple[set[str], int]:
  51. """Parse core.whitespace configuration value.
  52. Args:
  53. value: The core.whitespace config value (e.g., "blank-at-eol,space-before-tab")
  54. Returns:
  55. Tuple of (enabled error types, tab width)
  56. """
  57. if value is None:
  58. return DEFAULT_WHITESPACE_ERRORS.copy(), 8
  59. if value == "":
  60. return set(), 8
  61. # Start with defaults if no explicit errors are specified or if negation is used
  62. parts = value.split(",")
  63. has_negation = any(p.strip().startswith("-") for p in parts)
  64. has_explicit_errors = any(p.strip() in WHITESPACE_ERROR_TYPES for p in parts)
  65. if has_negation or not has_explicit_errors:
  66. enabled = DEFAULT_WHITESPACE_ERRORS.copy()
  67. else:
  68. enabled = set()
  69. tab_width = 8
  70. for part in parts:
  71. part = part.strip()
  72. if not part:
  73. continue
  74. # Handle negation
  75. if part.startswith("-"):
  76. error_type = part[1:]
  77. if error_type in WHITESPACE_ERROR_TYPES:
  78. enabled.discard(error_type)
  79. elif part.startswith("tabwidth="):
  80. try:
  81. tab_width = int(part[9:])
  82. if tab_width < 1:
  83. tab_width = 8
  84. except ValueError:
  85. tab_width = 8
  86. elif part in WHITESPACE_ERROR_TYPES:
  87. enabled.add(part)
  88. # Handle aliases
  89. if "trailing-space" in enabled:
  90. enabled.add("blank-at-eol")
  91. enabled.discard("trailing-space")
  92. return enabled, tab_width
  93. class WhitespaceChecker:
  94. """Checks for whitespace errors in text content."""
  95. def __init__(self, enabled_errors: set[str], tab_width: int = 8):
  96. """Initialize whitespace checker.
  97. Args:
  98. enabled_errors: Set of error types to check for
  99. tab_width: Width of tab character for indentation checking
  100. """
  101. self.enabled_errors = enabled_errors
  102. self.tab_width = tab_width
  103. def check_line(self, line: bytes, line_num: int) -> list[tuple[str, int]]:
  104. """Check a single line for whitespace errors.
  105. Args:
  106. line: Line content (without newline)
  107. line_num: Line number (1-based)
  108. Returns:
  109. List of (error_type, line_number) tuples
  110. """
  111. errors = []
  112. # Check for trailing whitespace (blank-at-eol)
  113. if "blank-at-eol" in self.enabled_errors:
  114. if line and (line[-1:] == b" " or line[-1:] == b"\t"):
  115. # Find where trailing whitespace starts
  116. i = len(line) - 1
  117. while i >= 0 and line[i : i + 1] in (b" ", b"\t"):
  118. i -= 1
  119. errors.append(("blank-at-eol", line_num))
  120. # Check for space before tab
  121. if "space-before-tab" in self.enabled_errors:
  122. # Check in indentation
  123. i = 0
  124. while i < len(line) and line[i : i + 1] in (b" ", b"\t"):
  125. if i > 0 and line[i - 1 : i] == b" " and line[i : i + 1] == b"\t":
  126. errors.append(("space-before-tab", line_num))
  127. break
  128. i += 1
  129. # Check for indent-with-non-tab (8+ spaces at start)
  130. if "indent-with-non-tab" in self.enabled_errors:
  131. space_count = 0
  132. for i in range(len(line)):
  133. if line[i : i + 1] == b" ":
  134. space_count += 1
  135. if space_count >= self.tab_width:
  136. errors.append(("indent-with-non-tab", line_num))
  137. break
  138. elif line[i : i + 1] == b"\t":
  139. space_count = 0 # Reset on tab
  140. else:
  141. break # Non-whitespace character
  142. # Check for tab-in-indent
  143. if "tab-in-indent" in self.enabled_errors:
  144. for i in range(len(line)):
  145. if line[i : i + 1] == b"\t":
  146. errors.append(("tab-in-indent", line_num))
  147. break
  148. elif line[i : i + 1] not in (b" ", b"\t"):
  149. break # Non-whitespace character
  150. # Check for carriage return
  151. if "cr-at-eol" in self.enabled_errors:
  152. if line and line[-1:] == b"\r":
  153. errors.append(("cr-at-eol", line_num))
  154. return errors
  155. def check_content(self, content: bytes) -> list[tuple[str, int]]:
  156. """Check content for whitespace errors.
  157. Args:
  158. content: File content to check
  159. Returns:
  160. List of (error_type, line_number) tuples
  161. """
  162. errors = []
  163. lines = content.split(b"\n")
  164. # Handle CRLF line endings
  165. for i, line in enumerate(lines):
  166. if line.endswith(b"\r"):
  167. lines[i] = line[:-1]
  168. # Check each line
  169. for i, line in enumerate(lines):
  170. errors.extend(self.check_line(line, i + 1))
  171. # Check for blank lines at end of file
  172. if "blank-at-eof" in self.enabled_errors:
  173. # Skip the last empty line if content ends with newline
  174. check_lines = lines[:-1] if lines and lines[-1] == b"" else lines
  175. if check_lines:
  176. trailing_blank_count = 0
  177. for i in range(len(check_lines) - 1, -1, -1):
  178. if check_lines[i] == b"":
  179. trailing_blank_count += 1
  180. else:
  181. break
  182. if trailing_blank_count > 0:
  183. # Report the line number of the last non-empty line + 1
  184. errors.append(("blank-at-eof", len(check_lines) + 1))
  185. return errors
  186. def fix_whitespace_errors(
  187. content: bytes,
  188. errors: Sequence[tuple[str, int]],
  189. fix_types: Set[str] | None = None,
  190. ) -> bytes:
  191. """Fix whitespace errors in content.
  192. Args:
  193. content: Original content
  194. errors: List of errors from WhitespaceChecker
  195. fix_types: Set of error types to fix (None means fix all)
  196. Returns:
  197. Fixed content
  198. """
  199. if not errors:
  200. return content
  201. lines = content.split(b"\n")
  202. # Handle CRLF line endings - we need to track which lines had them
  203. has_crlf = []
  204. for i, line in enumerate(lines):
  205. if line.endswith(b"\r"):
  206. has_crlf.append(i)
  207. lines[i] = line[:-1]
  208. # Group errors by line
  209. errors_by_line: dict[int, list[str]] = {}
  210. for error_type, line_num in errors:
  211. if fix_types is None or error_type in fix_types:
  212. if line_num not in errors_by_line:
  213. errors_by_line[line_num] = []
  214. errors_by_line[line_num].append(error_type)
  215. # Fix errors
  216. for line_num, error_types in errors_by_line.items():
  217. if line_num > len(lines):
  218. continue
  219. line_idx = line_num - 1
  220. line = lines[line_idx]
  221. # Fix trailing whitespace
  222. if "blank-at-eol" in error_types:
  223. # Remove trailing spaces and tabs
  224. while line and line[-1:] in (b" ", b"\t"):
  225. line = line[:-1]
  226. lines[line_idx] = line
  227. # Fix carriage return - since we already stripped CRs, we just don't restore them
  228. if "cr-at-eol" in error_types and line_idx in has_crlf:
  229. has_crlf.remove(line_idx)
  230. # Restore CRLF for lines that should keep them
  231. for idx in has_crlf:
  232. if idx < len(lines):
  233. lines[idx] = lines[idx] + b"\r"
  234. # Fix blank lines at end of file
  235. if fix_types is None or "blank-at-eof" in fix_types:
  236. # Remove trailing empty lines
  237. while len(lines) > 1 and lines[-1] == b"" and lines[-2] == b"":
  238. lines.pop()
  239. return b"\n".join(lines)