whitespace.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. # whitespace.py -- Whitespace error detection and fixing
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Whitespace error detection and fixing functionality.
  22. This module implements Git's core.whitespace configuration and related
  23. whitespace error detection capabilities.
  24. """
  25. from collections.abc import Sequence, Set
  26. # Default whitespace errors Git checks for
  27. DEFAULT_WHITESPACE_ERRORS = {
  28. "blank-at-eol",
  29. "space-before-tab",
  30. "blank-at-eof",
  31. }
  32. # All available whitespace error types
  33. WHITESPACE_ERROR_TYPES = {
  34. "blank-at-eol", # Trailing whitespace at end of line
  35. "space-before-tab", # Space before tab in indentation
  36. "indent-with-non-tab", # Indent with space when tabs expected (8+ spaces)
  37. "tab-in-indent", # Tab in indentation when spaces expected
  38. "blank-at-eof", # Blank lines at end of file
  39. "trailing-space", # Trailing whitespace (same as blank-at-eol)
  40. "cr-at-eol", # Carriage return at end of line
  41. "tabwidth", # Special: sets tab width (not an error type)
  42. }
  43. def parse_whitespace_config(value: str | None) -> tuple[set[str], int]:
  44. """Parse core.whitespace configuration value.
  45. Args:
  46. value: The core.whitespace config value (e.g., "blank-at-eol,space-before-tab")
  47. Returns:
  48. Tuple of (enabled error types, tab width)
  49. """
  50. if value is None:
  51. return DEFAULT_WHITESPACE_ERRORS.copy(), 8
  52. if value == "":
  53. return set(), 8
  54. # Start with defaults if no explicit errors are specified or if negation is used
  55. parts = value.split(",")
  56. has_negation = any(p.strip().startswith("-") for p in parts)
  57. has_explicit_errors = any(p.strip() in WHITESPACE_ERROR_TYPES for p in parts)
  58. if has_negation or not has_explicit_errors:
  59. enabled = DEFAULT_WHITESPACE_ERRORS.copy()
  60. else:
  61. enabled = set()
  62. tab_width = 8
  63. for part in parts:
  64. part = part.strip()
  65. if not part:
  66. continue
  67. # Handle negation
  68. if part.startswith("-"):
  69. error_type = part[1:]
  70. if error_type in WHITESPACE_ERROR_TYPES:
  71. enabled.discard(error_type)
  72. elif part.startswith("tabwidth="):
  73. try:
  74. tab_width = int(part[9:])
  75. if tab_width < 1:
  76. tab_width = 8
  77. except ValueError:
  78. tab_width = 8
  79. elif part in WHITESPACE_ERROR_TYPES:
  80. enabled.add(part)
  81. # Handle aliases
  82. if "trailing-space" in enabled:
  83. enabled.add("blank-at-eol")
  84. enabled.discard("trailing-space")
  85. return enabled, tab_width
  86. class WhitespaceChecker:
  87. """Checks for whitespace errors in text content."""
  88. def __init__(self, enabled_errors: set[str], tab_width: int = 8):
  89. """Initialize whitespace checker.
  90. Args:
  91. enabled_errors: Set of error types to check for
  92. tab_width: Width of tab character for indentation checking
  93. """
  94. self.enabled_errors = enabled_errors
  95. self.tab_width = tab_width
  96. def check_line(self, line: bytes, line_num: int) -> list[tuple[str, int]]:
  97. """Check a single line for whitespace errors.
  98. Args:
  99. line: Line content (without newline)
  100. line_num: Line number (1-based)
  101. Returns:
  102. List of (error_type, line_number) tuples
  103. """
  104. errors = []
  105. # Check for trailing whitespace (blank-at-eol)
  106. if "blank-at-eol" in self.enabled_errors:
  107. if line and (line[-1:] == b" " or line[-1:] == b"\t"):
  108. # Find where trailing whitespace starts
  109. i = len(line) - 1
  110. while i >= 0 and line[i : i + 1] in (b" ", b"\t"):
  111. i -= 1
  112. errors.append(("blank-at-eol", line_num))
  113. # Check for space before tab
  114. if "space-before-tab" in self.enabled_errors:
  115. # Check in indentation
  116. i = 0
  117. while i < len(line) and line[i : i + 1] in (b" ", b"\t"):
  118. if i > 0 and line[i - 1 : i] == b" " and line[i : i + 1] == b"\t":
  119. errors.append(("space-before-tab", line_num))
  120. break
  121. i += 1
  122. # Check for indent-with-non-tab (8+ spaces at start)
  123. if "indent-with-non-tab" in self.enabled_errors:
  124. space_count = 0
  125. for i in range(len(line)):
  126. if line[i : i + 1] == b" ":
  127. space_count += 1
  128. if space_count >= self.tab_width:
  129. errors.append(("indent-with-non-tab", line_num))
  130. break
  131. elif line[i : i + 1] == b"\t":
  132. space_count = 0 # Reset on tab
  133. else:
  134. break # Non-whitespace character
  135. # Check for tab-in-indent
  136. if "tab-in-indent" in self.enabled_errors:
  137. for i in range(len(line)):
  138. if line[i : i + 1] == b"\t":
  139. errors.append(("tab-in-indent", line_num))
  140. break
  141. elif line[i : i + 1] not in (b" ", b"\t"):
  142. break # Non-whitespace character
  143. # Check for carriage return
  144. if "cr-at-eol" in self.enabled_errors:
  145. if line and line[-1:] == b"\r":
  146. errors.append(("cr-at-eol", line_num))
  147. return errors
  148. def check_content(self, content: bytes) -> list[tuple[str, int]]:
  149. """Check content for whitespace errors.
  150. Args:
  151. content: File content to check
  152. Returns:
  153. List of (error_type, line_number) tuples
  154. """
  155. errors = []
  156. lines = content.split(b"\n")
  157. # Handle CRLF line endings
  158. for i, line in enumerate(lines):
  159. if line.endswith(b"\r"):
  160. lines[i] = line[:-1]
  161. # Check each line
  162. for i, line in enumerate(lines):
  163. errors.extend(self.check_line(line, i + 1))
  164. # Check for blank lines at end of file
  165. if "blank-at-eof" in self.enabled_errors:
  166. # Skip the last empty line if content ends with newline
  167. check_lines = lines[:-1] if lines and lines[-1] == b"" else lines
  168. if check_lines:
  169. trailing_blank_count = 0
  170. for i in range(len(check_lines) - 1, -1, -1):
  171. if check_lines[i] == b"":
  172. trailing_blank_count += 1
  173. else:
  174. break
  175. if trailing_blank_count > 0:
  176. # Report the line number of the last non-empty line + 1
  177. errors.append(("blank-at-eof", len(check_lines) + 1))
  178. return errors
  179. def fix_whitespace_errors(
  180. content: bytes,
  181. errors: Sequence[tuple[str, int]],
  182. fix_types: Set[str] | None = None,
  183. ) -> bytes:
  184. """Fix whitespace errors in content.
  185. Args:
  186. content: Original content
  187. errors: List of errors from WhitespaceChecker
  188. fix_types: Set of error types to fix (None means fix all)
  189. Returns:
  190. Fixed content
  191. """
  192. if not errors:
  193. return content
  194. lines = content.split(b"\n")
  195. # Handle CRLF line endings - we need to track which lines had them
  196. has_crlf = []
  197. for i, line in enumerate(lines):
  198. if line.endswith(b"\r"):
  199. has_crlf.append(i)
  200. lines[i] = line[:-1]
  201. # Group errors by line
  202. errors_by_line: dict[int, list[str]] = {}
  203. for error_type, line_num in errors:
  204. if fix_types is None or error_type in fix_types:
  205. if line_num not in errors_by_line:
  206. errors_by_line[line_num] = []
  207. errors_by_line[line_num].append(error_type)
  208. # Fix errors
  209. for line_num, error_types in errors_by_line.items():
  210. if line_num > len(lines):
  211. continue
  212. line_idx = line_num - 1
  213. line = lines[line_idx]
  214. # Fix trailing whitespace
  215. if "blank-at-eol" in error_types:
  216. # Remove trailing spaces and tabs
  217. while line and line[-1:] in (b" ", b"\t"):
  218. line = line[:-1]
  219. lines[line_idx] = line
  220. # Fix carriage return - since we already stripped CRs, we just don't restore them
  221. if "cr-at-eol" in error_types and line_idx in has_crlf:
  222. has_crlf.remove(line_idx)
  223. # Restore CRLF for lines that should keep them
  224. for idx in has_crlf:
  225. if idx < len(lines):
  226. lines[idx] = lines[idx] + b"\r"
  227. # Fix blank lines at end of file
  228. if fix_types is None or "blank-at-eof" in fix_types:
  229. # Remove trailing empty lines
  230. while len(lines) > 1 and lines[-1] == b"" and lines[-2] == b"":
  231. lines.pop()
  232. return b"\n".join(lines)