line_ending.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689
  1. # line_ending.py -- Line ending conversion functions
  2. # Copyright (C) 2018-2018 Boris Feld <boris.feld@comet.ml>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. r"""All line-ending related functions, from conversions to config processing.
  22. Line-ending normalization is a complex beast. Here is some notes and details
  23. about how it seems to work.
  24. The normalization is a two-fold process that happens at two moments:
  25. - When reading a file from the index and to the working directory. For example
  26. when doing a ``git clone`` or ``git checkout`` call. This is called the
  27. smudge filter (repository -> working tree).
  28. - When writing a file to the index from the working directory. For example
  29. when doing a ``git add`` call. This is called the clean filter (working tree
  30. -> repository).
  31. Note that when checking status (getting unstaged changes), whether or not
  32. normalization is done on write depends on whether or not the file in the
  33. working dir has also been normalized on read:
  34. - For autocrlf=true all files are always normalized on both read and write.
  35. - For autocrlf=input files are only normalized on write if they are newly
  36. "added". Since files which are already committed are not normalized on
  37. checkout into the working tree, they are also left alone when staging
  38. modifications into the index.
  39. One thing to know is that Git does line-ending normalization only on text
  40. files. How does Git know that a file is text? We can either mark a file as a
  41. text file, a binary file or ask Git to automatically decides. Git has an
  42. heuristic to detect if a file is a text file or a binary file. It seems based
  43. on the percentage of non-printable characters in files.
  44. The code for this heuristic is here:
  45. https://git.kernel.org/pub/scm/git/git.git/tree/convert.c#n46
  46. Dulwich have an implementation with a slightly different heuristic, the
  47. `dulwich.patch.is_binary` function.
  48. The binary detection heuristic implementation is close to the one in JGit:
  49. https://github.com/eclipse/jgit/blob/f6873ffe522bbc3536969a3a3546bf9a819b92bf/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java#L300
  50. There is multiple variables that impact the normalization.
  51. First, a repository can contains a ``.gitattributes`` file (or more than one...)
  52. that can further customize the operation on some file patterns, for example:
  53. \*.txt text
  54. Force all ``.txt`` files to be treated as text files and to have their lines
  55. endings normalized.
  56. \*.jpg -text
  57. Force all ``.jpg`` files to be treated as binary files and to not have their
  58. lines endings converted.
  59. \*.vcproj text eol=crlf
  60. Force all ``.vcproj`` files to be treated as text files and to have their lines
  61. endings converted into ``CRLF`` in working directory no matter the native EOL of
  62. the platform.
  63. \*.sh text eol=lf
  64. Force all ``.sh`` files to be treated as text files and to have their lines
  65. endings converted into ``LF`` in working directory no matter the native EOL of
  66. the platform.
  67. If the ``eol`` attribute is not defined, Git uses the ``core.eol`` configuration
  68. value described later.
  69. \* text=auto
  70. Force all files to be scanned by the text file heuristic detection and to have
  71. their line endings normalized in case they are detected as text files.
  72. Git also have a obsolete attribute named ``crlf`` that can be translated to the
  73. corresponding text attribute value.
  74. Then there are some configuration option (that can be defined at the
  75. repository or user level):
  76. - core.autocrlf
  77. - core.eol
  78. ``core.autocrlf`` is taken into account for all files that doesn't have a ``text``
  79. attribute defined in ``.gitattributes``; it takes three possible values:
  80. - ``true``: This forces all files on the working directory to have CRLF
  81. line-endings in the working directory and convert line-endings to LF
  82. when writing to the index. When autocrlf is set to true, eol value is
  83. ignored.
  84. - ``input``: Quite similar to the ``true`` value but only applies the clean
  85. filter, ie line-ending of new files added to the index will get their
  86. line-endings converted to LF.
  87. - ``false`` (default): No normalization is done.
  88. ``core.eol`` is the top-level configuration to define the line-ending to use
  89. when applying the smudge filter. It takes three possible values:
  90. - ``lf``: When normalization is done, force line-endings to be ``LF`` in the
  91. working directory.
  92. - ``crlf``: When normalization is done, force line-endings to be ``CRLF`` in
  93. the working directory.
  94. - ``native`` (default): When normalization is done, force line-endings to be
  95. the platform's native line ending.
  96. One thing to remember is when line-ending normalization is done on a file, Git
  97. always normalize line-ending to ``LF`` when writing to the index.
  98. There are sources that seems to indicate that Git won't do line-ending
  99. normalization when a file contains mixed line-endings. I think this logic
  100. might be in text / binary detection heuristic but couldn't find it yet.
  101. Sources:
  102. - https://git-scm.com/docs/git-config#git-config-coreeol
  103. - https://git-scm.com/docs/git-config#git-config-coreautocrlf
  104. - https://git-scm.com/docs/gitattributes#_checking_out_and_checking_in
  105. - https://adaptivepatchwork.com/2012/03/01/mind-the-end-of-your-line/
  106. """
  107. __all__ = [
  108. "CRLF",
  109. "LF",
  110. "BlobNormalizer",
  111. "LineEndingFilter",
  112. "TreeBlobNormalizer",
  113. "check_safecrlf",
  114. "convert_crlf_to_lf",
  115. "convert_lf_to_crlf",
  116. "get_checkin_filter",
  117. "get_checkin_filter_autocrlf",
  118. "get_checkout_filter",
  119. "get_checkout_filter_autocrlf",
  120. "get_clean_filter",
  121. "get_clean_filter_autocrlf",
  122. "get_smudge_filter",
  123. "get_smudge_filter_autocrlf",
  124. "normalize_blob",
  125. ]
  126. import logging
  127. from collections.abc import Callable, Mapping
  128. from typing import TYPE_CHECKING, Any
  129. if TYPE_CHECKING:
  130. from .config import StackedConfig
  131. from .object_store import BaseObjectStore
  132. from . import replace_me
  133. from .attrs import GitAttributes, Pattern
  134. from .filters import FilterBlobNormalizer, FilterContext, FilterDriver, FilterRegistry
  135. from .object_store import iter_tree_contents
  136. from .objects import Blob, ObjectID
  137. from .patch import is_binary
  138. CRLF = b"\r\n"
  139. LF = b"\n"
  140. logger = logging.getLogger(__name__)
  141. class LineEndingFilter(FilterDriver):
  142. """Filter driver for line ending conversion."""
  143. def __init__(
  144. self,
  145. clean_conversion: Callable[[bytes], bytes] | None = None,
  146. smudge_conversion: Callable[[bytes], bytes] | None = None,
  147. binary_detection: bool = True,
  148. safecrlf: bytes = b"false",
  149. ):
  150. """Initialize LineEndingFilter."""
  151. self.clean_conversion = clean_conversion
  152. self.smudge_conversion = smudge_conversion
  153. self.binary_detection = binary_detection
  154. self.safecrlf = safecrlf
  155. @classmethod
  156. def from_config(
  157. cls, config: "StackedConfig | None", for_text_attr: bool = False
  158. ) -> "LineEndingFilter":
  159. """Create a LineEndingFilter from git configuration.
  160. Args:
  161. config: Git configuration stack
  162. for_text_attr: If True, always normalize on checkin (for text attribute)
  163. Returns:
  164. Configured LineEndingFilter instance
  165. """
  166. if config is None:
  167. # Default filter
  168. if for_text_attr:
  169. # For text attribute: always normalize on checkin
  170. return cls(
  171. clean_conversion=convert_crlf_to_lf,
  172. smudge_conversion=None,
  173. binary_detection=True,
  174. )
  175. else:
  176. # No config: no conversion
  177. return cls()
  178. # Get core.eol setting
  179. try:
  180. core_eol_raw = config.get("core", "eol")
  181. core_eol: str = (
  182. core_eol_raw.decode("ascii")
  183. if isinstance(core_eol_raw, bytes)
  184. else str(core_eol_raw)
  185. )
  186. except KeyError:
  187. core_eol = "native"
  188. # Get core.autocrlf setting
  189. try:
  190. autocrlf_raw = config.get("core", "autocrlf")
  191. autocrlf: bytes = (
  192. autocrlf_raw.lower()
  193. if isinstance(autocrlf_raw, bytes)
  194. else str(autocrlf_raw).lower().encode("ascii")
  195. )
  196. except KeyError:
  197. autocrlf = b"false"
  198. # Get core.safecrlf setting
  199. try:
  200. safecrlf_raw = config.get("core", "safecrlf")
  201. safecrlf = (
  202. safecrlf_raw
  203. if isinstance(safecrlf_raw, bytes)
  204. else safecrlf_raw.encode("utf-8")
  205. )
  206. except KeyError:
  207. safecrlf = b"false"
  208. if for_text_attr:
  209. # For text attribute: always normalize to LF on checkin
  210. # Smudge behavior depends on core.eol and core.autocrlf
  211. smudge_filter = get_smudge_filter(core_eol, autocrlf)
  212. clean_filter: Callable[[bytes], bytes] | None = convert_crlf_to_lf
  213. else:
  214. # Normal autocrlf behavior
  215. smudge_filter = get_smudge_filter(core_eol, autocrlf)
  216. clean_filter = get_clean_filter(core_eol, autocrlf)
  217. return cls(
  218. clean_conversion=clean_filter,
  219. smudge_conversion=smudge_filter,
  220. binary_detection=True,
  221. safecrlf=safecrlf,
  222. )
  223. def clean(self, data: bytes, path: bytes = b"") -> bytes:
  224. """Apply line ending conversion for checkin (working tree -> repository)."""
  225. if self.clean_conversion is None:
  226. return data
  227. # Skip binary files if detection is enabled
  228. if self.binary_detection and is_binary(data):
  229. return data
  230. converted = self.clean_conversion(data)
  231. # Check if conversion is safe
  232. if self.safecrlf != b"false":
  233. check_safecrlf(data, converted, self.safecrlf, path)
  234. return converted
  235. def smudge(self, data: bytes, path: bytes = b"") -> bytes:
  236. """Apply line ending conversion for checkout (repository -> working tree)."""
  237. if self.smudge_conversion is None:
  238. return data
  239. # Skip binary files if detection is enabled
  240. if self.binary_detection and is_binary(data):
  241. return data
  242. converted = self.smudge_conversion(data)
  243. # Check if conversion is safe
  244. if self.safecrlf != b"false":
  245. check_safecrlf(data, converted, self.safecrlf, path)
  246. return converted
  247. def cleanup(self) -> None:
  248. """Clean up any resources held by this filter driver."""
  249. # LineEndingFilter doesn't hold any resources that need cleanup
  250. def reuse(self, config: "StackedConfig", filter_name: str) -> bool:
  251. """Check if this filter driver should be reused with the given configuration."""
  252. # LineEndingFilter is lightweight and should always be recreated
  253. # to ensure it uses the latest configuration
  254. return False
  255. def convert_crlf_to_lf(text_hunk: bytes) -> bytes:
  256. """Convert CRLF in text hunk into LF.
  257. Args:
  258. text_hunk: A bytes string representing a text hunk
  259. Returns: The text hunk with the same type, with CRLF replaced into LF
  260. """
  261. return text_hunk.replace(CRLF, LF)
  262. def convert_lf_to_crlf(text_hunk: bytes) -> bytes:
  263. """Convert LF in text hunk into CRLF.
  264. Args:
  265. text_hunk: A bytes string representing a text hunk
  266. Returns: The text hunk with the same type, with LF replaced into CRLF
  267. """
  268. # Single-pass conversion: split on LF and join with CRLF
  269. # This avoids the double replacement issue
  270. parts = text_hunk.split(LF)
  271. # Remove any trailing CR to avoid CRCRLF
  272. cleaned_parts = []
  273. for i, part in enumerate(parts):
  274. if i < len(parts) - 1 and part.endswith(b"\r"):
  275. cleaned_parts.append(part[:-1])
  276. else:
  277. cleaned_parts.append(part)
  278. return CRLF.join(cleaned_parts)
  279. def check_safecrlf(
  280. original: bytes, converted: bytes, safecrlf: bytes, path: bytes = b""
  281. ) -> None:
  282. """Check if CRLF conversion is safe according to core.safecrlf setting.
  283. Args:
  284. original: Original content before conversion
  285. converted: Content after conversion
  286. safecrlf: Value of core.safecrlf config (b"true", b"warn", or b"false")
  287. path: Path to the file being checked (for error messages)
  288. Raises:
  289. ValueError: If safecrlf is "true" and conversion would lose data
  290. """
  291. if safecrlf == b"false":
  292. return
  293. # Check if conversion is reversible
  294. if safecrlf in (b"true", b"warn"):
  295. # For CRLF->LF conversion, check if converting back would recover original
  296. if CRLF in original and CRLF not in converted:
  297. # This was a CRLF->LF conversion
  298. recovered = convert_lf_to_crlf(converted)
  299. if recovered != original:
  300. msg = (
  301. f"CRLF would be replaced by LF in {path.decode('utf-8', 'replace')}"
  302. )
  303. if safecrlf == b"true":
  304. raise ValueError(msg)
  305. else: # warn
  306. logger.warning(msg)
  307. # For LF->CRLF conversion, check if converting back would recover original
  308. elif LF in original and CRLF in converted and CRLF not in original:
  309. # This was a LF->CRLF conversion
  310. recovered = convert_crlf_to_lf(converted)
  311. if recovered != original:
  312. msg = (
  313. f"LF would be replaced by CRLF in {path.decode('utf-8', 'replace')}"
  314. )
  315. if safecrlf == b"true":
  316. raise ValueError(msg)
  317. else: # warn
  318. logger.warning(msg)
  319. def get_smudge_filter(
  320. core_eol: str, core_autocrlf: bytes
  321. ) -> Callable[[bytes], bytes] | None:
  322. """Returns the correct smudge filter based on the passed arguments."""
  323. # Git attributes handling is done by the filter infrastructure
  324. return get_smudge_filter_autocrlf(core_autocrlf)
  325. def get_clean_filter(
  326. core_eol: str, core_autocrlf: bytes
  327. ) -> Callable[[bytes], bytes] | None:
  328. """Returns the correct clean filter based on the passed arguments."""
  329. # Git attributes handling is done by the filter infrastructure
  330. return get_clean_filter_autocrlf(core_autocrlf)
  331. def get_smudge_filter_autocrlf(
  332. core_autocrlf: bytes,
  333. ) -> Callable[[bytes], bytes] | None:
  334. """Returns the correct smudge filter base on autocrlf value.
  335. Args:
  336. core_autocrlf: The bytes configuration value of core.autocrlf.
  337. Valid values are: b'true', b'false' or b'input'.
  338. Returns: Either None if no filter has to be applied or a function
  339. accepting a single argument, a binary text hunk
  340. """
  341. if core_autocrlf == b"true":
  342. return convert_lf_to_crlf
  343. return None
  344. def get_clean_filter_autocrlf(
  345. core_autocrlf: bytes,
  346. ) -> Callable[[bytes], bytes] | None:
  347. """Returns the correct clean filter base on autocrlf value.
  348. Args:
  349. core_autocrlf: The bytes configuration value of core.autocrlf.
  350. Valid values are: b'true', b'false' or b'input'.
  351. Returns: Either None if no filter has to be applied or a function
  352. accepting a single argument, a binary text hunk
  353. """
  354. if core_autocrlf == b"true" or core_autocrlf == b"input":
  355. return convert_crlf_to_lf
  356. # Checking filter should never be `convert_lf_to_crlf`
  357. return None
  358. # Backwards compatibility wrappers
  359. @replace_me(since="0.23.1", remove_in="0.25.0")
  360. def get_checkout_filter(
  361. core_eol: str, core_autocrlf: bool | str, git_attributes: Mapping[str, Any]
  362. ) -> Callable[[bytes], bytes] | None:
  363. """Deprecated: Use get_smudge_filter instead."""
  364. # Convert core_autocrlf to bytes for compatibility
  365. if isinstance(core_autocrlf, bool):
  366. autocrlf_bytes = b"true" if core_autocrlf else b"false"
  367. else:
  368. autocrlf_bytes = (
  369. core_autocrlf.encode("utf-8")
  370. if isinstance(core_autocrlf, str)
  371. else core_autocrlf
  372. )
  373. return get_smudge_filter(core_eol, autocrlf_bytes)
  374. @replace_me(since="0.23.1", remove_in="0.25.0")
  375. def get_checkin_filter(
  376. core_eol: str, core_autocrlf: bool | str, git_attributes: Mapping[str, Any]
  377. ) -> Callable[[bytes], bytes] | None:
  378. """Deprecated: Use get_clean_filter instead."""
  379. # Convert core_autocrlf to bytes for compatibility
  380. if isinstance(core_autocrlf, bool):
  381. autocrlf_bytes = b"true" if core_autocrlf else b"false"
  382. else:
  383. autocrlf_bytes = (
  384. core_autocrlf.encode("utf-8")
  385. if isinstance(core_autocrlf, str)
  386. else core_autocrlf
  387. )
  388. return get_clean_filter(core_eol, autocrlf_bytes)
  389. @replace_me(since="0.23.1", remove_in="0.25.0")
  390. def get_checkout_filter_autocrlf(
  391. core_autocrlf: bytes,
  392. ) -> Callable[[bytes], bytes] | None:
  393. """Deprecated: Use get_smudge_filter_autocrlf instead."""
  394. return get_smudge_filter_autocrlf(core_autocrlf)
  395. @replace_me(since="0.23.1", remove_in="0.25.0")
  396. def get_checkin_filter_autocrlf(
  397. core_autocrlf: bytes,
  398. ) -> Callable[[bytes], bytes] | None:
  399. """Deprecated: Use get_clean_filter_autocrlf instead."""
  400. return get_clean_filter_autocrlf(core_autocrlf)
  401. class BlobNormalizer(FilterBlobNormalizer):
  402. """An object to store computation result of which filter to apply based on configuration, gitattributes, path and operation (checkin or checkout).
  403. This class maintains backward compatibility while using the filter infrastructure.
  404. """
  405. def __init__(
  406. self,
  407. config_stack: "StackedConfig",
  408. gitattributes: Mapping[str, Any],
  409. core_eol: str = "native",
  410. autocrlf: bytes = b"false",
  411. safecrlf: bytes = b"false",
  412. ) -> None:
  413. """Initialize FilteringBlobNormalizer."""
  414. # Set up a filter registry with line ending filters
  415. filter_registry = FilterRegistry(config_stack)
  416. # Create line ending filter if needed
  417. smudge_filter = get_smudge_filter(core_eol, autocrlf)
  418. clean_filter = get_clean_filter(core_eol, autocrlf)
  419. # Always register a text filter that can be used by gitattributes
  420. # Even if autocrlf is false, gitattributes text=true should work
  421. line_ending_filter = LineEndingFilter(
  422. clean_conversion=clean_filter or convert_crlf_to_lf,
  423. smudge_conversion=smudge_filter or convert_lf_to_crlf,
  424. binary_detection=True,
  425. safecrlf=safecrlf,
  426. )
  427. filter_registry.register_driver("text", line_ending_filter)
  428. # Convert dict gitattributes to GitAttributes object for parent class
  429. git_attrs_patterns = []
  430. for pattern_str, attrs in gitattributes.items():
  431. if isinstance(pattern_str, str):
  432. pattern_bytes = pattern_str.encode("utf-8")
  433. else:
  434. pattern_bytes = pattern_str
  435. pattern = Pattern(pattern_bytes)
  436. git_attrs_patterns.append((pattern, attrs))
  437. git_attributes = GitAttributes(git_attrs_patterns)
  438. # Create FilterContext for parent class
  439. filter_context = FilterContext(filter_registry)
  440. # Initialize parent class with gitattributes
  441. # The filter infrastructure will handle gitattributes processing
  442. super().__init__(config_stack, git_attributes, filter_context=filter_context)
  443. # Store original filters for backward compatibility
  444. self.fallback_read_filter = smudge_filter
  445. self.fallback_write_filter = clean_filter
  446. def checkin_normalize(self, blob: Blob, tree_path: bytes) -> Blob:
  447. """Normalize a blob during a checkin operation."""
  448. # First try to get filter from gitattributes (handled by parent)
  449. result = super().checkin_normalize(blob, tree_path)
  450. # Check if gitattributes explicitly disabled text conversion
  451. attrs = self.gitattributes.match_path(tree_path)
  452. if b"text" in attrs and attrs[b"text"] is False:
  453. # Explicitly marked as binary, no conversion
  454. return blob
  455. # If no filter was applied via gitattributes and we have a fallback filter
  456. # (autocrlf is enabled), apply it to all files
  457. if result is blob and self.fallback_write_filter is not None:
  458. # Apply the clean filter with binary detection
  459. # Get safecrlf from config
  460. safecrlf = b"false"
  461. if hasattr(self, "filter_registry") and hasattr(
  462. self.filter_registry, "config_stack"
  463. ):
  464. safecrlf = self.filter_registry.config_stack.get(
  465. b"core", b"safecrlf", b"false"
  466. )
  467. if hasattr(safecrlf, "encode"):
  468. safecrlf = safecrlf.encode("utf-8")
  469. line_ending_filter = LineEndingFilter(
  470. clean_conversion=self.fallback_write_filter,
  471. smudge_conversion=None,
  472. binary_detection=True,
  473. safecrlf=safecrlf,
  474. )
  475. filtered_data = line_ending_filter.clean(blob.data, tree_path)
  476. if filtered_data != blob.data:
  477. new_blob = Blob()
  478. new_blob.data = filtered_data
  479. return new_blob
  480. return result
  481. def checkout_normalize(self, blob: Blob, tree_path: bytes) -> Blob:
  482. """Normalize a blob during a checkout operation."""
  483. # First try to get filter from gitattributes (handled by parent)
  484. result = super().checkout_normalize(blob, tree_path)
  485. # Check if gitattributes explicitly disabled text conversion
  486. attrs = self.gitattributes.match_path(tree_path)
  487. if b"text" in attrs and attrs[b"text"] is False:
  488. # Explicitly marked as binary, no conversion
  489. return blob
  490. # If no filter was applied via gitattributes and we have a fallback filter
  491. # (autocrlf is enabled), apply it to all files
  492. if result is blob and self.fallback_read_filter is not None:
  493. # Apply the smudge filter with binary detection
  494. # Get safecrlf from config
  495. safecrlf = b"false"
  496. if hasattr(self, "filter_registry") and hasattr(
  497. self.filter_registry, "config_stack"
  498. ):
  499. safecrlf = self.filter_registry.config_stack.get(
  500. b"core", b"safecrlf", b"false"
  501. )
  502. if hasattr(safecrlf, "encode"):
  503. safecrlf = safecrlf.encode("utf-8")
  504. line_ending_filter = LineEndingFilter(
  505. clean_conversion=None,
  506. smudge_conversion=self.fallback_read_filter,
  507. binary_detection=True,
  508. safecrlf=safecrlf,
  509. )
  510. filtered_data = line_ending_filter.smudge(blob.data, tree_path)
  511. if filtered_data != blob.data:
  512. new_blob = Blob()
  513. new_blob.data = filtered_data
  514. return new_blob
  515. return result
  516. def normalize_blob(
  517. blob: Blob, conversion: Callable[[bytes], bytes], binary_detection: bool
  518. ) -> Blob:
  519. """Normalize blob by applying line ending conversion."""
  520. # Read the original blob
  521. data = blob.data
  522. # If we need to detect if a file is binary and the file is detected as
  523. # binary, do not apply the conversion function and return the original
  524. # chunked text
  525. if binary_detection is True:
  526. if is_binary(data):
  527. return blob
  528. # Now apply the conversion
  529. converted_data = conversion(data)
  530. new_blob = Blob()
  531. new_blob.data = converted_data
  532. return new_blob
  533. class TreeBlobNormalizer(BlobNormalizer):
  534. """Blob normalizer that tracks existing files in a tree."""
  535. def __init__(
  536. self,
  537. config_stack: "StackedConfig",
  538. git_attributes: Mapping[str, Any],
  539. object_store: "BaseObjectStore",
  540. tree: ObjectID | None = None,
  541. core_eol: str = "native",
  542. autocrlf: bytes = b"false",
  543. safecrlf: bytes = b"false",
  544. ) -> None:
  545. """Initialize TreeBlobNormalizer."""
  546. super().__init__(config_stack, git_attributes, core_eol, autocrlf, safecrlf)
  547. if tree:
  548. self.existing_paths = {
  549. name for name, _, _ in iter_tree_contents(object_store, tree)
  550. }
  551. else:
  552. self.existing_paths = set()
  553. def checkin_normalize(self, blob: Blob, tree_path: bytes) -> Blob:
  554. """Normalize blob for checkin, considering existing tree state."""
  555. # Existing files should only be normalized on checkin if:
  556. # 1. They were previously normalized on checkout (autocrlf=true), OR
  557. # 2. We have a write filter (autocrlf=true or autocrlf=input), OR
  558. # 3. They are new files
  559. if (
  560. self.fallback_read_filter is not None
  561. or self.fallback_write_filter is not None
  562. or tree_path not in self.existing_paths
  563. ):
  564. return super().checkin_normalize(blob, tree_path)
  565. return blob