filters.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972
  1. # filters.py -- Git filter drivers (clean/smudge) implementation
  2. # Copyright (C) 2024 Jelmer Vernooij
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Implementation of Git filter drivers (clean/smudge filters)."""
  22. import logging
  23. import subprocess
  24. import threading
  25. from collections.abc import Callable
  26. from typing import TYPE_CHECKING, Optional
  27. from typing import Protocol as TypingProtocol
  28. from .attrs import GitAttributes
  29. from .objects import Blob
  30. if TYPE_CHECKING:
  31. from .config import StackedConfig
  32. from .protocol import Protocol
  33. from .repo import BaseRepo
  34. class FilterError(Exception):
  35. """Exception raised when filter operations fail."""
  36. class FilterDriver(TypingProtocol):
  37. """Protocol for filter drivers."""
  38. def clean(self, data: bytes) -> bytes:
  39. """Apply clean filter (working tree → repository)."""
  40. ...
  41. def smudge(self, data: bytes, path: bytes = b"") -> bytes:
  42. """Apply smudge filter (repository → working tree)."""
  43. ...
  44. def cleanup(self) -> None:
  45. """Clean up any resources held by this filter driver."""
  46. ...
  47. def reuse(self, config: "StackedConfig", filter_name: str) -> bool:
  48. """Check if this filter driver should be reused with the given configuration.
  49. This method determines whether a cached filter driver instance should continue
  50. to be used or if it should be recreated. Only filters that are expensive to
  51. create (like long-running process filters) and whose configuration hasn't
  52. changed should return True. Lightweight filters should return False to ensure
  53. they always use the latest configuration.
  54. Args:
  55. config: The current configuration stack
  56. filter_name: The name of the filter in config
  57. Returns:
  58. True if the filter should be reused, False if it should be recreated
  59. """
  60. ...
  61. class CompositeFilterDriver:
  62. """Filter driver that chains multiple filters together."""
  63. def __init__(self, filters: list[FilterDriver]) -> None:
  64. """Initialize CompositeFilterDriver.
  65. Args:
  66. filters: List of filters to apply in order
  67. """
  68. self.filters = filters
  69. def clean(self, data: bytes) -> bytes:
  70. """Apply all clean filters in order."""
  71. for filter_driver in self.filters:
  72. data = filter_driver.clean(data)
  73. return data
  74. def smudge(self, data: bytes, path: bytes = b"") -> bytes:
  75. """Apply all smudge filters in reverse order."""
  76. # For smudge, apply filters in reverse order
  77. for filter_driver in reversed(self.filters):
  78. data = filter_driver.smudge(data, path)
  79. return data
  80. def cleanup(self) -> None:
  81. """Clean up all filter drivers."""
  82. for filter_driver in self.filters:
  83. filter_driver.cleanup()
  84. def reuse(self, config: "StackedConfig", filter_name: str) -> bool:
  85. """Check if all filters can be reused."""
  86. # A composite filter can only be reused if all its components can
  87. return all(f.reuse(config, filter_name) for f in self.filters)
  88. class ProcessFilterDriver:
  89. """Filter driver that executes external processes."""
  90. def __init__(
  91. self,
  92. clean_cmd: str | None = None,
  93. smudge_cmd: str | None = None,
  94. required: bool = False,
  95. cwd: str | None = None,
  96. process_cmd: str | None = None,
  97. ) -> None:
  98. """Initialize ProcessFilterDriver.
  99. Args:
  100. clean_cmd: Command to run for clean filter
  101. smudge_cmd: Command to run for smudge filter
  102. required: Whether the filter is required
  103. cwd: Working directory for filter execution
  104. process_cmd: Command to run for process filter (preferred for performance)
  105. """
  106. self.clean_cmd = clean_cmd
  107. self.smudge_cmd = smudge_cmd
  108. self.required = required
  109. self.cwd = cwd
  110. self.process_cmd = process_cmd
  111. self._process: subprocess.Popen[bytes] | None = None
  112. self._protocol: Protocol | None = None
  113. self._capabilities: set[bytes] = set()
  114. self._process_lock = threading.Lock()
  115. def _get_or_start_process(self) -> Optional["Protocol"]:
  116. """Get or start the long-running process filter."""
  117. if self._process is None and self.process_cmd:
  118. from .errors import GitProtocolError, HangupException
  119. from .protocol import Protocol
  120. try:
  121. self._process = subprocess.Popen(
  122. self.process_cmd,
  123. shell=True,
  124. stdin=subprocess.PIPE,
  125. stdout=subprocess.PIPE,
  126. stderr=subprocess.PIPE,
  127. cwd=self.cwd,
  128. text=False, # Use bytes
  129. )
  130. # Check if process started successfully
  131. if self._process.poll() is not None:
  132. # Process already terminated
  133. raise OSError(
  134. f"Process terminated immediately with code {self._process.returncode}"
  135. )
  136. # Create protocol wrapper
  137. def write_func(data: bytes) -> int:
  138. assert self._process is not None
  139. assert self._process.stdin is not None
  140. n = self._process.stdin.write(data)
  141. self._process.stdin.flush()
  142. return n
  143. def read_func(size: int) -> bytes:
  144. assert self._process is not None
  145. assert self._process.stdout is not None
  146. return self._process.stdout.read(size)
  147. self._protocol = Protocol(read_func, write_func)
  148. # Send handshake using pkt-line format
  149. self._protocol.write_pkt_line(b"git-filter-client")
  150. self._protocol.write_pkt_line(b"version=2")
  151. self._protocol.write_pkt_line(None) # flush packet
  152. # Read handshake response
  153. welcome = self._protocol.read_pkt_line()
  154. version = self._protocol.read_pkt_line()
  155. flush = self._protocol.read_pkt_line()
  156. # Verify handshake (be liberal - accept with or without newlines)
  157. if welcome and welcome.rstrip(b"\n\r") != b"git-filter-server":
  158. raise FilterError(f"Invalid welcome message: {welcome!r}")
  159. if version and version.rstrip(b"\n\r") != b"version=2":
  160. raise FilterError(f"Invalid version: {version!r}")
  161. if flush is not None:
  162. raise FilterError("Expected flush packet after handshake")
  163. # Send capabilities
  164. self._protocol.write_pkt_line(b"capability=clean")
  165. self._protocol.write_pkt_line(b"capability=smudge")
  166. self._protocol.write_pkt_line(None) # flush packet
  167. # Read capability response
  168. capabilities = []
  169. while True:
  170. pkt = self._protocol.read_pkt_line()
  171. if pkt is None: # flush packet
  172. break
  173. capabilities.append(pkt)
  174. # Store supported capabilities
  175. self._capabilities = set()
  176. for cap in capabilities:
  177. cap = cap.rstrip(b"\n\r") # Be liberal - strip any line endings
  178. if cap.startswith(b"capability="):
  179. self._capabilities.add(cap[11:]) # Remove "capability=" prefix
  180. except (
  181. OSError,
  182. subprocess.SubprocessError,
  183. HangupException,
  184. GitProtocolError,
  185. ) as e:
  186. self.cleanup()
  187. raise FilterError(f"Failed to start process filter: {e}")
  188. return self._protocol
  189. def _use_process_filter(self, data: bytes, operation: str, path: str = "") -> bytes:
  190. """Use the long-running process filter for the operation."""
  191. with self._process_lock:
  192. try:
  193. proc = self._get_or_start_process()
  194. if proc is None:
  195. return data
  196. operation_bytes = operation.encode()
  197. if operation_bytes not in self._capabilities:
  198. raise FilterError(f"Operation {operation} not supported by filter")
  199. if not self._protocol:
  200. raise FilterError("Protocol not initialized")
  201. # Send request using pkt-line format
  202. self._protocol.write_pkt_line(f"command={operation}".encode())
  203. self._protocol.write_pkt_line(f"pathname={path}".encode())
  204. self._protocol.write_pkt_line(None) # flush packet
  205. # Send data
  206. # Split data into chunks if needed (max pkt-line payload is 65516 bytes)
  207. chunk_size = 65516
  208. for i in range(0, len(data), chunk_size):
  209. chunk = data[i : i + chunk_size]
  210. self._protocol.write_pkt_line(chunk)
  211. self._protocol.write_pkt_line(None) # flush packet to end data
  212. # Read response (initial headers)
  213. response_headers = {}
  214. while True:
  215. pkt = self._protocol.read_pkt_line()
  216. if pkt is None: # flush packet ends headers
  217. break
  218. key, _, value = pkt.decode().rstrip("\n\r").partition("=")
  219. response_headers[key] = value
  220. # Check status
  221. status = response_headers.get("status", "error")
  222. if status != "success":
  223. raise FilterError(f"Process filter {operation} failed: {status}")
  224. # Read result data
  225. result_chunks = []
  226. while True:
  227. pkt = self._protocol.read_pkt_line()
  228. if pkt is None: # flush packet ends data
  229. break
  230. result_chunks.append(pkt)
  231. # Read final headers per Git filter protocol
  232. # Filters send: headers + flush + content + flush + final_headers + flush
  233. final_headers = {}
  234. while True:
  235. pkt = self._protocol.read_pkt_line()
  236. if pkt is None: # flush packet ends final headers
  237. break
  238. key, _, value = pkt.decode().rstrip("\n\r").partition("=")
  239. final_headers[key] = value
  240. # Check final status (if provided, it overrides the initial status)
  241. final_status = final_headers.get("status", status)
  242. if final_status != "success":
  243. raise FilterError(
  244. f"Process filter {operation} failed with final status: {final_status}"
  245. )
  246. return b"".join(result_chunks)
  247. except (OSError, subprocess.SubprocessError, ValueError) as e:
  248. # Clean up broken process
  249. self.cleanup()
  250. raise FilterError(f"Process filter failed: {e}")
  251. def clean(self, data: bytes) -> bytes:
  252. """Apply clean filter using external process."""
  253. # Try process filter first (much faster)
  254. if self.process_cmd:
  255. try:
  256. return self._use_process_filter(data, "clean")
  257. except FilterError as e:
  258. if self.required:
  259. raise
  260. logging.warning("Process filter failed, falling back: %s", e)
  261. # Fall back to clean command
  262. if not self.clean_cmd:
  263. if self.required:
  264. raise FilterError("Clean command is required but not configured")
  265. return data
  266. try:
  267. result = subprocess.run(
  268. self.clean_cmd,
  269. shell=True,
  270. input=data,
  271. capture_output=True,
  272. check=True,
  273. cwd=self.cwd,
  274. )
  275. return result.stdout
  276. except subprocess.CalledProcessError as e:
  277. if self.required:
  278. raise FilterError(f"Required clean filter failed: {e}")
  279. # If not required, log warning and return original data on failure
  280. logging.warning("Optional clean filter failed: %s", e)
  281. return data
  282. def smudge(self, data: bytes, path: bytes = b"") -> bytes:
  283. """Apply smudge filter using external process."""
  284. path_str = path.decode("utf-8", errors="replace")
  285. # Try process filter first (much faster)
  286. if self.process_cmd:
  287. try:
  288. return self._use_process_filter(data, "smudge", path_str)
  289. except FilterError as e:
  290. if self.required:
  291. raise
  292. logging.warning("Process filter failed, falling back: %s", e)
  293. # Fall back to smudge command
  294. if not self.smudge_cmd:
  295. if self.required:
  296. raise FilterError("Smudge command is required but not configured")
  297. return data
  298. # Substitute %f placeholder with file path
  299. cmd = self.smudge_cmd.replace("%f", path_str)
  300. try:
  301. result = subprocess.run(
  302. cmd,
  303. shell=True,
  304. input=data,
  305. capture_output=True,
  306. check=True,
  307. cwd=self.cwd,
  308. )
  309. return result.stdout
  310. except subprocess.CalledProcessError as e:
  311. if self.required:
  312. raise FilterError(f"Required smudge filter failed: {e}")
  313. # If not required, log warning and return original data on failure
  314. logging.warning("Optional smudge filter failed: %s", e)
  315. return data
  316. def cleanup(self) -> None:
  317. """Clean up the process filter."""
  318. if self._process:
  319. # Close stdin first to signal the process to quit cleanly
  320. if self._process.stdin and not self._process.stdin.closed:
  321. try:
  322. self._process.stdin.close()
  323. except BrokenPipeError:
  324. pass
  325. # Try to terminate gracefully first
  326. if self._process.poll() is None: # Still running
  327. try:
  328. self._process.terminate()
  329. self._process.wait(timeout=2)
  330. except subprocess.TimeoutExpired:
  331. # Force kill if terminate didn't work
  332. try:
  333. self._process.kill()
  334. self._process.wait(timeout=3)
  335. except subprocess.TimeoutExpired:
  336. # On Windows, sometimes we need to be more aggressive
  337. import os
  338. if os.name == "nt":
  339. try:
  340. subprocess.run(
  341. [
  342. "taskkill",
  343. "/F",
  344. "/T",
  345. "/PID",
  346. str(self._process.pid),
  347. ],
  348. capture_output=True,
  349. timeout=5,
  350. )
  351. self._process.wait(timeout=1)
  352. except (
  353. subprocess.CalledProcessError,
  354. subprocess.TimeoutExpired,
  355. ):
  356. pass
  357. else:
  358. try:
  359. import signal
  360. os.kill(self._process.pid, signal.SIGKILL) # type: ignore[attr-defined,unused-ignore]
  361. self._process.wait(timeout=1)
  362. except (ProcessLookupError, subprocess.TimeoutExpired):
  363. pass
  364. except ProcessLookupError:
  365. # Process already dead
  366. pass
  367. # Close stdout and stderr to prevent resource leaks
  368. if self._process.stdout and not self._process.stdout.closed:
  369. try:
  370. self._process.stdout.close()
  371. except (OSError, ValueError):
  372. # OSError: I/O operation on closed file
  373. # ValueError: I/O operation on closed file (some platforms)
  374. pass
  375. if self._process.stderr and not self._process.stderr.closed:
  376. try:
  377. self._process.stderr.close()
  378. except (OSError, ValueError):
  379. pass
  380. self._process = None
  381. self._protocol = None
  382. def reuse(self, config: "StackedConfig", filter_name: str) -> bool:
  383. """Check if this filter driver should be reused with the given configuration."""
  384. # Only reuse if it's a long-running process filter AND config hasn't changed
  385. if self.process_cmd is None:
  386. # Not a long-running filter, don't cache
  387. return False
  388. # Check if the filter commands in config match our current commands
  389. try:
  390. clean_cmd_raw = config.get(("filter", filter_name), "clean")
  391. except KeyError:
  392. clean_cmd = None
  393. else:
  394. clean_cmd = (
  395. clean_cmd_raw.decode("utf-8")
  396. if isinstance(clean_cmd_raw, bytes)
  397. else clean_cmd_raw
  398. )
  399. if clean_cmd != self.clean_cmd:
  400. return False
  401. try:
  402. smudge_cmd_raw = config.get(("filter", filter_name), "smudge")
  403. except KeyError:
  404. smudge_cmd = None
  405. else:
  406. smudge_cmd = (
  407. smudge_cmd_raw.decode("utf-8")
  408. if isinstance(smudge_cmd_raw, bytes)
  409. else smudge_cmd_raw
  410. )
  411. if smudge_cmd != self.smudge_cmd:
  412. return False
  413. try:
  414. process_cmd_raw = config.get(("filter", filter_name), "process")
  415. except KeyError:
  416. process_cmd = None
  417. else:
  418. process_cmd = (
  419. process_cmd_raw.decode("utf-8")
  420. if isinstance(process_cmd_raw, bytes)
  421. else process_cmd_raw
  422. )
  423. if process_cmd != self.process_cmd:
  424. return False
  425. required = config.get_boolean(("filter", filter_name), "required", False)
  426. if required != self.required:
  427. return False
  428. return True
  429. def __del__(self) -> None:
  430. """Clean up the process filter on destruction."""
  431. self.cleanup()
  432. class FilterContext:
  433. """Context for managing stateful filter resources.
  434. This class manages the runtime state for filters, including:
  435. - Cached filter driver instances that maintain long-running state
  436. - Resource lifecycle management
  437. It works in conjunction with FilterRegistry to provide complete
  438. filter functionality while maintaining proper separation of concerns.
  439. """
  440. def __init__(self, filter_registry: "FilterRegistry") -> None:
  441. """Initialize FilterContext.
  442. Args:
  443. filter_registry: The filter registry to use for driver lookups
  444. """
  445. self.filter_registry = filter_registry
  446. self._active_drivers: dict[str, FilterDriver] = {}
  447. def get_driver(self, name: str) -> FilterDriver | None:
  448. """Get a filter driver by name, managing stateful instances.
  449. This method handles driver instantiation and caching. Only drivers
  450. that should be reused are cached.
  451. Args:
  452. name: The filter name
  453. Returns:
  454. FilterDriver instance or None
  455. """
  456. driver: FilterDriver | None = None
  457. # Check if we have a cached instance that should be reused
  458. if name in self._active_drivers:
  459. driver = self._active_drivers[name]
  460. # Check if the cached driver should still be reused
  461. if self.filter_registry.config and driver.reuse(
  462. self.filter_registry.config, name
  463. ):
  464. return driver
  465. else:
  466. # Driver shouldn't be reused, clean it up and remove from cache
  467. driver.cleanup()
  468. del self._active_drivers[name]
  469. # Get driver from registry
  470. driver = self.filter_registry.get_driver(name)
  471. if driver is not None and self.filter_registry.config:
  472. # Only cache drivers that should be reused
  473. if driver.reuse(self.filter_registry.config, name):
  474. self._active_drivers[name] = driver
  475. return driver
  476. def close(self) -> None:
  477. """Close all active filter resources."""
  478. # Clean up active drivers
  479. for driver in self._active_drivers.values():
  480. driver.cleanup()
  481. self._active_drivers.clear()
  482. # Also close the registry
  483. self.filter_registry.close()
  484. def refresh_config(self, config: "StackedConfig") -> None:
  485. """Refresh the configuration used by the filter registry.
  486. This should be called when the configuration has changed to ensure
  487. filters use the latest settings.
  488. Args:
  489. config: The new configuration stack
  490. """
  491. # Update the registry's config
  492. self.filter_registry.config = config
  493. # Re-setup line ending filter with new config
  494. # This will update the text filter factory to use new autocrlf settings
  495. self.filter_registry._setup_line_ending_filter()
  496. # The get_driver method will now handle checking reuse() for cached drivers
  497. def __del__(self) -> None:
  498. """Clean up on destruction."""
  499. try:
  500. self.close()
  501. except Exception:
  502. # Don't raise exceptions in __del__
  503. pass
  504. class FilterRegistry:
  505. """Registry for filter drivers."""
  506. def __init__(
  507. self,
  508. config: Optional["StackedConfig"] = None,
  509. repo: Optional["BaseRepo"] = None,
  510. ) -> None:
  511. """Initialize FilterRegistry.
  512. Args:
  513. config: Git configuration stack
  514. repo: Repository instance
  515. """
  516. self.config = config
  517. self.repo = repo
  518. self._drivers: dict[str, FilterDriver] = {}
  519. self._factories: dict[str, Callable[[FilterRegistry], FilterDriver]] = {}
  520. # Register built-in filter factories
  521. self.register_factory("lfs", self._create_lfs_filter)
  522. self.register_factory("text", self._create_text_filter)
  523. # Auto-register line ending filter if autocrlf is enabled
  524. self._setup_line_ending_filter()
  525. def register_factory(
  526. self, name: str, factory: Callable[["FilterRegistry"], FilterDriver]
  527. ) -> None:
  528. """Register a filter driver factory."""
  529. self._factories[name] = factory
  530. def register_driver(self, name: str, driver: FilterDriver) -> None:
  531. """Register a filter driver instance."""
  532. self._drivers[name] = driver
  533. def get_driver(self, name: str) -> FilterDriver | None:
  534. """Get a filter driver by name."""
  535. # Check if we already have an instance
  536. if name in self._drivers:
  537. return self._drivers[name]
  538. # Try to create from config first (respect user configuration)
  539. if self.config is not None:
  540. config_driver = self._create_from_config(name)
  541. if config_driver is not None:
  542. self._drivers[name] = config_driver
  543. return config_driver
  544. # Try to create from factory as fallback
  545. if name in self._factories:
  546. factory_driver = self._factories[name](self)
  547. self._drivers[name] = factory_driver
  548. return factory_driver
  549. return None
  550. def close(self) -> None:
  551. """Close all filter drivers, ensuring process cleanup."""
  552. for driver in self._drivers.values():
  553. driver.cleanup()
  554. self._drivers.clear()
  555. def __del__(self) -> None:
  556. """Clean up filter drivers on destruction."""
  557. try:
  558. self.close()
  559. except Exception:
  560. # Don't raise exceptions in __del__
  561. pass
  562. def _create_from_config(self, name: str) -> FilterDriver | None:
  563. """Create a filter driver from config."""
  564. if self.config is None:
  565. return None
  566. clean_cmd: str | None = None
  567. smudge_cmd: str | None = None
  568. process_cmd: str | None = None
  569. # Get process command (preferred over clean/smudge for performance)
  570. try:
  571. process_cmd_raw = self.config.get(("filter", name), "process")
  572. except KeyError:
  573. pass
  574. else:
  575. if isinstance(process_cmd_raw, bytes):
  576. process_cmd = process_cmd_raw.decode("utf-8")
  577. else:
  578. process_cmd = process_cmd_raw
  579. # Get clean command
  580. try:
  581. clean_cmd_raw = self.config.get(("filter", name), "clean")
  582. except KeyError:
  583. pass
  584. else:
  585. if isinstance(clean_cmd_raw, bytes):
  586. clean_cmd = clean_cmd_raw.decode("utf-8")
  587. else:
  588. clean_cmd = clean_cmd_raw
  589. # Get smudge command
  590. try:
  591. smudge_cmd_raw = self.config.get(("filter", name), "smudge")
  592. except KeyError:
  593. pass
  594. else:
  595. if isinstance(smudge_cmd_raw, bytes):
  596. smudge_cmd = smudge_cmd_raw.decode("utf-8")
  597. else:
  598. smudge_cmd = smudge_cmd_raw
  599. # Get required flag (defaults to False)
  600. required = self.config.get_boolean(("filter", name), "required", False)
  601. if process_cmd or clean_cmd or smudge_cmd:
  602. # Get repository working directory (only for Repo, not BaseRepo)
  603. from .repo import Repo
  604. repo_path = (
  605. self.repo.path if self.repo and isinstance(self.repo, Repo) else None
  606. )
  607. return ProcessFilterDriver(
  608. clean_cmd, smudge_cmd, required, repo_path, process_cmd
  609. )
  610. return None
  611. def _create_lfs_filter(self, registry: "FilterRegistry") -> FilterDriver:
  612. """Create LFS filter driver."""
  613. from .lfs import LFSFilterDriver, LFSStore
  614. # If we have a Repo (not just BaseRepo), use its LFS store
  615. from .repo import Repo
  616. if registry.repo is not None and isinstance(registry.repo, Repo):
  617. lfs_store = LFSStore.from_repo(registry.repo, create=True)
  618. else:
  619. # Fall back to creating a temporary LFS store
  620. import tempfile
  621. lfs_dir = tempfile.mkdtemp(prefix="dulwich-lfs-")
  622. lfs_store = LFSStore.create(lfs_dir)
  623. config = registry.repo.get_config_stack() if registry.repo else None
  624. return LFSFilterDriver(lfs_store, config=config)
  625. def _create_text_filter(self, registry: "FilterRegistry") -> FilterDriver:
  626. """Create text filter driver for line ending conversion.
  627. This filter is used when files have the 'text' attribute set explicitly.
  628. It always normalizes line endings on checkin (CRLF -> LF).
  629. """
  630. from .line_ending import LineEndingFilter
  631. return LineEndingFilter.from_config(self.config, for_text_attr=True)
  632. def _setup_line_ending_filter(self) -> None:
  633. """Automatically register line ending filter if configured."""
  634. if self.config is None:
  635. return
  636. # Parse autocrlf as bytes
  637. try:
  638. autocrlf_raw = self.config.get("core", "autocrlf")
  639. except KeyError:
  640. return
  641. else:
  642. autocrlf: bytes = (
  643. autocrlf_raw.lower()
  644. if isinstance(autocrlf_raw, bytes)
  645. else str(autocrlf_raw).lower().encode("ascii")
  646. )
  647. # If autocrlf is enabled, register the text filter
  648. if autocrlf in (b"true", b"input"):
  649. # Pre-create the text filter so it's available
  650. self.get_driver("text")
  651. def get_filter_for_path(
  652. path: bytes,
  653. gitattributes: "GitAttributes",
  654. filter_registry: FilterRegistry | None = None,
  655. filter_context: FilterContext | None = None,
  656. ) -> FilterDriver | None:
  657. """Get the appropriate filter driver for a given path.
  658. Args:
  659. path: Path to check
  660. gitattributes: GitAttributes object with parsed patterns
  661. filter_registry: Registry of filter drivers (deprecated, use filter_context)
  662. filter_context: Context for managing filter state
  663. Returns:
  664. FilterDriver instance or None
  665. """
  666. # Use filter_context if provided, otherwise fall back to registry
  667. if filter_context is not None:
  668. registry = filter_context.filter_registry
  669. get_driver = filter_context.get_driver
  670. elif filter_registry is not None:
  671. registry = filter_registry
  672. get_driver = filter_registry.get_driver
  673. else:
  674. raise ValueError("Either filter_registry or filter_context must be provided")
  675. # Get all attributes for this path
  676. attributes = gitattributes.match_path(path)
  677. # Collect filters to apply
  678. filters: list[FilterDriver] = []
  679. # Check for text attribute first (it should be applied before custom filters)
  680. text_attr = attributes.get(b"text")
  681. if text_attr is True:
  682. # Add text filter for line ending conversion
  683. text_filter = get_driver("text")
  684. if text_filter is not None:
  685. filters.append(text_filter)
  686. elif text_attr is False:
  687. # -text means binary, no conversion - but still check for custom filters
  688. pass
  689. else:
  690. # If no explicit text attribute, check if autocrlf is enabled
  691. # When autocrlf is true/input, files are treated as text by default
  692. if registry.config is not None:
  693. try:
  694. autocrlf_raw = registry.config.get("core", "autocrlf")
  695. except KeyError:
  696. pass
  697. else:
  698. autocrlf: bytes = (
  699. autocrlf_raw.lower()
  700. if isinstance(autocrlf_raw, bytes)
  701. else str(autocrlf_raw).lower().encode("ascii")
  702. )
  703. if autocrlf in (b"true", b"input"):
  704. # Add text filter for files without explicit attributes
  705. text_filter = get_driver("text")
  706. if text_filter is not None:
  707. filters.append(text_filter)
  708. # Check if there's a filter attribute
  709. filter_name = attributes.get(b"filter")
  710. if filter_name is not None and not isinstance(filter_name, bool):
  711. if isinstance(filter_name, bytes):
  712. filter_name_str = filter_name.decode("utf-8")
  713. driver = get_driver(filter_name_str)
  714. # Check if filter is required but missing
  715. if driver is None and registry.config is not None:
  716. required = registry.config.get_boolean(
  717. ("filter", filter_name_str), "required", False
  718. )
  719. if required:
  720. raise FilterError(
  721. f"Required filter '{filter_name_str}' is not available"
  722. )
  723. if driver is not None:
  724. filters.append(driver)
  725. # Return appropriate filter(s)
  726. if len(filters) == 0:
  727. return None
  728. elif len(filters) == 1:
  729. return filters[0]
  730. else:
  731. # Multiple filters - create a composite
  732. return CompositeFilterDriver(filters)
  733. class FilterBlobNormalizer:
  734. """Blob normalizer that applies clean/smudge filters based on gitattributes.
  735. This can be used in addition to or instead of line ending normalization.
  736. """
  737. def __init__(
  738. self,
  739. config_stack: Optional["StackedConfig"],
  740. gitattributes: GitAttributes,
  741. filter_registry: FilterRegistry | None = None,
  742. repo: Optional["BaseRepo"] = None,
  743. filter_context: FilterContext | None = None,
  744. ) -> None:
  745. """Initialize FilterBlobNormalizer.
  746. Args:
  747. config_stack: Git configuration stack
  748. gitattributes: GitAttributes instance
  749. filter_registry: Optional filter registry to use (deprecated, use filter_context)
  750. repo: Optional repository instance
  751. filter_context: Optional filter context to use for managing filter state
  752. """
  753. self.config_stack = config_stack
  754. self.gitattributes = gitattributes
  755. self._owns_context = False # Track if we created our own context
  756. # Support both old and new API
  757. if filter_context is not None:
  758. self.filter_context = filter_context
  759. self.filter_registry = filter_context.filter_registry
  760. self._owns_context = False # We're using an external context
  761. else:
  762. if filter_registry is not None:
  763. import warnings
  764. warnings.warn(
  765. "Passing filter_registry to FilterBlobNormalizer is deprecated. "
  766. "Pass a FilterContext instead.",
  767. DeprecationWarning,
  768. stacklevel=2,
  769. )
  770. self.filter_registry = filter_registry
  771. else:
  772. self.filter_registry = FilterRegistry(config_stack, repo)
  773. self.filter_context = FilterContext(self.filter_registry)
  774. self._owns_context = True # We created our own context
  775. def checkin_normalize(self, blob: Blob, path: bytes) -> Blob:
  776. """Apply clean filter during checkin (working tree -> repository)."""
  777. # Get filter for this path
  778. filter_driver = get_filter_for_path(
  779. path, self.gitattributes, filter_context=self.filter_context
  780. )
  781. if filter_driver is None:
  782. return blob
  783. # Apply clean filter
  784. filtered_data = filter_driver.clean(blob.data)
  785. if filtered_data == blob.data:
  786. return blob
  787. # Create new blob with filtered data
  788. new_blob = Blob()
  789. new_blob.data = filtered_data
  790. return new_blob
  791. def checkout_normalize(self, blob: Blob, path: bytes) -> Blob:
  792. """Apply smudge filter during checkout (repository -> working tree)."""
  793. # Get filter for this path
  794. filter_driver = get_filter_for_path(
  795. path, self.gitattributes, filter_context=self.filter_context
  796. )
  797. if filter_driver is None:
  798. return blob
  799. # Apply smudge filter
  800. filtered_data = filter_driver.smudge(blob.data, path)
  801. if filtered_data == blob.data:
  802. return blob
  803. # Create new blob with filtered data
  804. new_blob = Blob()
  805. new_blob.data = filtered_data
  806. return new_blob
  807. def close(self) -> None:
  808. """Close all filter drivers, ensuring process cleanup."""
  809. # Only close the filter context if we created it ourselves
  810. if self._owns_context:
  811. self.filter_context.close()
  812. def __del__(self) -> None:
  813. """Clean up filter drivers on destruction."""
  814. try:
  815. self.close()
  816. except Exception:
  817. # Don't raise exceptions in __del__
  818. pass