filters.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955
  1. # filters.py -- Git filter drivers (clean/smudge) implementation
  2. # Copyright (C) 2024 Jelmer Vernooij
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Implementation of Git filter drivers (clean/smudge filters)."""
  22. import logging
  23. import subprocess
  24. import threading
  25. from typing import TYPE_CHECKING, Callable, Optional
  26. from typing import Protocol as TypingProtocol
  27. from .attrs import GitAttributes
  28. from .objects import Blob
  29. if TYPE_CHECKING:
  30. from .config import StackedConfig
  31. from .protocol import Protocol
  32. from .repo import BaseRepo
  33. class FilterError(Exception):
  34. """Exception raised when filter operations fail."""
  35. class FilterDriver(TypingProtocol):
  36. """Protocol for filter drivers."""
  37. def clean(self, data: bytes) -> bytes:
  38. """Apply clean filter (working tree → repository)."""
  39. ...
  40. def smudge(self, data: bytes, path: bytes = b"") -> bytes:
  41. """Apply smudge filter (repository → working tree)."""
  42. ...
  43. def cleanup(self) -> None:
  44. """Clean up any resources held by this filter driver."""
  45. ...
  46. def reuse(self, config: "StackedConfig", filter_name: str) -> bool:
  47. """Check if this filter driver should be reused with the given configuration.
  48. This method determines whether a cached filter driver instance should continue
  49. to be used or if it should be recreated. Only filters that are expensive to
  50. create (like long-running process filters) and whose configuration hasn't
  51. changed should return True. Lightweight filters should return False to ensure
  52. they always use the latest configuration.
  53. Args:
  54. config: The current configuration stack
  55. filter_name: The name of the filter in config
  56. Returns:
  57. True if the filter should be reused, False if it should be recreated
  58. """
  59. ...
  60. class CompositeFilterDriver:
  61. """Filter driver that chains multiple filters together."""
  62. def __init__(self, filters: list[FilterDriver]) -> None:
  63. """Initialize CompositeFilterDriver.
  64. Args:
  65. filters: List of filters to apply in order
  66. """
  67. self.filters = filters
  68. def clean(self, data: bytes) -> bytes:
  69. """Apply all clean filters in order."""
  70. for filter_driver in self.filters:
  71. data = filter_driver.clean(data)
  72. return data
  73. def smudge(self, data: bytes, path: bytes = b"") -> bytes:
  74. """Apply all smudge filters in reverse order."""
  75. # For smudge, apply filters in reverse order
  76. for filter_driver in reversed(self.filters):
  77. data = filter_driver.smudge(data, path)
  78. return data
  79. def cleanup(self) -> None:
  80. """Clean up all filter drivers."""
  81. for filter_driver in self.filters:
  82. filter_driver.cleanup()
  83. def reuse(self, config: "StackedConfig", filter_name: str) -> bool:
  84. """Check if all filters can be reused."""
  85. # A composite filter can only be reused if all its components can
  86. return all(f.reuse(config, filter_name) for f in self.filters)
  87. class ProcessFilterDriver:
  88. """Filter driver that executes external processes."""
  89. def __init__(
  90. self,
  91. clean_cmd: Optional[str] = None,
  92. smudge_cmd: Optional[str] = None,
  93. required: bool = False,
  94. cwd: Optional[str] = None,
  95. process_cmd: Optional[str] = None,
  96. ) -> None:
  97. """Initialize ProcessFilterDriver.
  98. Args:
  99. clean_cmd: Command to run for clean filter
  100. smudge_cmd: Command to run for smudge filter
  101. required: Whether the filter is required
  102. cwd: Working directory for filter execution
  103. process_cmd: Command to run for process filter (preferred for performance)
  104. """
  105. self.clean_cmd = clean_cmd
  106. self.smudge_cmd = smudge_cmd
  107. self.required = required
  108. self.cwd = cwd
  109. self.process_cmd = process_cmd
  110. self._process: Optional[subprocess.Popen[bytes]] = None
  111. self._protocol: Optional[Protocol] = None
  112. self._capabilities: set[bytes] = set()
  113. self._process_lock = threading.Lock()
  114. def _get_or_start_process(self) -> Optional["Protocol"]:
  115. """Get or start the long-running process filter."""
  116. if self._process is None and self.process_cmd:
  117. from .errors import GitProtocolError, HangupException
  118. from .protocol import Protocol
  119. try:
  120. self._process = subprocess.Popen(
  121. self.process_cmd,
  122. shell=True,
  123. stdin=subprocess.PIPE,
  124. stdout=subprocess.PIPE,
  125. stderr=subprocess.PIPE,
  126. cwd=self.cwd,
  127. text=False, # Use bytes
  128. )
  129. # Check if process started successfully
  130. if self._process.poll() is not None:
  131. # Process already terminated
  132. raise OSError(
  133. f"Process terminated immediately with code {self._process.returncode}"
  134. )
  135. # Create protocol wrapper
  136. def write_func(data: bytes) -> int:
  137. assert self._process is not None
  138. assert self._process.stdin is not None
  139. n = self._process.stdin.write(data)
  140. self._process.stdin.flush()
  141. return n
  142. def read_func(size: int) -> bytes:
  143. assert self._process is not None
  144. assert self._process.stdout is not None
  145. return self._process.stdout.read(size)
  146. self._protocol = Protocol(read_func, write_func)
  147. # Send handshake using pkt-line format
  148. self._protocol.write_pkt_line(b"git-filter-client")
  149. self._protocol.write_pkt_line(b"version=2")
  150. self._protocol.write_pkt_line(None) # flush packet
  151. # Read handshake response
  152. welcome = self._protocol.read_pkt_line()
  153. version = self._protocol.read_pkt_line()
  154. flush = self._protocol.read_pkt_line()
  155. # Verify handshake (be liberal - accept with or without newlines)
  156. if welcome and welcome.rstrip(b"\n\r") != b"git-filter-server":
  157. raise FilterError(f"Invalid welcome message: {welcome!r}")
  158. if version and version.rstrip(b"\n\r") != b"version=2":
  159. raise FilterError(f"Invalid version: {version!r}")
  160. if flush is not None:
  161. raise FilterError("Expected flush packet after handshake")
  162. # Send capabilities
  163. self._protocol.write_pkt_line(b"capability=clean")
  164. self._protocol.write_pkt_line(b"capability=smudge")
  165. self._protocol.write_pkt_line(None) # flush packet
  166. # Read capability response
  167. capabilities = []
  168. while True:
  169. pkt = self._protocol.read_pkt_line()
  170. if pkt is None: # flush packet
  171. break
  172. capabilities.append(pkt)
  173. # Store supported capabilities
  174. self._capabilities = set()
  175. for cap in capabilities:
  176. cap = cap.rstrip(b"\n\r") # Be liberal - strip any line endings
  177. if cap.startswith(b"capability="):
  178. self._capabilities.add(cap[11:]) # Remove "capability=" prefix
  179. except (
  180. OSError,
  181. subprocess.SubprocessError,
  182. HangupException,
  183. GitProtocolError,
  184. ) as e:
  185. self.cleanup()
  186. raise FilterError(f"Failed to start process filter: {e}")
  187. return self._protocol
  188. def _use_process_filter(self, data: bytes, operation: str, path: str = "") -> bytes:
  189. """Use the long-running process filter for the operation."""
  190. with self._process_lock:
  191. try:
  192. proc = self._get_or_start_process()
  193. if proc is None:
  194. return data
  195. operation_bytes = operation.encode()
  196. if operation_bytes not in self._capabilities:
  197. raise FilterError(f"Operation {operation} not supported by filter")
  198. if not self._protocol:
  199. raise FilterError("Protocol not initialized")
  200. # Send request using pkt-line format
  201. self._protocol.write_pkt_line(f"command={operation}".encode())
  202. self._protocol.write_pkt_line(f"pathname={path}".encode())
  203. self._protocol.write_pkt_line(None) # flush packet
  204. # Send data
  205. # Split data into chunks if needed (max pkt-line payload is 65516 bytes)
  206. chunk_size = 65516
  207. for i in range(0, len(data), chunk_size):
  208. chunk = data[i : i + chunk_size]
  209. self._protocol.write_pkt_line(chunk)
  210. self._protocol.write_pkt_line(None) # flush packet to end data
  211. # Read response (initial headers)
  212. response_headers = {}
  213. while True:
  214. pkt = self._protocol.read_pkt_line()
  215. if pkt is None: # flush packet ends headers
  216. break
  217. key, _, value = pkt.decode().rstrip("\n\r").partition("=")
  218. response_headers[key] = value
  219. # Check status
  220. status = response_headers.get("status", "error")
  221. if status != "success":
  222. raise FilterError(f"Process filter {operation} failed: {status}")
  223. # Read result data
  224. result_chunks = []
  225. while True:
  226. pkt = self._protocol.read_pkt_line()
  227. if pkt is None: # flush packet ends data
  228. break
  229. result_chunks.append(pkt)
  230. # Read final headers per Git filter protocol
  231. # Filters send: headers + flush + content + flush + final_headers + flush
  232. final_headers = {}
  233. while True:
  234. pkt = self._protocol.read_pkt_line()
  235. if pkt is None: # flush packet ends final headers
  236. break
  237. key, _, value = pkt.decode().rstrip("\n\r").partition("=")
  238. final_headers[key] = value
  239. # Check final status (if provided, it overrides the initial status)
  240. final_status = final_headers.get("status", status)
  241. if final_status != "success":
  242. raise FilterError(
  243. f"Process filter {operation} failed with final status: {final_status}"
  244. )
  245. return b"".join(result_chunks)
  246. except (OSError, subprocess.SubprocessError, ValueError) as e:
  247. # Clean up broken process
  248. self.cleanup()
  249. raise FilterError(f"Process filter failed: {e}")
  250. def clean(self, data: bytes) -> bytes:
  251. """Apply clean filter using external process."""
  252. # Try process filter first (much faster)
  253. if self.process_cmd:
  254. try:
  255. return self._use_process_filter(data, "clean")
  256. except FilterError as e:
  257. if self.required:
  258. raise
  259. logging.warning(f"Process filter failed, falling back: {e}")
  260. # Fall back to clean command
  261. if not self.clean_cmd:
  262. if self.required:
  263. raise FilterError("Clean command is required but not configured")
  264. return data
  265. try:
  266. result = subprocess.run(
  267. self.clean_cmd,
  268. shell=True,
  269. input=data,
  270. capture_output=True,
  271. check=True,
  272. cwd=self.cwd,
  273. )
  274. return result.stdout
  275. except subprocess.CalledProcessError as e:
  276. if self.required:
  277. raise FilterError(f"Required clean filter failed: {e}")
  278. # If not required, log warning and return original data on failure
  279. logging.warning(f"Optional clean filter failed: {e}")
  280. return data
  281. def smudge(self, data: bytes, path: bytes = b"") -> bytes:
  282. """Apply smudge filter using external process."""
  283. path_str = path.decode("utf-8", errors="replace")
  284. # Try process filter first (much faster)
  285. if self.process_cmd:
  286. try:
  287. return self._use_process_filter(data, "smudge", path_str)
  288. except FilterError as e:
  289. if self.required:
  290. raise
  291. logging.warning(f"Process filter failed, falling back: {e}")
  292. # Fall back to smudge command
  293. if not self.smudge_cmd:
  294. if self.required:
  295. raise FilterError("Smudge command is required but not configured")
  296. return data
  297. # Substitute %f placeholder with file path
  298. cmd = self.smudge_cmd.replace("%f", path_str)
  299. try:
  300. result = subprocess.run(
  301. cmd,
  302. shell=True,
  303. input=data,
  304. capture_output=True,
  305. check=True,
  306. cwd=self.cwd,
  307. )
  308. return result.stdout
  309. except subprocess.CalledProcessError as e:
  310. if self.required:
  311. raise FilterError(f"Required smudge filter failed: {e}")
  312. # If not required, log warning and return original data on failure
  313. logging.warning(f"Optional smudge filter failed: {e}")
  314. return data
  315. def cleanup(self) -> None:
  316. """Clean up the process filter."""
  317. if self._process:
  318. # Close stdin first to signal the process to quit cleanly
  319. if self._process.stdin and not self._process.stdin.closed:
  320. try:
  321. self._process.stdin.close()
  322. except BrokenPipeError:
  323. pass
  324. # Try to terminate gracefully first
  325. if self._process.poll() is None: # Still running
  326. try:
  327. self._process.terminate()
  328. self._process.wait(timeout=2)
  329. except subprocess.TimeoutExpired:
  330. # Force kill if terminate didn't work
  331. try:
  332. self._process.kill()
  333. self._process.wait(timeout=3)
  334. except subprocess.TimeoutExpired:
  335. # On Windows, sometimes we need to be more aggressive
  336. import os
  337. if os.name == "nt":
  338. try:
  339. subprocess.run(
  340. [
  341. "taskkill",
  342. "/F",
  343. "/T",
  344. "/PID",
  345. str(self._process.pid),
  346. ],
  347. capture_output=True,
  348. timeout=5,
  349. )
  350. self._process.wait(timeout=1)
  351. except (
  352. subprocess.CalledProcessError,
  353. subprocess.TimeoutExpired,
  354. ):
  355. pass
  356. else:
  357. try:
  358. import signal
  359. os.kill(self._process.pid, signal.SIGKILL) # type: ignore[attr-defined,unused-ignore]
  360. self._process.wait(timeout=1)
  361. except (ProcessLookupError, subprocess.TimeoutExpired):
  362. pass
  363. except ProcessLookupError:
  364. # Process already dead
  365. pass
  366. self._process = None
  367. self._protocol = None
  368. def reuse(self, config: "StackedConfig", filter_name: str) -> bool:
  369. """Check if this filter driver should be reused with the given configuration."""
  370. # Only reuse if it's a long-running process filter AND config hasn't changed
  371. if self.process_cmd is None:
  372. # Not a long-running filter, don't cache
  373. return False
  374. # Check if the filter commands in config match our current commands
  375. try:
  376. clean_cmd_raw = config.get(("filter", filter_name), "clean")
  377. except KeyError:
  378. clean_cmd = None
  379. else:
  380. clean_cmd = (
  381. clean_cmd_raw.decode("utf-8")
  382. if isinstance(clean_cmd_raw, bytes)
  383. else clean_cmd_raw
  384. )
  385. if clean_cmd != self.clean_cmd:
  386. return False
  387. try:
  388. smudge_cmd_raw = config.get(("filter", filter_name), "smudge")
  389. except KeyError:
  390. smudge_cmd = None
  391. else:
  392. smudge_cmd = (
  393. smudge_cmd_raw.decode("utf-8")
  394. if isinstance(smudge_cmd_raw, bytes)
  395. else smudge_cmd_raw
  396. )
  397. if smudge_cmd != self.smudge_cmd:
  398. return False
  399. try:
  400. process_cmd_raw = config.get(("filter", filter_name), "process")
  401. except KeyError:
  402. process_cmd = None
  403. else:
  404. process_cmd = (
  405. process_cmd_raw.decode("utf-8")
  406. if isinstance(process_cmd_raw, bytes)
  407. else process_cmd_raw
  408. )
  409. if process_cmd != self.process_cmd:
  410. return False
  411. required = config.get_boolean(("filter", filter_name), "required", False)
  412. if required != self.required:
  413. return False
  414. return True
  415. def __del__(self) -> None:
  416. """Clean up the process filter on destruction."""
  417. self.cleanup()
  418. class FilterContext:
  419. """Context for managing stateful filter resources.
  420. This class manages the runtime state for filters, including:
  421. - Cached filter driver instances that maintain long-running state
  422. - Resource lifecycle management
  423. It works in conjunction with FilterRegistry to provide complete
  424. filter functionality while maintaining proper separation of concerns.
  425. """
  426. def __init__(self, filter_registry: "FilterRegistry") -> None:
  427. """Initialize FilterContext.
  428. Args:
  429. filter_registry: The filter registry to use for driver lookups
  430. """
  431. self.filter_registry = filter_registry
  432. self._active_drivers: dict[str, FilterDriver] = {}
  433. def get_driver(self, name: str) -> Optional[FilterDriver]:
  434. """Get a filter driver by name, managing stateful instances.
  435. This method handles driver instantiation and caching. Only drivers
  436. that should be reused are cached.
  437. Args:
  438. name: The filter name
  439. Returns:
  440. FilterDriver instance or None
  441. """
  442. driver: Optional[FilterDriver] = None
  443. # Check if we have a cached instance that should be reused
  444. if name in self._active_drivers:
  445. driver = self._active_drivers[name]
  446. # Check if the cached driver should still be reused
  447. if self.filter_registry.config and driver.reuse(
  448. self.filter_registry.config, name
  449. ):
  450. return driver
  451. else:
  452. # Driver shouldn't be reused, clean it up and remove from cache
  453. driver.cleanup()
  454. del self._active_drivers[name]
  455. # Get driver from registry
  456. driver = self.filter_registry.get_driver(name)
  457. if driver is not None and self.filter_registry.config:
  458. # Only cache drivers that should be reused
  459. if driver.reuse(self.filter_registry.config, name):
  460. self._active_drivers[name] = driver
  461. return driver
  462. def close(self) -> None:
  463. """Close all active filter resources."""
  464. # Clean up active drivers
  465. for driver in self._active_drivers.values():
  466. driver.cleanup()
  467. self._active_drivers.clear()
  468. # Also close the registry
  469. self.filter_registry.close()
  470. def refresh_config(self, config: "StackedConfig") -> None:
  471. """Refresh the configuration used by the filter registry.
  472. This should be called when the configuration has changed to ensure
  473. filters use the latest settings.
  474. Args:
  475. config: The new configuration stack
  476. """
  477. # Update the registry's config
  478. self.filter_registry.config = config
  479. # Re-setup line ending filter with new config
  480. # This will update the text filter factory to use new autocrlf settings
  481. self.filter_registry._setup_line_ending_filter()
  482. # The get_driver method will now handle checking reuse() for cached drivers
  483. def __del__(self) -> None:
  484. """Clean up on destruction."""
  485. try:
  486. self.close()
  487. except Exception:
  488. # Don't raise exceptions in __del__
  489. pass
  490. class FilterRegistry:
  491. """Registry for filter drivers."""
  492. def __init__(
  493. self,
  494. config: Optional["StackedConfig"] = None,
  495. repo: Optional["BaseRepo"] = None,
  496. ) -> None:
  497. """Initialize FilterRegistry.
  498. Args:
  499. config: Git configuration stack
  500. repo: Repository instance
  501. """
  502. self.config = config
  503. self.repo = repo
  504. self._drivers: dict[str, FilterDriver] = {}
  505. self._factories: dict[str, Callable[[FilterRegistry], FilterDriver]] = {}
  506. # Register built-in filter factories
  507. self.register_factory("lfs", self._create_lfs_filter)
  508. self.register_factory("text", self._create_text_filter)
  509. # Auto-register line ending filter if autocrlf is enabled
  510. self._setup_line_ending_filter()
  511. def register_factory(
  512. self, name: str, factory: Callable[["FilterRegistry"], FilterDriver]
  513. ) -> None:
  514. """Register a filter driver factory."""
  515. self._factories[name] = factory
  516. def register_driver(self, name: str, driver: FilterDriver) -> None:
  517. """Register a filter driver instance."""
  518. self._drivers[name] = driver
  519. def get_driver(self, name: str) -> Optional[FilterDriver]:
  520. """Get a filter driver by name."""
  521. # Check if we already have an instance
  522. if name in self._drivers:
  523. return self._drivers[name]
  524. # Try to create from config first (respect user configuration)
  525. if self.config is not None:
  526. config_driver = self._create_from_config(name)
  527. if config_driver is not None:
  528. self._drivers[name] = config_driver
  529. return config_driver
  530. # Try to create from factory as fallback
  531. if name in self._factories:
  532. factory_driver = self._factories[name](self)
  533. self._drivers[name] = factory_driver
  534. return factory_driver
  535. return None
  536. def close(self) -> None:
  537. """Close all filter drivers, ensuring process cleanup."""
  538. for driver in self._drivers.values():
  539. driver.cleanup()
  540. self._drivers.clear()
  541. def __del__(self) -> None:
  542. """Clean up filter drivers on destruction."""
  543. try:
  544. self.close()
  545. except Exception:
  546. # Don't raise exceptions in __del__
  547. pass
  548. def _create_from_config(self, name: str) -> Optional[FilterDriver]:
  549. """Create a filter driver from config."""
  550. if self.config is None:
  551. return None
  552. clean_cmd: Optional[str] = None
  553. smudge_cmd: Optional[str] = None
  554. process_cmd: Optional[str] = None
  555. # Get process command (preferred over clean/smudge for performance)
  556. try:
  557. process_cmd_raw = self.config.get(("filter", name), "process")
  558. except KeyError:
  559. pass
  560. else:
  561. if isinstance(process_cmd_raw, bytes):
  562. process_cmd = process_cmd_raw.decode("utf-8")
  563. else:
  564. process_cmd = process_cmd_raw
  565. # Get clean command
  566. try:
  567. clean_cmd_raw = self.config.get(("filter", name), "clean")
  568. except KeyError:
  569. pass
  570. else:
  571. if isinstance(clean_cmd_raw, bytes):
  572. clean_cmd = clean_cmd_raw.decode("utf-8")
  573. else:
  574. clean_cmd = clean_cmd_raw
  575. # Get smudge command
  576. try:
  577. smudge_cmd_raw = self.config.get(("filter", name), "smudge")
  578. except KeyError:
  579. pass
  580. else:
  581. if isinstance(smudge_cmd_raw, bytes):
  582. smudge_cmd = smudge_cmd_raw.decode("utf-8")
  583. else:
  584. smudge_cmd = smudge_cmd_raw
  585. # Get required flag (defaults to False)
  586. required = self.config.get_boolean(("filter", name), "required", False)
  587. if process_cmd or clean_cmd or smudge_cmd:
  588. # Get repository working directory (only for Repo, not BaseRepo)
  589. from .repo import Repo
  590. repo_path = (
  591. self.repo.path if self.repo and isinstance(self.repo, Repo) else None
  592. )
  593. return ProcessFilterDriver(
  594. clean_cmd, smudge_cmd, required, repo_path, process_cmd
  595. )
  596. return None
  597. def _create_lfs_filter(self, registry: "FilterRegistry") -> FilterDriver:
  598. """Create LFS filter driver."""
  599. from .lfs import LFSFilterDriver, LFSStore
  600. # If we have a Repo (not just BaseRepo), use its LFS store
  601. from .repo import Repo
  602. if registry.repo is not None and isinstance(registry.repo, Repo):
  603. lfs_store = LFSStore.from_repo(registry.repo, create=True)
  604. else:
  605. # Fall back to creating a temporary LFS store
  606. import tempfile
  607. lfs_dir = tempfile.mkdtemp(prefix="dulwich-lfs-")
  608. lfs_store = LFSStore.create(lfs_dir)
  609. config = registry.repo.get_config_stack() if registry.repo else None
  610. return LFSFilterDriver(lfs_store, config=config)
  611. def _create_text_filter(self, registry: "FilterRegistry") -> FilterDriver:
  612. """Create text filter driver for line ending conversion.
  613. This filter is used when files have the 'text' attribute set explicitly.
  614. It always normalizes line endings on checkin (CRLF -> LF).
  615. """
  616. from .line_ending import LineEndingFilter
  617. return LineEndingFilter.from_config(self.config, for_text_attr=True)
  618. def _setup_line_ending_filter(self) -> None:
  619. """Automatically register line ending filter if configured."""
  620. if self.config is None:
  621. return
  622. # Parse autocrlf as bytes
  623. try:
  624. autocrlf_raw = self.config.get("core", "autocrlf")
  625. except KeyError:
  626. return
  627. else:
  628. autocrlf: bytes = (
  629. autocrlf_raw.lower()
  630. if isinstance(autocrlf_raw, bytes)
  631. else str(autocrlf_raw).lower().encode("ascii")
  632. )
  633. # If autocrlf is enabled, register the text filter
  634. if autocrlf in (b"true", b"input"):
  635. # Pre-create the text filter so it's available
  636. self.get_driver("text")
  637. def get_filter_for_path(
  638. path: bytes,
  639. gitattributes: "GitAttributes",
  640. filter_registry: Optional[FilterRegistry] = None,
  641. filter_context: Optional[FilterContext] = None,
  642. ) -> Optional[FilterDriver]:
  643. """Get the appropriate filter driver for a given path.
  644. Args:
  645. path: Path to check
  646. gitattributes: GitAttributes object with parsed patterns
  647. filter_registry: Registry of filter drivers (deprecated, use filter_context)
  648. filter_context: Context for managing filter state
  649. Returns:
  650. FilterDriver instance or None
  651. """
  652. # Use filter_context if provided, otherwise fall back to registry
  653. if filter_context is not None:
  654. registry = filter_context.filter_registry
  655. get_driver = filter_context.get_driver
  656. elif filter_registry is not None:
  657. registry = filter_registry
  658. get_driver = filter_registry.get_driver
  659. else:
  660. raise ValueError("Either filter_registry or filter_context must be provided")
  661. # Get all attributes for this path
  662. attributes = gitattributes.match_path(path)
  663. # Collect filters to apply
  664. filters: list[FilterDriver] = []
  665. # Check for text attribute first (it should be applied before custom filters)
  666. text_attr = attributes.get(b"text")
  667. if text_attr is True:
  668. # Add text filter for line ending conversion
  669. text_filter = get_driver("text")
  670. if text_filter is not None:
  671. filters.append(text_filter)
  672. elif text_attr is False:
  673. # -text means binary, no conversion - but still check for custom filters
  674. pass
  675. else:
  676. # If no explicit text attribute, check if autocrlf is enabled
  677. # When autocrlf is true/input, files are treated as text by default
  678. if registry.config is not None:
  679. try:
  680. autocrlf_raw = registry.config.get("core", "autocrlf")
  681. except KeyError:
  682. pass
  683. else:
  684. autocrlf: bytes = (
  685. autocrlf_raw.lower()
  686. if isinstance(autocrlf_raw, bytes)
  687. else str(autocrlf_raw).lower().encode("ascii")
  688. )
  689. if autocrlf in (b"true", b"input"):
  690. # Add text filter for files without explicit attributes
  691. text_filter = get_driver("text")
  692. if text_filter is not None:
  693. filters.append(text_filter)
  694. # Check if there's a filter attribute
  695. filter_name = attributes.get(b"filter")
  696. if filter_name is not None and not isinstance(filter_name, bool):
  697. if isinstance(filter_name, bytes):
  698. filter_name_str = filter_name.decode("utf-8")
  699. driver = get_driver(filter_name_str)
  700. # Check if filter is required but missing
  701. if driver is None and registry.config is not None:
  702. required = registry.config.get_boolean(
  703. ("filter", filter_name_str), "required", False
  704. )
  705. if required:
  706. raise FilterError(
  707. f"Required filter '{filter_name_str}' is not available"
  708. )
  709. if driver is not None:
  710. filters.append(driver)
  711. # Return appropriate filter(s)
  712. if len(filters) == 0:
  713. return None
  714. elif len(filters) == 1:
  715. return filters[0]
  716. else:
  717. # Multiple filters - create a composite
  718. return CompositeFilterDriver(filters)
  719. class FilterBlobNormalizer:
  720. """Blob normalizer that applies clean/smudge filters based on gitattributes.
  721. This can be used in addition to or instead of line ending normalization.
  722. """
  723. def __init__(
  724. self,
  725. config_stack: Optional["StackedConfig"],
  726. gitattributes: GitAttributes,
  727. filter_registry: Optional[FilterRegistry] = None,
  728. repo: Optional["BaseRepo"] = None,
  729. filter_context: Optional[FilterContext] = None,
  730. ) -> None:
  731. """Initialize FilterBlobNormalizer.
  732. Args:
  733. config_stack: Git configuration stack
  734. gitattributes: GitAttributes instance
  735. filter_registry: Optional filter registry to use (deprecated, use filter_context)
  736. repo: Optional repository instance
  737. filter_context: Optional filter context to use for managing filter state
  738. """
  739. self.config_stack = config_stack
  740. self.gitattributes = gitattributes
  741. self._owns_context = False # Track if we created our own context
  742. # Support both old and new API
  743. if filter_context is not None:
  744. self.filter_context = filter_context
  745. self.filter_registry = filter_context.filter_registry
  746. self._owns_context = False # We're using an external context
  747. else:
  748. if filter_registry is not None:
  749. import warnings
  750. warnings.warn(
  751. "Passing filter_registry to FilterBlobNormalizer is deprecated. "
  752. "Pass a FilterContext instead.",
  753. DeprecationWarning,
  754. stacklevel=2,
  755. )
  756. self.filter_registry = filter_registry
  757. else:
  758. self.filter_registry = FilterRegistry(config_stack, repo)
  759. self.filter_context = FilterContext(self.filter_registry)
  760. self._owns_context = True # We created our own context
  761. def checkin_normalize(self, blob: Blob, path: bytes) -> Blob:
  762. """Apply clean filter during checkin (working tree -> repository)."""
  763. # Get filter for this path
  764. filter_driver = get_filter_for_path(
  765. path, self.gitattributes, filter_context=self.filter_context
  766. )
  767. if filter_driver is None:
  768. return blob
  769. # Apply clean filter
  770. filtered_data = filter_driver.clean(blob.data)
  771. if filtered_data == blob.data:
  772. return blob
  773. # Create new blob with filtered data
  774. new_blob = Blob()
  775. new_blob.data = filtered_data
  776. return new_blob
  777. def checkout_normalize(self, blob: Blob, path: bytes) -> Blob:
  778. """Apply smudge filter during checkout (repository -> working tree)."""
  779. # Get filter for this path
  780. filter_driver = get_filter_for_path(
  781. path, self.gitattributes, filter_context=self.filter_context
  782. )
  783. if filter_driver is None:
  784. return blob
  785. # Apply smudge filter
  786. filtered_data = filter_driver.smudge(blob.data, path)
  787. if filtered_data == blob.data:
  788. return blob
  789. # Create new blob with filtered data
  790. new_blob = Blob()
  791. new_blob.data = filtered_data
  792. return new_blob
  793. def close(self) -> None:
  794. """Close all filter drivers, ensuring process cleanup."""
  795. # Only close the filter context if we created it ourselves
  796. if self._owns_context:
  797. self.filter_context.close()
  798. def __del__(self) -> None:
  799. """Clean up filter drivers on destruction."""
  800. try:
  801. self.close()
  802. except Exception:
  803. # Don't raise exceptions in __del__
  804. pass