diff.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621
  1. # diff.py -- Diff functionality for Dulwich
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  5. # General Public License as published by the Free Software Foundation; version 2.0
  6. # or (at your option) any later version. You can redistribute it and/or
  7. # modify it under the terms of either of these two licenses.
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # You should have received a copy of the licenses; if not, see
  16. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  17. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  18. # License, Version 2.0.
  19. #
  20. """Diff functionality with separate codepaths.
  21. This module provides three main functions for different diff scenarios:
  22. 1. diff_index_to_tree: Shows staged changes (index vs commit)
  23. Used by: git diff --staged, git diff --cached
  24. 2. diff_working_tree_to_tree: Shows all changes from a commit to working tree
  25. Used by: git diff <commit>
  26. 3. diff_working_tree_to_index: Shows unstaged changes (working tree vs index)
  27. Used by: git diff (with no arguments)
  28. Example usage:
  29. from dulwich.repo import Repo
  30. from dulwich.diff import diff_index_to_tree
  31. import sys
  32. repo = Repo('.')
  33. # Show staged changes
  34. diff_index_to_tree(repo, sys.stdout.buffer)
  35. # Show changes in specific paths only
  36. diff_index_to_tree(repo, sys.stdout.buffer, paths=[b'src/', b'README.md'])
  37. """
  38. import logging
  39. import os
  40. import stat
  41. from typing import BinaryIO, Optional
  42. from .index import ConflictedIndexEntry, commit_index
  43. from .object_store import iter_tree_contents
  44. from .objects import S_ISGITLINK, Blob, Commit
  45. from .patch import write_blob_diff, write_object_diff
  46. from .repo import Repo
  47. logger = logging.getLogger(__name__)
  48. def should_include_path(path: bytes, paths: Optional[list[bytes]]) -> bool:
  49. """Check if a path should be included based on path filters.
  50. Args:
  51. path: The path to check
  52. paths: List of path filters, or None for no filtering
  53. Returns:
  54. True if the path should be included
  55. """
  56. if not paths:
  57. return True
  58. return any(path == p or path.startswith(p + b"/") for p in paths)
  59. def diff_index_to_tree(
  60. repo: Repo,
  61. outstream: BinaryIO,
  62. commit_sha: Optional[bytes] = None,
  63. paths: Optional[list[bytes]] = None,
  64. diff_algorithm: Optional[str] = None,
  65. ) -> None:
  66. """Show staged changes (index vs commit).
  67. Args:
  68. repo: Repository object
  69. outstream: Stream to write diff to
  70. commit_sha: SHA of commit to compare against, or None for HEAD
  71. paths: Optional list of paths to filter (as bytes)
  72. diff_algorithm: Algorithm to use for diffing ("myers" or "patience"), defaults to DEFAULT_DIFF_ALGORITHM if None
  73. """
  74. if commit_sha is None:
  75. try:
  76. commit_sha = repo.refs[b"HEAD"]
  77. old_commit = repo[commit_sha]
  78. assert isinstance(old_commit, Commit)
  79. old_tree = old_commit.tree
  80. except KeyError:
  81. # No HEAD means no commits yet
  82. old_tree = None
  83. else:
  84. old_commit = repo[commit_sha]
  85. assert isinstance(old_commit, Commit)
  86. old_tree = old_commit.tree
  87. # Get tree from index
  88. index = repo.open_index()
  89. new_tree = commit_index(repo.object_store, index)
  90. changes = repo.object_store.tree_changes(old_tree, new_tree, paths=paths)
  91. for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
  92. write_object_diff(
  93. outstream,
  94. repo.object_store,
  95. (oldpath, oldmode, oldsha),
  96. (newpath, newmode, newsha),
  97. diff_algorithm=diff_algorithm,
  98. )
  99. def diff_working_tree_to_tree(
  100. repo: Repo,
  101. outstream: BinaryIO,
  102. commit_sha: bytes,
  103. paths: Optional[list[bytes]] = None,
  104. diff_algorithm: Optional[str] = None,
  105. ) -> None:
  106. """Compare working tree to a specific commit.
  107. Args:
  108. repo: Repository object
  109. outstream: Stream to write diff to
  110. commit_sha: SHA of commit to compare against
  111. paths: Optional list of paths to filter (as bytes)
  112. diff_algorithm: Algorithm to use for diffing ("myers" or "patience"), defaults to DEFAULT_DIFF_ALGORITHM if None
  113. """
  114. commit = repo[commit_sha]
  115. assert isinstance(commit, Commit)
  116. tree = commit.tree
  117. normalizer = repo.get_blob_normalizer()
  118. filter_callback = normalizer.checkin_normalize
  119. # Get index for tracking new files
  120. index = repo.open_index()
  121. index_paths = set(index.paths())
  122. processed_paths = set()
  123. # Process files from the committed tree lazily
  124. for entry in iter_tree_contents(repo.object_store, tree):
  125. path = entry.path
  126. if not should_include_path(path, paths):
  127. continue
  128. processed_paths.add(path)
  129. full_path = os.path.join(repo.path, path.decode("utf-8"))
  130. # Get the old file from tree
  131. old_mode = entry.mode
  132. old_sha = entry.sha
  133. old_blob = repo.object_store[old_sha]
  134. assert isinstance(old_blob, Blob)
  135. try:
  136. # Use lstat to handle symlinks properly
  137. st = os.lstat(full_path)
  138. except FileNotFoundError:
  139. # File was deleted
  140. if old_blob is not None:
  141. write_blob_diff(
  142. outstream, (path, old_mode, old_blob), (None, None, None)
  143. )
  144. except PermissionError:
  145. logger.warning("%s: Permission denied", path.decode())
  146. # Show as deletion if it was in tree
  147. if old_blob is not None:
  148. write_blob_diff(
  149. outstream, (path, old_mode, old_blob), (None, None, None)
  150. )
  151. except OSError as e:
  152. logger.warning("%s: %s", path.decode(), e)
  153. # Show as deletion if it was in tree
  154. if old_blob is not None:
  155. write_blob_diff(
  156. outstream, (path, old_mode, old_blob), (None, None, None)
  157. )
  158. else:
  159. # Handle different file types
  160. if stat.S_ISDIR(st.st_mode):
  161. if old_blob is not None:
  162. # Directory in working tree where file was expected
  163. if stat.S_ISLNK(old_mode):
  164. logger.warning("%s: symlink became a directory", path.decode())
  165. else:
  166. logger.warning("%s: file became a directory", path.decode())
  167. # Show as deletion
  168. write_blob_diff(
  169. outstream, (path, old_mode, old_blob), (None, None, None)
  170. )
  171. # If old_blob is None, it's a new directory - skip it
  172. continue
  173. elif stat.S_ISLNK(st.st_mode):
  174. # Symlink in working tree
  175. target = os.readlink(full_path).encode("utf-8")
  176. new_blob = Blob()
  177. new_blob.data = target
  178. if old_blob is None:
  179. # New symlink
  180. write_blob_diff(
  181. outstream,
  182. (None, None, None),
  183. (path, stat.S_IFLNK | 0o777, new_blob),
  184. )
  185. elif not stat.S_ISLNK(old_mode):
  186. # Type change: file/submodule -> symlink
  187. write_blob_diff(
  188. outstream,
  189. (path, old_mode, old_blob),
  190. (path, stat.S_IFLNK | 0o777, new_blob),
  191. )
  192. elif old_blob is not None and old_blob.data != target:
  193. # Symlink target changed
  194. write_blob_diff(
  195. outstream,
  196. (path, old_mode, old_blob),
  197. (path, old_mode, new_blob),
  198. )
  199. elif stat.S_ISREG(st.st_mode):
  200. # Regular file
  201. with open(full_path, "rb") as f:
  202. new_content = f.read()
  203. # Create a temporary blob for filtering and comparison
  204. new_blob = Blob()
  205. new_blob.data = new_content
  206. # Apply filters if needed (only for regular files, not gitlinks)
  207. if filter_callback is not None and (
  208. old_blob is None or not S_ISGITLINK(old_mode)
  209. ):
  210. new_blob = filter_callback(new_blob, path)
  211. # Determine the git mode for the new file
  212. if st.st_mode & stat.S_IXUSR:
  213. new_git_mode = stat.S_IFREG | 0o755
  214. else:
  215. new_git_mode = stat.S_IFREG | 0o644
  216. if old_blob is None:
  217. # New file
  218. write_blob_diff(
  219. outstream, (None, None, None), (path, new_git_mode, new_blob)
  220. )
  221. elif stat.S_ISLNK(old_mode):
  222. # Symlink -> file
  223. write_blob_diff(
  224. outstream,
  225. (path, old_mode, old_blob),
  226. (path, new_git_mode, new_blob),
  227. )
  228. elif S_ISGITLINK(old_mode):
  229. # Submodule -> file
  230. write_blob_diff(
  231. outstream,
  232. (path, old_mode, old_blob),
  233. (path, new_git_mode, new_blob),
  234. )
  235. else:
  236. # Regular file, check for content or mode changes
  237. old_git_mode = old_mode & (stat.S_IFREG | 0o777)
  238. if (
  239. old_blob is not None and old_blob.data != new_blob.data
  240. ) or old_git_mode != new_git_mode:
  241. write_blob_diff(
  242. outstream,
  243. (path, old_mode, old_blob),
  244. (path, new_git_mode, new_blob),
  245. )
  246. elif stat.S_ISFIFO(st.st_mode):
  247. logger.warning("%s: unsupported file type (fifo)", path.decode())
  248. if old_blob is not None:
  249. write_blob_diff(
  250. outstream, (path, old_mode, old_blob), (None, None, None)
  251. )
  252. elif stat.S_ISSOCK(st.st_mode):
  253. logger.warning("%s: unsupported file type (socket)", path.decode())
  254. if old_blob is not None:
  255. write_blob_diff(
  256. outstream, (path, old_mode, old_blob), (None, None, None)
  257. )
  258. else:
  259. logger.warning("%s: unsupported file type", path.decode())
  260. if old_blob is not None:
  261. write_blob_diff(
  262. outstream, (path, old_mode, old_blob), (None, None, None)
  263. )
  264. # Now process any new files from index that weren't in the tree
  265. for path in sorted(index_paths - processed_paths):
  266. if not should_include_path(path, paths):
  267. continue
  268. full_path = os.path.join(repo.path, path.decode("utf-8"))
  269. try:
  270. # Use lstat to handle symlinks properly
  271. st = os.lstat(full_path)
  272. except FileNotFoundError:
  273. # New file already deleted, skip
  274. continue
  275. except PermissionError:
  276. logger.warning("%s: Permission denied", path.decode())
  277. continue
  278. except OSError as e:
  279. logger.warning("%s: %s", path.decode(), e)
  280. continue
  281. # Handle different file types for new files
  282. if stat.S_ISDIR(st.st_mode):
  283. # New directory - skip it
  284. continue
  285. elif stat.S_ISLNK(st.st_mode):
  286. # New symlink
  287. target = os.readlink(full_path).encode("utf-8")
  288. new_blob = Blob()
  289. new_blob.data = target
  290. write_blob_diff(
  291. outstream,
  292. (None, None, None),
  293. (path, stat.S_IFLNK | 0o777, new_blob),
  294. )
  295. elif stat.S_ISREG(st.st_mode):
  296. # New regular file
  297. with open(full_path, "rb") as f:
  298. new_content = f.read()
  299. new_blob = Blob()
  300. new_blob.data = new_content
  301. # Apply filters if needed
  302. if filter_callback is not None:
  303. new_blob = filter_callback(new_blob, path)
  304. # Determine the git mode for the new file
  305. if st.st_mode & stat.S_IXUSR:
  306. new_git_mode = 0o100755
  307. else:
  308. new_git_mode = 0o100644
  309. write_blob_diff(
  310. outstream, (None, None, None), (path, new_git_mode, new_blob)
  311. )
  312. elif stat.S_ISFIFO(st.st_mode):
  313. logger.warning("%s: unsupported file type (fifo)", path.decode())
  314. elif stat.S_ISSOCK(st.st_mode):
  315. logger.warning("%s: unsupported file type (socket)", path.decode())
  316. else:
  317. logger.warning("%s: unsupported file type", path.decode())
  318. def diff_working_tree_to_index(
  319. repo: Repo,
  320. outstream: BinaryIO,
  321. paths: Optional[list[bytes]] = None,
  322. diff_algorithm: Optional[str] = None,
  323. ) -> None:
  324. """Compare working tree to index.
  325. Args:
  326. repo: Repository object
  327. outstream: Stream to write diff to
  328. paths: Optional list of paths to filter (as bytes)
  329. diff_algorithm: Algorithm to use for diffing ("myers" or "patience"), defaults to DEFAULT_DIFF_ALGORITHM if None
  330. """
  331. index = repo.open_index()
  332. normalizer = repo.get_blob_normalizer()
  333. filter_callback = normalizer.checkin_normalize
  334. # Process each file in the index
  335. for tree_path, entry in index.iteritems():
  336. if not should_include_path(tree_path, paths):
  337. continue
  338. # Handle conflicted entries by using stage 2 ("ours")
  339. if isinstance(entry, ConflictedIndexEntry):
  340. if entry.this is None:
  341. continue # No stage 2 entry, skip
  342. old_mode = entry.this.mode
  343. old_sha = entry.this.sha
  344. else:
  345. # Get file from regular index entry
  346. old_mode = entry.mode
  347. old_sha = entry.sha
  348. old_obj = repo.object_store[old_sha]
  349. # Type check and cast to Blob
  350. if isinstance(old_obj, Blob):
  351. old_blob = old_obj
  352. else:
  353. old_blob = None
  354. full_path = os.path.join(repo.path, tree_path.decode("utf-8"))
  355. try:
  356. # Use lstat to handle symlinks properly
  357. st = os.lstat(full_path)
  358. # Handle different file types
  359. if stat.S_ISDIR(st.st_mode):
  360. # Directory in working tree where file was expected
  361. if stat.S_ISLNK(old_mode):
  362. logger.warning("%s: symlink became a directory", tree_path.decode())
  363. else:
  364. logger.warning("%s: file became a directory", tree_path.decode())
  365. # Show as deletion
  366. write_blob_diff(
  367. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  368. )
  369. elif stat.S_ISLNK(st.st_mode):
  370. # Symlink in working tree
  371. target = os.readlink(full_path).encode("utf-8")
  372. new_blob = Blob()
  373. new_blob.data = target
  374. # Check if type changed or content changed
  375. if not stat.S_ISLNK(old_mode):
  376. # Type change: file/submodule -> symlink
  377. write_blob_diff(
  378. outstream,
  379. (tree_path, old_mode, old_blob),
  380. (tree_path, stat.S_IFLNK | 0o777, new_blob),
  381. )
  382. elif old_blob is not None and old_blob.data != target:
  383. # Symlink target changed
  384. write_blob_diff(
  385. outstream,
  386. (tree_path, old_mode, old_blob),
  387. (tree_path, old_mode, new_blob),
  388. )
  389. elif stat.S_ISREG(st.st_mode):
  390. # Regular file
  391. with open(full_path, "rb") as f:
  392. new_content = f.read()
  393. # Create a temporary blob for filtering and comparison
  394. new_blob = Blob()
  395. new_blob.data = new_content
  396. # Apply filters if needed (only for regular files)
  397. if filter_callback is not None and not S_ISGITLINK(old_mode):
  398. new_blob = filter_callback(new_blob, tree_path)
  399. # Determine the git mode for the new file
  400. if st.st_mode & stat.S_IXUSR:
  401. new_git_mode = stat.S_IFREG | 0o755
  402. else:
  403. new_git_mode = stat.S_IFREG | 0o644
  404. # Check if this was a type change
  405. if stat.S_ISLNK(old_mode):
  406. # Symlink -> file
  407. write_blob_diff(
  408. outstream,
  409. (tree_path, old_mode, old_blob),
  410. (tree_path, new_git_mode, new_blob),
  411. )
  412. elif S_ISGITLINK(old_mode):
  413. # Submodule -> file
  414. write_blob_diff(
  415. outstream,
  416. (tree_path, old_mode, old_blob),
  417. (tree_path, new_git_mode, new_blob),
  418. )
  419. else:
  420. # Regular file, check for content or mode changes
  421. old_git_mode = old_mode & (stat.S_IFREG | 0o777)
  422. if (
  423. old_blob is not None and old_blob.data != new_blob.data
  424. ) or old_git_mode != new_git_mode:
  425. write_blob_diff(
  426. outstream,
  427. (tree_path, old_mode, old_blob),
  428. (tree_path, new_git_mode, new_blob),
  429. )
  430. elif stat.S_ISFIFO(st.st_mode):
  431. logger.warning("%s: unsupported file type (fifo)", tree_path.decode())
  432. write_blob_diff(
  433. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  434. )
  435. elif stat.S_ISSOCK(st.st_mode):
  436. logger.warning("%s: unsupported file type (socket)", tree_path.decode())
  437. write_blob_diff(
  438. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  439. )
  440. else:
  441. logger.warning("%s: unsupported file type", tree_path.decode())
  442. write_blob_diff(
  443. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  444. )
  445. except FileNotFoundError:
  446. # File was deleted - this is normal, not a warning
  447. write_blob_diff(
  448. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  449. )
  450. except PermissionError:
  451. logger.warning("%s: Permission denied", tree_path.decode())
  452. # Show as deletion since we can't read it
  453. write_blob_diff(
  454. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  455. )
  456. except OSError as e:
  457. logger.warning("%s: %s", tree_path.decode(), e)
  458. # Show as deletion since we can't read it
  459. write_blob_diff(
  460. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  461. )
  462. class ColorizedDiffStream:
  463. """Stream wrapper that colorizes diff output line by line using Rich.
  464. This class wraps a binary output stream and applies color formatting
  465. to diff output as it's written. It processes data line by line to
  466. enable streaming colorization without buffering the entire diff.
  467. """
  468. @staticmethod
  469. def is_available() -> bool:
  470. """Check if Rich is available for colorization.
  471. Returns:
  472. bool: True if Rich can be imported, False otherwise
  473. """
  474. try:
  475. import importlib.util
  476. return importlib.util.find_spec("rich.console") is not None
  477. except ImportError:
  478. return False
  479. def __init__(self, output_stream: BinaryIO) -> None:
  480. """Initialize the colorized stream wrapper.
  481. Args:
  482. output_stream: The underlying binary stream to write to
  483. """
  484. self.output_stream = output_stream
  485. import io
  486. from rich.console import Console
  487. # Rich expects a text stream, so we need to wrap our binary stream
  488. self.text_wrapper = io.TextIOWrapper(
  489. output_stream, encoding="utf-8", newline=""
  490. )
  491. self.console = Console(file=self.text_wrapper, force_terminal=True)
  492. self.buffer = b""
  493. def write(self, data: bytes) -> None:
  494. """Write data to the stream, applying colorization.
  495. Args:
  496. data: Bytes to write
  497. """
  498. # Add new data to buffer
  499. self.buffer += data
  500. # Process complete lines
  501. while b"\n" in self.buffer:
  502. line, self.buffer = self.buffer.split(b"\n", 1)
  503. self._colorize_and_write_line(line + b"\n")
  504. def writelines(self, lines: list[bytes]) -> None:
  505. """Write a list of lines to the stream.
  506. Args:
  507. lines: Iterable of bytes to write
  508. """
  509. for line in lines:
  510. self.write(line)
  511. def _colorize_and_write_line(self, line_bytes: bytes) -> None:
  512. """Apply color formatting to a single line and write it.
  513. Args:
  514. line_bytes: The line to colorize and write (as bytes)
  515. """
  516. try:
  517. line = line_bytes.decode("utf-8", errors="replace")
  518. # Colorize based on diff line type
  519. if line.startswith("+") and not line.startswith("+++"):
  520. self.console.print(line, style="green", end="")
  521. elif line.startswith("-") and not line.startswith("---"):
  522. self.console.print(line, style="red", end="")
  523. elif line.startswith("@@"):
  524. self.console.print(line, style="cyan", end="")
  525. elif line.startswith(("+++", "---")):
  526. self.console.print(line, style="bold", end="")
  527. else:
  528. self.console.print(line, end="")
  529. except (UnicodeDecodeError, UnicodeEncodeError):
  530. # Fallback to raw output if we can't decode/encode the text
  531. self.output_stream.write(line_bytes)
  532. def flush(self) -> None:
  533. """Flush any remaining buffered content and the underlying stream."""
  534. # Write any remaining buffer content
  535. if self.buffer:
  536. self._colorize_and_write_line(self.buffer)
  537. self.buffer = b""
  538. # Flush the text wrapper and underlying stream
  539. if hasattr(self.text_wrapper, "flush"):
  540. self.text_wrapper.flush()
  541. if hasattr(self.output_stream, "flush"):
  542. self.output_stream.flush()