diff.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. # diff.py -- Diff functionality for Dulwich
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  5. # General Public License as published by the Free Software Foundation; version 2.0
  6. # or (at your option) any later version. You can redistribute it and/or
  7. # modify it under the terms of either of these two licenses.
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # You should have received a copy of the licenses; if not, see
  16. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  17. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  18. # License, Version 2.0.
  19. #
  20. """Diff functionality with separate codepaths.
  21. This module provides three main functions for different diff scenarios:
  22. 1. diff_index_to_tree: Shows staged changes (index vs commit)
  23. Used by: git diff --staged, git diff --cached
  24. 2. diff_working_tree_to_tree: Shows all changes from a commit to working tree
  25. Used by: git diff <commit>
  26. 3. diff_working_tree_to_index: Shows unstaged changes (working tree vs index)
  27. Used by: git diff (with no arguments)
  28. Example usage:
  29. from dulwich.repo import Repo
  30. from dulwich.diff import diff_index_to_tree
  31. import sys
  32. repo = Repo('.')
  33. # Show staged changes
  34. diff_index_to_tree(repo, sys.stdout.buffer)
  35. # Show changes in specific paths only
  36. diff_index_to_tree(repo, sys.stdout.buffer, paths=[b'src/', b'README.md'])
  37. """
  38. import logging
  39. import os
  40. import stat
  41. from typing import BinaryIO, Optional, cast
  42. from .index import ConflictedIndexEntry, commit_index
  43. from .object_store import iter_tree_contents
  44. from .objects import S_ISGITLINK, Blob
  45. from .patch import write_blob_diff, write_object_diff
  46. from .repo import Repo
  47. logger = logging.getLogger(__name__)
  48. def should_include_path(path: bytes, paths: Optional[list[bytes]]) -> bool:
  49. """Check if a path should be included based on path filters.
  50. Args:
  51. path: The path to check
  52. paths: List of path filters, or None for no filtering
  53. Returns:
  54. True if the path should be included
  55. """
  56. if not paths:
  57. return True
  58. return any(path == p or path.startswith(p + b"/") for p in paths)
  59. def diff_index_to_tree(
  60. repo: Repo,
  61. outstream: BinaryIO,
  62. commit_sha: Optional[bytes] = None,
  63. paths: Optional[list[bytes]] = None,
  64. ) -> None:
  65. """Show staged changes (index vs commit).
  66. Args:
  67. repo: Repository object
  68. outstream: Stream to write diff to
  69. commit_sha: SHA of commit to compare against, or None for HEAD
  70. paths: Optional list of paths to filter (as bytes)
  71. """
  72. if commit_sha is None:
  73. try:
  74. commit_sha = repo.refs[b"HEAD"]
  75. old_tree = repo[commit_sha].tree
  76. except KeyError:
  77. # No HEAD means no commits yet
  78. old_tree = None
  79. else:
  80. old_tree = repo[commit_sha].tree
  81. # Get tree from index
  82. index = repo.open_index()
  83. new_tree = commit_index(repo.object_store, index)
  84. changes = repo.object_store.tree_changes(old_tree, new_tree, paths=paths)
  85. for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
  86. write_object_diff(
  87. outstream,
  88. repo.object_store,
  89. (oldpath, oldmode, oldsha),
  90. (newpath, newmode, newsha),
  91. )
  92. def diff_working_tree_to_tree(
  93. repo: Repo,
  94. outstream: BinaryIO,
  95. commit_sha: bytes,
  96. paths: Optional[list[bytes]] = None,
  97. ) -> None:
  98. """Compare working tree to a specific commit.
  99. Args:
  100. repo: Repository object
  101. outstream: Stream to write diff to
  102. commit_sha: SHA of commit to compare against
  103. paths: Optional list of paths to filter (as bytes)
  104. """
  105. tree = repo[commit_sha].tree
  106. normalizer = repo.get_blob_normalizer()
  107. filter_callback = normalizer.checkin_normalize
  108. # Get index for tracking new files
  109. index = repo.open_index()
  110. index_paths = set(index.paths())
  111. processed_paths = set()
  112. # Process files from the committed tree lazily
  113. for entry in iter_tree_contents(repo.object_store, tree):
  114. path = entry.path
  115. if not should_include_path(path, paths):
  116. continue
  117. processed_paths.add(path)
  118. full_path = os.path.join(repo.path, path.decode("utf-8"))
  119. # Get the old file from tree
  120. old_mode = entry.mode
  121. old_sha = entry.sha
  122. old_blob = repo.object_store[old_sha]
  123. assert isinstance(old_blob, Blob)
  124. try:
  125. # Use lstat to handle symlinks properly
  126. st = os.lstat(full_path)
  127. except FileNotFoundError:
  128. # File was deleted
  129. if old_blob is not None:
  130. write_blob_diff(
  131. outstream, (path, old_mode, old_blob), (None, None, None)
  132. )
  133. except PermissionError:
  134. logger.warning("%s: Permission denied", path.decode())
  135. # Show as deletion if it was in tree
  136. if old_blob is not None:
  137. write_blob_diff(
  138. outstream, (path, old_mode, old_blob), (None, None, None)
  139. )
  140. except OSError as e:
  141. logger.warning("%s: %s", path.decode(), e)
  142. # Show as deletion if it was in tree
  143. if old_blob is not None:
  144. write_blob_diff(
  145. outstream, (path, old_mode, old_blob), (None, None, None)
  146. )
  147. else:
  148. # Handle different file types
  149. if stat.S_ISDIR(st.st_mode):
  150. if old_blob is not None:
  151. # Directory in working tree where file was expected
  152. if stat.S_ISLNK(old_mode):
  153. logger.warning("%s: symlink became a directory", path.decode())
  154. else:
  155. logger.warning("%s: file became a directory", path.decode())
  156. # Show as deletion
  157. write_blob_diff(
  158. outstream, (path, old_mode, old_blob), (None, None, None)
  159. )
  160. # If old_blob is None, it's a new directory - skip it
  161. continue
  162. elif stat.S_ISLNK(st.st_mode):
  163. # Symlink in working tree
  164. target = os.readlink(full_path).encode("utf-8")
  165. new_blob = Blob()
  166. new_blob.data = target
  167. if old_blob is None:
  168. # New symlink
  169. write_blob_diff(
  170. outstream,
  171. (None, None, None),
  172. (path, stat.S_IFLNK | 0o777, new_blob),
  173. )
  174. elif not stat.S_ISLNK(old_mode):
  175. # Type change: file/submodule -> symlink
  176. write_blob_diff(
  177. outstream,
  178. (path, old_mode, old_blob),
  179. (path, stat.S_IFLNK | 0o777, new_blob),
  180. )
  181. elif old_blob is not None and old_blob.data != target:
  182. # Symlink target changed
  183. write_blob_diff(
  184. outstream,
  185. (path, old_mode, old_blob),
  186. (path, old_mode, new_blob),
  187. )
  188. elif stat.S_ISREG(st.st_mode):
  189. # Regular file
  190. with open(full_path, "rb") as f:
  191. new_content = f.read()
  192. # Create a temporary blob for filtering and comparison
  193. new_blob = Blob()
  194. new_blob.data = new_content
  195. # Apply filters if needed (only for regular files, not gitlinks)
  196. if filter_callback is not None and (
  197. old_blob is None or not S_ISGITLINK(old_mode)
  198. ):
  199. new_blob = filter_callback(new_blob, path)
  200. # Determine the git mode for the new file
  201. if st.st_mode & stat.S_IXUSR:
  202. new_git_mode = stat.S_IFREG | 0o755
  203. else:
  204. new_git_mode = stat.S_IFREG | 0o644
  205. if old_blob is None:
  206. # New file
  207. write_blob_diff(
  208. outstream, (None, None, None), (path, new_git_mode, new_blob)
  209. )
  210. elif stat.S_ISLNK(old_mode):
  211. # Symlink -> file
  212. write_blob_diff(
  213. outstream,
  214. (path, old_mode, old_blob),
  215. (path, new_git_mode, new_blob),
  216. )
  217. elif S_ISGITLINK(old_mode):
  218. # Submodule -> file
  219. write_blob_diff(
  220. outstream,
  221. (path, old_mode, old_blob),
  222. (path, new_git_mode, new_blob),
  223. )
  224. else:
  225. # Regular file, check for content or mode changes
  226. old_git_mode = old_mode & (stat.S_IFREG | 0o777)
  227. if (
  228. old_blob is not None and old_blob.data != new_blob.data
  229. ) or old_git_mode != new_git_mode:
  230. write_blob_diff(
  231. outstream,
  232. (path, old_mode, old_blob),
  233. (path, new_git_mode, new_blob),
  234. )
  235. elif stat.S_ISFIFO(st.st_mode):
  236. logger.warning("%s: unsupported file type (fifo)", path.decode())
  237. if old_blob is not None:
  238. write_blob_diff(
  239. outstream, (path, old_mode, old_blob), (None, None, None)
  240. )
  241. elif stat.S_ISSOCK(st.st_mode):
  242. logger.warning("%s: unsupported file type (socket)", path.decode())
  243. if old_blob is not None:
  244. write_blob_diff(
  245. outstream, (path, old_mode, old_blob), (None, None, None)
  246. )
  247. else:
  248. logger.warning("%s: unsupported file type", path.decode())
  249. if old_blob is not None:
  250. write_blob_diff(
  251. outstream, (path, old_mode, old_blob), (None, None, None)
  252. )
  253. # Now process any new files from index that weren't in the tree
  254. for path in sorted(index_paths - processed_paths):
  255. if not should_include_path(path, paths):
  256. continue
  257. full_path = os.path.join(repo.path, path.decode("utf-8"))
  258. try:
  259. # Use lstat to handle symlinks properly
  260. st = os.lstat(full_path)
  261. except FileNotFoundError:
  262. # New file already deleted, skip
  263. continue
  264. except PermissionError:
  265. logger.warning("%s: Permission denied", path.decode())
  266. continue
  267. except OSError as e:
  268. logger.warning("%s: %s", path.decode(), e)
  269. continue
  270. # Handle different file types for new files
  271. if stat.S_ISDIR(st.st_mode):
  272. # New directory - skip it
  273. continue
  274. elif stat.S_ISLNK(st.st_mode):
  275. # New symlink
  276. target = os.readlink(full_path).encode("utf-8")
  277. new_blob = Blob()
  278. new_blob.data = target
  279. write_blob_diff(
  280. outstream,
  281. (None, None, None),
  282. (path, stat.S_IFLNK | 0o777, new_blob),
  283. )
  284. elif stat.S_ISREG(st.st_mode):
  285. # New regular file
  286. with open(full_path, "rb") as f:
  287. new_content = f.read()
  288. new_blob = Blob()
  289. new_blob.data = new_content
  290. # Apply filters if needed
  291. if filter_callback is not None:
  292. new_blob = filter_callback(new_blob, path)
  293. # Determine the git mode for the new file
  294. if st.st_mode & stat.S_IXUSR:
  295. new_git_mode = 0o100755
  296. else:
  297. new_git_mode = 0o100644
  298. write_blob_diff(
  299. outstream, (None, None, None), (path, new_git_mode, new_blob)
  300. )
  301. elif stat.S_ISFIFO(st.st_mode):
  302. logger.warning("%s: unsupported file type (fifo)", path.decode())
  303. elif stat.S_ISSOCK(st.st_mode):
  304. logger.warning("%s: unsupported file type (socket)", path.decode())
  305. else:
  306. logger.warning("%s: unsupported file type", path.decode())
  307. def diff_working_tree_to_index(
  308. repo: Repo, outstream: BinaryIO, paths: Optional[list[bytes]] = None
  309. ) -> None:
  310. """Compare working tree to index.
  311. Args:
  312. repo: Repository object
  313. outstream: Stream to write diff to
  314. paths: Optional list of paths to filter (as bytes)
  315. """
  316. index = repo.open_index()
  317. normalizer = repo.get_blob_normalizer()
  318. filter_callback = normalizer.checkin_normalize
  319. # Process each file in the index
  320. for tree_path, entry in index.iteritems():
  321. if not should_include_path(tree_path, paths):
  322. continue
  323. # Handle conflicted entries by using stage 2 ("ours")
  324. if isinstance(entry, ConflictedIndexEntry):
  325. if entry.this is None:
  326. continue # No stage 2 entry, skip
  327. old_mode = entry.this.mode
  328. old_sha = entry.this.sha
  329. else:
  330. # Get file from regular index entry
  331. old_mode = entry.mode
  332. old_sha = entry.sha
  333. old_obj = repo.object_store[old_sha]
  334. # Type check and cast to Blob
  335. if isinstance(old_obj, Blob):
  336. old_blob = cast(Blob, old_obj)
  337. else:
  338. old_blob = None
  339. full_path = os.path.join(repo.path, tree_path.decode("utf-8"))
  340. try:
  341. # Use lstat to handle symlinks properly
  342. st = os.lstat(full_path)
  343. # Handle different file types
  344. if stat.S_ISDIR(st.st_mode):
  345. # Directory in working tree where file was expected
  346. if stat.S_ISLNK(old_mode):
  347. logger.warning("%s: symlink became a directory", tree_path.decode())
  348. else:
  349. logger.warning("%s: file became a directory", tree_path.decode())
  350. # Show as deletion
  351. write_blob_diff(
  352. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  353. )
  354. elif stat.S_ISLNK(st.st_mode):
  355. # Symlink in working tree
  356. target = os.readlink(full_path).encode("utf-8")
  357. new_blob = Blob()
  358. new_blob.data = target
  359. # Check if type changed or content changed
  360. if not stat.S_ISLNK(old_mode):
  361. # Type change: file/submodule -> symlink
  362. write_blob_diff(
  363. outstream,
  364. (tree_path, old_mode, old_blob),
  365. (tree_path, stat.S_IFLNK | 0o777, new_blob),
  366. )
  367. elif old_blob is not None and old_blob.data != target:
  368. # Symlink target changed
  369. write_blob_diff(
  370. outstream,
  371. (tree_path, old_mode, old_blob),
  372. (tree_path, old_mode, new_blob),
  373. )
  374. elif stat.S_ISREG(st.st_mode):
  375. # Regular file
  376. with open(full_path, "rb") as f:
  377. new_content = f.read()
  378. # Create a temporary blob for filtering and comparison
  379. new_blob = Blob()
  380. new_blob.data = new_content
  381. # Apply filters if needed (only for regular files)
  382. if filter_callback is not None and not S_ISGITLINK(old_mode):
  383. new_blob = filter_callback(new_blob, tree_path)
  384. # Determine the git mode for the new file
  385. if st.st_mode & stat.S_IXUSR:
  386. new_git_mode = stat.S_IFREG | 0o755
  387. else:
  388. new_git_mode = stat.S_IFREG | 0o644
  389. # Check if this was a type change
  390. if stat.S_ISLNK(old_mode):
  391. # Symlink -> file
  392. write_blob_diff(
  393. outstream,
  394. (tree_path, old_mode, old_blob),
  395. (tree_path, new_git_mode, new_blob),
  396. )
  397. elif S_ISGITLINK(old_mode):
  398. # Submodule -> file
  399. write_blob_diff(
  400. outstream,
  401. (tree_path, old_mode, old_blob),
  402. (tree_path, new_git_mode, new_blob),
  403. )
  404. else:
  405. # Regular file, check for content or mode changes
  406. old_git_mode = old_mode & (stat.S_IFREG | 0o777)
  407. if (
  408. old_blob is not None and old_blob.data != new_blob.data
  409. ) or old_git_mode != new_git_mode:
  410. write_blob_diff(
  411. outstream,
  412. (tree_path, old_mode, old_blob),
  413. (tree_path, new_git_mode, new_blob),
  414. )
  415. elif stat.S_ISFIFO(st.st_mode):
  416. logger.warning("%s: unsupported file type (fifo)", tree_path.decode())
  417. write_blob_diff(
  418. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  419. )
  420. elif stat.S_ISSOCK(st.st_mode):
  421. logger.warning("%s: unsupported file type (socket)", tree_path.decode())
  422. write_blob_diff(
  423. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  424. )
  425. else:
  426. logger.warning("%s: unsupported file type", tree_path.decode())
  427. write_blob_diff(
  428. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  429. )
  430. except FileNotFoundError:
  431. # File was deleted - this is normal, not a warning
  432. write_blob_diff(
  433. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  434. )
  435. except PermissionError:
  436. logger.warning("%s: Permission denied", tree_path.decode())
  437. # Show as deletion since we can't read it
  438. write_blob_diff(
  439. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  440. )
  441. except OSError as e:
  442. logger.warning("%s: %s", tree_path.decode(), e)
  443. # Show as deletion since we can't read it
  444. write_blob_diff(
  445. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  446. )
  447. class ColorizedDiffStream:
  448. """Stream wrapper that colorizes diff output line by line using Rich.
  449. This class wraps a binary output stream and applies color formatting
  450. to diff output as it's written. It processes data line by line to
  451. enable streaming colorization without buffering the entire diff.
  452. """
  453. @staticmethod
  454. def is_available() -> bool:
  455. """Check if Rich is available for colorization.
  456. Returns:
  457. bool: True if Rich can be imported, False otherwise
  458. """
  459. try:
  460. import importlib.util
  461. return importlib.util.find_spec("rich.console") is not None
  462. except ImportError:
  463. return False
  464. def __init__(self, output_stream: BinaryIO) -> None:
  465. """Initialize the colorized stream wrapper.
  466. Args:
  467. output_stream: The underlying binary stream to write to
  468. """
  469. self.output_stream = output_stream
  470. import io
  471. from rich.console import Console
  472. # Rich expects a text stream, so we need to wrap our binary stream
  473. self.text_wrapper = io.TextIOWrapper(
  474. output_stream, encoding="utf-8", newline=""
  475. )
  476. self.console = Console(file=self.text_wrapper, force_terminal=True)
  477. self.buffer = b""
  478. def write(self, data: bytes) -> None:
  479. """Write data to the stream, applying colorization.
  480. Args:
  481. data: Bytes to write
  482. """
  483. # Add new data to buffer
  484. self.buffer += data
  485. # Process complete lines
  486. while b"\n" in self.buffer:
  487. line, self.buffer = self.buffer.split(b"\n", 1)
  488. self._colorize_and_write_line(line + b"\n")
  489. def writelines(self, lines: list[bytes]) -> None:
  490. """Write a list of lines to the stream.
  491. Args:
  492. lines: Iterable of bytes to write
  493. """
  494. for line in lines:
  495. self.write(line)
  496. def _colorize_and_write_line(self, line_bytes: bytes) -> None:
  497. """Apply color formatting to a single line and write it.
  498. Args:
  499. line_bytes: The line to colorize and write (as bytes)
  500. """
  501. try:
  502. line = line_bytes.decode("utf-8", errors="replace")
  503. # Colorize based on diff line type
  504. if line.startswith("+") and not line.startswith("+++"):
  505. self.console.print(line, style="green", end="")
  506. elif line.startswith("-") and not line.startswith("---"):
  507. self.console.print(line, style="red", end="")
  508. elif line.startswith("@@"):
  509. self.console.print(line, style="cyan", end="")
  510. elif line.startswith(("+++", "---")):
  511. self.console.print(line, style="bold", end="")
  512. else:
  513. self.console.print(line, end="")
  514. except (UnicodeDecodeError, UnicodeEncodeError):
  515. # Fallback to raw output if we can't decode/encode the text
  516. self.output_stream.write(line_bytes)
  517. def flush(self) -> None:
  518. """Flush any remaining buffered content and the underlying stream."""
  519. # Write any remaining buffer content
  520. if self.buffer:
  521. self._colorize_and_write_line(self.buffer)
  522. self.buffer = b""
  523. # Flush the text wrapper and underlying stream
  524. if hasattr(self.text_wrapper, "flush"):
  525. self.text_wrapper.flush()
  526. if hasattr(self.output_stream, "flush"):
  527. self.output_stream.flush()