diff.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. # diff.py -- Diff functionality for Dulwich
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  5. # General Public License as public by the Free Software Foundation; version 2.0
  6. # or (at your option) any later version. You can redistribute it and/or
  7. # modify it under the terms of either of these two licenses.
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # You should have received a copy of the licenses; if not, see
  16. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  17. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  18. # License, Version 2.0.
  19. #
  20. """Diff functionality with separate codepaths.
  21. This module provides three main functions for different diff scenarios:
  22. 1. diff_index_to_tree: Shows staged changes (index vs commit)
  23. Used by: git diff --staged, git diff --cached
  24. 2. diff_working_tree_to_tree: Shows all changes from a commit to working tree
  25. Used by: git diff <commit>
  26. 3. diff_working_tree_to_index: Shows unstaged changes (working tree vs index)
  27. Used by: git diff (with no arguments)
  28. Example usage:
  29. from dulwich.repo import Repo
  30. from dulwich.diff import diff_index_to_tree
  31. import sys
  32. repo = Repo('.')
  33. # Show staged changes
  34. diff_index_to_tree(repo, sys.stdout.buffer)
  35. # Show changes in specific paths only
  36. diff_index_to_tree(repo, sys.stdout.buffer, paths=[b'src/', b'README.md'])
  37. """
  38. import logging
  39. import os
  40. import stat
  41. from typing import BinaryIO, Optional, cast
  42. from .index import ConflictedIndexEntry, commit_index
  43. from .object_store import iter_tree_contents
  44. from .objects import S_ISGITLINK, Blob
  45. from .patch import write_blob_diff, write_object_diff
  46. from .repo import Repo
  47. logger = logging.getLogger(__name__)
  48. def should_include_path(path: bytes, paths: Optional[list[bytes]]) -> bool:
  49. """Check if a path should be included based on path filters.
  50. Args:
  51. path: The path to check
  52. paths: List of path filters, or None for no filtering
  53. Returns:
  54. True if the path should be included
  55. """
  56. if not paths:
  57. return True
  58. return any(path == p or path.startswith(p + b"/") for p in paths)
  59. def diff_index_to_tree(
  60. repo: Repo,
  61. outstream: BinaryIO,
  62. commit_sha: Optional[bytes] = None,
  63. paths: Optional[list[bytes]] = None,
  64. ) -> None:
  65. """Show staged changes (index vs commit).
  66. Args:
  67. repo: Repository object
  68. outstream: Stream to write diff to
  69. commit_sha: SHA of commit to compare against, or None for HEAD
  70. paths: Optional list of paths to filter (as bytes)
  71. """
  72. if commit_sha is None:
  73. try:
  74. commit_sha = repo.refs[b"HEAD"]
  75. old_tree = repo[commit_sha].tree
  76. except KeyError:
  77. # No HEAD means no commits yet
  78. old_tree = None
  79. else:
  80. old_tree = repo[commit_sha].tree
  81. # Get tree from index
  82. index = repo.open_index()
  83. new_tree = commit_index(repo.object_store, index)
  84. changes = repo.object_store.tree_changes(old_tree, new_tree, paths=paths)
  85. for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
  86. write_object_diff(
  87. outstream,
  88. repo.object_store,
  89. (oldpath, oldmode, oldsha),
  90. (newpath, newmode, newsha),
  91. )
  92. def diff_working_tree_to_tree(
  93. repo: Repo,
  94. outstream: BinaryIO,
  95. commit_sha: bytes,
  96. paths: Optional[list[bytes]] = None,
  97. ) -> None:
  98. """Compare working tree to a specific commit.
  99. Args:
  100. repo: Repository object
  101. outstream: Stream to write diff to
  102. commit_sha: SHA of commit to compare against
  103. paths: Optional list of paths to filter (as bytes)
  104. """
  105. tree = repo[commit_sha].tree
  106. normalizer = repo.get_blob_normalizer()
  107. filter_callback = normalizer.checkin_normalize
  108. # Get index for tracking new files
  109. index = repo.open_index()
  110. index_paths = set(index.paths())
  111. processed_paths = set()
  112. # Process files from the committed tree lazily
  113. for entry in iter_tree_contents(repo.object_store, tree):
  114. path = entry.path
  115. if not should_include_path(path, paths):
  116. continue
  117. processed_paths.add(path)
  118. full_path = os.path.join(repo.path, path.decode("utf-8"))
  119. # Get the old file from tree
  120. old_mode = entry.mode
  121. old_sha = entry.sha
  122. old_blob = repo.object_store[old_sha]
  123. assert isinstance(old_blob, Blob)
  124. try:
  125. # Use lstat to handle symlinks properly
  126. st = os.lstat(full_path)
  127. except FileNotFoundError:
  128. # File was deleted
  129. if old_blob is not None:
  130. write_blob_diff(
  131. outstream, (path, old_mode, old_blob), (None, None, None)
  132. )
  133. except PermissionError:
  134. logger.warning("%s: Permission denied", path.decode())
  135. # Show as deletion if it was in tree
  136. if old_blob is not None:
  137. write_blob_diff(
  138. outstream, (path, old_mode, old_blob), (None, None, None)
  139. )
  140. except OSError as e:
  141. logger.warning("%s: %s", path.decode(), e)
  142. # Show as deletion if it was in tree
  143. if old_blob is not None:
  144. write_blob_diff(
  145. outstream, (path, old_mode, old_blob), (None, None, None)
  146. )
  147. else:
  148. # Handle different file types
  149. if stat.S_ISDIR(st.st_mode):
  150. if old_blob is not None:
  151. # Directory in working tree where file was expected
  152. if stat.S_ISLNK(old_mode):
  153. logger.warning("%s: symlink became a directory", path.decode())
  154. else:
  155. logger.warning("%s: file became a directory", path.decode())
  156. # Show as deletion
  157. write_blob_diff(
  158. outstream, (path, old_mode, old_blob), (None, None, None)
  159. )
  160. # If old_blob is None, it's a new directory - skip it
  161. continue
  162. elif stat.S_ISLNK(st.st_mode):
  163. # Symlink in working tree
  164. target = os.readlink(full_path).encode("utf-8")
  165. new_blob = Blob()
  166. new_blob.data = target
  167. if old_blob is None:
  168. # New symlink
  169. write_blob_diff(
  170. outstream,
  171. (None, None, None),
  172. (path, stat.S_IFLNK | 0o777, new_blob),
  173. )
  174. elif not stat.S_ISLNK(old_mode):
  175. # Type change: file/submodule -> symlink
  176. write_blob_diff(
  177. outstream,
  178. (path, old_mode, old_blob),
  179. (path, stat.S_IFLNK | 0o777, new_blob),
  180. )
  181. elif old_blob is not None and old_blob.data != target:
  182. # Symlink target changed
  183. write_blob_diff(
  184. outstream,
  185. (path, old_mode, old_blob),
  186. (path, old_mode, new_blob),
  187. )
  188. elif stat.S_ISREG(st.st_mode):
  189. # Regular file
  190. with open(full_path, "rb") as f:
  191. new_content = f.read()
  192. # Create a temporary blob for filtering and comparison
  193. new_blob = Blob()
  194. new_blob.data = new_content
  195. # Apply filters if needed (only for regular files, not gitlinks)
  196. if filter_callback is not None and (
  197. old_blob is None or not S_ISGITLINK(old_mode)
  198. ):
  199. new_blob = filter_callback(new_blob, path)
  200. # Determine the git mode for the new file
  201. if st.st_mode & stat.S_IXUSR:
  202. new_git_mode = stat.S_IFREG | 0o755
  203. else:
  204. new_git_mode = stat.S_IFREG | 0o644
  205. if old_blob is None:
  206. # New file
  207. write_blob_diff(
  208. outstream, (None, None, None), (path, new_git_mode, new_blob)
  209. )
  210. elif stat.S_ISLNK(old_mode):
  211. # Symlink -> file
  212. write_blob_diff(
  213. outstream,
  214. (path, old_mode, old_blob),
  215. (path, new_git_mode, new_blob),
  216. )
  217. elif S_ISGITLINK(old_mode):
  218. # Submodule -> file
  219. write_blob_diff(
  220. outstream,
  221. (path, old_mode, old_blob),
  222. (path, new_git_mode, new_blob),
  223. )
  224. else:
  225. # Regular file, check for content or mode changes
  226. old_git_mode = old_mode & (stat.S_IFREG | 0o777)
  227. if (
  228. old_blob is not None and old_blob.data != new_blob.data
  229. ) or old_git_mode != new_git_mode:
  230. write_blob_diff(
  231. outstream,
  232. (path, old_mode, old_blob),
  233. (path, new_git_mode, new_blob),
  234. )
  235. elif stat.S_ISFIFO(st.st_mode):
  236. logger.warning("%s: unsupported file type (fifo)", path.decode())
  237. if old_blob is not None:
  238. write_blob_diff(
  239. outstream, (path, old_mode, old_blob), (None, None, None)
  240. )
  241. elif stat.S_ISSOCK(st.st_mode):
  242. logger.warning("%s: unsupported file type (socket)", path.decode())
  243. if old_blob is not None:
  244. write_blob_diff(
  245. outstream, (path, old_mode, old_blob), (None, None, None)
  246. )
  247. else:
  248. logger.warning("%s: unsupported file type", path.decode())
  249. if old_blob is not None:
  250. write_blob_diff(
  251. outstream, (path, old_mode, old_blob), (None, None, None)
  252. )
  253. # Now process any new files from index that weren't in the tree
  254. for path in sorted(index_paths - processed_paths):
  255. if not should_include_path(path, paths):
  256. continue
  257. full_path = os.path.join(repo.path, path.decode("utf-8"))
  258. try:
  259. # Use lstat to handle symlinks properly
  260. st = os.lstat(full_path)
  261. except FileNotFoundError:
  262. # New file already deleted, skip
  263. continue
  264. except PermissionError:
  265. logger.warning("%s: Permission denied", path.decode())
  266. continue
  267. except OSError as e:
  268. logger.warning("%s: %s", path.decode(), e)
  269. continue
  270. # Handle different file types for new files
  271. if stat.S_ISDIR(st.st_mode):
  272. # New directory - skip it
  273. continue
  274. elif stat.S_ISLNK(st.st_mode):
  275. # New symlink
  276. target = os.readlink(full_path).encode("utf-8")
  277. new_blob = Blob()
  278. new_blob.data = target
  279. write_blob_diff(
  280. outstream,
  281. (None, None, None),
  282. (path, stat.S_IFLNK | 0o777, new_blob),
  283. )
  284. elif stat.S_ISREG(st.st_mode):
  285. # New regular file
  286. with open(full_path, "rb") as f:
  287. new_content = f.read()
  288. new_blob = Blob()
  289. new_blob.data = new_content
  290. # Apply filters if needed
  291. if filter_callback is not None:
  292. new_blob = filter_callback(new_blob, path)
  293. # Determine the git mode for the new file
  294. if st.st_mode & stat.S_IXUSR:
  295. new_git_mode = 0o100755
  296. else:
  297. new_git_mode = 0o100644
  298. write_blob_diff(
  299. outstream, (None, None, None), (path, new_git_mode, new_blob)
  300. )
  301. elif stat.S_ISFIFO(st.st_mode):
  302. logger.warning("%s: unsupported file type (fifo)", path.decode())
  303. elif stat.S_ISSOCK(st.st_mode):
  304. logger.warning("%s: unsupported file type (socket)", path.decode())
  305. else:
  306. logger.warning("%s: unsupported file type", path.decode())
  307. def diff_working_tree_to_index(
  308. repo: Repo, outstream: BinaryIO, paths: Optional[list[bytes]] = None
  309. ) -> None:
  310. """Compare working tree to index.
  311. Args:
  312. repo: Repository object
  313. outstream: Stream to write diff to
  314. paths: Optional list of paths to filter (as bytes)
  315. """
  316. index = repo.open_index()
  317. normalizer = repo.get_blob_normalizer()
  318. filter_callback = normalizer.checkin_normalize
  319. # Process each file in the index
  320. for tree_path, entry in index.iteritems():
  321. if not should_include_path(tree_path, paths):
  322. continue
  323. # Handle conflicted entries by using stage 2 ("ours")
  324. if isinstance(entry, ConflictedIndexEntry):
  325. if entry.this is None:
  326. continue # No stage 2 entry, skip
  327. old_mode = entry.this.mode
  328. old_sha = entry.this.sha
  329. else:
  330. # Get file from regular index entry
  331. old_mode = entry.mode
  332. old_sha = entry.sha
  333. old_obj = repo.object_store[old_sha]
  334. # Type check and cast to Blob
  335. if isinstance(old_obj, Blob):
  336. old_blob = cast(Blob, old_obj)
  337. else:
  338. old_blob = None
  339. full_path = os.path.join(repo.path, tree_path.decode("utf-8"))
  340. try:
  341. # Use lstat to handle symlinks properly
  342. st = os.lstat(full_path)
  343. # Handle different file types
  344. if stat.S_ISDIR(st.st_mode):
  345. # Directory in working tree where file was expected
  346. if stat.S_ISLNK(old_mode):
  347. logger.warning("%s: symlink became a directory", tree_path.decode())
  348. else:
  349. logger.warning("%s: file became a directory", tree_path.decode())
  350. # Show as deletion
  351. write_blob_diff(
  352. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  353. )
  354. elif stat.S_ISLNK(st.st_mode):
  355. # Symlink in working tree
  356. target = os.readlink(full_path).encode("utf-8")
  357. new_blob = Blob()
  358. new_blob.data = target
  359. # Check if type changed or content changed
  360. if not stat.S_ISLNK(old_mode):
  361. # Type change: file/submodule -> symlink
  362. write_blob_diff(
  363. outstream,
  364. (tree_path, old_mode, old_blob),
  365. (tree_path, stat.S_IFLNK | 0o777, new_blob),
  366. )
  367. elif old_blob is not None and old_blob.data != target:
  368. # Symlink target changed
  369. write_blob_diff(
  370. outstream,
  371. (tree_path, old_mode, old_blob),
  372. (tree_path, old_mode, new_blob),
  373. )
  374. elif stat.S_ISREG(st.st_mode):
  375. # Regular file
  376. with open(full_path, "rb") as f:
  377. new_content = f.read()
  378. # Create a temporary blob for filtering and comparison
  379. new_blob = Blob()
  380. new_blob.data = new_content
  381. # Apply filters if needed (only for regular files)
  382. if filter_callback is not None and not S_ISGITLINK(old_mode):
  383. new_blob = filter_callback(new_blob, tree_path)
  384. # Determine the git mode for the new file
  385. if st.st_mode & stat.S_IXUSR:
  386. new_git_mode = stat.S_IFREG | 0o755
  387. else:
  388. new_git_mode = stat.S_IFREG | 0o644
  389. # Check if this was a type change
  390. if stat.S_ISLNK(old_mode):
  391. # Symlink -> file
  392. write_blob_diff(
  393. outstream,
  394. (tree_path, old_mode, old_blob),
  395. (tree_path, new_git_mode, new_blob),
  396. )
  397. elif S_ISGITLINK(old_mode):
  398. # Submodule -> file
  399. write_blob_diff(
  400. outstream,
  401. (tree_path, old_mode, old_blob),
  402. (tree_path, new_git_mode, new_blob),
  403. )
  404. else:
  405. # Regular file, check for content or mode changes
  406. old_git_mode = old_mode & (stat.S_IFREG | 0o777)
  407. if (
  408. old_blob is not None and old_blob.data != new_blob.data
  409. ) or old_git_mode != new_git_mode:
  410. write_blob_diff(
  411. outstream,
  412. (tree_path, old_mode, old_blob),
  413. (tree_path, new_git_mode, new_blob),
  414. )
  415. elif stat.S_ISFIFO(st.st_mode):
  416. logger.warning("%s: unsupported file type (fifo)", tree_path.decode())
  417. write_blob_diff(
  418. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  419. )
  420. elif stat.S_ISSOCK(st.st_mode):
  421. logger.warning("%s: unsupported file type (socket)", tree_path.decode())
  422. write_blob_diff(
  423. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  424. )
  425. else:
  426. logger.warning("%s: unsupported file type", tree_path.decode())
  427. write_blob_diff(
  428. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  429. )
  430. except FileNotFoundError:
  431. # File was deleted - this is normal, not a warning
  432. write_blob_diff(
  433. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  434. )
  435. except PermissionError:
  436. logger.warning("%s: Permission denied", tree_path.decode())
  437. # Show as deletion since we can't read it
  438. write_blob_diff(
  439. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  440. )
  441. except OSError as e:
  442. logger.warning("%s: %s", tree_path.decode(), e)
  443. # Show as deletion since we can't read it
  444. write_blob_diff(
  445. outstream, (tree_path, old_mode, old_blob), (None, None, None)
  446. )