diff_tree.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842
  1. # diff_tree.py -- Utilities for diffing files and trees.
  2. # Copyright (C) 2010 Google, Inc.
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Utilities for diffing files and trees."""
  22. import stat
  23. from collections import defaultdict
  24. from collections.abc import Callable, Iterator, Mapping, Sequence
  25. from collections.abc import Set as AbstractSet
  26. from io import BytesIO
  27. from itertools import chain
  28. from typing import TYPE_CHECKING, Any, NamedTuple, TypeVar
  29. from .object_store import BaseObjectStore
  30. from .objects import S_ISGITLINK, ObjectID, ShaFile, Tree, TreeEntry
  31. # TreeChange type constants.
  32. CHANGE_ADD = "add"
  33. CHANGE_MODIFY = "modify"
  34. CHANGE_DELETE = "delete"
  35. CHANGE_RENAME = "rename"
  36. CHANGE_COPY = "copy"
  37. CHANGE_UNCHANGED = "unchanged"
  38. RENAME_CHANGE_TYPES = (CHANGE_RENAME, CHANGE_COPY)
  39. # _NULL_ENTRY removed - using None instead
  40. _MAX_SCORE = 100
  41. RENAME_THRESHOLD = 60
  42. MAX_FILES = 200
  43. REWRITE_THRESHOLD: int | None = None
  44. class TreeChange(NamedTuple):
  45. """Named tuple a single change between two trees."""
  46. type: str
  47. old: TreeEntry | None
  48. new: TreeEntry | None
  49. @classmethod
  50. def add(cls, new: TreeEntry) -> "TreeChange":
  51. """Create a TreeChange for an added entry.
  52. Args:
  53. new: New tree entry
  54. Returns:
  55. TreeChange instance
  56. """
  57. return cls(CHANGE_ADD, None, new)
  58. @classmethod
  59. def delete(cls, old: TreeEntry) -> "TreeChange":
  60. """Create a TreeChange for a deleted entry.
  61. Args:
  62. old: Old tree entry
  63. Returns:
  64. TreeChange instance
  65. """
  66. return cls(CHANGE_DELETE, old, None)
  67. def _tree_entries(path: bytes, tree: Tree) -> list[TreeEntry]:
  68. result: list[TreeEntry] = []
  69. if not tree:
  70. return result
  71. for entry in tree.iteritems(name_order=True):
  72. result.append(entry.in_path(path))
  73. return result
  74. def _merge_entries(
  75. path: bytes, tree1: Tree, tree2: Tree
  76. ) -> list[tuple[TreeEntry | None, TreeEntry | None]]:
  77. """Merge the entries of two trees.
  78. Args:
  79. path: A path to prepend to all tree entry names.
  80. tree1: The first Tree object to iterate, or None.
  81. tree2: The second Tree object to iterate, or None.
  82. Returns:
  83. A list of pairs of TreeEntry objects for each pair of entries in
  84. the trees. If an entry exists in one tree but not the other, the other
  85. entry will be None. If both entries exist, they are guaranteed to match.
  86. """
  87. entries1 = _tree_entries(path, tree1)
  88. entries2 = _tree_entries(path, tree2)
  89. i1 = i2 = 0
  90. len1 = len(entries1)
  91. len2 = len(entries2)
  92. result: list[tuple[TreeEntry | None, TreeEntry | None]] = []
  93. while i1 < len1 and i2 < len2:
  94. entry1 = entries1[i1]
  95. entry2 = entries2[i2]
  96. if entry1.path < entry2.path:
  97. result.append((entry1, None))
  98. i1 += 1
  99. elif entry1.path > entry2.path:
  100. result.append((None, entry2))
  101. i2 += 1
  102. else:
  103. result.append((entry1, entry2))
  104. i1 += 1
  105. i2 += 1
  106. for i in range(i1, len1):
  107. result.append((entries1[i], None))
  108. for i in range(i2, len2):
  109. result.append((None, entries2[i]))
  110. return result
  111. def _is_tree(entry: TreeEntry | None) -> bool:
  112. if entry is None or entry.mode is None:
  113. return False
  114. return stat.S_ISDIR(entry.mode)
  115. def walk_trees(
  116. store: BaseObjectStore,
  117. tree1_id: ObjectID | None,
  118. tree2_id: ObjectID | None,
  119. prune_identical: bool = False,
  120. paths: Sequence[bytes] | None = None,
  121. ) -> Iterator[tuple[TreeEntry | None, TreeEntry | None]]:
  122. """Recursively walk all the entries of two trees.
  123. Iteration is depth-first pre-order, as in e.g. os.walk.
  124. Args:
  125. store: An ObjectStore for looking up objects.
  126. tree1_id: The SHA of the first Tree object to iterate, or None.
  127. tree2_id: The SHA of the second Tree object to iterate, or None.
  128. prune_identical: If True, identical subtrees will not be walked.
  129. paths: Optional list of paths to filter to (as bytes).
  130. Returns:
  131. Iterator over Pairs of TreeEntry objects for each pair of entries
  132. in the trees and their subtrees recursively. If an entry exists in one
  133. tree but not the other, the other entry will be None. If both entries
  134. exist, they are guaranteed to match.
  135. """
  136. # This could be fairly easily generalized to >2 trees if we find a use
  137. # case.
  138. entry1 = TreeEntry(b"", stat.S_IFDIR, tree1_id) if tree1_id else None
  139. entry2 = TreeEntry(b"", stat.S_IFDIR, tree2_id) if tree2_id else None
  140. todo: list[tuple[TreeEntry | None, TreeEntry | None]] = [(entry1, entry2)]
  141. while todo:
  142. entry1, entry2 = todo.pop()
  143. is_tree1 = _is_tree(entry1)
  144. is_tree2 = _is_tree(entry2)
  145. if prune_identical and is_tree1 and is_tree2 and entry1 == entry2:
  146. continue
  147. tree1 = (is_tree1 and entry1 and store[entry1.sha]) or None
  148. tree2 = (is_tree2 and entry2 and store[entry2.sha]) or None
  149. path = (
  150. (entry1.path if entry1 else None)
  151. or (entry2.path if entry2 else None)
  152. or b""
  153. )
  154. # If we have path filters, check if we should process this tree
  155. if paths is not None and (is_tree1 or is_tree2) and path is not None:
  156. # Special case for root tree
  157. if path == b"":
  158. should_recurse = True
  159. else:
  160. # Check if any of our filter paths could be under this tree
  161. should_recurse = False
  162. for filter_path in paths:
  163. if filter_path == path:
  164. # Exact match - we want this directory itself
  165. should_recurse = True
  166. break
  167. elif filter_path.startswith(path + b"/"):
  168. # Filter path is under this directory
  169. should_recurse = True
  170. break
  171. elif path.startswith(filter_path + b"/"):
  172. # This directory is under a filter path
  173. should_recurse = True
  174. break
  175. if not should_recurse:
  176. # Skip this tree entirely
  177. continue
  178. # Ensure trees are Tree objects before merging
  179. if tree1 is not None and not isinstance(tree1, Tree):
  180. tree1 = None
  181. if tree2 is not None and not isinstance(tree2, Tree):
  182. tree2 = None
  183. if tree1 is not None or tree2 is not None:
  184. # Use empty trees for None values
  185. if tree1 is None:
  186. tree1 = Tree()
  187. if tree2 is None:
  188. tree2 = Tree()
  189. assert path is not None
  190. todo.extend(reversed(_merge_entries(path, tree1, tree2)))
  191. # Only yield entries that match our path filters
  192. if paths is None:
  193. yield entry1, entry2
  194. else:
  195. # Check if this entry matches any of our filters
  196. for filter_path in paths:
  197. if path == filter_path:
  198. # Exact match
  199. yield entry1, entry2
  200. break
  201. elif path is not None and path.startswith(filter_path + b"/"):
  202. # This entry is under a filter directory
  203. yield entry1, entry2
  204. break
  205. elif (
  206. path is not None
  207. and filter_path.startswith(path + b"/")
  208. and (is_tree1 or is_tree2)
  209. ):
  210. # This is a parent directory of a filter path
  211. yield entry1, entry2
  212. break
  213. def _skip_tree(entry: TreeEntry | None, include_trees: bool) -> TreeEntry | None:
  214. if entry is None or entry.mode is None:
  215. return None
  216. if not include_trees and stat.S_ISDIR(entry.mode):
  217. return None
  218. return entry
  219. def tree_changes(
  220. store: BaseObjectStore,
  221. tree1_id: ObjectID | None,
  222. tree2_id: ObjectID | None,
  223. want_unchanged: bool = False,
  224. rename_detector: "RenameDetector" | None = None,
  225. include_trees: bool = False,
  226. change_type_same: bool = False,
  227. paths: Sequence[bytes] | None = None,
  228. ) -> Iterator[TreeChange]:
  229. """Find the differences between the contents of two trees.
  230. Args:
  231. store: An ObjectStore for looking up objects.
  232. tree1_id: The SHA of the source tree.
  233. tree2_id: The SHA of the target tree.
  234. want_unchanged: If True, include TreeChanges for unmodified entries
  235. as well.
  236. include_trees: Whether to include trees
  237. rename_detector: RenameDetector object for detecting renames.
  238. change_type_same: Whether to report change types in the same
  239. entry or as delete+add.
  240. paths: Optional list of paths to filter to (as bytes).
  241. Returns:
  242. Iterator over TreeChange instances for each change between the
  243. source and target tree.
  244. """
  245. if rename_detector is not None and tree1_id is not None and tree2_id is not None:
  246. yield from rename_detector.changes_with_renames(
  247. tree1_id,
  248. tree2_id,
  249. want_unchanged=want_unchanged,
  250. include_trees=include_trees,
  251. )
  252. return
  253. entries = walk_trees(
  254. store, tree1_id, tree2_id, prune_identical=(not want_unchanged), paths=paths
  255. )
  256. for entry1, entry2 in entries:
  257. if entry1 == entry2 and not want_unchanged:
  258. continue
  259. # Treat entries for trees as missing.
  260. entry1 = _skip_tree(entry1, include_trees)
  261. entry2 = _skip_tree(entry2, include_trees)
  262. if entry1 is not None and entry2 is not None:
  263. if (
  264. entry1.mode is not None
  265. and entry2.mode is not None
  266. and stat.S_IFMT(entry1.mode) != stat.S_IFMT(entry2.mode)
  267. and not change_type_same
  268. ):
  269. # File type changed: report as delete/add.
  270. yield TreeChange.delete(entry1)
  271. entry1 = None
  272. change_type = CHANGE_ADD
  273. elif entry1 == entry2:
  274. change_type = CHANGE_UNCHANGED
  275. else:
  276. change_type = CHANGE_MODIFY
  277. elif entry1 is not None:
  278. change_type = CHANGE_DELETE
  279. elif entry2 is not None:
  280. change_type = CHANGE_ADD
  281. else:
  282. # Both were None because at least one was a tree.
  283. continue
  284. yield TreeChange(change_type, entry1, entry2)
  285. T = TypeVar("T")
  286. U = TypeVar("U")
  287. def _all_eq(seq: Sequence[T], key: Callable[[T], U], value: U) -> bool:
  288. for e in seq:
  289. if key(e) != value:
  290. return False
  291. return True
  292. def _all_same(seq: Sequence[Any], key: Callable[[Any], Any]) -> bool:
  293. return _all_eq(seq[1:], key, key(seq[0]))
  294. def tree_changes_for_merge(
  295. store: BaseObjectStore,
  296. parent_tree_ids: Sequence[ObjectID],
  297. tree_id: ObjectID,
  298. rename_detector: "RenameDetector" | None = None,
  299. ) -> Iterator[list[TreeChange | None]]:
  300. """Get the tree changes for a merge tree relative to all its parents.
  301. Args:
  302. store: An ObjectStore for looking up objects.
  303. parent_tree_ids: An iterable of the SHAs of the parent trees.
  304. tree_id: The SHA of the merge tree.
  305. rename_detector: RenameDetector object for detecting renames.
  306. Returns:
  307. Iterator over lists of TreeChange objects, one per conflicted path
  308. in the merge.
  309. Each list contains one element per parent, with the TreeChange for that
  310. path relative to that parent. An element may be None if it never
  311. existed in one parent and was deleted in two others.
  312. A path is only included in the output if it is a conflict, i.e. its SHA
  313. in the merge tree is not found in any of the parents, or in the case of
  314. deletes, if not all of the old SHAs match.
  315. """
  316. all_parent_changes = [
  317. tree_changes(store, t, tree_id, rename_detector=rename_detector)
  318. for t in parent_tree_ids
  319. ]
  320. num_parents = len(parent_tree_ids)
  321. changes_by_path: dict[bytes, list[TreeChange | None]] = defaultdict(
  322. lambda: [None] * num_parents
  323. )
  324. # Organize by path.
  325. for i, parent_changes in enumerate(all_parent_changes):
  326. for change in parent_changes:
  327. if change.type == CHANGE_DELETE:
  328. assert change.old is not None
  329. path = change.old.path
  330. else:
  331. assert change.new is not None
  332. path = change.new.path
  333. assert path is not None
  334. changes_by_path[path][i] = change
  335. def old_sha(c: TreeChange) -> ObjectID | None:
  336. return c.old.sha if c.old is not None else None
  337. def change_type(c: TreeChange) -> str:
  338. return c.type
  339. # Yield only conflicting changes.
  340. for _, changes in sorted(changes_by_path.items()):
  341. assert len(changes) == num_parents
  342. have = [c for c in changes if c is not None]
  343. if _all_eq(have, change_type, CHANGE_DELETE):
  344. if not _all_same(have, old_sha):
  345. yield changes
  346. elif not _all_same(have, change_type):
  347. yield changes
  348. elif None not in changes:
  349. # If no change was found relative to one parent, that means the SHA
  350. # must have matched the SHA in that parent, so it is not a
  351. # conflict.
  352. yield changes
  353. _BLOCK_SIZE = 64
  354. def _count_blocks(obj: ShaFile) -> dict[int, int]:
  355. """Count the blocks in an object.
  356. Splits the data into blocks either on lines or <=64-byte chunks of lines.
  357. Args:
  358. obj: The object to count blocks for.
  359. Returns:
  360. A dict of block hashcode -> total bytes occurring.
  361. """
  362. block_counts: dict[int, int] = defaultdict(int)
  363. block = BytesIO()
  364. n = 0
  365. # Cache attrs as locals to avoid expensive lookups in the inner loop.
  366. block_write = block.write
  367. block_seek = block.seek
  368. block_truncate = block.truncate
  369. block_getvalue = block.getvalue
  370. for c in chain.from_iterable(obj.as_raw_chunks()):
  371. cb = c.to_bytes(1, "big")
  372. block_write(cb)
  373. n += 1
  374. if cb == b"\n" or n == _BLOCK_SIZE:
  375. value = block_getvalue()
  376. block_counts[hash(value)] += len(value)
  377. block_seek(0)
  378. block_truncate()
  379. n = 0
  380. if n > 0:
  381. last_block = block_getvalue()
  382. block_counts[hash(last_block)] += len(last_block)
  383. return block_counts
  384. def _common_bytes(blocks1: Mapping[int, int], blocks2: Mapping[int, int]) -> int:
  385. """Count the number of common bytes in two block count dicts.
  386. Args:
  387. blocks1: The first dict of block hashcode -> total bytes.
  388. blocks2: The second dict of block hashcode -> total bytes.
  389. Returns:
  390. The number of bytes in common between blocks1 and blocks2. This is
  391. only approximate due to possible hash collisions.
  392. """
  393. # Iterate over the smaller of the two dicts, since this is symmetrical.
  394. if len(blocks1) > len(blocks2):
  395. blocks1, blocks2 = blocks2, blocks1
  396. score = 0
  397. for block, count1 in blocks1.items():
  398. count2 = blocks2.get(block)
  399. if count2:
  400. score += min(count1, count2)
  401. return score
  402. def _similarity_score(
  403. obj1: ShaFile,
  404. obj2: ShaFile,
  405. block_cache: dict[ObjectID, dict[int, int]] | None = None,
  406. ) -> int:
  407. """Compute a similarity score for two objects.
  408. Args:
  409. obj1: The first object to score.
  410. obj2: The second object to score.
  411. block_cache: An optional dict of SHA to block counts to cache
  412. results between calls.
  413. Returns:
  414. The similarity score between the two objects, defined as the
  415. number of bytes in common between the two objects divided by the
  416. maximum size, scaled to the range 0-100.
  417. """
  418. if block_cache is None:
  419. block_cache = {}
  420. if obj1.id not in block_cache:
  421. block_cache[obj1.id] = _count_blocks(obj1)
  422. if obj2.id not in block_cache:
  423. block_cache[obj2.id] = _count_blocks(obj2)
  424. common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])
  425. max_size = max(obj1.raw_length(), obj2.raw_length())
  426. if not max_size:
  427. return _MAX_SCORE
  428. return int(float(common_bytes) * _MAX_SCORE / max_size)
  429. def _tree_change_key(entry: TreeChange) -> tuple[bytes, bytes]:
  430. # Sort by old path then new path. If only one exists, use it for both keys.
  431. path1 = entry.old.path if entry.old is not None else None
  432. path2 = entry.new.path if entry.new is not None else None
  433. if path1 is None:
  434. path1 = path2
  435. if path2 is None:
  436. path2 = path1
  437. assert path1 is not None
  438. assert path2 is not None
  439. return (path1, path2)
  440. class RenameDetector:
  441. """Object for handling rename detection between two trees."""
  442. _adds: list[TreeChange]
  443. _deletes: list[TreeChange]
  444. _changes: list[TreeChange]
  445. _candidates: list[tuple[int, TreeChange]]
  446. def __init__(
  447. self,
  448. store: BaseObjectStore,
  449. rename_threshold: int = RENAME_THRESHOLD,
  450. max_files: int | None = MAX_FILES,
  451. rewrite_threshold: int | None = REWRITE_THRESHOLD,
  452. find_copies_harder: bool = False,
  453. ) -> None:
  454. """Initialize the rename detector.
  455. Args:
  456. store: An ObjectStore for looking up objects.
  457. rename_threshold: The threshold similarity score for considering
  458. an add/delete pair to be a rename/copy; see _similarity_score.
  459. max_files: The maximum number of adds and deletes to consider,
  460. or None for no limit. The detector is guaranteed to compare no more
  461. than max_files ** 2 add/delete pairs. This limit is provided
  462. because rename detection can be quadratic in the project size. If
  463. the limit is exceeded, no content rename detection is attempted.
  464. rewrite_threshold: The threshold similarity score below which a
  465. modify should be considered a delete/add, or None to not break
  466. modifies; see _similarity_score.
  467. find_copies_harder: If True, consider unmodified files when
  468. detecting copies.
  469. """
  470. self._store = store
  471. self._rename_threshold = rename_threshold
  472. self._rewrite_threshold = rewrite_threshold
  473. self._max_files = max_files
  474. self._find_copies_harder = find_copies_harder
  475. self._want_unchanged = False
  476. def _reset(self) -> None:
  477. self._adds = []
  478. self._deletes = []
  479. self._changes = []
  480. def _should_split(self, change: TreeChange) -> bool:
  481. if self._rewrite_threshold is None or change.type != CHANGE_MODIFY:
  482. return False
  483. assert change.old is not None and change.new is not None
  484. if change.old.sha == change.new.sha:
  485. return False
  486. assert change.old.sha is not None
  487. assert change.new.sha is not None
  488. old_obj = self._store[change.old.sha]
  489. new_obj = self._store[change.new.sha]
  490. return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
  491. def _add_change(self, change: TreeChange) -> None:
  492. if change.type == CHANGE_ADD:
  493. self._adds.append(change)
  494. elif change.type == CHANGE_DELETE:
  495. self._deletes.append(change)
  496. elif self._should_split(change):
  497. assert change.old is not None and change.new is not None
  498. self._deletes.append(TreeChange.delete(change.old))
  499. self._adds.append(TreeChange.add(change.new))
  500. elif (
  501. self._find_copies_harder and change.type == CHANGE_UNCHANGED
  502. ) or change.type == CHANGE_MODIFY:
  503. # Treat all modifies as potential deletes for rename detection,
  504. # but don't split them (to avoid spurious renames). Setting
  505. # find_copies_harder means we treat unchanged the same as
  506. # modified.
  507. self._deletes.append(change)
  508. else:
  509. self._changes.append(change)
  510. def _collect_changes(
  511. self, tree1_id: ObjectID | None, tree2_id: ObjectID | None
  512. ) -> None:
  513. want_unchanged = self._find_copies_harder or self._want_unchanged
  514. for change in tree_changes(
  515. self._store,
  516. tree1_id,
  517. tree2_id,
  518. want_unchanged=want_unchanged,
  519. include_trees=self._include_trees,
  520. ):
  521. self._add_change(change)
  522. def _prune(
  523. self, add_paths: AbstractSet[bytes], delete_paths: AbstractSet[bytes]
  524. ) -> None:
  525. def check_add(a: TreeChange) -> bool:
  526. assert a.new is not None
  527. return a.new.path not in add_paths
  528. def check_delete(d: TreeChange) -> bool:
  529. assert d.old is not None
  530. return d.old.path not in delete_paths
  531. self._adds = [a for a in self._adds if check_add(a)]
  532. self._deletes = [d for d in self._deletes if check_delete(d)]
  533. def _find_exact_renames(self) -> None:
  534. add_map = defaultdict(list)
  535. for add in self._adds:
  536. assert add.new is not None
  537. add_map[add.new.sha].append(add.new)
  538. delete_map = defaultdict(list)
  539. for delete in self._deletes:
  540. # Keep track of whether the delete was actually marked as a delete.
  541. # If not, it needs to be marked as a copy.
  542. is_delete = delete.type == CHANGE_DELETE
  543. assert delete.old is not None
  544. delete_map[delete.old.sha].append((delete.old, is_delete))
  545. add_paths = set()
  546. delete_paths = set()
  547. for sha, sha_deletes in delete_map.items():
  548. sha_adds = add_map[sha]
  549. for (old, is_delete), new in zip(sha_deletes, sha_adds):
  550. assert old.mode is not None
  551. assert new.mode is not None
  552. if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode):
  553. continue
  554. if is_delete:
  555. assert old.path is not None
  556. delete_paths.add(old.path)
  557. assert new.path is not None
  558. add_paths.add(new.path)
  559. new_type = (is_delete and CHANGE_RENAME) or CHANGE_COPY
  560. self._changes.append(TreeChange(new_type, old, new))
  561. num_extra_adds = len(sha_adds) - len(sha_deletes)
  562. # TODO(dborowitz): Less arbitrary way of dealing with extra copies.
  563. old = sha_deletes[0][0]
  564. if num_extra_adds > 0:
  565. for new in sha_adds[-num_extra_adds:]:
  566. assert new.path is not None
  567. add_paths.add(new.path)
  568. self._changes.append(TreeChange(CHANGE_COPY, old, new))
  569. self._prune(add_paths, delete_paths)
  570. def _should_find_content_renames(self) -> bool:
  571. if self._max_files is None:
  572. return True
  573. return len(self._adds) * len(self._deletes) <= self._max_files**2
  574. def _rename_type(
  575. self, check_paths: bool, delete: TreeChange, add: TreeChange
  576. ) -> str:
  577. assert delete.old is not None and add.new is not None
  578. if check_paths and delete.old.path == add.new.path:
  579. # If the paths match, this must be a split modify, so make sure it
  580. # comes out as a modify.
  581. return CHANGE_MODIFY
  582. elif delete.type != CHANGE_DELETE:
  583. # If it's in deletes but not marked as a delete, it must have been
  584. # added due to find_copies_harder, and needs to be marked as a
  585. # copy.
  586. return CHANGE_COPY
  587. return CHANGE_RENAME
  588. def _find_content_rename_candidates(self) -> None:
  589. candidates = self._candidates = []
  590. # TODO: Optimizations:
  591. # - Compare object sizes before counting blocks.
  592. # - Skip if delete's S_IFMT differs from all adds.
  593. # - Skip if adds or deletes is empty.
  594. # Match C git's behavior of not attempting to find content renames if
  595. # the matrix size exceeds the threshold.
  596. if not self._should_find_content_renames():
  597. return
  598. block_cache = {}
  599. check_paths = self._rename_threshold is not None
  600. for delete in self._deletes:
  601. assert delete.old is not None
  602. assert delete.old.mode is not None
  603. if S_ISGITLINK(delete.old.mode):
  604. continue # Git links don't exist in this repo.
  605. assert delete.old.sha is not None
  606. old_sha = delete.old.sha
  607. old_obj = self._store[old_sha]
  608. block_cache[old_sha] = _count_blocks(old_obj)
  609. for add in self._adds:
  610. assert add.new is not None
  611. assert add.new.mode is not None
  612. if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
  613. continue
  614. assert add.new.sha is not None
  615. new_obj = self._store[add.new.sha]
  616. score = _similarity_score(old_obj, new_obj, block_cache=block_cache)
  617. if score > self._rename_threshold:
  618. new_type = self._rename_type(check_paths, delete, add)
  619. rename = TreeChange(new_type, delete.old, add.new)
  620. candidates.append((-score, rename))
  621. def _choose_content_renames(self) -> None:
  622. # Sort scores from highest to lowest, but keep names in ascending
  623. # order.
  624. self._candidates.sort()
  625. delete_paths = set()
  626. add_paths = set()
  627. for _, change in self._candidates:
  628. assert change.old is not None and change.new is not None
  629. new_path = change.new.path
  630. assert new_path is not None
  631. if new_path in add_paths:
  632. continue
  633. old_path = change.old.path
  634. assert old_path is not None
  635. orig_type = change.type
  636. if old_path in delete_paths:
  637. change = TreeChange(CHANGE_COPY, change.old, change.new)
  638. # If the candidate was originally a copy, that means it came from a
  639. # modified or unchanged path, so we don't want to prune it.
  640. if orig_type != CHANGE_COPY:
  641. delete_paths.add(old_path)
  642. add_paths.add(new_path)
  643. self._changes.append(change)
  644. self._prune(add_paths, delete_paths)
  645. def _join_modifies(self) -> None:
  646. if self._rewrite_threshold is None:
  647. return
  648. modifies = {}
  649. delete_map = {}
  650. for d in self._deletes:
  651. assert d.old is not None
  652. delete_map[d.old.path] = d
  653. for add in self._adds:
  654. assert add.new is not None
  655. path = add.new.path
  656. delete = delete_map.get(path)
  657. if (
  658. delete is not None
  659. and delete.old is not None
  660. and delete.old.mode is not None
  661. and add.new.mode is not None
  662. and stat.S_IFMT(delete.old.mode) == stat.S_IFMT(add.new.mode)
  663. ):
  664. modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new)
  665. def check_add_mod(a: TreeChange) -> bool:
  666. assert a.new is not None
  667. return a.new.path not in modifies
  668. def check_delete_mod(d: TreeChange) -> bool:
  669. assert d.old is not None
  670. return d.old.path not in modifies
  671. self._adds = [a for a in self._adds if check_add_mod(a)]
  672. self._deletes = [d for d in self._deletes if check_delete_mod(d)]
  673. self._changes += modifies.values()
  674. def _sorted_changes(self) -> list[TreeChange]:
  675. result = []
  676. result.extend(self._adds)
  677. result.extend(self._deletes)
  678. result.extend(self._changes)
  679. result.sort(key=_tree_change_key)
  680. return result
  681. def _prune_unchanged(self) -> None:
  682. if self._want_unchanged:
  683. return
  684. self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED]
  685. def changes_with_renames(
  686. self,
  687. tree1_id: ObjectID | None,
  688. tree2_id: ObjectID | None,
  689. want_unchanged: bool = False,
  690. include_trees: bool = False,
  691. ) -> list[TreeChange]:
  692. """Iterate TreeChanges between two tree SHAs, with rename detection."""
  693. self._reset()
  694. self._want_unchanged = want_unchanged
  695. self._include_trees = include_trees
  696. self._collect_changes(tree1_id, tree2_id)
  697. self._find_exact_renames()
  698. self._find_content_rename_candidates()
  699. self._choose_content_renames()
  700. self._join_modifies()
  701. self._prune_unchanged()
  702. return self._sorted_changes()
  703. # Hold on to the pure-python implementations for testing.
  704. _is_tree_py = _is_tree
  705. _merge_entries_py = _merge_entries
  706. _count_blocks_py = _count_blocks
  707. if TYPE_CHECKING:
  708. # For type checking, use the Python implementations
  709. pass
  710. else:
  711. # At runtime, try to import Rust extensions
  712. try:
  713. # Try to import Rust versions
  714. from dulwich._diff_tree import (
  715. _count_blocks as _rust_count_blocks,
  716. )
  717. from dulwich._diff_tree import (
  718. _is_tree as _rust_is_tree,
  719. )
  720. from dulwich._diff_tree import (
  721. _merge_entries as _rust_merge_entries,
  722. )
  723. # Override with Rust versions
  724. _count_blocks = _rust_count_blocks
  725. _is_tree = _rust_is_tree
  726. _merge_entries = _rust_merge_entries
  727. except ImportError:
  728. pass