diff_tree.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. # diff_tree.py -- Utilities for diffing files and trees.
  2. # Copyright (C) 2010 Google, Inc.
  3. #
  4. # This program is free software; you can redistribute it and/or
  5. # modify it under the terms of the GNU General Public License
  6. # as published by the Free Software Foundation; either version 2
  7. # or (at your option) a later version of the License.
  8. #
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU General Public License for more details.
  13. #
  14. # You should have received a copy of the GNU General Public License
  15. # along with this program; if not, write to the Free Software
  16. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  17. # MA 02110-1301, USA.
  18. """Utilities for diffing files and trees."""
  19. from cStringIO import StringIO
  20. import itertools
  21. import stat
  22. from dulwich.misc import (
  23. defaultdict,
  24. TreeChangeTuple,
  25. )
  26. from dulwich.objects import (
  27. TreeEntry,
  28. )
  29. # TreeChange type constants.
  30. CHANGE_ADD = 'add'
  31. CHANGE_MODIFY = 'modify'
  32. CHANGE_DELETE = 'delete'
  33. CHANGE_RENAME = 'rename'
  34. CHANGE_COPY = 'copy'
  35. CHANGE_UNCHANGED = 'unchanged'
  36. _NULL_ENTRY = TreeEntry(None, None, None)
  37. _MAX_SCORE = 100
  38. class TreeChange(TreeChangeTuple):
  39. """Class encapsulating a single change between two trees."""
  40. @classmethod
  41. def add(cls, new):
  42. return cls(CHANGE_ADD, _NULL_ENTRY, new)
  43. @classmethod
  44. def delete(cls, old):
  45. return cls(CHANGE_DELETE, old, _NULL_ENTRY)
  46. def _tree_entries(path, tree):
  47. result = []
  48. if not tree:
  49. return result
  50. for entry in tree.iteritems(name_order=True):
  51. result.append(entry.in_path(path))
  52. return result
  53. def _merge_entries(path, tree1, tree2):
  54. """Merge the entries of two trees.
  55. :param path: A path to prepend to all tree entry names.
  56. :param tree1: The first Tree object to iterate, or None.
  57. :param tree2: The second Tree object to iterate, or None.
  58. :return: A list of pairs of TreeEntry objects for each pair of entries in
  59. the trees. If an entry exists in one tree but not the other, the other
  60. entry will have all attributes set to None. If neither entry's path is
  61. None, they are guaranteed to match.
  62. """
  63. entries1 = _tree_entries(path, tree1)
  64. entries2 = _tree_entries(path, tree2)
  65. i1 = i2 = 0
  66. len1 = len(entries1)
  67. len2 = len(entries2)
  68. result = []
  69. while i1 < len1 and i2 < len2:
  70. entry1 = entries1[i1]
  71. entry2 = entries2[i2]
  72. if entry1.path < entry2.path:
  73. result.append((entry1, _NULL_ENTRY))
  74. i1 += 1
  75. elif entry1.path > entry2.path:
  76. result.append((_NULL_ENTRY, entry2))
  77. i2 += 1
  78. else:
  79. result.append((entry1, entry2))
  80. i1 += 1
  81. i2 += 1
  82. for i in xrange(i1, len1):
  83. result.append((entries1[i], _NULL_ENTRY))
  84. for i in xrange(i2, len2):
  85. result.append((_NULL_ENTRY, entries2[i]))
  86. return result
  87. def walk_trees(store, tree1_id, tree2_id, prune_identical=False):
  88. """Recursively walk all the entries of two trees.
  89. Iteration is depth-first pre-order, as in e.g. os.walk.
  90. :param store: An ObjectStore for looking up objects.
  91. :param tree1_id: The SHA of the first Tree object to iterate, or None.
  92. :param tree2_id: The SHA of the second Tree object to iterate, or None.
  93. :param prune_identical: If True, identical subtrees will not be walked.
  94. :yield: Pairs of TreeEntry objects for each pair of entries in the trees and
  95. their subtrees recursively. If an entry exists in one tree but not the
  96. other, the other entry will have all attributes set to None. If neither
  97. entry's path is None, they are guaranteed to match.
  98. """
  99. # This could be fairly easily generalized to >2 trees if we find a use case.
  100. mode1 = tree1_id and stat.S_IFDIR or None
  101. mode2 = tree2_id and stat.S_IFDIR or None
  102. todo = [(TreeEntry('', mode1, tree1_id), TreeEntry('', mode2, tree2_id))]
  103. while todo:
  104. entry1, entry2 = todo.pop()
  105. is_tree1 = entry1.mode and stat.S_ISDIR(entry1.mode)
  106. is_tree2 = entry2.mode and stat.S_ISDIR(entry2.mode)
  107. if prune_identical and is_tree1 and is_tree2 and entry1 == entry2:
  108. continue
  109. tree1 = is_tree1 and store[entry1.sha] or None
  110. tree2 = is_tree2 and store[entry2.sha] or None
  111. path = entry1.path or entry2.path
  112. todo.extend(reversed(_merge_entries(path, tree1, tree2)))
  113. yield entry1, entry2
  114. def _skip_tree(entry):
  115. if entry.mode is None or stat.S_ISDIR(entry.mode):
  116. return _NULL_ENTRY
  117. return entry
  118. def tree_changes(store, tree1_id, tree2_id, want_unchanged=False):
  119. """Find the differences between the contents of two trees.
  120. :param store: An ObjectStore for looking up objects.
  121. :param tree1_id: The SHA of the source tree.
  122. :param tree2_id: The SHA of the target tree.
  123. :param want_unchanged: If True, include TreeChanges for unmodified entries
  124. as well.
  125. :yield: TreeChange instances for each change between the source and target
  126. tree.
  127. """
  128. entries = walk_trees(store, tree1_id, tree2_id,
  129. prune_identical=(not want_unchanged))
  130. for entry1, entry2 in entries:
  131. if entry1 == entry2 and not want_unchanged:
  132. continue
  133. # Treat entries for trees as missing.
  134. entry1 = _skip_tree(entry1)
  135. entry2 = _skip_tree(entry2)
  136. if entry1 != _NULL_ENTRY and entry2 != _NULL_ENTRY:
  137. if stat.S_IFMT(entry1.mode) != stat.S_IFMT(entry2.mode):
  138. # File type changed: report as delete/add.
  139. yield TreeChange.delete(entry1)
  140. entry1 = _NULL_ENTRY
  141. change_type = CHANGE_ADD
  142. elif entry1 == entry2:
  143. change_type = CHANGE_UNCHANGED
  144. else:
  145. change_type = CHANGE_MODIFY
  146. elif entry1 != _NULL_ENTRY:
  147. change_type = CHANGE_DELETE
  148. elif entry2 != _NULL_ENTRY:
  149. change_type = CHANGE_ADD
  150. else:
  151. # Both were None because at least one was a tree.
  152. continue
  153. yield TreeChange(change_type, entry1, entry2)
  154. _BLOCK_SIZE = 64
  155. def _count_blocks(obj):
  156. """Count the blocks in an object.
  157. Splits the data into blocks either on lines or <=64-byte chunks of lines.
  158. :param obj: The object to count blocks for.
  159. :return: A dict of block -> number of occurrences.
  160. """
  161. block_counts = defaultdict(int)
  162. block = StringIO()
  163. for c in itertools.chain(*obj.as_raw_chunks()):
  164. block.write(c)
  165. if c == '\n' or block.tell() == _BLOCK_SIZE:
  166. block_counts[block.getvalue()] += 1
  167. block.seek(0)
  168. block.truncate()
  169. last_block = block.getvalue()
  170. if last_block:
  171. block_counts[last_block] += 1
  172. return block_counts
  173. def _common_bytes(blocks1, blocks2):
  174. """Count the number of common bytes in two block count dicts."""
  175. # Iterate over the smaller of the two dicts, since this is symmetrical.
  176. if len(blocks1) > len(blocks2):
  177. blocks1, blocks2 = blocks2, blocks1
  178. score = 0
  179. for block, count1 in blocks1.iteritems():
  180. count2 = blocks2.get(block)
  181. if count2:
  182. score += min(count1, count2) * len(block)
  183. return score
  184. def _similarity_score(obj1, obj2, block_cache=None):
  185. """Compute a similarity score for two objects.
  186. :param obj1: The first object to score.
  187. :param obj2: The second object to score.
  188. :param block_cache: An optional dict of SHA to block counts to cache results
  189. between calls.
  190. :return: The similarity score between the two objects, defined as the number
  191. of bytes in common between the two objects divided by the maximum size,
  192. scaled to the range 0-100.
  193. """
  194. if block_cache is None:
  195. block_cache = {}
  196. if obj1.id not in block_cache:
  197. block_cache[obj1.id] = _count_blocks(obj1)
  198. if obj2.id not in block_cache:
  199. block_cache[obj2.id] = _count_blocks(obj2)
  200. common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])
  201. max_size = max(obj1.raw_length(), obj2.raw_length())
  202. if not max_size:
  203. return _MAX_SCORE
  204. return int(float(common_bytes) * _MAX_SCORE / max_size)
  205. def _tree_change_key(entry):
  206. # Sort by old path then new path. If only one exists, use it for both keys.
  207. path1 = entry.old.path
  208. path2 = entry.new.path
  209. if path1 is None:
  210. path1 = path2
  211. if path2 is None:
  212. path2 = path1
  213. return (path1, path2)