walk.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. # walk.py -- General implementation of walking commits and their contents.
  2. # Copyright (C) 2010 Google, Inc.
  3. #
  4. # This program is free software; you can redistribute it and/or
  5. # modify it under the terms of the GNU General Public License
  6. # as published by the Free Software Foundation; version 2
  7. # or (at your option) any later version of the License.
  8. #
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU General Public License for more details.
  13. #
  14. # You should have received a copy of the GNU General Public License
  15. # along with this program; if not, write to the Free Software
  16. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  17. # MA 02110-1301, USA.
  18. """General implementation of walking commits and their contents."""
  19. try:
  20. from collections import defaultdict
  21. except ImportError:
  22. from _compat import defaultdict
  23. import collections
  24. import heapq
  25. import itertools
  26. from dulwich._compat import (
  27. all,
  28. )
  29. from dulwich.diff_tree import (
  30. RENAME_CHANGE_TYPES,
  31. tree_changes,
  32. tree_changes_for_merge,
  33. RenameDetector,
  34. )
  35. from dulwich.errors import (
  36. MissingCommitError,
  37. )
  38. ORDER_DATE = 'date'
  39. ORDER_TOPO = 'topo'
  40. ALL_ORDERS = (ORDER_DATE, ORDER_TOPO)
  41. # Maximum number of commits to walk past a commit time boundary.
  42. _MAX_EXTRA_COMMITS = 5
  43. class WalkEntry(object):
  44. """Object encapsulating a single result from a walk."""
  45. def __init__(self, walker, commit):
  46. self.commit = commit
  47. self._store = walker.store
  48. self._get_parents = walker.get_parents
  49. self._changes = None
  50. self._rename_detector = walker.rename_detector
  51. def changes(self):
  52. """Get the tree changes for this entry.
  53. :return: For commits with up to one parent, a list of TreeChange
  54. objects; if the commit has no parents, these will be relative to the
  55. empty tree. For merge commits, a list of lists of TreeChange
  56. objects; see dulwich.diff.tree_changes_for_merge.
  57. """
  58. if self._changes is None:
  59. commit = self.commit
  60. if not self._get_parents(commit):
  61. changes_func = tree_changes
  62. parent = None
  63. elif len(self._get_parents(commit)) == 1:
  64. changes_func = tree_changes
  65. parent = self._store[self._get_parents(commit)[0]].tree
  66. else:
  67. changes_func = tree_changes_for_merge
  68. parent = [self._store[p].tree for p in self._get_parents(commit)]
  69. self._changes = list(changes_func(
  70. self._store, parent, commit.tree,
  71. rename_detector=self._rename_detector))
  72. return self._changes
  73. def __repr__(self):
  74. return '<WalkEntry commit=%s, changes=%r>' % (
  75. self.commit.id, self.changes())
  76. class _CommitTimeQueue(object):
  77. """Priority queue of WalkEntry objects by commit time."""
  78. def __init__(self, walker):
  79. self._walker = walker
  80. self._store = walker.store
  81. self._get_parents = walker.get_parents
  82. self._excluded = walker.excluded
  83. self._pq = []
  84. self._pq_set = set()
  85. self._seen = set()
  86. self._done = set()
  87. self._min_time = walker.since
  88. self._last = None
  89. self._extra_commits_left = _MAX_EXTRA_COMMITS
  90. self._is_finished = False
  91. for commit_id in itertools.chain(walker.include, walker.excluded):
  92. self._push(commit_id)
  93. def _push(self, commit_id):
  94. try:
  95. commit = self._store[commit_id]
  96. except KeyError:
  97. raise MissingCommitError(commit_id)
  98. if commit_id not in self._pq_set and commit_id not in self._done:
  99. heapq.heappush(self._pq, (-commit.commit_time, commit))
  100. self._pq_set.add(commit_id)
  101. self._seen.add(commit_id)
  102. def _exclude_parents(self, commit):
  103. excluded = self._excluded
  104. seen = self._seen
  105. todo = [commit]
  106. while todo:
  107. commit = todo.pop()
  108. for parent in self._get_parents(commit):
  109. if parent not in excluded and parent in seen:
  110. # TODO: This is inefficient unless the object store does
  111. # some caching (which DiskObjectStore currently does not).
  112. # We could either add caching in this class or pass around
  113. # parsed queue entry objects instead of commits.
  114. todo.append(self._store[parent])
  115. excluded.add(parent)
  116. def next(self):
  117. if self._is_finished:
  118. return None
  119. while self._pq:
  120. _, commit = heapq.heappop(self._pq)
  121. sha = commit.id
  122. self._pq_set.remove(sha)
  123. if sha in self._done:
  124. continue
  125. self._done.add(commit.id)
  126. for parent_id in self._get_parents(commit):
  127. self._push(parent_id)
  128. reset_extra_commits = True
  129. is_excluded = sha in self._excluded
  130. if is_excluded:
  131. self._exclude_parents(commit)
  132. if self._pq and all(c.id in self._excluded
  133. for _, c in self._pq):
  134. _, n = self._pq[0]
  135. if self._last and n.commit_time >= self._last.commit_time:
  136. # If the next commit is newer than the last one, we need
  137. # to keep walking in case its parents (which we may not
  138. # have seen yet) are excluded. This gives the excluded
  139. # set a chance to "catch up" while the commit is still
  140. # in the Walker's output queue.
  141. reset_extra_commits = True
  142. else:
  143. reset_extra_commits = False
  144. if (self._min_time is not None and
  145. commit.commit_time < self._min_time):
  146. # We want to stop walking at min_time, but commits at the
  147. # boundary may be out of order with respect to their parents. So
  148. # we walk _MAX_EXTRA_COMMITS more commits once we hit this
  149. # boundary.
  150. reset_extra_commits = False
  151. if reset_extra_commits:
  152. # We're not at a boundary, so reset the counter.
  153. self._extra_commits_left = _MAX_EXTRA_COMMITS
  154. else:
  155. self._extra_commits_left -= 1
  156. if not self._extra_commits_left:
  157. break
  158. if not is_excluded:
  159. self._last = commit
  160. return WalkEntry(self._walker, commit)
  161. self._is_finished = True
  162. return None
  163. class Walker(object):
  164. """Object for performing a walk of commits in a store.
  165. Walker objects are initialized with a store and other options and can then
  166. be treated as iterators of Commit objects.
  167. """
  168. def __init__(self, store, include, exclude=None, order=ORDER_DATE,
  169. reverse=False, max_entries=None, paths=None,
  170. rename_detector=None, follow=False, since=None, until=None,
  171. get_parents=lambda commit: commit.parents,
  172. queue_cls=_CommitTimeQueue):
  173. """Constructor.
  174. :param store: ObjectStore instance for looking up objects.
  175. :param include: Iterable of SHAs of commits to include along with their
  176. ancestors.
  177. :param exclude: Iterable of SHAs of commits to exclude along with their
  178. ancestors, overriding includes.
  179. :param order: ORDER_* constant specifying the order of results. Anything
  180. other than ORDER_DATE may result in O(n) memory usage.
  181. :param reverse: If True, reverse the order of output, requiring O(n)
  182. memory.
  183. :param max_entries: The maximum number of entries to yield, or None for
  184. no limit.
  185. :param paths: Iterable of file or subtree paths to show entries for.
  186. :param rename_detector: diff.RenameDetector object for detecting
  187. renames.
  188. :param follow: If True, follow path across renames/copies. Forces a
  189. default rename_detector.
  190. :param since: Timestamp to list commits after.
  191. :param until: Timestamp to list commits before.
  192. :param get_parents: Method to retrieve the parents of a commit
  193. :param queue_cls: A class to use for a queue of commits, supporting the
  194. iterator protocol. The constructor takes a single argument, the
  195. Walker.
  196. """
  197. # Note: when adding arguments to this method, please also update
  198. # dulwich.repo.BaseRepo.get_walker
  199. if order not in ALL_ORDERS:
  200. raise ValueError('Unknown walk order %s' % order)
  201. self.store = store
  202. self.include = include
  203. self.excluded = set(exclude or [])
  204. self.order = order
  205. self.reverse = reverse
  206. self.max_entries = max_entries
  207. self.paths = paths and set(paths) or None
  208. if follow and not rename_detector:
  209. rename_detector = RenameDetector(store)
  210. self.rename_detector = rename_detector
  211. self.get_parents = get_parents
  212. self.follow = follow
  213. self.since = since
  214. self.until = until
  215. self._num_entries = 0
  216. self._queue = queue_cls(self)
  217. self._out_queue = collections.deque()
  218. def _path_matches(self, changed_path):
  219. if changed_path is None:
  220. return False
  221. for followed_path in self.paths:
  222. if changed_path == followed_path:
  223. return True
  224. if (changed_path.startswith(followed_path) and
  225. changed_path[len(followed_path)] == '/'):
  226. return True
  227. return False
  228. def _change_matches(self, change):
  229. if not change:
  230. return False
  231. old_path = change.old.path
  232. new_path = change.new.path
  233. if self._path_matches(new_path):
  234. if self.follow and change.type in RENAME_CHANGE_TYPES:
  235. self.paths.add(old_path)
  236. self.paths.remove(new_path)
  237. return True
  238. elif self._path_matches(old_path):
  239. return True
  240. return False
  241. def _should_return(self, entry):
  242. """Determine if a walk entry should be returned..
  243. :param entry: The WalkEntry to consider.
  244. :return: True if the WalkEntry should be returned by this walk, or False
  245. otherwise (e.g. if it doesn't match any requested paths).
  246. """
  247. commit = entry.commit
  248. if self.since is not None and commit.commit_time < self.since:
  249. return False
  250. if self.until is not None and commit.commit_time > self.until:
  251. return False
  252. if commit.id in self.excluded:
  253. return False
  254. if self.paths is None:
  255. return True
  256. if len(self.get_parents(commit)) > 1:
  257. for path_changes in entry.changes():
  258. # For merge commits, only include changes with conflicts for
  259. # this path. Since a rename conflict may include different
  260. # old.paths, we have to check all of them.
  261. for change in path_changes:
  262. if self._change_matches(change):
  263. return True
  264. else:
  265. for change in entry.changes():
  266. if self._change_matches(change):
  267. return True
  268. return None
  269. def _next(self):
  270. max_entries = self.max_entries
  271. while max_entries is None or self._num_entries < max_entries:
  272. entry = self._queue.next()
  273. if entry is not None:
  274. self._out_queue.append(entry)
  275. if entry is None or len(self._out_queue) > _MAX_EXTRA_COMMITS:
  276. if not self._out_queue:
  277. return None
  278. entry = self._out_queue.popleft()
  279. if self._should_return(entry):
  280. self._num_entries += 1
  281. return entry
  282. return None
  283. def _reorder(self, results):
  284. """Possibly reorder a results iterator.
  285. :param results: An iterator of WalkEntry objects, in the order returned
  286. from the queue_cls.
  287. :return: An iterator or list of WalkEntry objects, in the order required
  288. by the Walker.
  289. """
  290. if self.order == ORDER_TOPO:
  291. results = _topo_reorder(results, self.get_parents)
  292. if self.reverse:
  293. results = reversed(list(results))
  294. return results
  295. def __iter__(self):
  296. return iter(self._reorder(iter(self._next, None)))
  297. def _topo_reorder(entries, get_parents=lambda commit: commit.parents):
  298. """Reorder an iterable of entries topologically.
  299. This works best assuming the entries are already in almost-topological
  300. order, e.g. in commit time order.
  301. :param entries: An iterable of WalkEntry objects.
  302. :param get_parents: Optional function for getting the parents of a commit.
  303. :return: iterator over WalkEntry objects from entries in FIFO order, except
  304. where a parent would be yielded before any of its children.
  305. """
  306. todo = collections.deque()
  307. pending = {}
  308. num_children = defaultdict(int)
  309. for entry in entries:
  310. todo.append(entry)
  311. for p in get_parents(entry.commit):
  312. num_children[p] += 1
  313. while todo:
  314. entry = todo.popleft()
  315. commit = entry.commit
  316. commit_id = commit.id
  317. if num_children[commit_id]:
  318. pending[commit_id] = entry
  319. continue
  320. for parent_id in get_parents(commit):
  321. num_children[parent_id] -= 1
  322. if not num_children[parent_id]:
  323. parent_entry = pending.pop(parent_id, None)
  324. if parent_entry:
  325. todo.appendleft(parent_entry)
  326. yield entry