walk.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. # walk.py -- General implementation of walking commits and their contents.
  2. # Copyright (C) 2010 Google, Inc.
  3. #
  4. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  5. # General Public License as public by the Free Software Foundation; version 2.0
  6. # or (at your option) any later version. You can redistribute it and/or
  7. # modify it under the terms of either of these two licenses.
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # You should have received a copy of the licenses; if not, see
  16. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  17. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  18. # License, Version 2.0.
  19. #
  20. """General implementation of walking commits and their contents."""
  21. import collections
  22. import heapq
  23. from itertools import chain
  24. from dulwich.diff_tree import (
  25. RENAME_CHANGE_TYPES,
  26. tree_changes,
  27. tree_changes_for_merge,
  28. RenameDetector,
  29. )
  30. from dulwich.errors import (
  31. MissingCommitError,
  32. )
  33. from dulwich.objects import (
  34. Tag,
  35. )
  36. ORDER_DATE = "date"
  37. ORDER_TOPO = "topo"
  38. ALL_ORDERS = (ORDER_DATE, ORDER_TOPO)
  39. # Maximum number of commits to walk past a commit time boundary.
  40. _MAX_EXTRA_COMMITS = 5
  41. class WalkEntry(object):
  42. """Object encapsulating a single result from a walk."""
  43. def __init__(self, walker, commit):
  44. self.commit = commit
  45. self._store = walker.store
  46. self._get_parents = walker.get_parents
  47. self._changes = {}
  48. self._rename_detector = walker.rename_detector
  49. def changes(self, path_prefix=None):
  50. """Get the tree changes for this entry.
  51. Args:
  52. path_prefix: Portion of the path in the repository to
  53. use to filter changes. Must be a directory name. Must be
  54. a full, valid, path reference (no partial names or wildcards).
  55. Returns: For commits with up to one parent, a list of TreeChange
  56. objects; if the commit has no parents, these will be relative to
  57. the empty tree. For merge commits, a list of lists of TreeChange
  58. objects; see dulwich.diff.tree_changes_for_merge.
  59. """
  60. cached = self._changes.get(path_prefix)
  61. if cached is None:
  62. commit = self.commit
  63. if not self._get_parents(commit):
  64. changes_func = tree_changes
  65. parent = None
  66. elif len(self._get_parents(commit)) == 1:
  67. changes_func = tree_changes
  68. parent = self._store[self._get_parents(commit)[0]].tree
  69. if path_prefix:
  70. mode, subtree_sha = parent.lookup_path(
  71. self._store.__getitem__,
  72. path_prefix,
  73. )
  74. parent = self._store[subtree_sha]
  75. else:
  76. changes_func = tree_changes_for_merge
  77. parent = [self._store[p].tree for p in self._get_parents(commit)]
  78. if path_prefix:
  79. parent_trees = [self._store[p] for p in parent]
  80. parent = []
  81. for p in parent_trees:
  82. try:
  83. mode, st = p.lookup_path(
  84. self._store.__getitem__,
  85. path_prefix,
  86. )
  87. except KeyError:
  88. pass
  89. else:
  90. parent.append(st)
  91. commit_tree_sha = commit.tree
  92. if path_prefix:
  93. commit_tree = self._store[commit_tree_sha]
  94. mode, commit_tree_sha = commit_tree.lookup_path(
  95. self._store.__getitem__,
  96. path_prefix,
  97. )
  98. cached = list(
  99. changes_func(
  100. self._store,
  101. parent,
  102. commit_tree_sha,
  103. rename_detector=self._rename_detector,
  104. )
  105. )
  106. self._changes[path_prefix] = cached
  107. return self._changes[path_prefix]
  108. def __repr__(self):
  109. return "<WalkEntry commit=%s, changes=%r>" % (
  110. self.commit.id,
  111. self.changes(),
  112. )
  113. class _CommitTimeQueue(object):
  114. """Priority queue of WalkEntry objects by commit time."""
  115. def __init__(self, walker):
  116. self._walker = walker
  117. self._store = walker.store
  118. self._get_parents = walker.get_parents
  119. self._excluded = walker.excluded
  120. self._pq = []
  121. self._pq_set = set()
  122. self._seen = set()
  123. self._done = set()
  124. self._min_time = walker.since
  125. self._last = None
  126. self._extra_commits_left = _MAX_EXTRA_COMMITS
  127. self._is_finished = False
  128. for commit_id in chain(walker.include, walker.excluded):
  129. self._push(commit_id)
  130. def _push(self, object_id):
  131. try:
  132. obj = self._store[object_id]
  133. except KeyError:
  134. raise MissingCommitError(object_id)
  135. if isinstance(obj, Tag):
  136. self._push(obj.object[1])
  137. return
  138. # TODO(jelmer): What to do about non-Commit and non-Tag objects?
  139. commit = obj
  140. if commit.id not in self._pq_set and commit.id not in self._done:
  141. heapq.heappush(self._pq, (-commit.commit_time, commit))
  142. self._pq_set.add(commit.id)
  143. self._seen.add(commit.id)
  144. def _exclude_parents(self, commit):
  145. excluded = self._excluded
  146. seen = self._seen
  147. todo = [commit]
  148. while todo:
  149. commit = todo.pop()
  150. for parent in self._get_parents(commit):
  151. if parent not in excluded and parent in seen:
  152. # TODO: This is inefficient unless the object store does
  153. # some caching (which DiskObjectStore currently does not).
  154. # We could either add caching in this class or pass around
  155. # parsed queue entry objects instead of commits.
  156. todo.append(self._store[parent])
  157. excluded.add(parent)
  158. def next(self):
  159. if self._is_finished:
  160. return None
  161. while self._pq:
  162. _, commit = heapq.heappop(self._pq)
  163. sha = commit.id
  164. self._pq_set.remove(sha)
  165. if sha in self._done:
  166. continue
  167. self._done.add(sha)
  168. for parent_id in self._get_parents(commit):
  169. self._push(parent_id)
  170. reset_extra_commits = True
  171. is_excluded = sha in self._excluded
  172. if is_excluded:
  173. self._exclude_parents(commit)
  174. if self._pq and all(c.id in self._excluded for _, c in self._pq):
  175. _, n = self._pq[0]
  176. if self._last and n.commit_time >= self._last.commit_time:
  177. # If the next commit is newer than the last one, we
  178. # need to keep walking in case its parents (which we
  179. # may not have seen yet) are excluded. This gives the
  180. # excluded set a chance to "catch up" while the commit
  181. # is still in the Walker's output queue.
  182. reset_extra_commits = True
  183. else:
  184. reset_extra_commits = False
  185. if self._min_time is not None and commit.commit_time < self._min_time:
  186. # We want to stop walking at min_time, but commits at the
  187. # boundary may be out of order with respect to their parents.
  188. # So we walk _MAX_EXTRA_COMMITS more commits once we hit this
  189. # boundary.
  190. reset_extra_commits = False
  191. if reset_extra_commits:
  192. # We're not at a boundary, so reset the counter.
  193. self._extra_commits_left = _MAX_EXTRA_COMMITS
  194. else:
  195. self._extra_commits_left -= 1
  196. if not self._extra_commits_left:
  197. break
  198. if not is_excluded:
  199. self._last = commit
  200. return WalkEntry(self._walker, commit)
  201. self._is_finished = True
  202. return None
  203. __next__ = next
  204. class Walker(object):
  205. """Object for performing a walk of commits in a store.
  206. Walker objects are initialized with a store and other options and can then
  207. be treated as iterators of Commit objects.
  208. """
  209. def __init__(
  210. self,
  211. store,
  212. include,
  213. exclude=None,
  214. order=ORDER_DATE,
  215. reverse=False,
  216. max_entries=None,
  217. paths=None,
  218. rename_detector=None,
  219. follow=False,
  220. since=None,
  221. until=None,
  222. get_parents=lambda commit: commit.parents,
  223. queue_cls=_CommitTimeQueue,
  224. ):
  225. """Constructor.
  226. Args:
  227. store: ObjectStore instance for looking up objects.
  228. include: Iterable of SHAs of commits to include along with their
  229. ancestors.
  230. exclude: Iterable of SHAs of commits to exclude along with their
  231. ancestors, overriding includes.
  232. order: ORDER_* constant specifying the order of results.
  233. Anything other than ORDER_DATE may result in O(n) memory usage.
  234. reverse: If True, reverse the order of output, requiring O(n)
  235. memory.
  236. max_entries: The maximum number of entries to yield, or None for
  237. no limit.
  238. paths: Iterable of file or subtree paths to show entries for.
  239. rename_detector: diff.RenameDetector object for detecting
  240. renames.
  241. follow: If True, follow path across renames/copies. Forces a
  242. default rename_detector.
  243. since: Timestamp to list commits after.
  244. until: Timestamp to list commits before.
  245. get_parents: Method to retrieve the parents of a commit
  246. queue_cls: A class to use for a queue of commits, supporting the
  247. iterator protocol. The constructor takes a single argument, the
  248. Walker.
  249. """
  250. # Note: when adding arguments to this method, please also update
  251. # dulwich.repo.BaseRepo.get_walker
  252. if order not in ALL_ORDERS:
  253. raise ValueError("Unknown walk order %s" % order)
  254. self.store = store
  255. if isinstance(include, bytes):
  256. # TODO(jelmer): Really, this should require a single type.
  257. # Print deprecation warning here?
  258. include = [include]
  259. self.include = include
  260. self.excluded = set(exclude or [])
  261. self.order = order
  262. self.reverse = reverse
  263. self.max_entries = max_entries
  264. self.paths = paths and set(paths) or None
  265. if follow and not rename_detector:
  266. rename_detector = RenameDetector(store)
  267. self.rename_detector = rename_detector
  268. self.get_parents = get_parents
  269. self.follow = follow
  270. self.since = since
  271. self.until = until
  272. self._num_entries = 0
  273. self._queue = queue_cls(self)
  274. self._out_queue = collections.deque()
  275. def _path_matches(self, changed_path):
  276. if changed_path is None:
  277. return False
  278. for followed_path in self.paths:
  279. if changed_path == followed_path:
  280. return True
  281. if (
  282. changed_path.startswith(followed_path)
  283. and changed_path[len(followed_path)] == b"/"[0]
  284. ):
  285. return True
  286. return False
  287. def _change_matches(self, change):
  288. if not change:
  289. return False
  290. old_path = change.old.path
  291. new_path = change.new.path
  292. if self._path_matches(new_path):
  293. if self.follow and change.type in RENAME_CHANGE_TYPES:
  294. self.paths.add(old_path)
  295. self.paths.remove(new_path)
  296. return True
  297. elif self._path_matches(old_path):
  298. return True
  299. return False
  300. def _should_return(self, entry):
  301. """Determine if a walk entry should be returned..
  302. Args:
  303. entry: The WalkEntry to consider.
  304. Returns: True if the WalkEntry should be returned by this walk, or
  305. False otherwise (e.g. if it doesn't match any requested paths).
  306. """
  307. commit = entry.commit
  308. if self.since is not None and commit.commit_time < self.since:
  309. return False
  310. if self.until is not None and commit.commit_time > self.until:
  311. return False
  312. if commit.id in self.excluded:
  313. return False
  314. if self.paths is None:
  315. return True
  316. if len(self.get_parents(commit)) > 1:
  317. for path_changes in entry.changes():
  318. # For merge commits, only include changes with conflicts for
  319. # this path. Since a rename conflict may include different
  320. # old.paths, we have to check all of them.
  321. for change in path_changes:
  322. if self._change_matches(change):
  323. return True
  324. else:
  325. for change in entry.changes():
  326. if self._change_matches(change):
  327. return True
  328. return None
  329. def _next(self):
  330. max_entries = self.max_entries
  331. while max_entries is None or self._num_entries < max_entries:
  332. entry = next(self._queue)
  333. if entry is not None:
  334. self._out_queue.append(entry)
  335. if entry is None or len(self._out_queue) > _MAX_EXTRA_COMMITS:
  336. if not self._out_queue:
  337. return None
  338. entry = self._out_queue.popleft()
  339. if self._should_return(entry):
  340. self._num_entries += 1
  341. return entry
  342. return None
  343. def _reorder(self, results):
  344. """Possibly reorder a results iterator.
  345. Args:
  346. results: An iterator of WalkEntry objects, in the order returned
  347. from the queue_cls.
  348. Returns: An iterator or list of WalkEntry objects, in the order
  349. required by the Walker.
  350. """
  351. if self.order == ORDER_TOPO:
  352. results = _topo_reorder(results, self.get_parents)
  353. if self.reverse:
  354. results = reversed(list(results))
  355. return results
  356. def __iter__(self):
  357. return iter(self._reorder(iter(self._next, None)))
  358. def _topo_reorder(entries, get_parents=lambda commit: commit.parents):
  359. """Reorder an iterable of entries topologically.
  360. This works best assuming the entries are already in almost-topological
  361. order, e.g. in commit time order.
  362. Args:
  363. entries: An iterable of WalkEntry objects.
  364. get_parents: Optional function for getting the parents of a commit.
  365. Returns: iterator over WalkEntry objects from entries in FIFO order, except
  366. where a parent would be yielded before any of its children.
  367. """
  368. todo = collections.deque()
  369. pending = {}
  370. num_children = collections.defaultdict(int)
  371. for entry in entries:
  372. todo.append(entry)
  373. for p in get_parents(entry.commit):
  374. num_children[p] += 1
  375. while todo:
  376. entry = todo.popleft()
  377. commit = entry.commit
  378. commit_id = commit.id
  379. if num_children[commit_id]:
  380. pending[commit_id] = entry
  381. continue
  382. for parent_id in get_parents(commit):
  383. num_children[parent_id] -= 1
  384. if not num_children[parent_id]:
  385. parent_entry = pending.pop(parent_id, None)
  386. if parent_entry:
  387. todo.appendleft(parent_entry)
  388. yield entry