walk.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. # walk.py -- General implementation of walking commits and their contents.
  2. # Copyright (C) 2010 Google, Inc.
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """General implementation of walking commits and their contents."""
  22. import collections
  23. import heapq
  24. from collections.abc import Iterator
  25. from itertools import chain
  26. from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast
  27. if TYPE_CHECKING:
  28. from .object_store import BaseObjectStore
  29. from .diff_tree import (
  30. RENAME_CHANGE_TYPES,
  31. RenameDetector,
  32. TreeChange,
  33. tree_changes,
  34. tree_changes_for_merge,
  35. )
  36. from .errors import MissingCommitError
  37. from .objects import Commit, ObjectID, Tag
  38. ORDER_DATE = "date"
  39. ORDER_TOPO = "topo"
  40. ALL_ORDERS = (ORDER_DATE, ORDER_TOPO)
  41. # Maximum number of commits to walk past a commit time boundary.
  42. _MAX_EXTRA_COMMITS = 5
  43. class WalkEntry:
  44. """Object encapsulating a single result from a walk."""
  45. def __init__(self, walker: "Walker", commit: Commit) -> None:
  46. self.commit = commit
  47. self._store = walker.store
  48. self._get_parents = walker.get_parents
  49. self._changes: dict[Optional[bytes], list[TreeChange]] = {}
  50. self._rename_detector = walker.rename_detector
  51. def changes(
  52. self, path_prefix: Optional[bytes] = None
  53. ) -> Union[list[TreeChange], list[list[TreeChange]]]:
  54. """Get the tree changes for this entry.
  55. Args:
  56. path_prefix: Portion of the path in the repository to
  57. use to filter changes. Must be a directory name. Must be
  58. a full, valid, path reference (no partial names or wildcards).
  59. Returns: For commits with up to one parent, a list of TreeChange
  60. objects; if the commit has no parents, these will be relative to
  61. the empty tree. For merge commits, a list of lists of TreeChange
  62. objects; see dulwich.diff_tree.tree_changes_for_merge.
  63. """
  64. cached = self._changes.get(path_prefix)
  65. if cached is None:
  66. commit = self.commit
  67. if not self._get_parents(commit):
  68. changes_func = tree_changes
  69. parent = None
  70. elif len(self._get_parents(commit)) == 1:
  71. changes_func = tree_changes
  72. parent = cast(Commit, self._store[self._get_parents(commit)[0]]).tree
  73. if path_prefix:
  74. mode, subtree_sha = parent.lookup_path(
  75. self._store.__getitem__,
  76. path_prefix,
  77. )
  78. parent = self._store[subtree_sha]
  79. else:
  80. # For merge commits, we need to handle multiple parents differently
  81. parent = [
  82. cast(Commit, self._store[p]).tree for p in self._get_parents(commit)
  83. ]
  84. # Use a lambda to adapt the signature
  85. changes_func = cast(
  86. Any,
  87. lambda store,
  88. parent_trees,
  89. tree_id,
  90. rename_detector=None: tree_changes_for_merge(
  91. store, parent_trees, tree_id, rename_detector
  92. ),
  93. )
  94. if path_prefix:
  95. parent_trees = [self._store[p] for p in parent]
  96. parent = []
  97. for p in parent_trees:
  98. try:
  99. from .objects import Tree
  100. assert isinstance(p, Tree)
  101. mode, st = p.lookup_path(
  102. self._store.__getitem__,
  103. path_prefix,
  104. )
  105. except KeyError:
  106. pass
  107. else:
  108. parent.append(st)
  109. commit_tree_sha = commit.tree
  110. if path_prefix:
  111. commit_tree = self._store[commit_tree_sha]
  112. from .objects import Tree
  113. assert isinstance(commit_tree, Tree)
  114. mode, commit_tree_sha = commit_tree.lookup_path(
  115. self._store.__getitem__,
  116. path_prefix,
  117. )
  118. cached = list(
  119. changes_func(
  120. self._store,
  121. parent,
  122. commit_tree_sha,
  123. rename_detector=self._rename_detector,
  124. )
  125. )
  126. self._changes[path_prefix] = cached
  127. return self._changes[path_prefix]
  128. def __repr__(self) -> str:
  129. return f"<WalkEntry commit={self.commit.id.decode('ascii')}, changes={self.changes()!r}>"
  130. class _CommitTimeQueue:
  131. """Priority queue of WalkEntry objects by commit time."""
  132. def __init__(self, walker: "Walker") -> None:
  133. self._walker = walker
  134. self._store = walker.store
  135. self._get_parents = walker.get_parents
  136. self._excluded = walker.excluded
  137. self._pq: list[tuple[int, Commit]] = []
  138. self._pq_set: set[ObjectID] = set()
  139. self._seen: set[ObjectID] = set()
  140. self._done: set[ObjectID] = set()
  141. self._min_time = walker.since
  142. self._last: Optional[Commit] = None
  143. self._extra_commits_left = _MAX_EXTRA_COMMITS
  144. self._is_finished = False
  145. for commit_id in chain(walker.include, walker.excluded):
  146. self._push(commit_id)
  147. def _push(self, object_id: ObjectID) -> None:
  148. try:
  149. obj = self._store[object_id]
  150. except KeyError as exc:
  151. raise MissingCommitError(object_id) from exc
  152. if isinstance(obj, Tag):
  153. self._push(obj.object[1])
  154. return
  155. # TODO(jelmer): What to do about non-Commit and non-Tag objects?
  156. if not isinstance(obj, Commit):
  157. return
  158. commit = obj
  159. if commit.id not in self._pq_set and commit.id not in self._done:
  160. heapq.heappush(self._pq, (-commit.commit_time, commit))
  161. self._pq_set.add(commit.id)
  162. self._seen.add(commit.id)
  163. def _exclude_parents(self, commit: Commit) -> None:
  164. excluded = self._excluded
  165. seen = self._seen
  166. todo = [commit]
  167. while todo:
  168. commit = todo.pop()
  169. for parent in self._get_parents(commit):
  170. if parent not in excluded and parent in seen:
  171. # TODO: This is inefficient unless the object store does
  172. # some caching (which DiskObjectStore currently does not).
  173. # We could either add caching in this class or pass around
  174. # parsed queue entry objects instead of commits.
  175. todo.append(cast(Commit, self._store[parent]))
  176. excluded.add(parent)
  177. def next(self) -> Optional[WalkEntry]:
  178. if self._is_finished:
  179. return None
  180. while self._pq:
  181. _, commit = heapq.heappop(self._pq)
  182. sha = commit.id
  183. self._pq_set.remove(sha)
  184. if sha in self._done:
  185. continue
  186. self._done.add(sha)
  187. for parent_id in self._get_parents(commit):
  188. self._push(parent_id)
  189. reset_extra_commits = True
  190. is_excluded = sha in self._excluded
  191. if is_excluded:
  192. self._exclude_parents(commit)
  193. if self._pq and all(c.id in self._excluded for _, c in self._pq):
  194. _, n = self._pq[0]
  195. if self._last and n.commit_time >= self._last.commit_time:
  196. # If the next commit is newer than the last one, we
  197. # need to keep walking in case its parents (which we
  198. # may not have seen yet) are excluded. This gives the
  199. # excluded set a chance to "catch up" while the commit
  200. # is still in the Walker's output queue.
  201. reset_extra_commits = True
  202. else:
  203. reset_extra_commits = False
  204. if self._min_time is not None and commit.commit_time < self._min_time:
  205. # We want to stop walking at min_time, but commits at the
  206. # boundary may be out of order with respect to their parents.
  207. # So we walk _MAX_EXTRA_COMMITS more commits once we hit this
  208. # boundary.
  209. reset_extra_commits = False
  210. if reset_extra_commits:
  211. # We're not at a boundary, so reset the counter.
  212. self._extra_commits_left = _MAX_EXTRA_COMMITS
  213. else:
  214. self._extra_commits_left -= 1
  215. if not self._extra_commits_left:
  216. break
  217. if not is_excluded:
  218. self._last = commit
  219. return WalkEntry(self._walker, commit)
  220. self._is_finished = True
  221. return None
  222. __next__ = next
  223. class Walker:
  224. """Object for performing a walk of commits in a store.
  225. Walker objects are initialized with a store and other options and can then
  226. be treated as iterators of Commit objects.
  227. """
  228. def __init__(
  229. self,
  230. store: "BaseObjectStore",
  231. include: list[bytes],
  232. exclude: Optional[list[bytes]] = None,
  233. order: str = "date",
  234. reverse: bool = False,
  235. max_entries: Optional[int] = None,
  236. paths: Optional[list[bytes]] = None,
  237. rename_detector: Optional[RenameDetector] = None,
  238. follow: bool = False,
  239. since: Optional[int] = None,
  240. until: Optional[int] = None,
  241. get_parents: Callable[[Commit], list[bytes]] = lambda commit: commit.parents,
  242. queue_cls: type = _CommitTimeQueue,
  243. ) -> None:
  244. """Constructor.
  245. Args:
  246. store: ObjectStore instance for looking up objects.
  247. include: Iterable of SHAs of commits to include along with their
  248. ancestors.
  249. exclude: Iterable of SHAs of commits to exclude along with their
  250. ancestors, overriding includes.
  251. order: ORDER_* constant specifying the order of results.
  252. Anything other than ORDER_DATE may result in O(n) memory usage.
  253. reverse: If True, reverse the order of output, requiring O(n)
  254. memory.
  255. max_entries: The maximum number of entries to yield, or None for
  256. no limit.
  257. paths: Iterable of file or subtree paths to show entries for.
  258. rename_detector: diff.RenameDetector object for detecting
  259. renames.
  260. follow: If True, follow path across renames/copies. Forces a
  261. default rename_detector.
  262. since: Timestamp to list commits after.
  263. until: Timestamp to list commits before.
  264. get_parents: Method to retrieve the parents of a commit
  265. queue_cls: A class to use for a queue of commits, supporting the
  266. iterator protocol. The constructor takes a single argument, the
  267. Walker.
  268. """
  269. # Note: when adding arguments to this method, please also update
  270. # dulwich.repo.BaseRepo.get_walker
  271. if order not in ALL_ORDERS:
  272. raise ValueError(f"Unknown walk order {order}")
  273. self.store = store
  274. if isinstance(include, bytes):
  275. # TODO(jelmer): Really, this should require a single type.
  276. # Print deprecation warning here?
  277. include = [include]
  278. self.include = include
  279. self.excluded = set(exclude or [])
  280. self.order = order
  281. self.reverse = reverse
  282. self.max_entries = max_entries
  283. self.paths = (paths and set(paths)) or None
  284. if follow and not rename_detector:
  285. rename_detector = RenameDetector(store)
  286. self.rename_detector = rename_detector
  287. self.get_parents = get_parents
  288. self.follow = follow
  289. self.since = since
  290. self.until = until
  291. self._num_entries = 0
  292. self._queue = queue_cls(self)
  293. self._out_queue: collections.deque[WalkEntry] = collections.deque()
  294. def _path_matches(self, changed_path: Optional[bytes]) -> bool:
  295. if changed_path is None:
  296. return False
  297. if self.paths is None:
  298. return True
  299. for followed_path in self.paths:
  300. if changed_path == followed_path:
  301. return True
  302. if (
  303. changed_path.startswith(followed_path)
  304. and changed_path[len(followed_path)] == b"/"[0]
  305. ):
  306. return True
  307. return False
  308. def _change_matches(self, change: TreeChange) -> bool:
  309. assert self.paths
  310. if not change:
  311. return False
  312. old_path = change.old.path
  313. new_path = change.new.path
  314. if self._path_matches(new_path):
  315. if self.follow and change.type in RENAME_CHANGE_TYPES:
  316. self.paths.add(old_path)
  317. self.paths.remove(new_path)
  318. return True
  319. elif self._path_matches(old_path):
  320. return True
  321. return False
  322. def _should_return(self, entry: WalkEntry) -> Optional[bool]:
  323. """Determine if a walk entry should be returned..
  324. Args:
  325. entry: The WalkEntry to consider.
  326. Returns: True if the WalkEntry should be returned by this walk, or
  327. False otherwise (e.g. if it doesn't match any requested paths).
  328. """
  329. commit = entry.commit
  330. if self.since is not None and commit.commit_time < self.since:
  331. return False
  332. if self.until is not None and commit.commit_time > self.until:
  333. return False
  334. if commit.id in self.excluded:
  335. return False
  336. if self.paths is None:
  337. return True
  338. if len(self.get_parents(commit)) > 1:
  339. for path_changes in entry.changes():
  340. # For merge commits, only include changes with conflicts for
  341. # this path. Since a rename conflict may include different
  342. # old.paths, we have to check all of them.
  343. for change in path_changes:
  344. if self._change_matches(change):
  345. return True
  346. else:
  347. changes = entry.changes()
  348. # Handle both list[TreeChange] and list[list[TreeChange]]
  349. if changes and isinstance(changes[0], list):
  350. # It's list[list[TreeChange]], flatten it
  351. for change_list in changes:
  352. for change in change_list:
  353. if self._change_matches(change):
  354. return True
  355. else:
  356. # It's list[TreeChange]
  357. from .diff_tree import TreeChange
  358. for change in changes:
  359. if isinstance(change, TreeChange) and self._change_matches(change):
  360. return True
  361. return None
  362. def _next(self) -> Optional[WalkEntry]:
  363. max_entries = self.max_entries
  364. while max_entries is None or self._num_entries < max_entries:
  365. entry = next(self._queue)
  366. if entry is not None:
  367. self._out_queue.append(entry)
  368. if entry is None or len(self._out_queue) > _MAX_EXTRA_COMMITS:
  369. if not self._out_queue:
  370. return None
  371. entry = self._out_queue.popleft()
  372. if self._should_return(entry):
  373. self._num_entries += 1
  374. return entry
  375. return None
  376. def _reorder(
  377. self, results: Iterator[WalkEntry]
  378. ) -> Union[Iterator[WalkEntry], list[WalkEntry]]:
  379. """Possibly reorder a results iterator.
  380. Args:
  381. results: An iterator of WalkEntry objects, in the order returned
  382. from the queue_cls.
  383. Returns: An iterator or list of WalkEntry objects, in the order
  384. required by the Walker.
  385. """
  386. if self.order == ORDER_TOPO:
  387. results = _topo_reorder(results, self.get_parents)
  388. if self.reverse:
  389. results = reversed(list(results))
  390. return results
  391. def __iter__(self) -> Iterator[WalkEntry]:
  392. return iter(self._reorder(iter(self._next, None)))
  393. def _topo_reorder(
  394. entries: Iterator[WalkEntry],
  395. get_parents: Callable[[Commit], list[bytes]] = lambda commit: commit.parents,
  396. ) -> Iterator[WalkEntry]:
  397. """Reorder an iterable of entries topologically.
  398. This works best assuming the entries are already in almost-topological
  399. order, e.g. in commit time order.
  400. Args:
  401. entries: An iterable of WalkEntry objects.
  402. get_parents: Optional function for getting the parents of a commit.
  403. Returns: iterator over WalkEntry objects from entries in FIFO order, except
  404. where a parent would be yielded before any of its children.
  405. """
  406. todo: collections.deque[WalkEntry] = collections.deque()
  407. pending: dict[bytes, WalkEntry] = {}
  408. num_children: dict[bytes, int] = collections.defaultdict(int)
  409. for entry in entries:
  410. todo.append(entry)
  411. for p in get_parents(entry.commit):
  412. num_children[p] += 1
  413. while todo:
  414. entry = todo.popleft()
  415. commit = entry.commit
  416. commit_id = commit.id
  417. if num_children[commit_id]:
  418. pending[commit_id] = entry
  419. continue
  420. for parent_id in get_parents(commit):
  421. num_children[parent_id] -= 1
  422. if not num_children[parent_id]:
  423. parent_entry = pending.pop(parent_id, None)
  424. if parent_entry:
  425. todo.appendleft(parent_entry)
  426. yield entry