walk.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. # walk.py -- General implementation of walking commits and their contents.
  2. # Copyright (C) 2010 Google, Inc.
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """General implementation of walking commits and their contents."""
  22. import heapq
  23. from collections import defaultdict, deque
  24. from collections.abc import Callable, Iterator, Sequence
  25. from itertools import chain
  26. from typing import TYPE_CHECKING, Any, cast
  27. if TYPE_CHECKING:
  28. from .object_store import BaseObjectStore
  29. from .diff_tree import (
  30. RENAME_CHANGE_TYPES,
  31. RenameDetector,
  32. TreeChange,
  33. tree_changes,
  34. tree_changes_for_merge,
  35. )
  36. from .errors import MissingCommitError
  37. from .objects import Commit, ObjectID, Tag
  38. ORDER_DATE = "date"
  39. ORDER_TOPO = "topo"
  40. ALL_ORDERS = (ORDER_DATE, ORDER_TOPO)
  41. # Maximum number of commits to walk past a commit time boundary.
  42. _MAX_EXTRA_COMMITS = 5
  43. class WalkEntry:
  44. """Object encapsulating a single result from a walk."""
  45. def __init__(self, walker: "Walker", commit: Commit) -> None:
  46. """Initialize WalkEntry.
  47. Args:
  48. walker: Walker instance that created this entry
  49. commit: Commit object for this entry
  50. """
  51. self.commit = commit
  52. self._store = walker.store
  53. self._get_parents = walker.get_parents
  54. self._changes: dict[bytes | None, list[TreeChange]] = {}
  55. self._rename_detector = walker.rename_detector
  56. def changes(
  57. self, path_prefix: bytes | None = None
  58. ) -> list[TreeChange] | list[list[TreeChange]]:
  59. """Get the tree changes for this entry.
  60. Args:
  61. path_prefix: Portion of the path in the repository to
  62. use to filter changes. Must be a directory name. Must be
  63. a full, valid, path reference (no partial names or wildcards).
  64. Returns: For commits with up to one parent, a list of TreeChange
  65. objects; if the commit has no parents, these will be relative to
  66. the empty tree. For merge commits, a list of lists of TreeChange
  67. objects; see dulwich.diff_tree.tree_changes_for_merge.
  68. """
  69. cached = self._changes.get(path_prefix)
  70. if cached is None:
  71. commit = self.commit
  72. if not self._get_parents(commit):
  73. changes_func = tree_changes
  74. parent = None
  75. elif len(self._get_parents(commit)) == 1:
  76. changes_func = tree_changes
  77. parent_commit = self._store[self._get_parents(commit)[0]]
  78. assert isinstance(parent_commit, Commit)
  79. parent = parent_commit.tree
  80. if path_prefix:
  81. _mode, subtree_sha = parent.lookup_path(
  82. self._store.__getitem__,
  83. path_prefix,
  84. )
  85. parent = self._store[subtree_sha]
  86. else:
  87. # For merge commits, we need to handle multiple parents differently
  88. parent_trees = []
  89. for p in self._get_parents(commit):
  90. parent_commit = self._store[p]
  91. assert isinstance(parent_commit, Commit)
  92. parent_trees.append(parent_commit.tree)
  93. parent = parent_trees
  94. # Use a lambda to adapt the signature
  95. changes_func = cast(
  96. Any,
  97. lambda store,
  98. parent_trees,
  99. tree_id,
  100. rename_detector=None: tree_changes_for_merge(
  101. store, parent_trees, tree_id, rename_detector
  102. ),
  103. )
  104. if path_prefix:
  105. parent_trees = [self._store[p] for p in parent]
  106. parent = []
  107. for p in parent_trees:
  108. try:
  109. from .objects import Tree
  110. assert isinstance(p, Tree)
  111. _mode, st = p.lookup_path(
  112. self._store.__getitem__,
  113. path_prefix,
  114. )
  115. except KeyError:
  116. pass
  117. else:
  118. parent.append(st)
  119. commit_tree_sha = commit.tree
  120. if path_prefix:
  121. commit_tree = self._store[commit_tree_sha]
  122. from .objects import Tree
  123. assert isinstance(commit_tree, Tree)
  124. _mode, commit_tree_sha = commit_tree.lookup_path(
  125. self._store.__getitem__,
  126. path_prefix,
  127. )
  128. cached = list(
  129. changes_func(
  130. self._store,
  131. parent,
  132. commit_tree_sha,
  133. rename_detector=self._rename_detector,
  134. )
  135. )
  136. self._changes[path_prefix] = cached
  137. return self._changes[path_prefix]
  138. def __repr__(self) -> str:
  139. """Return string representation of WalkEntry."""
  140. return f"<WalkEntry commit={self.commit.id.decode('ascii')}, changes={self.changes()!r}>"
  141. class _CommitTimeQueue:
  142. """Priority queue of WalkEntry objects by commit time."""
  143. def __init__(self, walker: "Walker") -> None:
  144. self._walker = walker
  145. self._store = walker.store
  146. self._get_parents = walker.get_parents
  147. self._excluded = walker.excluded
  148. self._pq: list[tuple[int, Commit]] = []
  149. self._pq_set: set[ObjectID] = set()
  150. self._seen: set[ObjectID] = set()
  151. self._done: set[ObjectID] = set()
  152. self._min_time = walker.since
  153. self._last: Commit | None = None
  154. self._extra_commits_left = _MAX_EXTRA_COMMITS
  155. self._is_finished = False
  156. for commit_id in chain(walker.include, walker.excluded):
  157. self._push(commit_id)
  158. def _push(self, object_id: ObjectID) -> None:
  159. try:
  160. obj = self._store[object_id]
  161. except KeyError as exc:
  162. raise MissingCommitError(object_id) from exc
  163. if isinstance(obj, Tag):
  164. self._push(obj.object[1])
  165. return
  166. # TODO(jelmer): What to do about non-Commit and non-Tag objects?
  167. if not isinstance(obj, Commit):
  168. return
  169. commit = obj
  170. if commit.id not in self._pq_set and commit.id not in self._done:
  171. heapq.heappush(self._pq, (-commit.commit_time, commit))
  172. self._pq_set.add(commit.id)
  173. self._seen.add(commit.id)
  174. def _exclude_parents(self, commit: Commit) -> None:
  175. excluded = self._excluded
  176. seen = self._seen
  177. todo = [commit]
  178. while todo:
  179. commit = todo.pop()
  180. for parent in self._get_parents(commit):
  181. if parent not in excluded and parent in seen:
  182. # TODO: This is inefficient unless the object store does
  183. # some caching (which DiskObjectStore currently does not).
  184. # We could either add caching in this class or pass around
  185. # parsed queue entry objects instead of commits.
  186. parent_commit = self._store[parent]
  187. assert isinstance(parent_commit, Commit)
  188. todo.append(parent_commit)
  189. excluded.add(parent)
  190. def next(self) -> WalkEntry | None:
  191. if self._is_finished:
  192. return None
  193. while self._pq:
  194. _, commit = heapq.heappop(self._pq)
  195. sha = commit.id
  196. self._pq_set.remove(sha)
  197. if sha in self._done:
  198. continue
  199. self._done.add(sha)
  200. for parent_id in self._get_parents(commit):
  201. self._push(parent_id)
  202. reset_extra_commits = True
  203. is_excluded = sha in self._excluded
  204. if is_excluded:
  205. self._exclude_parents(commit)
  206. if self._pq and all(c.id in self._excluded for _, c in self._pq):
  207. _, n = self._pq[0]
  208. if self._last and n.commit_time >= self._last.commit_time:
  209. # If the next commit is newer than the last one, we
  210. # need to keep walking in case its parents (which we
  211. # may not have seen yet) are excluded. This gives the
  212. # excluded set a chance to "catch up" while the commit
  213. # is still in the Walker's output queue.
  214. reset_extra_commits = True
  215. else:
  216. reset_extra_commits = False
  217. if self._min_time is not None and commit.commit_time < self._min_time:
  218. # We want to stop walking at min_time, but commits at the
  219. # boundary may be out of order with respect to their parents.
  220. # So we walk _MAX_EXTRA_COMMITS more commits once we hit this
  221. # boundary.
  222. reset_extra_commits = False
  223. if reset_extra_commits:
  224. # We're not at a boundary, so reset the counter.
  225. self._extra_commits_left = _MAX_EXTRA_COMMITS
  226. else:
  227. self._extra_commits_left -= 1
  228. if not self._extra_commits_left:
  229. break
  230. if not is_excluded:
  231. self._last = commit
  232. return WalkEntry(self._walker, commit)
  233. self._is_finished = True
  234. return None
  235. __next__ = next
  236. class Walker:
  237. """Object for performing a walk of commits in a store.
  238. Walker objects are initialized with a store and other options and can then
  239. be treated as iterators of Commit objects.
  240. """
  241. def __init__(
  242. self,
  243. store: "BaseObjectStore",
  244. include: ObjectID | Sequence[ObjectID],
  245. exclude: Sequence[ObjectID] | None = None,
  246. order: str = "date",
  247. reverse: bool = False,
  248. max_entries: int | None = None,
  249. paths: Sequence[bytes] | None = None,
  250. rename_detector: RenameDetector | None = None,
  251. follow: bool = False,
  252. since: int | None = None,
  253. until: int | None = None,
  254. get_parents: Callable[[Commit], list[ObjectID]] = lambda commit: commit.parents,
  255. queue_cls: type = _CommitTimeQueue,
  256. ) -> None:
  257. """Constructor.
  258. Args:
  259. store: ObjectStore instance for looking up objects.
  260. include: Iterable of SHAs of commits to include along with their
  261. ancestors.
  262. exclude: Iterable of SHAs of commits to exclude along with their
  263. ancestors, overriding includes.
  264. order: ORDER_* constant specifying the order of results.
  265. Anything other than ORDER_DATE may result in O(n) memory usage.
  266. reverse: If True, reverse the order of output, requiring O(n)
  267. memory.
  268. max_entries: The maximum number of entries to yield, or None for
  269. no limit.
  270. paths: Iterable of file or subtree paths to show entries for.
  271. rename_detector: diff.RenameDetector object for detecting
  272. renames.
  273. follow: If True, follow path across renames/copies. Forces a
  274. default rename_detector.
  275. since: Timestamp to list commits after.
  276. until: Timestamp to list commits before.
  277. get_parents: Method to retrieve the parents of a commit
  278. queue_cls: A class to use for a queue of commits, supporting the
  279. iterator protocol. The constructor takes a single argument, the
  280. Walker.
  281. """
  282. # Note: when adding arguments to this method, please also update
  283. # dulwich.repo.BaseRepo.get_walker
  284. if order not in ALL_ORDERS:
  285. raise ValueError(f"Unknown walk order {order}")
  286. self.store = store
  287. if isinstance(include, bytes):
  288. # TODO(jelmer): Really, this should require a single type.
  289. # Print deprecation warning here?
  290. include = [include]
  291. self.include = include
  292. self.excluded = set(exclude or [])
  293. self.order = order
  294. self.reverse = reverse
  295. self.max_entries = max_entries
  296. self.paths = (paths and set(paths)) or None
  297. if follow and not rename_detector:
  298. rename_detector = RenameDetector(store)
  299. self.rename_detector = rename_detector
  300. self.get_parents = get_parents
  301. self.follow = follow
  302. self.since = since
  303. self.until = until
  304. self._num_entries = 0
  305. self._queue = queue_cls(self)
  306. self._out_queue: deque[WalkEntry] = deque()
  307. def _path_matches(self, changed_path: bytes | None) -> bool:
  308. if changed_path is None:
  309. return False
  310. if self.paths is None:
  311. return True
  312. for followed_path in self.paths:
  313. if changed_path == followed_path:
  314. return True
  315. if (
  316. changed_path.startswith(followed_path)
  317. and changed_path[len(followed_path)] == b"/"[0]
  318. ):
  319. return True
  320. return False
  321. def _change_matches(self, change: TreeChange) -> bool:
  322. assert self.paths
  323. if not change:
  324. return False
  325. old_path = change.old.path if change.old is not None else None
  326. new_path = change.new.path if change.new is not None else None
  327. if self._path_matches(new_path):
  328. if self.follow and change.type in RENAME_CHANGE_TYPES:
  329. assert old_path is not None
  330. assert new_path is not None
  331. self.paths.add(old_path)
  332. self.paths.remove(new_path)
  333. return True
  334. elif self._path_matches(old_path):
  335. return True
  336. return False
  337. def _should_return(self, entry: WalkEntry) -> bool | None:
  338. """Determine if a walk entry should be returned..
  339. Args:
  340. entry: The WalkEntry to consider.
  341. Returns: True if the WalkEntry should be returned by this walk, or
  342. False otherwise (e.g. if it doesn't match any requested paths).
  343. """
  344. commit = entry.commit
  345. if self.since is not None and commit.commit_time < self.since:
  346. return False
  347. if self.until is not None and commit.commit_time > self.until:
  348. return False
  349. if commit.id in self.excluded:
  350. return False
  351. if self.paths is None:
  352. return True
  353. if len(self.get_parents(commit)) > 1:
  354. changes_result = entry.changes()
  355. # For merge commits, changes() returns list[list[TreeChange]]
  356. assert isinstance(changes_result, list)
  357. for path_changes in changes_result:
  358. # For merge commits, only include changes with conflicts for
  359. # this path. Since a rename conflict may include different
  360. # old.paths, we have to check all of them.
  361. assert isinstance(path_changes, list)
  362. for change in path_changes:
  363. from .diff_tree import TreeChange
  364. assert isinstance(change, TreeChange)
  365. if self._change_matches(change):
  366. return True
  367. else:
  368. changes = entry.changes()
  369. from .diff_tree import TreeChange
  370. # Handle both list[TreeChange] and list[list[TreeChange]]
  371. if changes and isinstance(changes[0], list):
  372. # It's list[list[TreeChange]], flatten it
  373. for change_list in changes:
  374. assert isinstance(change_list, list)
  375. for change in change_list:
  376. assert isinstance(change, TreeChange)
  377. if self._change_matches(change):
  378. return True
  379. else:
  380. # It's list[TreeChange]
  381. assert isinstance(changes, list)
  382. for item in changes:
  383. assert isinstance(item, TreeChange)
  384. if self._change_matches(item):
  385. return True
  386. return None
  387. def _next(self) -> WalkEntry | None:
  388. max_entries = self.max_entries
  389. while max_entries is None or self._num_entries < max_entries:
  390. entry = next(self._queue)
  391. if entry is not None:
  392. self._out_queue.append(entry)
  393. if entry is None or len(self._out_queue) > _MAX_EXTRA_COMMITS:
  394. if not self._out_queue:
  395. return None
  396. entry = self._out_queue.popleft()
  397. if self._should_return(entry):
  398. self._num_entries += 1
  399. return entry
  400. return None
  401. def _reorder(
  402. self, results: Iterator[WalkEntry]
  403. ) -> Iterator[WalkEntry] | list[WalkEntry]:
  404. """Possibly reorder a results iterator.
  405. Args:
  406. results: An iterator of WalkEntry objects, in the order returned
  407. from the queue_cls.
  408. Returns: An iterator or list of WalkEntry objects, in the order
  409. required by the Walker.
  410. """
  411. if self.order == ORDER_TOPO:
  412. results = _topo_reorder(results, self.get_parents)
  413. if self.reverse:
  414. results = reversed(list(results))
  415. return results
  416. def __iter__(self) -> Iterator[WalkEntry]:
  417. """Iterate over walk entries."""
  418. return iter(self._reorder(iter(self._next, None)))
  419. def _topo_reorder(
  420. entries: Iterator[WalkEntry],
  421. get_parents: Callable[[Commit], list[ObjectID]] = lambda commit: commit.parents,
  422. ) -> Iterator[WalkEntry]:
  423. """Reorder an iterable of entries topologically.
  424. This works best assuming the entries are already in almost-topological
  425. order, e.g. in commit time order.
  426. Args:
  427. entries: An iterable of WalkEntry objects.
  428. get_parents: Optional function for getting the parents of a commit.
  429. Returns: iterator over WalkEntry objects from entries in FIFO order, except
  430. where a parent would be yielded before any of its children.
  431. """
  432. todo: deque[WalkEntry] = deque()
  433. pending: dict[bytes, WalkEntry] = {}
  434. num_children: dict[bytes, int] = defaultdict(int)
  435. for entry in entries:
  436. todo.append(entry)
  437. for p in get_parents(entry.commit):
  438. num_children[p] += 1
  439. while todo:
  440. entry = todo.popleft()
  441. commit = entry.commit
  442. commit_id = commit.id
  443. if num_children[commit_id]:
  444. pending[commit_id] = entry
  445. continue
  446. for parent_id in get_parents(commit):
  447. num_children[parent_id] -= 1
  448. if not num_children[parent_id]:
  449. parent_entry = pending.pop(parent_id, None)
  450. if parent_entry:
  451. todo.appendleft(parent_entry)
  452. yield entry