walk.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. # walk.py -- General implementation of walking commits and their contents.
  2. # Copyright (C) 2010 Google, Inc.
  3. #
  4. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  5. # General Public License as public by the Free Software Foundation; version 2.0
  6. # or (at your option) any later version. You can redistribute it and/or
  7. # modify it under the terms of either of these two licenses.
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # You should have received a copy of the licenses; if not, see
  16. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  17. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  18. # License, Version 2.0.
  19. #
  20. """General implementation of walking commits and their contents."""
  21. import collections
  22. import heapq
  23. from itertools import chain
  24. from typing import Deque, Dict, List, Optional, Set, Tuple
  25. from .diff_tree import (
  26. RENAME_CHANGE_TYPES,
  27. RenameDetector,
  28. TreeChange,
  29. tree_changes,
  30. tree_changes_for_merge,
  31. )
  32. from .errors import MissingCommitError
  33. from .objects import Commit, ObjectID, Tag
  34. ORDER_DATE = "date"
  35. ORDER_TOPO = "topo"
  36. ALL_ORDERS = (ORDER_DATE, ORDER_TOPO)
  37. # Maximum number of commits to walk past a commit time boundary.
  38. _MAX_EXTRA_COMMITS = 5
  39. class WalkEntry:
  40. """Object encapsulating a single result from a walk."""
  41. def __init__(self, walker, commit) -> None:
  42. self.commit = commit
  43. self._store = walker.store
  44. self._get_parents = walker.get_parents
  45. self._changes: Dict[str, List[TreeChange]] = {}
  46. self._rename_detector = walker.rename_detector
  47. def changes(self, path_prefix=None):
  48. """Get the tree changes for this entry.
  49. Args:
  50. path_prefix: Portion of the path in the repository to
  51. use to filter changes. Must be a directory name. Must be
  52. a full, valid, path reference (no partial names or wildcards).
  53. Returns: For commits with up to one parent, a list of TreeChange
  54. objects; if the commit has no parents, these will be relative to
  55. the empty tree. For merge commits, a list of lists of TreeChange
  56. objects; see dulwich.diff.tree_changes_for_merge.
  57. """
  58. cached = self._changes.get(path_prefix)
  59. if cached is None:
  60. commit = self.commit
  61. if not self._get_parents(commit):
  62. changes_func = tree_changes
  63. parent = None
  64. elif len(self._get_parents(commit)) == 1:
  65. changes_func = tree_changes
  66. parent = self._store[self._get_parents(commit)[0]].tree
  67. if path_prefix:
  68. mode, subtree_sha = parent.lookup_path(
  69. self._store.__getitem__,
  70. path_prefix,
  71. )
  72. parent = self._store[subtree_sha]
  73. else:
  74. changes_func = tree_changes_for_merge
  75. parent = [self._store[p].tree for p in self._get_parents(commit)]
  76. if path_prefix:
  77. parent_trees = [self._store[p] for p in parent]
  78. parent = []
  79. for p in parent_trees:
  80. try:
  81. mode, st = p.lookup_path(
  82. self._store.__getitem__,
  83. path_prefix,
  84. )
  85. except KeyError:
  86. pass
  87. else:
  88. parent.append(st)
  89. commit_tree_sha = commit.tree
  90. if path_prefix:
  91. commit_tree = self._store[commit_tree_sha]
  92. mode, commit_tree_sha = commit_tree.lookup_path(
  93. self._store.__getitem__,
  94. path_prefix,
  95. )
  96. cached = list(
  97. changes_func(
  98. self._store,
  99. parent,
  100. commit_tree_sha,
  101. rename_detector=self._rename_detector,
  102. )
  103. )
  104. self._changes[path_prefix] = cached
  105. return self._changes[path_prefix]
  106. def __repr__(self) -> str:
  107. return f"<WalkEntry commit={self.commit.id}, changes={self.changes()!r}>"
  108. class _CommitTimeQueue:
  109. """Priority queue of WalkEntry objects by commit time."""
  110. def __init__(self, walker: "Walker") -> None:
  111. self._walker = walker
  112. self._store = walker.store
  113. self._get_parents = walker.get_parents
  114. self._excluded = walker.excluded
  115. self._pq: List[Tuple[int, Commit]] = []
  116. self._pq_set: Set[ObjectID] = set()
  117. self._seen: Set[ObjectID] = set()
  118. self._done: Set[ObjectID] = set()
  119. self._min_time = walker.since
  120. self._last = None
  121. self._extra_commits_left = _MAX_EXTRA_COMMITS
  122. self._is_finished = False
  123. for commit_id in chain(walker.include, walker.excluded):
  124. self._push(commit_id)
  125. def _push(self, object_id: bytes):
  126. try:
  127. obj = self._store[object_id]
  128. except KeyError as exc:
  129. raise MissingCommitError(object_id) from exc
  130. if isinstance(obj, Tag):
  131. self._push(obj.object[1])
  132. return
  133. # TODO(jelmer): What to do about non-Commit and non-Tag objects?
  134. commit = obj
  135. if commit.id not in self._pq_set and commit.id not in self._done:
  136. heapq.heappush(self._pq, (-commit.commit_time, commit))
  137. self._pq_set.add(commit.id)
  138. self._seen.add(commit.id)
  139. def _exclude_parents(self, commit):
  140. excluded = self._excluded
  141. seen = self._seen
  142. todo = [commit]
  143. while todo:
  144. commit = todo.pop()
  145. for parent in self._get_parents(commit):
  146. if parent not in excluded and parent in seen:
  147. # TODO: This is inefficient unless the object store does
  148. # some caching (which DiskObjectStore currently does not).
  149. # We could either add caching in this class or pass around
  150. # parsed queue entry objects instead of commits.
  151. todo.append(self._store[parent])
  152. excluded.add(parent)
  153. def next(self):
  154. if self._is_finished:
  155. return None
  156. while self._pq:
  157. _, commit = heapq.heappop(self._pq)
  158. sha = commit.id
  159. self._pq_set.remove(sha)
  160. if sha in self._done:
  161. continue
  162. self._done.add(sha)
  163. for parent_id in self._get_parents(commit):
  164. self._push(parent_id)
  165. reset_extra_commits = True
  166. is_excluded = sha in self._excluded
  167. if is_excluded:
  168. self._exclude_parents(commit)
  169. if self._pq and all(c.id in self._excluded for _, c in self._pq):
  170. _, n = self._pq[0]
  171. if self._last and n.commit_time >= self._last.commit_time:
  172. # If the next commit is newer than the last one, we
  173. # need to keep walking in case its parents (which we
  174. # may not have seen yet) are excluded. This gives the
  175. # excluded set a chance to "catch up" while the commit
  176. # is still in the Walker's output queue.
  177. reset_extra_commits = True
  178. else:
  179. reset_extra_commits = False
  180. if self._min_time is not None and commit.commit_time < self._min_time:
  181. # We want to stop walking at min_time, but commits at the
  182. # boundary may be out of order with respect to their parents.
  183. # So we walk _MAX_EXTRA_COMMITS more commits once we hit this
  184. # boundary.
  185. reset_extra_commits = False
  186. if reset_extra_commits:
  187. # We're not at a boundary, so reset the counter.
  188. self._extra_commits_left = _MAX_EXTRA_COMMITS
  189. else:
  190. self._extra_commits_left -= 1
  191. if not self._extra_commits_left:
  192. break
  193. if not is_excluded:
  194. self._last = commit
  195. return WalkEntry(self._walker, commit)
  196. self._is_finished = True
  197. return None
  198. __next__ = next
  199. class Walker:
  200. """Object for performing a walk of commits in a store.
  201. Walker objects are initialized with a store and other options and can then
  202. be treated as iterators of Commit objects.
  203. """
  204. def __init__(
  205. self,
  206. store,
  207. include: List[bytes],
  208. exclude: Optional[List[bytes]] = None,
  209. order: str = "date",
  210. reverse: bool = False,
  211. max_entries: Optional[int] = None,
  212. paths: Optional[List[bytes]] = None,
  213. rename_detector: Optional[RenameDetector] = None,
  214. follow: bool = False,
  215. since: Optional[int] = None,
  216. until: Optional[int] = None,
  217. get_parents=lambda commit: commit.parents,
  218. queue_cls=_CommitTimeQueue,
  219. ) -> None:
  220. """Constructor.
  221. Args:
  222. store: ObjectStore instance for looking up objects.
  223. include: Iterable of SHAs of commits to include along with their
  224. ancestors.
  225. exclude: Iterable of SHAs of commits to exclude along with their
  226. ancestors, overriding includes.
  227. order: ORDER_* constant specifying the order of results.
  228. Anything other than ORDER_DATE may result in O(n) memory usage.
  229. reverse: If True, reverse the order of output, requiring O(n)
  230. memory.
  231. max_entries: The maximum number of entries to yield, or None for
  232. no limit.
  233. paths: Iterable of file or subtree paths to show entries for.
  234. rename_detector: diff.RenameDetector object for detecting
  235. renames.
  236. follow: If True, follow path across renames/copies. Forces a
  237. default rename_detector.
  238. since: Timestamp to list commits after.
  239. until: Timestamp to list commits before.
  240. get_parents: Method to retrieve the parents of a commit
  241. queue_cls: A class to use for a queue of commits, supporting the
  242. iterator protocol. The constructor takes a single argument, the
  243. Walker.
  244. """
  245. # Note: when adding arguments to this method, please also update
  246. # dulwich.repo.BaseRepo.get_walker
  247. if order not in ALL_ORDERS:
  248. raise ValueError(f"Unknown walk order {order}")
  249. self.store = store
  250. if isinstance(include, bytes):
  251. # TODO(jelmer): Really, this should require a single type.
  252. # Print deprecation warning here?
  253. include = [include]
  254. self.include = include
  255. self.excluded = set(exclude or [])
  256. self.order = order
  257. self.reverse = reverse
  258. self.max_entries = max_entries
  259. self.paths = paths and set(paths) or None
  260. if follow and not rename_detector:
  261. rename_detector = RenameDetector(store)
  262. self.rename_detector = rename_detector
  263. self.get_parents = get_parents
  264. self.follow = follow
  265. self.since = since
  266. self.until = until
  267. self._num_entries = 0
  268. self._queue = queue_cls(self)
  269. self._out_queue: Deque[WalkEntry] = collections.deque()
  270. def _path_matches(self, changed_path):
  271. if changed_path is None:
  272. return False
  273. if self.paths is None:
  274. return True
  275. for followed_path in self.paths:
  276. if changed_path == followed_path:
  277. return True
  278. if (
  279. changed_path.startswith(followed_path)
  280. and changed_path[len(followed_path)] == b"/"[0]
  281. ):
  282. return True
  283. return False
  284. def _change_matches(self, change):
  285. if not change:
  286. return False
  287. old_path = change.old.path
  288. new_path = change.new.path
  289. if self._path_matches(new_path):
  290. if self.follow and change.type in RENAME_CHANGE_TYPES:
  291. self.paths.add(old_path)
  292. self.paths.remove(new_path)
  293. return True
  294. elif self._path_matches(old_path):
  295. return True
  296. return False
  297. def _should_return(self, entry):
  298. """Determine if a walk entry should be returned..
  299. Args:
  300. entry: The WalkEntry to consider.
  301. Returns: True if the WalkEntry should be returned by this walk, or
  302. False otherwise (e.g. if it doesn't match any requested paths).
  303. """
  304. commit = entry.commit
  305. if self.since is not None and commit.commit_time < self.since:
  306. return False
  307. if self.until is not None and commit.commit_time > self.until:
  308. return False
  309. if commit.id in self.excluded:
  310. return False
  311. if self.paths is None:
  312. return True
  313. if len(self.get_parents(commit)) > 1:
  314. for path_changes in entry.changes():
  315. # For merge commits, only include changes with conflicts for
  316. # this path. Since a rename conflict may include different
  317. # old.paths, we have to check all of them.
  318. for change in path_changes:
  319. if self._change_matches(change):
  320. return True
  321. else:
  322. for change in entry.changes():
  323. if self._change_matches(change):
  324. return True
  325. return None
  326. def _next(self):
  327. max_entries = self.max_entries
  328. while max_entries is None or self._num_entries < max_entries:
  329. entry = next(self._queue)
  330. if entry is not None:
  331. self._out_queue.append(entry)
  332. if entry is None or len(self._out_queue) > _MAX_EXTRA_COMMITS:
  333. if not self._out_queue:
  334. return None
  335. entry = self._out_queue.popleft()
  336. if self._should_return(entry):
  337. self._num_entries += 1
  338. return entry
  339. return None
  340. def _reorder(self, results):
  341. """Possibly reorder a results iterator.
  342. Args:
  343. results: An iterator of WalkEntry objects, in the order returned
  344. from the queue_cls.
  345. Returns: An iterator or list of WalkEntry objects, in the order
  346. required by the Walker.
  347. """
  348. if self.order == ORDER_TOPO:
  349. results = _topo_reorder(results, self.get_parents)
  350. if self.reverse:
  351. results = reversed(list(results))
  352. return results
  353. def __iter__(self):
  354. return iter(self._reorder(iter(self._next, None)))
  355. def _topo_reorder(entries, get_parents=lambda commit: commit.parents):
  356. """Reorder an iterable of entries topologically.
  357. This works best assuming the entries are already in almost-topological
  358. order, e.g. in commit time order.
  359. Args:
  360. entries: An iterable of WalkEntry objects.
  361. get_parents: Optional function for getting the parents of a commit.
  362. Returns: iterator over WalkEntry objects from entries in FIFO order, except
  363. where a parent would be yielded before any of its children.
  364. """
  365. todo = collections.deque()
  366. pending = {}
  367. num_children = collections.defaultdict(int)
  368. for entry in entries:
  369. todo.append(entry)
  370. for p in get_parents(entry.commit):
  371. num_children[p] += 1
  372. while todo:
  373. entry = todo.popleft()
  374. commit = entry.commit
  375. commit_id = commit.id
  376. if num_children[commit_id]:
  377. pending[commit_id] = entry
  378. continue
  379. for parent_id in get_parents(commit):
  380. num_children[parent_id] -= 1
  381. if not num_children[parent_id]:
  382. parent_entry = pending.pop(parent_id, None)
  383. if parent_entry:
  384. todo.appendleft(parent_entry)
  385. yield entry