walk.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. # walk.py -- General implementation of walking commits and their contents.
  2. # Copyright (C) 2010 Google, Inc.
  3. #
  4. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  5. # General Public License as public by the Free Software Foundation; version 2.0
  6. # or (at your option) any later version. You can redistribute it and/or
  7. # modify it under the terms of either of these two licenses.
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # You should have received a copy of the licenses; if not, see
  16. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  17. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  18. # License, Version 2.0.
  19. #
  20. """General implementation of walking commits and their contents."""
  21. import collections
  22. import heapq
  23. from itertools import chain
  24. from typing import Deque, List, Optional, Set, Tuple
  25. from .diff_tree import (
  26. RENAME_CHANGE_TYPES,
  27. RenameDetector,
  28. tree_changes,
  29. tree_changes_for_merge,
  30. )
  31. from .errors import MissingCommitError
  32. from .objects import Commit, ObjectID, Tag
  33. ORDER_DATE = "date"
  34. ORDER_TOPO = "topo"
  35. ALL_ORDERS = (ORDER_DATE, ORDER_TOPO)
  36. # Maximum number of commits to walk past a commit time boundary.
  37. _MAX_EXTRA_COMMITS = 5
  38. class WalkEntry:
  39. """Object encapsulating a single result from a walk."""
  40. def __init__(self, walker, commit) -> None:
  41. self.commit = commit
  42. self._store = walker.store
  43. self._get_parents = walker.get_parents
  44. self._changes = {}
  45. self._rename_detector = walker.rename_detector
  46. def changes(self, path_prefix=None):
  47. """Get the tree changes for this entry.
  48. Args:
  49. path_prefix: Portion of the path in the repository to
  50. use to filter changes. Must be a directory name. Must be
  51. a full, valid, path reference (no partial names or wildcards).
  52. Returns: For commits with up to one parent, a list of TreeChange
  53. objects; if the commit has no parents, these will be relative to
  54. the empty tree. For merge commits, a list of lists of TreeChange
  55. objects; see dulwich.diff.tree_changes_for_merge.
  56. """
  57. cached = self._changes.get(path_prefix)
  58. if cached is None:
  59. commit = self.commit
  60. if not self._get_parents(commit):
  61. changes_func = tree_changes
  62. parent = None
  63. elif len(self._get_parents(commit)) == 1:
  64. changes_func = tree_changes
  65. parent = self._store[self._get_parents(commit)[0]].tree
  66. if path_prefix:
  67. mode, subtree_sha = parent.lookup_path(
  68. self._store.__getitem__,
  69. path_prefix,
  70. )
  71. parent = self._store[subtree_sha]
  72. else:
  73. changes_func = tree_changes_for_merge
  74. parent = [self._store[p].tree for p in self._get_parents(commit)]
  75. if path_prefix:
  76. parent_trees = [self._store[p] for p in parent]
  77. parent = []
  78. for p in parent_trees:
  79. try:
  80. mode, st = p.lookup_path(
  81. self._store.__getitem__,
  82. path_prefix,
  83. )
  84. except KeyError:
  85. pass
  86. else:
  87. parent.append(st)
  88. commit_tree_sha = commit.tree
  89. if path_prefix:
  90. commit_tree = self._store[commit_tree_sha]
  91. mode, commit_tree_sha = commit_tree.lookup_path(
  92. self._store.__getitem__,
  93. path_prefix,
  94. )
  95. cached = list(
  96. changes_func(
  97. self._store,
  98. parent,
  99. commit_tree_sha,
  100. rename_detector=self._rename_detector,
  101. )
  102. )
  103. self._changes[path_prefix] = cached
  104. return self._changes[path_prefix]
  105. def __repr__(self) -> str:
  106. return "<WalkEntry commit={}, changes={!r}>".format(
  107. self.commit.id,
  108. self.changes(),
  109. )
  110. class _CommitTimeQueue:
  111. """Priority queue of WalkEntry objects by commit time."""
  112. def __init__(self, walker: "Walker") -> None:
  113. self._walker = walker
  114. self._store = walker.store
  115. self._get_parents = walker.get_parents
  116. self._excluded = walker.excluded
  117. self._pq: List[Tuple[int, Commit]] = []
  118. self._pq_set: Set[ObjectID] = set()
  119. self._seen: Set[ObjectID] = set()
  120. self._done: Set[ObjectID] = set()
  121. self._min_time = walker.since
  122. self._last = None
  123. self._extra_commits_left = _MAX_EXTRA_COMMITS
  124. self._is_finished = False
  125. for commit_id in chain(walker.include, walker.excluded):
  126. self._push(commit_id)
  127. def _push(self, object_id: bytes):
  128. try:
  129. obj = self._store[object_id]
  130. except KeyError as exc:
  131. raise MissingCommitError(object_id) from exc
  132. if isinstance(obj, Tag):
  133. self._push(obj.object[1])
  134. return
  135. # TODO(jelmer): What to do about non-Commit and non-Tag objects?
  136. commit = obj
  137. if commit.id not in self._pq_set and commit.id not in self._done:
  138. heapq.heappush(self._pq, (-commit.commit_time, commit))
  139. self._pq_set.add(commit.id)
  140. self._seen.add(commit.id)
  141. def _exclude_parents(self, commit):
  142. excluded = self._excluded
  143. seen = self._seen
  144. todo = [commit]
  145. while todo:
  146. commit = todo.pop()
  147. for parent in self._get_parents(commit):
  148. if parent not in excluded and parent in seen:
  149. # TODO: This is inefficient unless the object store does
  150. # some caching (which DiskObjectStore currently does not).
  151. # We could either add caching in this class or pass around
  152. # parsed queue entry objects instead of commits.
  153. todo.append(self._store[parent])
  154. excluded.add(parent)
  155. def next(self):
  156. if self._is_finished:
  157. return None
  158. while self._pq:
  159. _, commit = heapq.heappop(self._pq)
  160. sha = commit.id
  161. self._pq_set.remove(sha)
  162. if sha in self._done:
  163. continue
  164. self._done.add(sha)
  165. for parent_id in self._get_parents(commit):
  166. self._push(parent_id)
  167. reset_extra_commits = True
  168. is_excluded = sha in self._excluded
  169. if is_excluded:
  170. self._exclude_parents(commit)
  171. if self._pq and all(c.id in self._excluded for _, c in self._pq):
  172. _, n = self._pq[0]
  173. if self._last and n.commit_time >= self._last.commit_time:
  174. # If the next commit is newer than the last one, we
  175. # need to keep walking in case its parents (which we
  176. # may not have seen yet) are excluded. This gives the
  177. # excluded set a chance to "catch up" while the commit
  178. # is still in the Walker's output queue.
  179. reset_extra_commits = True
  180. else:
  181. reset_extra_commits = False
  182. if self._min_time is not None and commit.commit_time < self._min_time:
  183. # We want to stop walking at min_time, but commits at the
  184. # boundary may be out of order with respect to their parents.
  185. # So we walk _MAX_EXTRA_COMMITS more commits once we hit this
  186. # boundary.
  187. reset_extra_commits = False
  188. if reset_extra_commits:
  189. # We're not at a boundary, so reset the counter.
  190. self._extra_commits_left = _MAX_EXTRA_COMMITS
  191. else:
  192. self._extra_commits_left -= 1
  193. if not self._extra_commits_left:
  194. break
  195. if not is_excluded:
  196. self._last = commit
  197. return WalkEntry(self._walker, commit)
  198. self._is_finished = True
  199. return None
  200. __next__ = next
  201. class Walker:
  202. """Object for performing a walk of commits in a store.
  203. Walker objects are initialized with a store and other options and can then
  204. be treated as iterators of Commit objects.
  205. """
  206. def __init__(
  207. self,
  208. store,
  209. include: List[bytes],
  210. exclude: Optional[List[bytes]] = None,
  211. order: str = 'date',
  212. reverse: bool = False,
  213. max_entries: Optional[int] = None,
  214. paths: Optional[List[bytes]] = None,
  215. rename_detector: Optional[RenameDetector] = None,
  216. follow: bool = False,
  217. since: Optional[int] = None,
  218. until: Optional[int] = None,
  219. get_parents=lambda commit: commit.parents,
  220. queue_cls=_CommitTimeQueue,
  221. ) -> None:
  222. """Constructor.
  223. Args:
  224. store: ObjectStore instance for looking up objects.
  225. include: Iterable of SHAs of commits to include along with their
  226. ancestors.
  227. exclude: Iterable of SHAs of commits to exclude along with their
  228. ancestors, overriding includes.
  229. order: ORDER_* constant specifying the order of results.
  230. Anything other than ORDER_DATE may result in O(n) memory usage.
  231. reverse: If True, reverse the order of output, requiring O(n)
  232. memory.
  233. max_entries: The maximum number of entries to yield, or None for
  234. no limit.
  235. paths: Iterable of file or subtree paths to show entries for.
  236. rename_detector: diff.RenameDetector object for detecting
  237. renames.
  238. follow: If True, follow path across renames/copies. Forces a
  239. default rename_detector.
  240. since: Timestamp to list commits after.
  241. until: Timestamp to list commits before.
  242. get_parents: Method to retrieve the parents of a commit
  243. queue_cls: A class to use for a queue of commits, supporting the
  244. iterator protocol. The constructor takes a single argument, the
  245. Walker.
  246. """
  247. # Note: when adding arguments to this method, please also update
  248. # dulwich.repo.BaseRepo.get_walker
  249. if order not in ALL_ORDERS:
  250. raise ValueError("Unknown walk order %s" % order)
  251. self.store = store
  252. if isinstance(include, bytes):
  253. # TODO(jelmer): Really, this should require a single type.
  254. # Print deprecation warning here?
  255. include = [include]
  256. self.include = include
  257. self.excluded = set(exclude or [])
  258. self.order = order
  259. self.reverse = reverse
  260. self.max_entries = max_entries
  261. self.paths = paths and set(paths) or None
  262. if follow and not rename_detector:
  263. rename_detector = RenameDetector(store)
  264. self.rename_detector = rename_detector
  265. self.get_parents = get_parents
  266. self.follow = follow
  267. self.since = since
  268. self.until = until
  269. self._num_entries = 0
  270. self._queue = queue_cls(self)
  271. self._out_queue: Deque[WalkEntry] = collections.deque()
  272. def _path_matches(self, changed_path):
  273. if changed_path is None:
  274. return False
  275. if self.paths is None:
  276. return True
  277. for followed_path in self.paths:
  278. if changed_path == followed_path:
  279. return True
  280. if (
  281. changed_path.startswith(followed_path)
  282. and changed_path[len(followed_path)] == b"/"[0]
  283. ):
  284. return True
  285. return False
  286. def _change_matches(self, change):
  287. if not change:
  288. return False
  289. old_path = change.old.path
  290. new_path = change.new.path
  291. if self._path_matches(new_path):
  292. if self.follow and change.type in RENAME_CHANGE_TYPES:
  293. self.paths.add(old_path)
  294. self.paths.remove(new_path)
  295. return True
  296. elif self._path_matches(old_path):
  297. return True
  298. return False
  299. def _should_return(self, entry):
  300. """Determine if a walk entry should be returned..
  301. Args:
  302. entry: The WalkEntry to consider.
  303. Returns: True if the WalkEntry should be returned by this walk, or
  304. False otherwise (e.g. if it doesn't match any requested paths).
  305. """
  306. commit = entry.commit
  307. if self.since is not None and commit.commit_time < self.since:
  308. return False
  309. if self.until is not None and commit.commit_time > self.until:
  310. return False
  311. if commit.id in self.excluded:
  312. return False
  313. if self.paths is None:
  314. return True
  315. if len(self.get_parents(commit)) > 1:
  316. for path_changes in entry.changes():
  317. # For merge commits, only include changes with conflicts for
  318. # this path. Since a rename conflict may include different
  319. # old.paths, we have to check all of them.
  320. for change in path_changes:
  321. if self._change_matches(change):
  322. return True
  323. else:
  324. for change in entry.changes():
  325. if self._change_matches(change):
  326. return True
  327. return None
  328. def _next(self):
  329. max_entries = self.max_entries
  330. while max_entries is None or self._num_entries < max_entries:
  331. entry = next(self._queue)
  332. if entry is not None:
  333. self._out_queue.append(entry)
  334. if entry is None or len(self._out_queue) > _MAX_EXTRA_COMMITS:
  335. if not self._out_queue:
  336. return None
  337. entry = self._out_queue.popleft()
  338. if self._should_return(entry):
  339. self._num_entries += 1
  340. return entry
  341. return None
  342. def _reorder(self, results):
  343. """Possibly reorder a results iterator.
  344. Args:
  345. results: An iterator of WalkEntry objects, in the order returned
  346. from the queue_cls.
  347. Returns: An iterator or list of WalkEntry objects, in the order
  348. required by the Walker.
  349. """
  350. if self.order == ORDER_TOPO:
  351. results = _topo_reorder(results, self.get_parents)
  352. if self.reverse:
  353. results = reversed(list(results))
  354. return results
  355. def __iter__(self):
  356. return iter(self._reorder(iter(self._next, None)))
  357. def _topo_reorder(entries, get_parents=lambda commit: commit.parents):
  358. """Reorder an iterable of entries topologically.
  359. This works best assuming the entries are already in almost-topological
  360. order, e.g. in commit time order.
  361. Args:
  362. entries: An iterable of WalkEntry objects.
  363. get_parents: Optional function for getting the parents of a commit.
  364. Returns: iterator over WalkEntry objects from entries in FIFO order, except
  365. where a parent would be yielded before any of its children.
  366. """
  367. todo = collections.deque()
  368. pending = {}
  369. num_children = collections.defaultdict(int)
  370. for entry in entries:
  371. todo.append(entry)
  372. for p in get_parents(entry.commit):
  373. num_children[p] += 1
  374. while todo:
  375. entry = todo.popleft()
  376. commit = entry.commit
  377. commit_id = commit.id
  378. if num_children[commit_id]:
  379. pending[commit_id] = entry
  380. continue
  381. for parent_id in get_parents(commit):
  382. num_children[parent_id] -= 1
  383. if not num_children[parent_id]:
  384. parent_entry = pending.pop(parent_id, None)
  385. if parent_entry:
  386. todo.appendleft(parent_entry)
  387. yield entry