walk.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. # walk.py -- General implementation of walking commits and their contents.
  2. # Copyright (C) 2010 Google, Inc.
  3. #
  4. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  5. # General Public License as public by the Free Software Foundation; version 2.0
  6. # or (at your option) any later version. You can redistribute it and/or
  7. # modify it under the terms of either of these two licenses.
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # You should have received a copy of the licenses; if not, see
  16. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  17. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  18. # License, Version 2.0.
  19. #
  20. """General implementation of walking commits and their contents."""
  21. import collections
  22. import heapq
  23. from itertools import chain
  24. from typing import List, Tuple, Set, Deque, Optional
  25. from dulwich.diff_tree import (
  26. RENAME_CHANGE_TYPES,
  27. tree_changes,
  28. tree_changes_for_merge,
  29. RenameDetector,
  30. )
  31. from dulwich.errors import (
  32. MissingCommitError,
  33. )
  34. from dulwich.objects import (
  35. Commit,
  36. Tag,
  37. ObjectID,
  38. )
  39. ORDER_DATE = "date"
  40. ORDER_TOPO = "topo"
  41. ALL_ORDERS = (ORDER_DATE, ORDER_TOPO)
  42. # Maximum number of commits to walk past a commit time boundary.
  43. _MAX_EXTRA_COMMITS = 5
  44. class WalkEntry:
  45. """Object encapsulating a single result from a walk."""
  46. def __init__(self, walker, commit):
  47. self.commit = commit
  48. self._store = walker.store
  49. self._get_parents = walker.get_parents
  50. self._changes = {}
  51. self._rename_detector = walker.rename_detector
  52. def changes(self, path_prefix=None):
  53. """Get the tree changes for this entry.
  54. Args:
  55. path_prefix: Portion of the path in the repository to
  56. use to filter changes. Must be a directory name. Must be
  57. a full, valid, path reference (no partial names or wildcards).
  58. Returns: For commits with up to one parent, a list of TreeChange
  59. objects; if the commit has no parents, these will be relative to
  60. the empty tree. For merge commits, a list of lists of TreeChange
  61. objects; see dulwich.diff.tree_changes_for_merge.
  62. """
  63. cached = self._changes.get(path_prefix)
  64. if cached is None:
  65. commit = self.commit
  66. if not self._get_parents(commit):
  67. changes_func = tree_changes
  68. parent = None
  69. elif len(self._get_parents(commit)) == 1:
  70. changes_func = tree_changes
  71. parent = self._store[self._get_parents(commit)[0]].tree
  72. if path_prefix:
  73. mode, subtree_sha = parent.lookup_path(
  74. self._store.__getitem__,
  75. path_prefix,
  76. )
  77. parent = self._store[subtree_sha]
  78. else:
  79. changes_func = tree_changes_for_merge
  80. parent = [self._store[p].tree for p in self._get_parents(commit)]
  81. if path_prefix:
  82. parent_trees = [self._store[p] for p in parent]
  83. parent = []
  84. for p in parent_trees:
  85. try:
  86. mode, st = p.lookup_path(
  87. self._store.__getitem__,
  88. path_prefix,
  89. )
  90. except KeyError:
  91. pass
  92. else:
  93. parent.append(st)
  94. commit_tree_sha = commit.tree
  95. if path_prefix:
  96. commit_tree = self._store[commit_tree_sha]
  97. mode, commit_tree_sha = commit_tree.lookup_path(
  98. self._store.__getitem__,
  99. path_prefix,
  100. )
  101. cached = list(
  102. changes_func(
  103. self._store,
  104. parent,
  105. commit_tree_sha,
  106. rename_detector=self._rename_detector,
  107. )
  108. )
  109. self._changes[path_prefix] = cached
  110. return self._changes[path_prefix]
  111. def __repr__(self):
  112. return "<WalkEntry commit={}, changes={!r}>".format(
  113. self.commit.id,
  114. self.changes(),
  115. )
  116. class _CommitTimeQueue:
  117. """Priority queue of WalkEntry objects by commit time."""
  118. def __init__(self, walker: "Walker"):
  119. self._walker = walker
  120. self._store = walker.store
  121. self._get_parents = walker.get_parents
  122. self._excluded = walker.excluded
  123. self._pq: List[Tuple[int, Commit]] = []
  124. self._pq_set: Set[ObjectID] = set()
  125. self._seen: Set[ObjectID] = set()
  126. self._done: Set[ObjectID] = set()
  127. self._min_time = walker.since
  128. self._last = None
  129. self._extra_commits_left = _MAX_EXTRA_COMMITS
  130. self._is_finished = False
  131. for commit_id in chain(walker.include, walker.excluded):
  132. self._push(commit_id)
  133. def _push(self, object_id: bytes):
  134. try:
  135. obj = self._store[object_id]
  136. except KeyError as exc:
  137. raise MissingCommitError(object_id) from exc
  138. if isinstance(obj, Tag):
  139. self._push(obj.object[1])
  140. return
  141. # TODO(jelmer): What to do about non-Commit and non-Tag objects?
  142. commit = obj
  143. if commit.id not in self._pq_set and commit.id not in self._done:
  144. heapq.heappush(self._pq, (-commit.commit_time, commit))
  145. self._pq_set.add(commit.id)
  146. self._seen.add(commit.id)
  147. def _exclude_parents(self, commit):
  148. excluded = self._excluded
  149. seen = self._seen
  150. todo = [commit]
  151. while todo:
  152. commit = todo.pop()
  153. for parent in self._get_parents(commit):
  154. if parent not in excluded and parent in seen:
  155. # TODO: This is inefficient unless the object store does
  156. # some caching (which DiskObjectStore currently does not).
  157. # We could either add caching in this class or pass around
  158. # parsed queue entry objects instead of commits.
  159. todo.append(self._store[parent])
  160. excluded.add(parent)
  161. def next(self):
  162. if self._is_finished:
  163. return None
  164. while self._pq:
  165. _, commit = heapq.heappop(self._pq)
  166. sha = commit.id
  167. self._pq_set.remove(sha)
  168. if sha in self._done:
  169. continue
  170. self._done.add(sha)
  171. for parent_id in self._get_parents(commit):
  172. self._push(parent_id)
  173. reset_extra_commits = True
  174. is_excluded = sha in self._excluded
  175. if is_excluded:
  176. self._exclude_parents(commit)
  177. if self._pq and all(c.id in self._excluded for _, c in self._pq):
  178. _, n = self._pq[0]
  179. if self._last and n.commit_time >= self._last.commit_time:
  180. # If the next commit is newer than the last one, we
  181. # need to keep walking in case its parents (which we
  182. # may not have seen yet) are excluded. This gives the
  183. # excluded set a chance to "catch up" while the commit
  184. # is still in the Walker's output queue.
  185. reset_extra_commits = True
  186. else:
  187. reset_extra_commits = False
  188. if self._min_time is not None and commit.commit_time < self._min_time:
  189. # We want to stop walking at min_time, but commits at the
  190. # boundary may be out of order with respect to their parents.
  191. # So we walk _MAX_EXTRA_COMMITS more commits once we hit this
  192. # boundary.
  193. reset_extra_commits = False
  194. if reset_extra_commits:
  195. # We're not at a boundary, so reset the counter.
  196. self._extra_commits_left = _MAX_EXTRA_COMMITS
  197. else:
  198. self._extra_commits_left -= 1
  199. if not self._extra_commits_left:
  200. break
  201. if not is_excluded:
  202. self._last = commit
  203. return WalkEntry(self._walker, commit)
  204. self._is_finished = True
  205. return None
  206. __next__ = next
  207. class Walker:
  208. """Object for performing a walk of commits in a store.
  209. Walker objects are initialized with a store and other options and can then
  210. be treated as iterators of Commit objects.
  211. """
  212. def __init__(
  213. self,
  214. store,
  215. include: List[bytes],
  216. exclude: Optional[List[bytes]] = None,
  217. order: str = 'date',
  218. reverse: bool = False,
  219. max_entries: Optional[int] = None,
  220. paths: Optional[List[bytes]] = None,
  221. rename_detector: Optional[RenameDetector] = None,
  222. follow: bool = False,
  223. since: Optional[int] = None,
  224. until: Optional[int] = None,
  225. get_parents=lambda commit: commit.parents,
  226. queue_cls=_CommitTimeQueue,
  227. ):
  228. """Constructor.
  229. Args:
  230. store: ObjectStore instance for looking up objects.
  231. include: Iterable of SHAs of commits to include along with their
  232. ancestors.
  233. exclude: Iterable of SHAs of commits to exclude along with their
  234. ancestors, overriding includes.
  235. order: ORDER_* constant specifying the order of results.
  236. Anything other than ORDER_DATE may result in O(n) memory usage.
  237. reverse: If True, reverse the order of output, requiring O(n)
  238. memory.
  239. max_entries: The maximum number of entries to yield, or None for
  240. no limit.
  241. paths: Iterable of file or subtree paths to show entries for.
  242. rename_detector: diff.RenameDetector object for detecting
  243. renames.
  244. follow: If True, follow path across renames/copies. Forces a
  245. default rename_detector.
  246. since: Timestamp to list commits after.
  247. until: Timestamp to list commits before.
  248. get_parents: Method to retrieve the parents of a commit
  249. queue_cls: A class to use for a queue of commits, supporting the
  250. iterator protocol. The constructor takes a single argument, the
  251. Walker.
  252. """
  253. # Note: when adding arguments to this method, please also update
  254. # dulwich.repo.BaseRepo.get_walker
  255. if order not in ALL_ORDERS:
  256. raise ValueError("Unknown walk order %s" % order)
  257. self.store = store
  258. if isinstance(include, bytes):
  259. # TODO(jelmer): Really, this should require a single type.
  260. # Print deprecation warning here?
  261. include = [include]
  262. self.include = include
  263. self.excluded = set(exclude or [])
  264. self.order = order
  265. self.reverse = reverse
  266. self.max_entries = max_entries
  267. self.paths = paths and set(paths) or None
  268. if follow and not rename_detector:
  269. rename_detector = RenameDetector(store)
  270. self.rename_detector = rename_detector
  271. self.get_parents = get_parents
  272. self.follow = follow
  273. self.since = since
  274. self.until = until
  275. self._num_entries = 0
  276. self._queue = queue_cls(self)
  277. self._out_queue: Deque[WalkEntry] = collections.deque()
  278. def _path_matches(self, changed_path):
  279. if changed_path is None:
  280. return False
  281. if self.paths is None:
  282. return True
  283. for followed_path in self.paths:
  284. if changed_path == followed_path:
  285. return True
  286. if (
  287. changed_path.startswith(followed_path)
  288. and changed_path[len(followed_path)] == b"/"[0]
  289. ):
  290. return True
  291. return False
  292. def _change_matches(self, change):
  293. if not change:
  294. return False
  295. old_path = change.old.path
  296. new_path = change.new.path
  297. if self._path_matches(new_path):
  298. if self.follow and change.type in RENAME_CHANGE_TYPES:
  299. self.paths.add(old_path)
  300. self.paths.remove(new_path)
  301. return True
  302. elif self._path_matches(old_path):
  303. return True
  304. return False
  305. def _should_return(self, entry):
  306. """Determine if a walk entry should be returned..
  307. Args:
  308. entry: The WalkEntry to consider.
  309. Returns: True if the WalkEntry should be returned by this walk, or
  310. False otherwise (e.g. if it doesn't match any requested paths).
  311. """
  312. commit = entry.commit
  313. if self.since is not None and commit.commit_time < self.since:
  314. return False
  315. if self.until is not None and commit.commit_time > self.until:
  316. return False
  317. if commit.id in self.excluded:
  318. return False
  319. if self.paths is None:
  320. return True
  321. if len(self.get_parents(commit)) > 1:
  322. for path_changes in entry.changes():
  323. # For merge commits, only include changes with conflicts for
  324. # this path. Since a rename conflict may include different
  325. # old.paths, we have to check all of them.
  326. for change in path_changes:
  327. if self._change_matches(change):
  328. return True
  329. else:
  330. for change in entry.changes():
  331. if self._change_matches(change):
  332. return True
  333. return None
  334. def _next(self):
  335. max_entries = self.max_entries
  336. while max_entries is None or self._num_entries < max_entries:
  337. entry = next(self._queue)
  338. if entry is not None:
  339. self._out_queue.append(entry)
  340. if entry is None or len(self._out_queue) > _MAX_EXTRA_COMMITS:
  341. if not self._out_queue:
  342. return None
  343. entry = self._out_queue.popleft()
  344. if self._should_return(entry):
  345. self._num_entries += 1
  346. return entry
  347. return None
  348. def _reorder(self, results):
  349. """Possibly reorder a results iterator.
  350. Args:
  351. results: An iterator of WalkEntry objects, in the order returned
  352. from the queue_cls.
  353. Returns: An iterator or list of WalkEntry objects, in the order
  354. required by the Walker.
  355. """
  356. if self.order == ORDER_TOPO:
  357. results = _topo_reorder(results, self.get_parents)
  358. if self.reverse:
  359. results = reversed(list(results))
  360. return results
  361. def __iter__(self):
  362. return iter(self._reorder(iter(self._next, None)))
  363. def _topo_reorder(entries, get_parents=lambda commit: commit.parents):
  364. """Reorder an iterable of entries topologically.
  365. This works best assuming the entries are already in almost-topological
  366. order, e.g. in commit time order.
  367. Args:
  368. entries: An iterable of WalkEntry objects.
  369. get_parents: Optional function for getting the parents of a commit.
  370. Returns: iterator over WalkEntry objects from entries in FIFO order, except
  371. where a parent would be yielded before any of its children.
  372. """
  373. todo = collections.deque()
  374. pending = {}
  375. num_children = collections.defaultdict(int)
  376. for entry in entries:
  377. todo.append(entry)
  378. for p in get_parents(entry.commit):
  379. num_children[p] += 1
  380. while todo:
  381. entry = todo.popleft()
  382. commit = entry.commit
  383. commit_id = commit.id
  384. if num_children[commit_id]:
  385. pending[commit_id] = entry
  386. continue
  387. for parent_id in get_parents(commit):
  388. num_children[parent_id] -= 1
  389. if not num_children[parent_id]:
  390. parent_entry = pending.pop(parent_id, None)
  391. if parent_entry:
  392. todo.appendleft(parent_entry)
  393. yield entry