walk.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. # walk.py -- General implementation of walking commits and their contents.
  2. # Copyright (C) 2010 Google, Inc.
  3. #
  4. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  5. # General Public License as public by the Free Software Foundation; version 2.0
  6. # or (at your option) any later version. You can redistribute it and/or
  7. # modify it under the terms of either of these two licenses.
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # You should have received a copy of the licenses; if not, see
  16. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  17. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  18. # License, Version 2.0.
  19. #
  20. """General implementation of walking commits and their contents."""
  21. import collections
  22. import heapq
  23. from itertools import chain
  24. from typing import Deque, List, Optional, Set, Tuple, Dict
  25. from .diff_tree import (
  26. RENAME_CHANGE_TYPES,
  27. RenameDetector,
  28. TreeChange,
  29. tree_changes,
  30. tree_changes_for_merge,
  31. )
  32. from .errors import MissingCommitError
  33. from .objects import Commit, ObjectID, Tag
  34. ORDER_DATE = "date"
  35. ORDER_TOPO = "topo"
  36. ALL_ORDERS = (ORDER_DATE, ORDER_TOPO)
  37. # Maximum number of commits to walk past a commit time boundary.
  38. _MAX_EXTRA_COMMITS = 5
  39. class WalkEntry:
  40. """Object encapsulating a single result from a walk."""
  41. def __init__(self, walker, commit) -> None:
  42. self.commit = commit
  43. self._store = walker.store
  44. self._get_parents = walker.get_parents
  45. self._changes: Dict[str, List[TreeChange]] = {}
  46. self._rename_detector = walker.rename_detector
  47. def changes(self, path_prefix=None):
  48. """Get the tree changes for this entry.
  49. Args:
  50. path_prefix: Portion of the path in the repository to
  51. use to filter changes. Must be a directory name. Must be
  52. a full, valid, path reference (no partial names or wildcards).
  53. Returns: For commits with up to one parent, a list of TreeChange
  54. objects; if the commit has no parents, these will be relative to
  55. the empty tree. For merge commits, a list of lists of TreeChange
  56. objects; see dulwich.diff.tree_changes_for_merge.
  57. """
  58. cached = self._changes.get(path_prefix)
  59. if cached is None:
  60. commit = self.commit
  61. if not self._get_parents(commit):
  62. changes_func = tree_changes
  63. parent = None
  64. elif len(self._get_parents(commit)) == 1:
  65. changes_func = tree_changes
  66. parent = self._store[self._get_parents(commit)[0]].tree
  67. if path_prefix:
  68. mode, subtree_sha = parent.lookup_path(
  69. self._store.__getitem__,
  70. path_prefix,
  71. )
  72. parent = self._store[subtree_sha]
  73. else:
  74. changes_func = tree_changes_for_merge
  75. parent = [self._store[p].tree for p in self._get_parents(commit)]
  76. if path_prefix:
  77. parent_trees = [self._store[p] for p in parent]
  78. parent = []
  79. for p in parent_trees:
  80. try:
  81. mode, st = p.lookup_path(
  82. self._store.__getitem__,
  83. path_prefix,
  84. )
  85. except KeyError:
  86. pass
  87. else:
  88. parent.append(st)
  89. commit_tree_sha = commit.tree
  90. if path_prefix:
  91. commit_tree = self._store[commit_tree_sha]
  92. mode, commit_tree_sha = commit_tree.lookup_path(
  93. self._store.__getitem__,
  94. path_prefix,
  95. )
  96. cached = list(
  97. changes_func(
  98. self._store,
  99. parent,
  100. commit_tree_sha,
  101. rename_detector=self._rename_detector,
  102. )
  103. )
  104. self._changes[path_prefix] = cached
  105. return self._changes[path_prefix]
  106. def __repr__(self) -> str:
  107. return "<WalkEntry commit={}, changes={!r}>".format(
  108. self.commit.id,
  109. self.changes(),
  110. )
  111. class _CommitTimeQueue:
  112. """Priority queue of WalkEntry objects by commit time."""
  113. def __init__(self, walker: "Walker") -> None:
  114. self._walker = walker
  115. self._store = walker.store
  116. self._get_parents = walker.get_parents
  117. self._excluded = walker.excluded
  118. self._pq: List[Tuple[int, Commit]] = []
  119. self._pq_set: Set[ObjectID] = set()
  120. self._seen: Set[ObjectID] = set()
  121. self._done: Set[ObjectID] = set()
  122. self._min_time = walker.since
  123. self._last = None
  124. self._extra_commits_left = _MAX_EXTRA_COMMITS
  125. self._is_finished = False
  126. for commit_id in chain(walker.include, walker.excluded):
  127. self._push(commit_id)
  128. def _push(self, object_id: bytes):
  129. try:
  130. obj = self._store[object_id]
  131. except KeyError as exc:
  132. raise MissingCommitError(object_id) from exc
  133. if isinstance(obj, Tag):
  134. self._push(obj.object[1])
  135. return
  136. # TODO(jelmer): What to do about non-Commit and non-Tag objects?
  137. commit = obj
  138. if commit.id not in self._pq_set and commit.id not in self._done:
  139. heapq.heappush(self._pq, (-commit.commit_time, commit))
  140. self._pq_set.add(commit.id)
  141. self._seen.add(commit.id)
  142. def _exclude_parents(self, commit):
  143. excluded = self._excluded
  144. seen = self._seen
  145. todo = [commit]
  146. while todo:
  147. commit = todo.pop()
  148. for parent in self._get_parents(commit):
  149. if parent not in excluded and parent in seen:
  150. # TODO: This is inefficient unless the object store does
  151. # some caching (which DiskObjectStore currently does not).
  152. # We could either add caching in this class or pass around
  153. # parsed queue entry objects instead of commits.
  154. todo.append(self._store[parent])
  155. excluded.add(parent)
  156. def next(self):
  157. if self._is_finished:
  158. return None
  159. while self._pq:
  160. _, commit = heapq.heappop(self._pq)
  161. sha = commit.id
  162. self._pq_set.remove(sha)
  163. if sha in self._done:
  164. continue
  165. self._done.add(sha)
  166. for parent_id in self._get_parents(commit):
  167. self._push(parent_id)
  168. reset_extra_commits = True
  169. is_excluded = sha in self._excluded
  170. if is_excluded:
  171. self._exclude_parents(commit)
  172. if self._pq and all(c.id in self._excluded for _, c in self._pq):
  173. _, n = self._pq[0]
  174. if self._last and n.commit_time >= self._last.commit_time:
  175. # If the next commit is newer than the last one, we
  176. # need to keep walking in case its parents (which we
  177. # may not have seen yet) are excluded. This gives the
  178. # excluded set a chance to "catch up" while the commit
  179. # is still in the Walker's output queue.
  180. reset_extra_commits = True
  181. else:
  182. reset_extra_commits = False
  183. if self._min_time is not None and commit.commit_time < self._min_time:
  184. # We want to stop walking at min_time, but commits at the
  185. # boundary may be out of order with respect to their parents.
  186. # So we walk _MAX_EXTRA_COMMITS more commits once we hit this
  187. # boundary.
  188. reset_extra_commits = False
  189. if reset_extra_commits:
  190. # We're not at a boundary, so reset the counter.
  191. self._extra_commits_left = _MAX_EXTRA_COMMITS
  192. else:
  193. self._extra_commits_left -= 1
  194. if not self._extra_commits_left:
  195. break
  196. if not is_excluded:
  197. self._last = commit
  198. return WalkEntry(self._walker, commit)
  199. self._is_finished = True
  200. return None
  201. __next__ = next
  202. class Walker:
  203. """Object for performing a walk of commits in a store.
  204. Walker objects are initialized with a store and other options and can then
  205. be treated as iterators of Commit objects.
  206. """
  207. def __init__(
  208. self,
  209. store,
  210. include: List[bytes],
  211. exclude: Optional[List[bytes]] = None,
  212. order: str = 'date',
  213. reverse: bool = False,
  214. max_entries: Optional[int] = None,
  215. paths: Optional[List[bytes]] = None,
  216. rename_detector: Optional[RenameDetector] = None,
  217. follow: bool = False,
  218. since: Optional[int] = None,
  219. until: Optional[int] = None,
  220. get_parents=lambda commit: commit.parents,
  221. queue_cls=_CommitTimeQueue,
  222. ) -> None:
  223. """Constructor.
  224. Args:
  225. store: ObjectStore instance for looking up objects.
  226. include: Iterable of SHAs of commits to include along with their
  227. ancestors.
  228. exclude: Iterable of SHAs of commits to exclude along with their
  229. ancestors, overriding includes.
  230. order: ORDER_* constant specifying the order of results.
  231. Anything other than ORDER_DATE may result in O(n) memory usage.
  232. reverse: If True, reverse the order of output, requiring O(n)
  233. memory.
  234. max_entries: The maximum number of entries to yield, or None for
  235. no limit.
  236. paths: Iterable of file or subtree paths to show entries for.
  237. rename_detector: diff.RenameDetector object for detecting
  238. renames.
  239. follow: If True, follow path across renames/copies. Forces a
  240. default rename_detector.
  241. since: Timestamp to list commits after.
  242. until: Timestamp to list commits before.
  243. get_parents: Method to retrieve the parents of a commit
  244. queue_cls: A class to use for a queue of commits, supporting the
  245. iterator protocol. The constructor takes a single argument, the
  246. Walker.
  247. """
  248. # Note: when adding arguments to this method, please also update
  249. # dulwich.repo.BaseRepo.get_walker
  250. if order not in ALL_ORDERS:
  251. raise ValueError("Unknown walk order %s" % order)
  252. self.store = store
  253. if isinstance(include, bytes):
  254. # TODO(jelmer): Really, this should require a single type.
  255. # Print deprecation warning here?
  256. include = [include]
  257. self.include = include
  258. self.excluded = set(exclude or [])
  259. self.order = order
  260. self.reverse = reverse
  261. self.max_entries = max_entries
  262. self.paths = paths and set(paths) or None
  263. if follow and not rename_detector:
  264. rename_detector = RenameDetector(store)
  265. self.rename_detector = rename_detector
  266. self.get_parents = get_parents
  267. self.follow = follow
  268. self.since = since
  269. self.until = until
  270. self._num_entries = 0
  271. self._queue = queue_cls(self)
  272. self._out_queue: Deque[WalkEntry] = collections.deque()
  273. def _path_matches(self, changed_path):
  274. if changed_path is None:
  275. return False
  276. if self.paths is None:
  277. return True
  278. for followed_path in self.paths:
  279. if changed_path == followed_path:
  280. return True
  281. if (
  282. changed_path.startswith(followed_path)
  283. and changed_path[len(followed_path)] == b"/"[0]
  284. ):
  285. return True
  286. return False
  287. def _change_matches(self, change):
  288. if not change:
  289. return False
  290. old_path = change.old.path
  291. new_path = change.new.path
  292. if self._path_matches(new_path):
  293. if self.follow and change.type in RENAME_CHANGE_TYPES:
  294. self.paths.add(old_path)
  295. self.paths.remove(new_path)
  296. return True
  297. elif self._path_matches(old_path):
  298. return True
  299. return False
  300. def _should_return(self, entry):
  301. """Determine if a walk entry should be returned..
  302. Args:
  303. entry: The WalkEntry to consider.
  304. Returns: True if the WalkEntry should be returned by this walk, or
  305. False otherwise (e.g. if it doesn't match any requested paths).
  306. """
  307. commit = entry.commit
  308. if self.since is not None and commit.commit_time < self.since:
  309. return False
  310. if self.until is not None and commit.commit_time > self.until:
  311. return False
  312. if commit.id in self.excluded:
  313. return False
  314. if self.paths is None:
  315. return True
  316. if len(self.get_parents(commit)) > 1:
  317. for path_changes in entry.changes():
  318. # For merge commits, only include changes with conflicts for
  319. # this path. Since a rename conflict may include different
  320. # old.paths, we have to check all of them.
  321. for change in path_changes:
  322. if self._change_matches(change):
  323. return True
  324. else:
  325. for change in entry.changes():
  326. if self._change_matches(change):
  327. return True
  328. return None
  329. def _next(self):
  330. max_entries = self.max_entries
  331. while max_entries is None or self._num_entries < max_entries:
  332. entry = next(self._queue)
  333. if entry is not None:
  334. self._out_queue.append(entry)
  335. if entry is None or len(self._out_queue) > _MAX_EXTRA_COMMITS:
  336. if not self._out_queue:
  337. return None
  338. entry = self._out_queue.popleft()
  339. if self._should_return(entry):
  340. self._num_entries += 1
  341. return entry
  342. return None
  343. def _reorder(self, results):
  344. """Possibly reorder a results iterator.
  345. Args:
  346. results: An iterator of WalkEntry objects, in the order returned
  347. from the queue_cls.
  348. Returns: An iterator or list of WalkEntry objects, in the order
  349. required by the Walker.
  350. """
  351. if self.order == ORDER_TOPO:
  352. results = _topo_reorder(results, self.get_parents)
  353. if self.reverse:
  354. results = reversed(list(results))
  355. return results
  356. def __iter__(self):
  357. return iter(self._reorder(iter(self._next, None)))
  358. def _topo_reorder(entries, get_parents=lambda commit: commit.parents):
  359. """Reorder an iterable of entries topologically.
  360. This works best assuming the entries are already in almost-topological
  361. order, e.g. in commit time order.
  362. Args:
  363. entries: An iterable of WalkEntry objects.
  364. get_parents: Optional function for getting the parents of a commit.
  365. Returns: iterator over WalkEntry objects from entries in FIFO order, except
  366. where a parent would be yielded before any of its children.
  367. """
  368. todo = collections.deque()
  369. pending = {}
  370. num_children = collections.defaultdict(int)
  371. for entry in entries:
  372. todo.append(entry)
  373. for p in get_parents(entry.commit):
  374. num_children[p] += 1
  375. while todo:
  376. entry = todo.popleft()
  377. commit = entry.commit
  378. commit_id = commit.id
  379. if num_children[commit_id]:
  380. pending[commit_id] = entry
  381. continue
  382. for parent_id in get_parents(commit):
  383. num_children[parent_id] -= 1
  384. if not num_children[parent_id]:
  385. parent_entry = pending.pop(parent_id, None)
  386. if parent_entry:
  387. todo.appendleft(parent_entry)
  388. yield entry