graph.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  2. # Copyright (c) 2020 Kevin B. Hendricks, Stratford Ontario Canada
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. """Implementation of merge-base following the approach of git."""
  21. from collections.abc import Callable, Iterator, Mapping, Sequence
  22. from heapq import heappop, heappush
  23. from typing import TYPE_CHECKING, Generic, TypeVar
  24. if TYPE_CHECKING:
  25. from .repo import BaseRepo
  26. from .lru_cache import LRUCache
  27. from .objects import Commit, ObjectID
  28. T = TypeVar("T")
  29. # priority queue using builtin python minheap tools
  30. # why they do not have a builtin maxheap is simply ridiculous but
  31. # liveable with integer time stamps using negation
  32. class WorkList(Generic[T]):
  33. """Priority queue for commit processing using a min-heap."""
  34. def __init__(self) -> None:
  35. """Initialize an empty work list."""
  36. self.pq: list[tuple[int, T]] = []
  37. def add(self, item: tuple[int, T]) -> None:
  38. """Add an item to the work list.
  39. Args:
  40. item: Tuple of (timestamp, commit)
  41. """
  42. dt, cmt = item
  43. heappush(self.pq, (-dt, cmt))
  44. def get(self) -> tuple[int, T] | None:
  45. """Get the highest priority item from the work list.
  46. Returns:
  47. Tuple of (timestamp, commit) or None if empty
  48. """
  49. item = heappop(self.pq)
  50. if item:
  51. pr, cmt = item
  52. return -pr, cmt
  53. return None
  54. def iter(self) -> Iterator[tuple[int, T]]:
  55. """Iterate over items in the work list.
  56. Yields:
  57. Tuples of (timestamp, commit)
  58. """
  59. for pr, cmt in self.pq:
  60. yield (-pr, cmt)
  61. def _find_lcas(
  62. lookup_parents: Callable[[ObjectID], list[ObjectID]],
  63. c1: ObjectID,
  64. c2s: Sequence[ObjectID],
  65. lookup_stamp: Callable[[ObjectID], int],
  66. min_stamp: int = 0,
  67. shallows: set[ObjectID] | None = None,
  68. ) -> list[ObjectID]:
  69. """Find lowest common ancestors between commits.
  70. Args:
  71. lookup_parents: Function to get parent commits
  72. c1: First commit
  73. c2s: List of second commits
  74. lookup_stamp: Function to get commit timestamp
  75. min_stamp: Minimum timestamp to consider
  76. shallows: Set of shallow commits
  77. Returns:
  78. List of lowest common ancestor commit IDs
  79. """
  80. cands = []
  81. cstates = {}
  82. # Flags to Record State
  83. _ANC_OF_1 = 1 # ancestor of commit 1
  84. _ANC_OF_2 = 2 # ancestor of commit 2
  85. _DNC = 4 # Do Not Consider
  86. _LCA = 8 # potential LCA (Lowest Common Ancestor)
  87. def _has_candidates(
  88. wlst: WorkList[ObjectID], cstates: Mapping[ObjectID, int]
  89. ) -> bool:
  90. """Check if there are any candidate commits in the work list.
  91. Args:
  92. wlst: Work list of commits
  93. cstates: Dictionary of commit states
  94. Returns:
  95. True if there are candidates to process
  96. """
  97. for dt, cmt in wlst.iter():
  98. if cmt in cstates:
  99. if not ((cstates[cmt] & _DNC) == _DNC):
  100. return True
  101. return False
  102. # initialize the working list states with ancestry info
  103. # note possibility of c1 being one of c2s should be handled
  104. wlst: WorkList[bytes] = WorkList()
  105. cstates[c1] = _ANC_OF_1
  106. try:
  107. wlst.add((lookup_stamp(c1), c1))
  108. except KeyError:
  109. # If c1 doesn't exist and we have shallow commits, it might be a missing parent
  110. if shallows is None or not shallows:
  111. raise
  112. # For missing commits in shallow repos, use a minimal timestamp
  113. wlst.add((0, c1))
  114. for c2 in c2s:
  115. cflags = cstates.get(c2, 0)
  116. cstates[c2] = cflags | _ANC_OF_2
  117. try:
  118. wlst.add((lookup_stamp(c2), c2))
  119. except KeyError:
  120. # If c2 doesn't exist and we have shallow commits, it might be a missing parent
  121. if shallows is None or not shallows:
  122. raise
  123. # For missing commits in shallow repos, use a minimal timestamp
  124. wlst.add((0, c2))
  125. # loop while at least one working list commit is still viable (not marked as _DNC)
  126. # adding any parents to the list in a breadth first manner
  127. while _has_candidates(wlst, cstates):
  128. result = wlst.get()
  129. if result is None:
  130. break
  131. dt, cmt = result
  132. # Look only at ANCESTRY and _DNC flags so that already
  133. # found _LCAs can still be marked _DNC by lower _LCAS
  134. cflags = cstates[cmt] & (_ANC_OF_1 | _ANC_OF_2 | _DNC)
  135. if cflags == (_ANC_OF_1 | _ANC_OF_2):
  136. # potential common ancestor if not already in candidates add it
  137. if not (cstates[cmt] & _LCA) == _LCA:
  138. cstates[cmt] = cstates[cmt] | _LCA
  139. cands.append((dt, cmt))
  140. # mark any parents of this node _DNC as all parents
  141. # would be one generation further removed common ancestors
  142. cflags = cflags | _DNC
  143. try:
  144. parents = lookup_parents(cmt)
  145. except KeyError:
  146. # If we can't get parents in a shallow repo, skip this node
  147. # This is safer than pretending it has no parents
  148. if shallows is not None and shallows:
  149. continue
  150. raise
  151. if parents:
  152. for pcmt in parents:
  153. pflags = cstates.get(pcmt, 0)
  154. # if this parent was already visited with no new ancestry/flag information
  155. # do not add it to the working list again
  156. if (pflags & cflags) == cflags:
  157. continue
  158. try:
  159. pdt = lookup_stamp(pcmt)
  160. except KeyError:
  161. # Parent doesn't exist - if we're in a shallow repo, skip it
  162. if shallows is not None and shallows:
  163. continue
  164. raise
  165. if pdt < min_stamp:
  166. continue
  167. cstates[pcmt] = pflags | cflags
  168. wlst.add((pdt, pcmt))
  169. # walk final candidates removing any superseded by _DNC by later lower _LCAs
  170. # remove any duplicates and sort it so that earliest is first
  171. results = []
  172. for dt, cmt in cands:
  173. if not ((cstates[cmt] & _DNC) == _DNC) and (dt, cmt) not in results:
  174. results.append((dt, cmt))
  175. results.sort(key=lambda x: x[0])
  176. lcas = [cmt for dt, cmt in results]
  177. return lcas
  178. # actual git sorts these based on commit times
  179. def find_merge_base(repo: "BaseRepo", commit_ids: Sequence[ObjectID]) -> list[ObjectID]:
  180. """Find lowest common ancestors of commit_ids[0] and *any* of commits_ids[1:].
  181. Args:
  182. repo: Repository object
  183. commit_ids: list of commit ids
  184. Returns:
  185. list of lowest common ancestor commit_ids
  186. """
  187. cmtcache: LRUCache[ObjectID, Commit] = LRUCache(max_cache=128)
  188. parents_provider = repo.parents_provider()
  189. def lookup_stamp(cmtid: ObjectID) -> int:
  190. if cmtid not in cmtcache:
  191. obj = repo.object_store[cmtid]
  192. assert isinstance(obj, Commit)
  193. cmtcache[cmtid] = obj
  194. commit_time = cmtcache[cmtid].commit_time
  195. assert isinstance(commit_time, int)
  196. return commit_time
  197. def lookup_parents(cmtid: ObjectID) -> list[ObjectID]:
  198. commit = None
  199. if cmtid in cmtcache:
  200. commit = cmtcache[cmtid]
  201. # must use parents provider to handle grafts and shallow
  202. return parents_provider.get_parents(cmtid, commit=commit)
  203. if not commit_ids:
  204. return []
  205. c1 = commit_ids[0]
  206. if not len(commit_ids) > 1:
  207. return [c1]
  208. c2s = list(commit_ids[1:])
  209. if c1 in c2s:
  210. return [c1]
  211. lcas = _find_lcas(
  212. lookup_parents, c1, c2s, lookup_stamp, shallows=parents_provider.shallows
  213. )
  214. return lcas
  215. def find_octopus_base(
  216. repo: "BaseRepo", commit_ids: Sequence[ObjectID]
  217. ) -> list[ObjectID]:
  218. """Find lowest common ancestors of *all* provided commit_ids.
  219. Args:
  220. repo: Repository
  221. commit_ids: list of commit ids
  222. Returns:
  223. list of lowest common ancestor commit_ids
  224. """
  225. cmtcache: LRUCache[ObjectID, Commit] = LRUCache(max_cache=128)
  226. parents_provider = repo.parents_provider()
  227. def lookup_stamp(cmtid: ObjectID) -> int:
  228. if cmtid not in cmtcache:
  229. obj = repo.object_store[cmtid]
  230. assert isinstance(obj, Commit)
  231. cmtcache[cmtid] = obj
  232. commit_time = cmtcache[cmtid].commit_time
  233. assert isinstance(commit_time, int)
  234. return commit_time
  235. def lookup_parents(cmtid: ObjectID) -> list[ObjectID]:
  236. commit = None
  237. if cmtid in cmtcache:
  238. commit = cmtcache[cmtid]
  239. # must use parents provider to handle grafts and shallow
  240. return parents_provider.get_parents(cmtid, commit=commit)
  241. if not commit_ids:
  242. return []
  243. if len(commit_ids) <= 2:
  244. return find_merge_base(repo, commit_ids)
  245. lcas = [commit_ids[0]]
  246. others = commit_ids[1:]
  247. for cmt in others:
  248. next_lcas = []
  249. for ca in lcas:
  250. res = _find_lcas(
  251. lookup_parents,
  252. cmt,
  253. [ca],
  254. lookup_stamp,
  255. shallows=parents_provider.shallows,
  256. )
  257. next_lcas.extend(res)
  258. lcas = next_lcas[:]
  259. return lcas
  260. def can_fast_forward(repo: "BaseRepo", c1: bytes, c2: bytes) -> bool:
  261. """Is it possible to fast-forward from c1 to c2?
  262. Args:
  263. repo: Repository to retrieve objects from
  264. c1: Commit id for first commit
  265. c2: Commit id for second commit
  266. """
  267. cmtcache: LRUCache[ObjectID, Commit] = LRUCache(max_cache=128)
  268. parents_provider = repo.parents_provider()
  269. def lookup_stamp(cmtid: ObjectID) -> int:
  270. if cmtid not in cmtcache:
  271. obj = repo.object_store[cmtid]
  272. assert isinstance(obj, Commit)
  273. cmtcache[cmtid] = obj
  274. commit_time = cmtcache[cmtid].commit_time
  275. assert isinstance(commit_time, int)
  276. return commit_time
  277. def lookup_parents(cmtid: ObjectID) -> list[ObjectID]:
  278. commit = None
  279. if cmtid in cmtcache:
  280. commit = cmtcache[cmtid]
  281. # must use parents provider to handle grafts and shallow
  282. return parents_provider.get_parents(cmtid, commit=commit)
  283. if c1 == c2:
  284. return True
  285. # Algorithm: Find the common ancestor
  286. try:
  287. min_stamp = lookup_stamp(c1)
  288. except KeyError:
  289. # If c1 doesn't exist in the object store, we can't determine fast-forward
  290. # This can happen in shallow clones where c1 is a missing parent
  291. # Check if any shallow commits have c1 as a parent
  292. if parents_provider.shallows:
  293. # We're in a shallow repository and c1 doesn't exist
  294. # We can't determine if fast-forward is possible
  295. return False
  296. raise
  297. lcas = _find_lcas(
  298. lookup_parents,
  299. c1,
  300. [c2],
  301. lookup_stamp,
  302. min_stamp=min_stamp,
  303. shallows=parents_provider.shallows,
  304. )
  305. return lcas == [c1]
  306. def independent(repo: "BaseRepo", commit_ids: Sequence[ObjectID]) -> list[ObjectID]:
  307. """Filter commits to only those that are not reachable from others.
  308. Args:
  309. repo: Repository object
  310. commit_ids: list of commit ids to filter
  311. Returns:
  312. list of commit ids that are not ancestors of any other commits in the list
  313. """
  314. if not commit_ids:
  315. return []
  316. if len(commit_ids) == 1:
  317. return list(commit_ids)
  318. # Filter out commits that are ancestors of other commits
  319. independent_commits = []
  320. for i, commit_id in enumerate(commit_ids):
  321. is_independent = True
  322. # Check if this commit is an ancestor of any other commit
  323. for j, other_id in enumerate(commit_ids):
  324. if i == j:
  325. continue
  326. # If merge base of (commit_id, other_id) is commit_id,
  327. # then commit_id is an ancestor of other_id
  328. merge_bases = find_merge_base(repo, [commit_id, other_id])
  329. if merge_bases == [commit_id]:
  330. is_independent = False
  331. break
  332. if is_independent:
  333. independent_commits.append(commit_id)
  334. return independent_commits