bundle.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. # bundle.py -- Bundle format support
  2. # Copyright (C) 2020 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Bundle format support."""
  22. __all__ = [
  23. "Bundle",
  24. "PackDataLike",
  25. "create_bundle_from_repo",
  26. "read_bundle",
  27. "write_bundle",
  28. ]
  29. import types
  30. from collections.abc import Callable, Iterator, Sequence
  31. from typing import (
  32. TYPE_CHECKING,
  33. BinaryIO,
  34. Protocol,
  35. cast,
  36. runtime_checkable,
  37. )
  38. if TYPE_CHECKING:
  39. from .object_format import ObjectFormat
  40. from .objects import ObjectID
  41. from .pack import PackData, UnpackedObject, write_pack_data
  42. from .refs import Ref
  43. @runtime_checkable
  44. class PackDataLike(Protocol):
  45. """Protocol for objects that behave like PackData."""
  46. object_format: "ObjectFormat"
  47. def __len__(self) -> int:
  48. """Return the number of objects in the pack."""
  49. ...
  50. def iter_unpacked(self) -> Iterator[UnpackedObject]:
  51. """Iterate over unpacked objects in the pack."""
  52. ...
  53. def close(self) -> None:
  54. """Close any open resources."""
  55. ...
  56. if TYPE_CHECKING:
  57. from .object_store import BaseObjectStore
  58. from .repo import BaseRepo
  59. class Bundle:
  60. """Git bundle object representation."""
  61. version: int | None
  62. capabilities: dict[str, str | None]
  63. prerequisites: list[tuple[ObjectID, bytes]]
  64. references: dict[Ref, ObjectID]
  65. pack_data: PackDataLike | None
  66. def __repr__(self) -> str:
  67. """Return string representation of Bundle."""
  68. return (
  69. f"<{type(self).__name__}(version={self.version}, "
  70. f"capabilities={self.capabilities}, "
  71. f"prerequisites={self.prerequisites}, "
  72. f"references={self.references})>"
  73. )
  74. def __eq__(self, other: object) -> bool:
  75. """Check equality with another Bundle."""
  76. if not isinstance(other, type(self)):
  77. return False
  78. if self.version != other.version:
  79. return False
  80. if self.capabilities != other.capabilities:
  81. return False
  82. if self.prerequisites != other.prerequisites:
  83. return False
  84. if self.references != other.references:
  85. return False
  86. if self.pack_data != other.pack_data:
  87. return False
  88. return True
  89. def close(self) -> None:
  90. """Close any open resources in this bundle."""
  91. if self.pack_data is not None:
  92. self.pack_data.close()
  93. self.pack_data = None
  94. def __enter__(self) -> "Bundle":
  95. """Enter context manager."""
  96. return self
  97. def __exit__(
  98. self,
  99. exc_type: type[BaseException] | None,
  100. exc_val: BaseException | None,
  101. exc_tb: types.TracebackType | None,
  102. ) -> None:
  103. """Exit context manager and close bundle."""
  104. self.close()
  105. def __del__(self) -> None:
  106. """Warn if bundle was not explicitly closed."""
  107. if self.pack_data is not None:
  108. import warnings
  109. warnings.warn(
  110. f"Bundle {self!r} was not explicitly closed. "
  111. "Please use bundle.close() or a context manager.",
  112. ResourceWarning,
  113. stacklevel=2,
  114. )
  115. def store_objects(
  116. self,
  117. object_store: "BaseObjectStore",
  118. progress: Callable[[str], None] | None = None,
  119. ) -> None:
  120. """Store all objects from this bundle into an object store.
  121. Args:
  122. object_store: The object store to add objects to
  123. progress: Optional progress callback function
  124. """
  125. from .objects import ShaFile
  126. if self.pack_data is None:
  127. raise ValueError("pack_data is not loaded")
  128. count = 0
  129. for unpacked in self.pack_data.iter_unpacked():
  130. # Convert the unpacked object to a proper git object
  131. if unpacked.decomp_chunks and unpacked.obj_type_num is not None:
  132. git_obj = ShaFile.from_raw_chunks(
  133. unpacked.obj_type_num, unpacked.decomp_chunks
  134. )
  135. object_store.add_object(git_obj)
  136. count += 1
  137. if progress and count % 100 == 0:
  138. progress(f"Stored {count} objects")
  139. if progress:
  140. progress(f"Stored {count} objects total")
  141. def _read_bundle(f: BinaryIO, version: int) -> Bundle:
  142. capabilities = {}
  143. prerequisites = []
  144. references: dict[Ref, ObjectID] = {}
  145. line = f.readline()
  146. if version >= 3:
  147. while line.startswith(b"@"):
  148. line = line[1:].rstrip(b"\n")
  149. try:
  150. key, value_bytes = line.split(b"=", 1)
  151. value = value_bytes.decode("utf-8")
  152. except ValueError:
  153. key = line
  154. value = None
  155. capabilities[key.decode("utf-8")] = value
  156. line = f.readline()
  157. while line.startswith(b"-"):
  158. (obj_id, comment) = line[1:].rstrip(b"\n").split(b" ", 1)
  159. prerequisites.append((ObjectID(obj_id), comment))
  160. line = f.readline()
  161. while line != b"\n":
  162. (obj_id, ref) = line.rstrip(b"\n").split(b" ", 1)
  163. references[Ref(ref)] = ObjectID(obj_id)
  164. line = f.readline()
  165. # Extract pack data to separate stream since PackData expects
  166. # the file to start with PACK header at position 0
  167. pack_bytes = f.read()
  168. if not pack_bytes:
  169. raise ValueError("Bundle file contains no pack data")
  170. from io import BytesIO
  171. from .object_format import DEFAULT_OBJECT_FORMAT
  172. pack_file = BytesIO(pack_bytes)
  173. # TODO: Support specifying object format based on bundle metadata
  174. pack_data = PackData.from_file(pack_file, object_format=DEFAULT_OBJECT_FORMAT)
  175. ret = Bundle()
  176. ret.references = references
  177. ret.capabilities = capabilities
  178. ret.prerequisites = prerequisites
  179. ret.pack_data = pack_data
  180. ret.version = version
  181. return ret
  182. def read_bundle(f: BinaryIO) -> Bundle:
  183. """Read a bundle file.
  184. Args:
  185. f: A seekable binary file-like object. The file must remain open
  186. for the lifetime of the returned Bundle object.
  187. """
  188. if not hasattr(f, "seek"):
  189. raise ValueError("Bundle file must be seekable")
  190. firstline = f.readline()
  191. if firstline == b"# v2 git bundle\n":
  192. return _read_bundle(f, 2)
  193. if firstline == b"# v3 git bundle\n":
  194. return _read_bundle(f, 3)
  195. raise AssertionError(f"unsupported bundle format header: {firstline!r}")
  196. def write_bundle(f: BinaryIO, bundle: Bundle) -> None:
  197. """Write a bundle to a file.
  198. Args:
  199. f: File-like object to write to
  200. bundle: Bundle object to write
  201. """
  202. version = bundle.version
  203. if version is None:
  204. if bundle.capabilities:
  205. version = 3
  206. else:
  207. version = 2
  208. if version == 2:
  209. f.write(b"# v2 git bundle\n")
  210. elif version == 3:
  211. f.write(b"# v3 git bundle\n")
  212. else:
  213. raise AssertionError(f"unknown version {version}")
  214. if version == 3:
  215. for key, value in bundle.capabilities.items():
  216. f.write(b"@" + key.encode("utf-8"))
  217. if value is not None:
  218. f.write(b"=" + value.encode("utf-8"))
  219. f.write(b"\n")
  220. for obj_id, comment in bundle.prerequisites:
  221. f.write(b"-" + obj_id + b" " + comment + b"\n")
  222. for ref, obj_id in bundle.references.items():
  223. f.write(obj_id + b" " + ref + b"\n")
  224. f.write(b"\n")
  225. if bundle.pack_data is None:
  226. raise ValueError("bundle.pack_data is not loaded")
  227. write_pack_data(
  228. cast(Callable[[bytes], None], f.write),
  229. num_records=len(bundle.pack_data),
  230. records=bundle.pack_data.iter_unpacked(),
  231. object_format=bundle.pack_data.object_format,
  232. )
  233. def create_bundle_from_repo(
  234. repo: "BaseRepo",
  235. refs: Sequence[Ref] | None = None,
  236. prerequisites: Sequence[bytes] | None = None,
  237. version: int | None = None,
  238. capabilities: dict[str, str | None] | None = None,
  239. progress: Callable[[str], None] | None = None,
  240. ) -> Bundle:
  241. """Create a bundle from a repository.
  242. Args:
  243. repo: Repository object to create bundle from
  244. refs: List of refs to include (defaults to all refs)
  245. prerequisites: List of commit SHAs that are prerequisites
  246. version: Bundle version (2 or 3, auto-detected if None)
  247. capabilities: Bundle capabilities (for v3 bundles)
  248. progress: Optional progress reporting function
  249. Returns:
  250. Bundle object ready for writing
  251. """
  252. if refs is None:
  253. refs = list(repo.refs.keys())
  254. if prerequisites is None:
  255. prerequisites = []
  256. if capabilities is None:
  257. capabilities = {}
  258. # Build the references dictionary for the bundle
  259. bundle_refs: dict[Ref, ObjectID] = {}
  260. want_objects: set[ObjectID] = set()
  261. for ref in refs:
  262. if ref in repo.refs:
  263. ref_value = repo.refs[ref]
  264. # Handle peeled refs
  265. try:
  266. peeled_value = repo.refs.get_peeled(ref)
  267. if peeled_value is not None and peeled_value != ref_value:
  268. bundle_refs[ref] = peeled_value
  269. else:
  270. bundle_refs[ref] = ref_value
  271. except KeyError:
  272. bundle_refs[ref] = ref_value
  273. want_objects.add(bundle_refs[ref])
  274. # Convert prerequisites to proper format
  275. bundle_prerequisites = []
  276. have_objects: set[ObjectID] = set()
  277. for prereq in prerequisites:
  278. if not isinstance(prereq, bytes):
  279. raise TypeError(
  280. f"Invalid prerequisite type: {type(prereq)}, expected bytes"
  281. )
  282. if len(prereq) != 40:
  283. raise ValueError(
  284. f"Invalid prerequisite SHA length: {len(prereq)}, expected 40 hex characters"
  285. )
  286. try:
  287. # Validate it's actually hex
  288. bytes.fromhex(prereq.decode("utf-8"))
  289. except ValueError:
  290. raise ValueError(f"Invalid prerequisite format: {prereq!r}")
  291. # Store hex in bundle and for pack generation
  292. bundle_prerequisites.append((ObjectID(prereq), b""))
  293. have_objects.add(ObjectID(prereq))
  294. # Generate pack data containing all objects needed for the refs
  295. pack_count, pack_objects = repo.generate_pack_data(
  296. have=have_objects,
  297. want=want_objects,
  298. progress=progress,
  299. )
  300. # Store the pack objects directly, we'll write them when saving the bundle
  301. # For now, create a simple wrapper to hold the data
  302. class _BundlePackData:
  303. def __init__(
  304. self,
  305. count: int,
  306. objects: Iterator[UnpackedObject],
  307. object_format: "ObjectFormat",
  308. ) -> None:
  309. self._count = count
  310. self._objects = list(objects) # Materialize the iterator
  311. self.object_format = object_format
  312. def __len__(self) -> int:
  313. return self._count
  314. def iter_unpacked(self) -> Iterator[UnpackedObject]:
  315. return iter(self._objects)
  316. def close(self) -> None:
  317. """Close pack data (no-op for in-memory pack data)."""
  318. pack_data = _BundlePackData(pack_count, pack_objects, repo.object_format)
  319. # Create bundle object
  320. bundle = Bundle()
  321. bundle.version = version
  322. bundle.capabilities = capabilities
  323. bundle.prerequisites = bundle_prerequisites
  324. bundle.references = bundle_refs
  325. bundle.pack_data = pack_data
  326. return bundle