fastexport.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. # __init__.py -- Fast export/import functionality
  2. # Copyright (C) 2010-2013 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Fast export/import functionality."""
  22. __all__ = [
  23. "GitFastExporter",
  24. "GitImportProcessor",
  25. "split_email",
  26. ]
  27. import stat
  28. from collections.abc import Generator
  29. from typing import TYPE_CHECKING, Any, BinaryIO
  30. from fastimport import commands, parser, processor
  31. from fastimport import errors as fastimport_errors
  32. from .index import commit_tree
  33. from .object_store import iter_tree_contents
  34. from .objects import ZERO_SHA, Blob, Commit, ObjectID, Tag
  35. from .refs import Ref
  36. if TYPE_CHECKING:
  37. from .object_store import BaseObjectStore
  38. from .repo import BaseRepo
  39. def split_email(text: bytes) -> tuple[bytes, bytes]:
  40. """Split email address from name.
  41. Args:
  42. text: Full name and email (e.g. b"John Doe <john@example.com>")
  43. Returns:
  44. Tuple of (name, email)
  45. """
  46. # TODO(jelmer): Dedupe this and the same functionality in
  47. # format_annotate_line.
  48. (name, email) = text.rsplit(b" <", 1)
  49. return (name, email.rstrip(b">"))
  50. class GitFastExporter:
  51. """Generate a fast-export output stream for Git objects."""
  52. def __init__(self, outf: BinaryIO, store: "BaseObjectStore") -> None:
  53. """Initialize the fast exporter.
  54. Args:
  55. outf: Output file to write to
  56. store: Object store to export from
  57. """
  58. self.outf = outf
  59. self.store = store
  60. self.markers: dict[bytes, ObjectID] = {}
  61. self._marker_idx = 0
  62. def print_cmd(self, cmd: object) -> None:
  63. """Print a command to the output stream.
  64. Args:
  65. cmd: Command object to print
  66. """
  67. if hasattr(cmd, "__bytes__"):
  68. output = cmd.__bytes__()
  69. else:
  70. output = cmd.__repr__().encode("utf-8")
  71. self.outf.write(output + b"\n")
  72. def _allocate_marker(self) -> bytes:
  73. """Allocate a new marker.
  74. Returns:
  75. New marker as bytes
  76. """
  77. self._marker_idx += 1
  78. return str(self._marker_idx).encode("ascii")
  79. def _export_blob(self, blob: Blob) -> tuple[Any, bytes]:
  80. """Export a blob object.
  81. Args:
  82. blob: Blob object to export
  83. Returns:
  84. Tuple of (BlobCommand, marker)
  85. """
  86. marker = self._allocate_marker()
  87. self.markers[marker] = blob.id
  88. return (commands.BlobCommand(marker, blob.data), marker) # type: ignore[no-untyped-call,unused-ignore]
  89. def emit_blob(self, blob: Blob) -> bytes:
  90. """Emit a blob to the output stream.
  91. Args:
  92. blob: Blob object to emit
  93. Returns:
  94. Marker for the blob
  95. """
  96. (cmd, marker) = self._export_blob(blob)
  97. self.print_cmd(cmd)
  98. return marker
  99. def _iter_files(
  100. self, base_tree: ObjectID | None, new_tree: ObjectID | None
  101. ) -> Generator[Any, None, None]:
  102. for (
  103. (old_path, new_path),
  104. (old_mode, new_mode),
  105. (old_hexsha, new_hexsha),
  106. ) in self.store.tree_changes(base_tree, new_tree):
  107. if new_path is None:
  108. if old_path is not None:
  109. yield commands.FileDeleteCommand(old_path) # type: ignore[no-untyped-call,unused-ignore]
  110. continue
  111. marker = b""
  112. if new_mode is not None and not stat.S_ISDIR(new_mode):
  113. if new_hexsha is not None:
  114. blob = self.store[new_hexsha]
  115. from .objects import Blob
  116. if isinstance(blob, Blob):
  117. marker = self.emit_blob(blob)
  118. if old_path != new_path and old_path is not None:
  119. yield commands.FileRenameCommand(old_path, new_path) # type: ignore[no-untyped-call,unused-ignore]
  120. if old_mode != new_mode or old_hexsha != new_hexsha:
  121. prefixed_marker = b":" + marker
  122. assert new_mode is not None
  123. yield commands.FileModifyCommand( # type: ignore[no-untyped-call,unused-ignore]
  124. new_path, new_mode, prefixed_marker, None
  125. )
  126. def _export_commit(
  127. self, commit: Commit, ref: Ref, base_tree: ObjectID | None = None
  128. ) -> tuple[Any, bytes]:
  129. file_cmds = list(self._iter_files(base_tree, commit.tree))
  130. marker = self._allocate_marker()
  131. if commit.parents:
  132. from_ = commit.parents[0]
  133. merges = commit.parents[1:]
  134. else:
  135. from_ = None
  136. merges = []
  137. author, author_email = split_email(commit.author)
  138. committer, committer_email = split_email(commit.committer)
  139. cmd = commands.CommitCommand( # type: ignore[no-untyped-call,unused-ignore]
  140. ref,
  141. marker,
  142. (author, author_email, commit.author_time, commit.author_timezone),
  143. (
  144. committer,
  145. committer_email,
  146. commit.commit_time,
  147. commit.commit_timezone,
  148. ),
  149. commit.message,
  150. from_,
  151. merges,
  152. file_cmds,
  153. )
  154. return (cmd, marker)
  155. def emit_commit(
  156. self, commit: Commit, ref: Ref, base_tree: ObjectID | None = None
  157. ) -> bytes:
  158. """Emit a commit in fast-export format.
  159. Args:
  160. commit: Commit object to export
  161. ref: Reference name for the commit
  162. base_tree: Base tree for incremental export
  163. Returns:
  164. Marker for the commit
  165. """
  166. cmd, marker = self._export_commit(commit, ref, base_tree)
  167. self.print_cmd(cmd)
  168. return marker
  169. class GitImportProcessor(processor.ImportProcessor): # type: ignore[misc,unused-ignore]
  170. """An import processor that imports into a Git repository using Dulwich."""
  171. # FIXME: Batch creation of objects?
  172. def __init__(
  173. self,
  174. repo: "BaseRepo",
  175. params: Any | None = None, # noqa: ANN401
  176. verbose: bool = False,
  177. outf: BinaryIO | None = None,
  178. ) -> None:
  179. """Initialize GitImportProcessor.
  180. Args:
  181. repo: Repository to import into
  182. params: Import parameters
  183. verbose: Whether to enable verbose output
  184. outf: Output file for verbose messages
  185. """
  186. processor.ImportProcessor.__init__(self, params, verbose) # type: ignore[no-untyped-call,unused-ignore]
  187. self.repo = repo
  188. self.last_commit = ZERO_SHA
  189. self.markers: dict[bytes, ObjectID] = {}
  190. self._contents: dict[bytes, tuple[int, bytes]] = {}
  191. def lookup_object(self, objectish: bytes) -> ObjectID:
  192. """Look up an object by reference or marker.
  193. Args:
  194. objectish: Object reference or marker
  195. Returns:
  196. Object ID
  197. """
  198. if objectish.startswith(b":"):
  199. return self.markers[objectish[1:]]
  200. return ObjectID(objectish)
  201. def import_stream(self, stream: BinaryIO) -> dict[bytes, ObjectID]:
  202. """Import from a fast-import stream.
  203. Args:
  204. stream: Stream to import from
  205. Returns:
  206. Dictionary of markers to object IDs
  207. """
  208. p = parser.ImportParser(stream) # type: ignore[no-untyped-call,unused-ignore]
  209. self.process(p.iter_commands) # type: ignore[no-untyped-call,unused-ignore]
  210. return self.markers
  211. def blob_handler(self, cmd: commands.BlobCommand) -> None:
  212. """Process a BlobCommand."""
  213. blob = Blob.from_string(cmd.data)
  214. self.repo.object_store.add_object(blob)
  215. if cmd.mark:
  216. self.markers[cmd.mark] = blob.id
  217. def checkpoint_handler(self, cmd: commands.CheckpointCommand) -> None:
  218. """Process a CheckpointCommand."""
  219. def commit_handler(self, cmd: commands.CommitCommand) -> None:
  220. """Process a CommitCommand."""
  221. commit = Commit()
  222. if cmd.author is not None:
  223. (author_name, author_email, author_timestamp, author_timezone) = cmd.author
  224. else:
  225. (author_name, author_email, author_timestamp, author_timezone) = (
  226. cmd.committer
  227. )
  228. (
  229. committer_name,
  230. committer_email,
  231. commit_timestamp,
  232. commit_timezone,
  233. ) = cmd.committer
  234. if isinstance(author_name, str):
  235. author_name = author_name.encode("utf-8")
  236. if isinstance(author_email, str):
  237. author_email = author_email.encode("utf-8")
  238. commit.author = author_name + b" <" + author_email + b">"
  239. commit.author_timezone = author_timezone
  240. commit.author_time = int(author_timestamp)
  241. if isinstance(committer_name, str):
  242. committer_name = committer_name.encode("utf-8")
  243. if isinstance(committer_email, str):
  244. committer_email = committer_email.encode("utf-8")
  245. commit.committer = committer_name + b" <" + committer_email + b">"
  246. commit.commit_timezone = commit_timezone
  247. commit.commit_time = int(commit_timestamp)
  248. commit.message = cmd.message
  249. commit.parents = []
  250. if cmd.from_:
  251. cmd.from_ = self.lookup_object(cmd.from_)
  252. self._reset_base(cmd.from_)
  253. for filecmd in cmd.iter_files(): # type: ignore[no-untyped-call,unused-ignore]
  254. if filecmd.name == b"filemodify":
  255. assert isinstance(filecmd, commands.FileModifyCommand)
  256. if filecmd.data is not None:
  257. blob = Blob.from_string(filecmd.data)
  258. self.repo.object_store.add_object(blob)
  259. blob_id = blob.id
  260. else:
  261. assert filecmd.dataref is not None
  262. blob_id = self.lookup_object(filecmd.dataref)
  263. self._contents[filecmd.path] = (filecmd.mode, blob_id)
  264. elif filecmd.name == b"filedelete":
  265. assert isinstance(filecmd, commands.FileDeleteCommand)
  266. del self._contents[filecmd.path]
  267. elif filecmd.name == b"filecopy":
  268. assert isinstance(filecmd, commands.FileCopyCommand)
  269. self._contents[filecmd.dest_path] = self._contents[filecmd.src_path]
  270. elif filecmd.name == b"filerename":
  271. assert isinstance(filecmd, commands.FileRenameCommand)
  272. self._contents[filecmd.new_path] = self._contents[filecmd.old_path]
  273. del self._contents[filecmd.old_path]
  274. elif filecmd.name == b"filedeleteall":
  275. self._contents = {}
  276. else:
  277. raise Exception(f"Command {filecmd.name!r} not supported")
  278. from dulwich.objects import ObjectID
  279. commit.tree = commit_tree(
  280. self.repo.object_store,
  281. (
  282. (path, ObjectID(hexsha), mode)
  283. for (path, (mode, hexsha)) in self._contents.items()
  284. ),
  285. )
  286. if self.last_commit != ZERO_SHA:
  287. commit.parents.append(self.last_commit)
  288. for merge in cmd.merges:
  289. commit.parents.append(self.lookup_object(merge))
  290. self.repo.object_store.add_object(commit)
  291. self.repo[cmd.ref] = commit.id
  292. self.last_commit = commit.id
  293. if cmd.mark:
  294. mark_bytes = (
  295. cmd.mark
  296. if isinstance(cmd.mark, bytes)
  297. else str(cmd.mark).encode("ascii")
  298. )
  299. self.markers[mark_bytes] = commit.id
  300. def progress_handler(self, cmd: commands.ProgressCommand) -> None:
  301. """Process a ProgressCommand."""
  302. def _reset_base(self, commit_id: ObjectID) -> None:
  303. if self.last_commit == commit_id:
  304. return
  305. self._contents = {}
  306. self.last_commit = commit_id
  307. if commit_id != ZERO_SHA:
  308. from .objects import Commit
  309. commit = self.repo[commit_id]
  310. tree_id = commit.tree if isinstance(commit, Commit) else None
  311. if tree_id is None:
  312. return
  313. for (
  314. path,
  315. mode,
  316. hexsha,
  317. ) in iter_tree_contents(self.repo.object_store, tree_id):
  318. assert path is not None and mode is not None and hexsha is not None
  319. self._contents[path] = (mode, hexsha)
  320. def reset_handler(self, cmd: commands.ResetCommand) -> None:
  321. """Process a ResetCommand."""
  322. if cmd.from_ is None:
  323. from_ = ZERO_SHA
  324. else:
  325. from_ = self.lookup_object(cmd.from_)
  326. self._reset_base(from_)
  327. self.repo.refs[Ref(cmd.ref)] = from_
  328. def tag_handler(self, cmd: commands.TagCommand) -> None:
  329. """Process a TagCommand."""
  330. tag = Tag()
  331. tag.tagger = cmd.tagger
  332. tag.message = cmd.message
  333. tag.name = cmd.from_
  334. self.repo.object_store.add_object(tag)
  335. self.repo.refs["refs/tags/" + tag.name] = tag.id
  336. def feature_handler(self, cmd: commands.FeatureCommand) -> None:
  337. """Process a FeatureCommand."""
  338. feature_name = (
  339. cmd.feature_name.decode("utf-8")
  340. if isinstance(cmd.feature_name, bytes)
  341. else cmd.feature_name
  342. )
  343. raise fastimport_errors.UnknownFeature(feature_name) # type: ignore[no-untyped-call,unused-ignore]