2
0

archive.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # archive.py -- Creating an archive from a tarball
  2. # Copyright (C) 2015 Jonas Haag <jonas@lophus.org>
  3. # Copyright (C) 2015 Jelmer Vernooij <jelmer@jelmer.uk>
  4. #
  5. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  6. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  7. # General Public License as published by the Free Software Foundation; version 2.0
  8. # or (at your option) any later version. You can redistribute it and/or
  9. # modify it under the terms of either of these two licenses.
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. # You should have received a copy of the licenses; if not, see
  18. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  19. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  20. # License, Version 2.0.
  21. #
  22. """Generates tarballs for Git trees."""
  23. import posixpath
  24. import stat
  25. import struct
  26. import tarfile
  27. from collections.abc import Generator
  28. from contextlib import closing
  29. from io import BytesIO
  30. from os import SEEK_END
  31. from typing import TYPE_CHECKING, Optional
  32. if TYPE_CHECKING:
  33. from .object_store import BaseObjectStore
  34. from .objects import TreeEntry
  35. from .objects import Tree
  36. class ChunkedBytesIO:
  37. """Turn a list of bytestrings into a file-like object.
  38. This is similar to creating a `BytesIO` from a concatenation of the
  39. bytestring list, but saves memory by NOT creating one giant bytestring
  40. first::
  41. BytesIO(b''.join(list_of_bytestrings)) =~= ChunkedBytesIO(
  42. list_of_bytestrings)
  43. """
  44. def __init__(self, contents: list[bytes]) -> None:
  45. self.contents = contents
  46. self.pos = (0, 0)
  47. def read(self, maxbytes: Optional[int] = None) -> bytes:
  48. if maxbytes is None or maxbytes < 0:
  49. remaining = None
  50. else:
  51. remaining = maxbytes
  52. buf = []
  53. chunk, cursor = self.pos
  54. while chunk < len(self.contents):
  55. chunk_remainder = len(self.contents[chunk]) - cursor
  56. if remaining is not None and remaining < chunk_remainder:
  57. buf.append(self.contents[chunk][cursor : cursor + remaining])
  58. cursor += remaining
  59. self.pos = (chunk, cursor)
  60. break
  61. else:
  62. buf.append(self.contents[chunk][cursor:])
  63. if remaining is not None:
  64. remaining -= chunk_remainder
  65. chunk += 1
  66. cursor = 0
  67. self.pos = (chunk, cursor)
  68. return b"".join(buf)
  69. def tar_stream(
  70. store: "BaseObjectStore",
  71. tree: "Tree",
  72. mtime: int,
  73. prefix: bytes = b"",
  74. format: str = "",
  75. ) -> Generator[bytes, None, None]:
  76. """Generate a tar stream for the contents of a Git tree.
  77. Returns a generator that lazily assembles a .tar.gz archive, yielding it in
  78. pieces (bytestrings). To obtain the complete .tar.gz binary file, simply
  79. concatenate these chunks.
  80. Args:
  81. store: Object store to retrieve objects from
  82. tree: Tree object for the tree root
  83. mtime: UNIX timestamp that is assigned as the modification time for
  84. all files, and the gzip header modification time if format='gz'
  85. format: Optional compression format for tarball
  86. Returns:
  87. Bytestrings
  88. """
  89. buf = BytesIO()
  90. mode = "w:" + format if format else "w"
  91. from typing import Any, cast
  92. # The tarfile.open overloads are complex; cast to Any to avoid issues
  93. with closing(cast(Any, tarfile.open)(name=None, mode=mode, fileobj=buf)) as tar:
  94. if format == "gz":
  95. # Manually correct the gzip header file modification time so that
  96. # archives created from the same Git tree are always identical.
  97. # The gzip header file modification time is not currently
  98. # accessible from the tarfile API, see:
  99. # https://bugs.python.org/issue31526
  100. buf.seek(0)
  101. assert buf.read(2) == b"\x1f\x8b", "Invalid gzip header"
  102. buf.seek(4)
  103. buf.write(struct.pack("<L", mtime))
  104. buf.seek(0, SEEK_END)
  105. for entry_abspath, entry in _walk_tree(store, tree, prefix):
  106. try:
  107. blob = store[entry.sha]
  108. except KeyError:
  109. # Entry probably refers to a submodule, which we don't yet
  110. # support.
  111. continue
  112. if hasattr(blob, "chunked"):
  113. data = ChunkedBytesIO(blob.chunked)
  114. else:
  115. # Fallback for objects without chunked attribute
  116. data = ChunkedBytesIO([blob.as_raw_string()])
  117. info = tarfile.TarInfo()
  118. # tarfile only works with ascii.
  119. info.name = entry_abspath.decode("utf-8", "surrogateescape")
  120. info.size = blob.raw_length()
  121. info.mode = entry.mode
  122. info.mtime = mtime
  123. tar.addfile(info, data)
  124. yield buf.getvalue()
  125. buf.truncate(0)
  126. buf.seek(0)
  127. yield buf.getvalue()
  128. def _walk_tree(
  129. store: "BaseObjectStore", tree: "Tree", root: bytes = b""
  130. ) -> Generator[tuple[bytes, "TreeEntry"], None, None]:
  131. """Recursively walk a dulwich Tree, yielding tuples of
  132. (absolute path, TreeEntry) along the way.
  133. """
  134. for entry in tree.iteritems():
  135. entry_abspath = posixpath.join(root, entry.path)
  136. if stat.S_ISDIR(entry.mode):
  137. subtree = store[entry.sha]
  138. if isinstance(subtree, Tree):
  139. yield from _walk_tree(store, subtree, entry_abspath)
  140. else:
  141. yield (entry_abspath, entry)