archive.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. # archive.py -- Creating an archive from a tarball
  2. # Copyright (C) 2015 Jonas Haag <jonas@lophus.org>
  3. # Copyright (C) 2015 Jelmer Vernooij <jelmer@jelmer.uk>
  4. #
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as public by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Generates tarballs for Git trees."""
  22. import posixpath
  23. import stat
  24. import struct
  25. import tarfile
  26. from contextlib import closing
  27. from io import BytesIO
  28. from os import SEEK_END
  29. class ChunkedBytesIO:
  30. """Turn a list of bytestrings into a file-like object.
  31. This is similar to creating a `BytesIO` from a concatenation of the
  32. bytestring list, but saves memory by NOT creating one giant bytestring
  33. first::
  34. BytesIO(b''.join(list_of_bytestrings)) =~= ChunkedBytesIO(
  35. list_of_bytestrings)
  36. """
  37. def __init__(self, contents) -> None:
  38. self.contents = contents
  39. self.pos = (0, 0)
  40. def read(self, maxbytes=None):
  41. if maxbytes < 0:
  42. maxbytes = float("inf")
  43. buf = []
  44. chunk, cursor = self.pos
  45. while chunk < len(self.contents):
  46. if maxbytes < len(self.contents[chunk]) - cursor:
  47. buf.append(self.contents[chunk][cursor : cursor + maxbytes])
  48. cursor += maxbytes
  49. self.pos = (chunk, cursor)
  50. break
  51. else:
  52. buf.append(self.contents[chunk][cursor:])
  53. maxbytes -= len(self.contents[chunk]) - cursor
  54. chunk += 1
  55. cursor = 0
  56. self.pos = (chunk, cursor)
  57. return b"".join(buf)
  58. def tar_stream(store, tree, mtime, prefix=b"", format=""):
  59. """Generate a tar stream for the contents of a Git tree.
  60. Returns a generator that lazily assembles a .tar.gz archive, yielding it in
  61. pieces (bytestrings). To obtain the complete .tar.gz binary file, simply
  62. concatenate these chunks.
  63. Args:
  64. store: Object store to retrieve objects from
  65. tree: Tree object for the tree root
  66. mtime: UNIX timestamp that is assigned as the modification time for
  67. all files, and the gzip header modification time if format='gz'
  68. format: Optional compression format for tarball
  69. Returns:
  70. Bytestrings
  71. """
  72. buf = BytesIO()
  73. with closing(tarfile.open(None, "w:%s" % format, buf)) as tar:
  74. if format == "gz":
  75. # Manually correct the gzip header file modification time so that
  76. # archives created from the same Git tree are always identical.
  77. # The gzip header file modification time is not currently
  78. # accessible from the tarfile API, see:
  79. # https://bugs.python.org/issue31526
  80. buf.seek(0)
  81. assert buf.read(2) == b"\x1f\x8b", "Invalid gzip header"
  82. buf.seek(4)
  83. buf.write(struct.pack("<L", mtime))
  84. buf.seek(0, SEEK_END)
  85. for entry_abspath, entry in _walk_tree(store, tree, prefix):
  86. try:
  87. blob = store[entry.sha]
  88. except KeyError:
  89. # Entry probably refers to a submodule, which we don't yet
  90. # support.
  91. continue
  92. data = ChunkedBytesIO(blob.chunked)
  93. info = tarfile.TarInfo()
  94. # tarfile only works with ascii.
  95. info.name = entry_abspath.decode("utf-8", "surrogateescape")
  96. info.size = blob.raw_length()
  97. info.mode = entry.mode
  98. info.mtime = mtime
  99. tar.addfile(info, data)
  100. yield buf.getvalue()
  101. buf.truncate(0)
  102. buf.seek(0)
  103. yield buf.getvalue()
  104. def _walk_tree(store, tree, root=b""):
  105. """Recursively walk a dulwich Tree, yielding tuples of
  106. (absolute path, TreeEntry) along the way.
  107. """
  108. for entry in tree.iteritems():
  109. entry_abspath = posixpath.join(root, entry.path)
  110. if stat.S_ISDIR(entry.mode):
  111. yield from _walk_tree(store, store[entry.sha], entry_abspath)
  112. else:
  113. yield (entry_abspath, entry)