archive.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # archive.py -- Creating an archive from a tarball
  2. # Copyright (C) 2015 Jonas Haag <jonas@lophus.org>
  3. # Copyright (C) 2015 Jelmer Vernooij <jelmer@jelmer.uk>
  4. #
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as public by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Generates tarballs for Git trees.
  22. """
  23. import posixpath
  24. import stat
  25. import tarfile
  26. import struct
  27. from os import SEEK_END
  28. from io import BytesIO
  29. from contextlib import closing
  30. class ChunkedBytesIO(object):
  31. """Turn a list of bytestrings into a file-like object.
  32. This is similar to creating a `BytesIO` from a concatenation of the
  33. bytestring list, but saves memory by NOT creating one giant bytestring
  34. first::
  35. BytesIO(b''.join(list_of_bytestrings)) =~= ChunkedBytesIO(
  36. list_of_bytestrings)
  37. """
  38. def __init__(self, contents):
  39. self.contents = contents
  40. self.pos = (0, 0)
  41. def read(self, maxbytes=None):
  42. if maxbytes < 0:
  43. maxbytes = float("inf")
  44. buf = []
  45. chunk, cursor = self.pos
  46. while chunk < len(self.contents):
  47. if maxbytes < len(self.contents[chunk]) - cursor:
  48. buf.append(self.contents[chunk][cursor : cursor + maxbytes])
  49. cursor += maxbytes
  50. self.pos = (chunk, cursor)
  51. break
  52. else:
  53. buf.append(self.contents[chunk][cursor:])
  54. maxbytes -= len(self.contents[chunk]) - cursor
  55. chunk += 1
  56. cursor = 0
  57. self.pos = (chunk, cursor)
  58. return b"".join(buf)
  59. def tar_stream(store, tree, mtime, prefix=b"", format=""):
  60. """Generate a tar stream for the contents of a Git tree.
  61. Returns a generator that lazily assembles a .tar.gz archive, yielding it in
  62. pieces (bytestrings). To obtain the complete .tar.gz binary file, simply
  63. concatenate these chunks.
  64. Args:
  65. store: Object store to retrieve objects from
  66. tree: Tree object for the tree root
  67. mtime: UNIX timestamp that is assigned as the modification time for
  68. all files, and the gzip header modification time if format='gz'
  69. format: Optional compression format for tarball
  70. Returns:
  71. Bytestrings
  72. """
  73. buf = BytesIO()
  74. with closing(tarfile.open(None, "w:%s" % format, buf)) as tar:
  75. if format == "gz":
  76. # Manually correct the gzip header file modification time so that
  77. # archives created from the same Git tree are always identical.
  78. # The gzip header file modification time is not currently
  79. # accessible from the tarfile API, see:
  80. # https://bugs.python.org/issue31526
  81. buf.seek(0)
  82. assert buf.read(2) == b"\x1f\x8b", "Invalid gzip header"
  83. buf.seek(4)
  84. buf.write(struct.pack("<L", mtime))
  85. buf.seek(0, SEEK_END)
  86. for entry_abspath, entry in _walk_tree(store, tree, prefix):
  87. try:
  88. blob = store[entry.sha]
  89. except KeyError:
  90. # Entry probably refers to a submodule, which we don't yet
  91. # support.
  92. continue
  93. data = ChunkedBytesIO(blob.chunked)
  94. info = tarfile.TarInfo()
  95. # tarfile only works with ascii.
  96. info.name = entry_abspath.decode("ascii")
  97. info.size = blob.raw_length()
  98. info.mode = entry.mode
  99. info.mtime = mtime
  100. tar.addfile(info, data)
  101. yield buf.getvalue()
  102. buf.truncate(0)
  103. buf.seek(0)
  104. yield buf.getvalue()
  105. def _walk_tree(store, tree, root=b""):
  106. """Recursively walk a dulwich Tree, yielding tuples of
  107. (absolute path, TreeEntry) along the way.
  108. """
  109. for entry in tree.iteritems():
  110. entry_abspath = posixpath.join(root, entry.path)
  111. if stat.S_ISDIR(entry.mode):
  112. for _ in _walk_tree(store, store[entry.sha], entry_abspath):
  113. yield _
  114. else:
  115. yield (entry_abspath, entry)