archive.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. # archive.py -- Creating an archive from a tarball
  2. # Copyright (C) 2015 Jonas Haag <jonas@lophus.org>
  3. # Copyright (C) 2015 Jelmer Vernooij <jelmer@jelmer.uk>
  4. #
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as public by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Generates tarballs for Git trees.
  22. """
  23. import posixpath
  24. import stat
  25. import tarfile
  26. import struct
  27. from os import SEEK_END
  28. from io import BytesIO
  29. from contextlib import closing
  30. class ChunkedBytesIO(object):
  31. """Turn a list of bytestrings into a file-like object.
  32. This is similar to creating a `BytesIO` from a concatenation of the
  33. bytestring list, but saves memory by NOT creating one giant bytestring
  34. first::
  35. BytesIO(b''.join(list_of_bytestrings)) =~= ChunkedBytesIO(
  36. list_of_bytestrings)
  37. """
  38. def __init__(self, contents):
  39. self.contents = contents
  40. self.pos = (0, 0)
  41. def read(self, maxbytes=None):
  42. if maxbytes < 0:
  43. maxbytes = float('inf')
  44. buf = []
  45. chunk, cursor = self.pos
  46. while chunk < len(self.contents):
  47. if maxbytes < len(self.contents[chunk]) - cursor:
  48. buf.append(self.contents[chunk][cursor:cursor+maxbytes])
  49. cursor += maxbytes
  50. self.pos = (chunk, cursor)
  51. break
  52. else:
  53. buf.append(self.contents[chunk][cursor:])
  54. maxbytes -= len(self.contents[chunk]) - cursor
  55. chunk += 1
  56. cursor = 0
  57. self.pos = (chunk, cursor)
  58. return b''.join(buf)
  59. def tar_stream(store, tree, mtime, format=''):
  60. """Generate a tar stream for the contents of a Git tree.
  61. Returns a generator that lazily assembles a .tar.gz archive, yielding it in
  62. pieces (bytestrings). To obtain the complete .tar.gz binary file, simply
  63. concatenate these chunks.
  64. :param store: Object store to retrieve objects from
  65. :param tree: Tree object for the tree root
  66. :param mtime: UNIX timestamp that is assigned as the modification time for
  67. all files, and the gzip header modification time if format='gz'
  68. :param format: Optional compression format for tarball
  69. :return: Bytestrings
  70. """
  71. buf = BytesIO()
  72. with closing(tarfile.open(None, "w:%s" % format, buf)) as tar:
  73. if format == 'gz':
  74. # Manually correct the gzip header file modification time so that
  75. # archives created from the same Git tree are always identical.
  76. # The gzip header file modification time is not currenctly
  77. # accessible from the tarfile API, see: https://bugs.python.org/issue31526
  78. buf.seek(0)
  79. assert buf.read(2) == b'\x1f\x8b', 'Invalid gzip header'
  80. buf.seek(4)
  81. buf.write(struct.pack('<L', mtime))
  82. buf.seek(0, SEEK_END)
  83. for entry_abspath, entry in _walk_tree(store, tree):
  84. try:
  85. blob = store[entry.sha]
  86. except KeyError:
  87. # Entry probably refers to a submodule, which we don't yet
  88. # support.
  89. continue
  90. data = ChunkedBytesIO(blob.chunked)
  91. info = tarfile.TarInfo()
  92. # tarfile only works with ascii.
  93. info.name = entry_abspath.decode('ascii')
  94. info.size = blob.raw_length()
  95. info.mode = entry.mode
  96. info.mtime = mtime
  97. tar.addfile(info, data)
  98. yield buf.getvalue()
  99. buf.truncate(0)
  100. buf.seek(0)
  101. yield buf.getvalue()
  102. def _walk_tree(store, tree, root=b''):
  103. """Recursively walk a dulwich Tree, yielding tuples of
  104. (absolute path, TreeEntry) along the way.
  105. """
  106. for entry in tree.iteritems():
  107. entry_abspath = posixpath.join(root, entry.path)
  108. if stat.S_ISDIR(entry.mode):
  109. for _ in _walk_tree(store, store[entry.sha], entry_abspath):
  110. yield _
  111. else:
  112. yield (entry_abspath, entry)