123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944 |
- # pack.py -- For dealing with packed git objects.
- # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
- # Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@samba.org>
- #
- # This program is free software; you can redistribute it and/or
- # modify it under the terms of the GNU General Public License
- # as published by the Free Software Foundation; version 2
- # of the License or (at your option) a later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- # MA 02110-1301, USA.
- """Classes for dealing with packed git objects.
- A pack is a compact representation of a bunch of objects, stored
- using deltas where possible.
- They have two parts, the pack file, which stores the data, and an index
- that tells you where the data is.
- To find an object you look in all of the index files 'til you find a
- match for the object name. You then use the pointer got from this as
- a pointer in to the corresponding packfile.
- """
- from collections import defaultdict
- import binascii
- from io import BytesIO, UnsupportedOperation
- from collections import (
- deque,
- )
- import difflib
- import struct
- from itertools import chain
- try:
- from itertools import imap, izip
- except ImportError:
- # Python3
- imap = map
- izip = zip
- import os
- import sys
- try:
- import mmap
- except ImportError:
- has_mmap = False
- else:
- has_mmap = True
- # For some reason the above try, except fails to set has_mmap = False for plan9
- if sys.platform == 'Plan9':
- has_mmap = False
- from hashlib import sha1
- from os import (
- SEEK_CUR,
- SEEK_END,
- )
- from struct import unpack_from
- import zlib
- from dulwich.errors import (
- ApplyDeltaError,
- ChecksumMismatch,
- )
- from dulwich.file import GitFile
- from dulwich.lru_cache import (
- LRUSizeCache,
- )
- from dulwich.objects import (
- ShaFile,
- hex_to_sha,
- sha_to_hex,
- object_header,
- )
- OFS_DELTA = 6
- REF_DELTA = 7
- DELTA_TYPES = (OFS_DELTA, REF_DELTA)
- DEFAULT_PACK_DELTA_WINDOW_SIZE = 10
- def take_msb_bytes(read, crc32=None):
- """Read bytes marked with most significant bit.
- :param read: Read function
- """
- ret = []
- while len(ret) == 0 or ret[-1] & 0x80:
- b = read(1)
- if crc32 is not None:
- crc32 = binascii.crc32(b, crc32)
- ret.append(ord(b[:1]))
- return ret, crc32
- class UnpackedObject(object):
- """Class encapsulating an object unpacked from a pack file.
- These objects should only be created from within unpack_object. Most
- members start out as empty and are filled in at various points by
- read_zlib_chunks, unpack_object, DeltaChainIterator, etc.
- End users of this object should take care that the function they're getting
- this object from is guaranteed to set the members they need.
- """
- __slots__ = [
- 'offset', # Offset in its pack.
- '_sha', # Cached binary SHA.
- 'obj_type_num', # Type of this object.
- 'obj_chunks', # Decompressed and delta-resolved chunks.
- 'pack_type_num', # Type of this object in the pack (may be a delta).
- 'delta_base', # Delta base offset or SHA.
- 'comp_chunks', # Compressed object chunks.
- 'decomp_chunks', # Decompressed object chunks.
- 'decomp_len', # Decompressed length of this object.
- 'crc32', # CRC32.
- ]
- # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
- # methods of this object.
- def __init__(self, pack_type_num, delta_base, decomp_len, crc32):
- self.offset = None
- self._sha = None
- self.pack_type_num = pack_type_num
- self.delta_base = delta_base
- self.comp_chunks = None
- self.decomp_chunks = []
- self.decomp_len = decomp_len
- self.crc32 = crc32
- if pack_type_num in DELTA_TYPES:
- self.obj_type_num = None
- self.obj_chunks = None
- else:
- self.obj_type_num = pack_type_num
- self.obj_chunks = self.decomp_chunks
- self.delta_base = delta_base
- def sha(self):
- """Return the binary SHA of this object."""
- if self._sha is None:
- self._sha = obj_sha(self.obj_type_num, self.obj_chunks)
- return self._sha
- def sha_file(self):
- """Return a ShaFile from this object."""
- return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)
- # Only provided for backwards compatibility with code that expects either
- # chunks or a delta tuple.
- def _obj(self):
- """Return the decompressed chunks, or (delta base, delta chunks)."""
- if self.pack_type_num in DELTA_TYPES:
- return (self.delta_base, self.decomp_chunks)
- else:
- return self.decomp_chunks
- def __eq__(self, other):
- if not isinstance(other, UnpackedObject):
- return False
- for slot in self.__slots__:
- if getattr(self, slot) != getattr(other, slot):
- return False
- return True
- def __ne__(self, other):
- return not (self == other)
- def __repr__(self):
- data = ['%s=%r' % (s, getattr(self, s)) for s in self.__slots__]
- return '%s(%s)' % (self.__class__.__name__, ', '.join(data))
- _ZLIB_BUFSIZE = 4096
- def read_zlib_chunks(read_some, unpacked, include_comp=False,
- buffer_size=_ZLIB_BUFSIZE):
- """Read zlib data from a buffer.
- This function requires that the buffer have additional data following the
- compressed data, which is guaranteed to be the case for git pack files.
- :param read_some: Read function that returns at least one byte, but may
- return less than the requested size.
- :param unpacked: An UnpackedObject to write result data to. If its crc32
- attr is not None, the CRC32 of the compressed bytes will be computed
- using this starting CRC32.
- After this function, will have the following attrs set:
- * comp_chunks (if include_comp is True)
- * decomp_chunks
- * decomp_len
- * crc32
- :param include_comp: If True, include compressed data in the result.
- :param buffer_size: Size of the read buffer.
- :return: Leftover unused data from the decompression.
- :raise zlib.error: if a decompression error occurred.
- """
- if unpacked.decomp_len <= -1:
- raise ValueError('non-negative zlib data stream size expected')
- decomp_obj = zlib.decompressobj()
- comp_chunks = []
- decomp_chunks = unpacked.decomp_chunks
- decomp_len = 0
- crc32 = unpacked.crc32
- while True:
- add = read_some(buffer_size)
- if not add:
- raise zlib.error('EOF before end of zlib stream')
- comp_chunks.append(add)
- decomp = decomp_obj.decompress(add)
- decomp_len += len(decomp)
- decomp_chunks.append(decomp)
- unused = decomp_obj.unused_data
- if unused:
- left = len(unused)
- if crc32 is not None:
- crc32 = binascii.crc32(add[:-left], crc32)
- if include_comp:
- comp_chunks[-1] = add[:-left]
- break
- elif crc32 is not None:
- crc32 = binascii.crc32(add, crc32)
- if crc32 is not None:
- crc32 &= 0xffffffff
- if decomp_len != unpacked.decomp_len:
- raise zlib.error('decompressed data does not match expected size')
- unpacked.crc32 = crc32
- if include_comp:
- unpacked.comp_chunks = comp_chunks
- return unused
- def iter_sha1(iter):
- """Return the hexdigest of the SHA1 over a set of names.
- :param iter: Iterator over string objects
- :return: 40-byte hex sha1 digest
- """
- sha = sha1()
- for name in iter:
- sha.update(name)
- return sha.hexdigest().encode('ascii')
- def load_pack_index(path):
- """Load an index file by path.
- :param filename: Path to the index file
- :return: A PackIndex loaded from the given path
- """
- with GitFile(path, 'rb') as f:
- return load_pack_index_file(path, f)
- def _load_file_contents(f, size=None):
- try:
- fd = f.fileno()
- except (UnsupportedOperation, AttributeError):
- fd = None
- # Attempt to use mmap if possible
- if fd is not None:
- if size is None:
- size = os.fstat(fd).st_size
- if has_mmap:
- try:
- contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
- except mmap.error:
- # Perhaps a socket?
- pass
- else:
- return contents, size
- contents = f.read()
- size = len(contents)
- return contents, size
- def load_pack_index_file(path, f):
- """Load an index file from a file-like object.
- :param path: Path for the index file
- :param f: File-like object
- :return: A PackIndex loaded from the given file
- """
- contents, size = _load_file_contents(f)
- if contents[:4] == b'\377tOc':
- version = struct.unpack(b'>L', contents[4:8])[0]
- if version == 2:
- return PackIndex2(path, file=f, contents=contents,
- size=size)
- else:
- raise KeyError('Unknown pack index format %d' % version)
- else:
- return PackIndex1(path, file=f, contents=contents, size=size)
- def bisect_find_sha(start, end, sha, unpack_name):
- """Find a SHA in a data blob with sorted SHAs.
- :param start: Start index of range to search
- :param end: End index of range to search
- :param sha: Sha to find
- :param unpack_name: Callback to retrieve SHA by index
- :return: Index of the SHA, or None if it wasn't found
- """
- assert start <= end
- while start <= end:
- i = (start + end) // 2
- file_sha = unpack_name(i)
- if file_sha < sha:
- start = i + 1
- elif file_sha > sha:
- end = i - 1
- else:
- return i
- return None
- class PackIndex(object):
- """An index in to a packfile.
- Given a sha id of an object a pack index can tell you the location in the
- packfile of that object if it has it.
- """
- def __eq__(self, other):
- if not isinstance(other, PackIndex):
- return False
- for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
- other.iterentries()):
- if name1 != name2:
- return False
- return True
- def __ne__(self, other):
- return not self.__eq__(other)
- def __len__(self):
- """Return the number of entries in this pack index."""
- raise NotImplementedError(self.__len__)
- def __iter__(self):
- """Iterate over the SHAs in this pack."""
- return imap(sha_to_hex, self._itersha())
- def iterentries(self):
- """Iterate over the entries in this pack index.
- :return: iterator over tuples with object name, offset in packfile and
- crc32 checksum.
- """
- raise NotImplementedError(self.iterentries)
- def get_pack_checksum(self):
- """Return the SHA1 checksum stored for the corresponding packfile.
- :return: 20-byte binary digest
- """
- raise NotImplementedError(self.get_pack_checksum)
- def object_index(self, sha):
- """Return the index in to the corresponding packfile for the object.
- Given the name of an object it will return the offset that object
- lives at within the corresponding pack file. If the pack file doesn't
- have the object then None will be returned.
- """
- if len(sha) == 40:
- sha = hex_to_sha(sha)
- return self._object_index(sha)
- def _object_index(self, sha):
- """See object_index.
- :param sha: A *binary* SHA string. (20 characters long)_
- """
- raise NotImplementedError(self._object_index)
- def objects_sha1(self):
- """Return the hex SHA1 over all the shas of all objects in this pack.
- :note: This is used for the filename of the pack.
- """
- return iter_sha1(self._itersha())
- def _itersha(self):
- """Yield all the SHA1's of the objects in the index, sorted."""
- raise NotImplementedError(self._itersha)
- class MemoryPackIndex(PackIndex):
- """Pack index that is stored entirely in memory."""
- def __init__(self, entries, pack_checksum=None):
- """Create a new MemoryPackIndex.
- :param entries: Sequence of name, idx, crc32 (sorted)
- :param pack_checksum: Optional pack checksum
- """
- self._by_sha = {}
- for name, idx, crc32 in entries:
- self._by_sha[name] = idx
- self._entries = entries
- self._pack_checksum = pack_checksum
- def get_pack_checksum(self):
- return self._pack_checksum
- def __len__(self):
- return len(self._entries)
- def _object_index(self, sha):
- return self._by_sha[sha][0]
- def _itersha(self):
- return iter(self._by_sha)
- def iterentries(self):
- return iter(self._entries)
- class FilePackIndex(PackIndex):
- """Pack index that is based on a file.
- To do the loop it opens the file, and indexes first 256 4 byte groups
- with the first byte of the sha id. The value in the four byte group indexed
- is the end of the group that shares the same starting byte. Subtract one
- from the starting byte and index again to find the start of the group.
- The values are sorted by sha id within the group, so do the math to find
- the start and end offset and then bisect in to find if the value is present.
- """
- def __init__(self, filename, file=None, contents=None, size=None):
- """Create a pack index object.
- Provide it with the name of the index file to consider, and it will map
- it whenever required.
- """
- self._filename = filename
- # Take the size now, so it can be checked each time we map the file to
- # ensure that it hasn't changed.
- if file is None:
- self._file = GitFile(filename, 'rb')
- else:
- self._file = file
- if contents is None:
- self._contents, self._size = _load_file_contents(self._file, size)
- else:
- self._contents, self._size = (contents, size)
- def __eq__(self, other):
- # Quick optimization:
- if (isinstance(other, FilePackIndex) and
- self._fan_out_table != other._fan_out_table):
- return False
- return super(FilePackIndex, self).__eq__(other)
- def close(self):
- self._file.close()
- if getattr(self._contents, "close", None) is not None:
- self._contents.close()
- def __len__(self):
- """Return the number of entries in this pack index."""
- return self._fan_out_table[-1]
- def _unpack_entry(self, i):
- """Unpack the i-th entry in the index file.
- :return: Tuple with object name (SHA), offset in pack file and CRC32
- checksum (if known).
- """
- raise NotImplementedError(self._unpack_entry)
- def _unpack_name(self, i):
- """Unpack the i-th name from the index file."""
- raise NotImplementedError(self._unpack_name)
- def _unpack_offset(self, i):
- """Unpack the i-th object offset from the index file."""
- raise NotImplementedError(self._unpack_offset)
- def _unpack_crc32_checksum(self, i):
- """Unpack the crc32 checksum for the i-th object from the index file."""
- raise NotImplementedError(self._unpack_crc32_checksum)
- def _itersha(self):
- for i in range(len(self)):
- yield self._unpack_name(i)
- def iterentries(self):
- """Iterate over the entries in this pack index.
- :return: iterator over tuples with object name, offset in packfile and
- crc32 checksum.
- """
- for i in range(len(self)):
- yield self._unpack_entry(i)
- def _read_fan_out_table(self, start_offset):
- ret = []
- for i in range(0x100):
- fanout_entry = self._contents[start_offset+i*4:start_offset+(i+1)*4]
- ret.append(struct.unpack('>L', fanout_entry)[0])
- return ret
- def check(self):
- """Check that the stored checksum matches the actual checksum."""
- actual = self.calculate_checksum()
- stored = self.get_stored_checksum()
- if actual != stored:
- raise ChecksumMismatch(stored, actual)
- def calculate_checksum(self):
- """Calculate the SHA1 checksum over this pack index.
- :return: This is a 20-byte binary digest
- """
- return sha1(self._contents[:-20]).digest()
- def get_pack_checksum(self):
- """Return the SHA1 checksum stored for the corresponding packfile.
- :return: 20-byte binary digest
- """
- return bytes(self._contents[-40:-20])
- def get_stored_checksum(self):
- """Return the SHA1 checksum stored for this index.
- :return: 20-byte binary digest
- """
- return bytes(self._contents[-20:])
- def _object_index(self, sha):
- """See object_index.
- :param sha: A *binary* SHA string. (20 characters long)_
- """
- assert len(sha) == 20
- idx = ord(sha[:1])
- if idx == 0:
- start = 0
- else:
- start = self._fan_out_table[idx-1]
- end = self._fan_out_table[idx]
- i = bisect_find_sha(start, end, sha, self._unpack_name)
- if i is None:
- raise KeyError(sha)
- return self._unpack_offset(i)
- class PackIndex1(FilePackIndex):
- """Version 1 Pack Index file."""
- def __init__(self, filename, file=None, contents=None, size=None):
- super(PackIndex1, self).__init__(filename, file, contents, size)
- self.version = 1
- self._fan_out_table = self._read_fan_out_table(0)
- def _unpack_entry(self, i):
- (offset, name) = unpack_from('>L20s', self._contents,
- (0x100 * 4) + (i * 24))
- return (name, offset, None)
- def _unpack_name(self, i):
- offset = (0x100 * 4) + (i * 24) + 4
- return self._contents[offset:offset+20]
- def _unpack_offset(self, i):
- offset = (0x100 * 4) + (i * 24)
- return unpack_from('>L', self._contents, offset)[0]
- def _unpack_crc32_checksum(self, i):
- # Not stored in v1 index files
- return None
- class PackIndex2(FilePackIndex):
- """Version 2 Pack Index file."""
- def __init__(self, filename, file=None, contents=None, size=None):
- super(PackIndex2, self).__init__(filename, file, contents, size)
- if self._contents[:4] != b'\377tOc':
- raise AssertionError('Not a v2 pack index file')
- (self.version, ) = unpack_from(b'>L', self._contents, 4)
- if self.version != 2:
- raise AssertionError('Version was %d' % self.version)
- self._fan_out_table = self._read_fan_out_table(8)
- self._name_table_offset = 8 + 0x100 * 4
- self._crc32_table_offset = self._name_table_offset + 20 * len(self)
- self._pack_offset_table_offset = (self._crc32_table_offset +
- 4 * len(self))
- self._pack_offset_largetable_offset = (self._pack_offset_table_offset +
- 4 * len(self))
- def _unpack_entry(self, i):
- return (self._unpack_name(i), self._unpack_offset(i),
- self._unpack_crc32_checksum(i))
- def _unpack_name(self, i):
- offset = self._name_table_offset + i * 20
- return self._contents[offset:offset+20]
- def _unpack_offset(self, i):
- offset = self._pack_offset_table_offset + i * 4
- offset = unpack_from('>L', self._contents, offset)[0]
- if offset & (2**31):
- offset = self._pack_offset_largetable_offset + (offset&(2**31-1)) * 8
- offset = unpack_from('>Q', self._contents, offset)[0]
- return offset
- def _unpack_crc32_checksum(self, i):
- return unpack_from('>L', self._contents,
- self._crc32_table_offset + i * 4)[0]
- def read_pack_header(read):
- """Read the header of a pack file.
- :param read: Read function
- :return: Tuple of (pack version, number of objects). If no data is available
- to read, returns (None, None).
- """
- header = read(12)
- if not header:
- return None, None
- if header[:4] != b'PACK':
- raise AssertionError('Invalid pack header %r' % header)
- (version,) = unpack_from(b'>L', header, 4)
- if version not in (2, 3):
- raise AssertionError('Version was %d' % version)
- (num_objects,) = unpack_from(b'>L', header, 8)
- return (version, num_objects)
- def chunks_length(chunks):
- if isinstance(chunks, bytes):
- return len(chunks)
- else:
- return sum(imap(len, chunks))
- def unpack_object(read_all, read_some=None, compute_crc32=False,
- include_comp=False, zlib_bufsize=_ZLIB_BUFSIZE):
- """Unpack a Git object.
- :param read_all: Read function that blocks until the number of requested
- bytes are read.
- :param read_some: Read function that returns at least one byte, but may not
- return the number of bytes requested.
- :param compute_crc32: If True, compute the CRC32 of the compressed data. If
- False, the returned CRC32 will be None.
- :param include_comp: If True, include compressed data in the result.
- :param zlib_bufsize: An optional buffer size for zlib operations.
- :return: A tuple of (unpacked, unused), where unused is the unused data
- leftover from decompression, and unpacked in an UnpackedObject with
- the following attrs set:
- * obj_chunks (for non-delta types)
- * pack_type_num
- * delta_base (for delta types)
- * comp_chunks (if include_comp is True)
- * decomp_chunks
- * decomp_len
- * crc32 (if compute_crc32 is True)
- """
- if read_some is None:
- read_some = read_all
- if compute_crc32:
- crc32 = 0
- else:
- crc32 = None
- bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
- type_num = (bytes[0] >> 4) & 0x07
- size = bytes[0] & 0x0f
- for i, byte in enumerate(bytes[1:]):
- size += (byte & 0x7f) << ((i * 7) + 4)
- raw_base = len(bytes)
- if type_num == OFS_DELTA:
- bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
- raw_base += len(bytes)
- if bytes[-1] & 0x80:
- raise AssertionError
- delta_base_offset = bytes[0] & 0x7f
- for byte in bytes[1:]:
- delta_base_offset += 1
- delta_base_offset <<= 7
- delta_base_offset += (byte & 0x7f)
- delta_base = delta_base_offset
- elif type_num == REF_DELTA:
- delta_base = read_all(20)
- if compute_crc32:
- crc32 = binascii.crc32(delta_base, crc32)
- raw_base += 20
- else:
- delta_base = None
- unpacked = UnpackedObject(type_num, delta_base, size, crc32)
- unused = read_zlib_chunks(read_some, unpacked, buffer_size=zlib_bufsize,
- include_comp=include_comp)
- return unpacked, unused
- def _compute_object_size(value):
- """Compute the size of a unresolved object for use with LRUSizeCache."""
- (num, obj) = value
- if num in DELTA_TYPES:
- return chunks_length(obj[1])
- return chunks_length(obj)
- class PackStreamReader(object):
- """Class to read a pack stream.
- The pack is read from a ReceivableProtocol using read() or recv() as
- appropriate.
- """
- def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE):
- self.read_all = read_all
- if read_some is None:
- self.read_some = read_all
- else:
- self.read_some = read_some
- self.sha = sha1()
- self._offset = 0
- self._rbuf = BytesIO()
- # trailer is a deque to avoid memory allocation on small reads
- self._trailer = deque()
- self._zlib_bufsize = zlib_bufsize
- def _read(self, read, size):
- """Read up to size bytes using the given callback.
- As a side effect, update the verifier's hash (excluding the last 20
- bytes read).
- :param read: The read callback to read from.
- :param size: The maximum number of bytes to read; the particular
- behavior is callback-specific.
- """
- data = read(size)
- # maintain a trailer of the last 20 bytes we've read
- n = len(data)
- self._offset += n
- tn = len(self._trailer)
- if n >= 20:
- to_pop = tn
- to_add = 20
- else:
- to_pop = max(n + tn - 20, 0)
- to_add = n
- self.sha.update(bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)])))
- self._trailer.extend(data[-to_add:])
- # hash everything but the trailer
- self.sha.update(data[:-to_add])
- return data
- def _buf_len(self):
- buf = self._rbuf
- start = buf.tell()
- buf.seek(0, SEEK_END)
- end = buf.tell()
- buf.seek(start)
- return end - start
- @property
- def offset(self):
- return self._offset - self._buf_len()
- def read(self, size):
- """Read, blocking until size bytes are read."""
- buf_len = self._buf_len()
- if buf_len >= size:
- return self._rbuf.read(size)
- buf_data = self._rbuf.read()
- self._rbuf = BytesIO()
- return buf_data + self._read(self.read_all, size - buf_len)
- def recv(self, size):
- """Read up to size bytes, blocking until one byte is read."""
- buf_len = self._buf_len()
- if buf_len:
- data = self._rbuf.read(size)
- if size >= buf_len:
- self._rbuf = BytesIO()
- return data
- return self._read(self.read_some, size)
- def __len__(self):
- return self._num_objects
- def read_objects(self, compute_crc32=False):
- """Read the objects in this pack file.
- :param compute_crc32: If True, compute the CRC32 of the compressed
- data. If False, the returned CRC32 will be None.
- :return: Iterator over UnpackedObjects with the following members set:
- offset
- obj_type_num
- obj_chunks (for non-delta types)
- delta_base (for delta types)
- decomp_chunks
- decomp_len
- crc32 (if compute_crc32 is True)
- :raise ChecksumMismatch: if the checksum of the pack contents does not
- match the checksum in the pack trailer.
- :raise zlib.error: if an error occurred during zlib decompression.
- :raise IOError: if an error occurred writing to the output file.
- """
- pack_version, self._num_objects = read_pack_header(self.read)
- if pack_version is None:
- return
- for i in range(self._num_objects):
- offset = self.offset
- unpacked, unused = unpack_object(
- self.read, read_some=self.recv, compute_crc32=compute_crc32,
- zlib_bufsize=self._zlib_bufsize)
- unpacked.offset = offset
- # prepend any unused data to current read buffer
- buf = BytesIO()
- buf.write(unused)
- buf.write(self._rbuf.read())
- buf.seek(0)
- self._rbuf = buf
- yield unpacked
- if self._buf_len() < 20:
- # If the read buffer is full, then the last read() got the whole
- # trailer off the wire. If not, it means there is still some of the
- # trailer to read. We need to read() all 20 bytes; N come from the
- # read buffer and (20 - N) come from the wire.
- self.read(20)
- pack_sha = bytearray(self._trailer)
- if pack_sha != self.sha.digest():
- raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())
- class PackStreamCopier(PackStreamReader):
- """Class to verify a pack stream as it is being read.
- The pack is read from a ReceivableProtocol using read() or recv() as
- appropriate and written out to the given file-like object.
- """
- def __init__(self, read_all, read_some, outfile, delta_iter=None):
- """Initialize the copier.
- :param read_all: Read function that blocks until the number of requested
- bytes are read.
- :param read_some: Read function that returns at least one byte, but may
- not return the number of bytes requested.
- :param outfile: File-like object to write output through.
- :param delta_iter: Optional DeltaChainIterator to record deltas as we
- read them.
- """
- super(PackStreamCopier, self).__init__(read_all, read_some=read_some)
- self.outfile = outfile
- self._delta_iter = delta_iter
- def _read(self, read, size):
- """Read data from the read callback and write it to the file."""
- data = super(PackStreamCopier, self)._read(read, size)
- self.outfile.write(data)
- return data
- def verify(self):
- """Verify a pack stream and write it to the output file.
- See PackStreamReader.iterobjects for a list of exceptions this may
- throw.
- """
- if self._delta_iter:
- for unpacked in self.read_objects():
- self._delta_iter.record(unpacked)
- else:
- for _ in self.read_objects():
- pass
- def obj_sha(type, chunks):
- """Compute the SHA for a numeric type and object chunks."""
- sha = sha1()
- sha.update(object_header(type, chunks_length(chunks)))
- if isinstance(chunks, bytes):
- sha.update(chunks)
- else:
- for chunk in chunks:
- sha.update(chunk)
- return sha.digest()
- def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1<<16):
- """Hash a portion of a file into a new SHA.
- :param f: A file-like object to read from that supports seek().
- :param start_ofs: The offset in the file to start reading at.
- :param end_ofs: The offset in the file to end reading at, relative to the
- end of the file.
- :param buffer_size: A buffer size for reading.
- :return: A new SHA object updated with data read from the file.
- """
- sha = sha1()
- f.seek(0, SEEK_END)
- length = f.tell()
- if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:
- raise AssertionError(
- "Attempt to read beyond file length. "
- "start_ofs: %d, end_ofs: %d, file length: %d" % (
- start_ofs, end_ofs, length))
- todo = length + end_ofs - start_ofs
- f.seek(start_ofs)
- while todo:
- data = f.read(min(todo, buffer_size))
- sha.update(data)
- todo -= len(data)
- return sha
- class PackData(object):
- """The data contained in a packfile.
- Pack files can be accessed both sequentially for exploding a pack, and
- directly with the help of an index to retrieve a specific object.
- The objects within are either complete or a delta against another.
- The header is variable length. If the MSB of each byte is set then it
- indicates that the subsequent byte is still part of the header.
- For the first byte the next MS bits are the type, which tells you the type
- of object, and whether it is a delta. The LS byte is the lowest bits of the
- size. For each subsequent byte the LS 7 bits are the next MS bits of the
- size, i.e. the last byte of the header contains the MS bits of the size.
- For the complete objects the data is stored as zlib deflated data.
- The size in the header is the uncompressed object size, so to uncompress
- you need to just keep feeding data to zlib until you get an object back,
- or it errors on bad data. This is done here by just giving the complete
- buffer from the start of the deflated object on. This is bad, but until I
- get mmap sorted out it will have to do.
- Currently there are no integrity checks done. Also no attempt is made to
- try and detect the delta case, or a request for an object at the wrong
- position. It will all just throw a zlib or KeyError.
- """
- def __init__(self, filename, file=None, size=None):
- """Create a PackData object representing the pack in the given filename.
- The file must exist and stay readable until the object is disposed of. It
- must also stay the same size. It will be mapped whenever needed.
- Currently there is a restriction on the size of the pack as the python
- mmap implementation is flawed.
- """
- self._filename = filename
- self._size = size
- self._header_size = 12
- if file is None:
- self._file = GitFile(self._filename, 'rb')
- else:
- self._file = file
- (version, self._num_objects) = read_pack_header(self._file.read)
- self._offset_cache = LRUSizeCache(1024*1024*20,
- compute_size=_compute_object_size)
- self.pack = None
- @property
- def filename(self):
- return os.path.basename(self._filename)
- @classmethod
- def from_file(cls, file, size):
- return cls(str(file), file=file, size=size)
- @classmethod
- def from_path(cls, path):
- return cls(filename=path)
- def close(self):
- self._file.close()
- def __enter__(self):
- return self
- def __exit__(self, exc_type, exc_val, exc_tb):
- self.close()
- def _get_size(self):
- if self._size is not None:
- return self._size
- self._size = os.path.getsize(self._filename)
- if self._size < self._header_size:
- errmsg = ('%s is too small for a packfile (%d < %d)' %
- (self._filename, self._size, self._header_size))
- raise AssertionError(errmsg)
- return self._size
- def __len__(self):
- """Returns the number of objects in this pack."""
- return self._num_objects
- def calculate_checksum(self):
- """Calculate the checksum for this pack.
- :return: 20-byte binary SHA1 digest
- """
- return compute_file_sha(self._file, end_ofs=-20).digest()
- def get_ref(self, sha):
- """Get the object for a ref SHA, only looking in this pack."""
- # TODO: cache these results
- if self.pack is None:
- raise KeyError(sha)
- try:
- offset = self.pack.index.object_index(sha)
- except KeyError:
- offset = None
- if offset:
- type, obj = self.get_object_at(offset)
- elif self.pack is not None and self.pack.resolve_ext_ref:
- type, obj = self.pack.resolve_ext_ref(sha)
- else:
- raise KeyError(sha)
- return offset, type, obj
- def resolve_object(self, offset, type, obj, get_ref=None):
- """Resolve an object, possibly resolving deltas when necessary.
- :return: Tuple with object type and contents.
- """
- if type not in DELTA_TYPES:
- return type, obj
- if get_ref is None:
- get_ref = self.get_ref
- if type == OFS_DELTA:
- (delta_offset, delta) = obj
- # TODO: clean up asserts and replace with nicer error messages
- assert isinstance(offset, int) or isinstance(offset, long)
- assert isinstance(delta_offset, int) or isinstance(offset, long)
- base_offset = offset-delta_offset
- type, base_obj = self.get_object_at(base_offset)
- assert isinstance(type, int)
- elif type == REF_DELTA:
- (basename, delta) = obj
- assert isinstance(basename, bytes) and len(basename) == 20
- base_offset, type, base_obj = get_ref(basename)
- assert isinstance(type, int)
- type, base_chunks = self.resolve_object(base_offset, type, base_obj)
- chunks = apply_delta(base_chunks, delta)
- # TODO(dborowitz): This can result in poor performance if large base
- # objects are separated from deltas in the pack. We should reorganize
- # so that we apply deltas to all objects in a chain one after the other
- # to optimize cache performance.
- if offset is not None:
- self._offset_cache[offset] = type, chunks
- return type, chunks
- def iterobjects(self, progress=None, compute_crc32=True):
- self._file.seek(self._header_size)
- for i in range(1, self._num_objects + 1):
- offset = self._file.tell()
- unpacked, unused = unpack_object(
- self._file.read, compute_crc32=compute_crc32)
- if progress is not None:
- progress(i, self._num_objects)
- yield (offset, unpacked.pack_type_num, unpacked._obj(),
- unpacked.crc32)
- self._file.seek(-len(unused), SEEK_CUR) # Back up over unused data.
- def _iter_unpacked(self):
- # TODO(dborowitz): Merge this with iterobjects, if we can change its
- # return type.
- self._file.seek(self._header_size)
- if self._num_objects is None:
- return
- for _ in range(self._num_objects):
- offset = self._file.tell()
- unpacked, unused = unpack_object(
- self._file.read, compute_crc32=False)
- unpacked.offset = offset
- yield unpacked
- self._file.seek(-len(unused), SEEK_CUR) # Back up over unused data.
- def iterentries(self, progress=None):
- """Yield entries summarizing the contents of this pack.
- :param progress: Progress function, called with current and total
- object count.
- :return: iterator of tuples with (sha, offset, crc32)
- """
- num_objects = self._num_objects
- resolve_ext_ref = (
- self.pack.resolve_ext_ref if self.pack is not None else None)
- indexer = PackIndexer.for_pack_data(
- self, resolve_ext_ref=resolve_ext_ref)
- for i, result in enumerate(indexer):
- if progress is not None:
- progress(i, num_objects)
- yield result
- def sorted_entries(self, progress=None):
- """Return entries in this pack, sorted by SHA.
- :param progress: Progress function, called with current and total
- object count
- :return: List of tuples with (sha, offset, crc32)
- """
- ret = list(self.iterentries(progress=progress))
- ret.sort()
- return ret
- def create_index_v1(self, filename, progress=None):
- """Create a version 1 file for this data file.
- :param filename: Index filename.
- :param progress: Progress report function
- :return: Checksum of index file
- """
- entries = self.sorted_entries(progress=progress)
- with GitFile(filename, 'wb') as f:
- return write_pack_index_v1(f, entries, self.calculate_checksum())
- def create_index_v2(self, filename, progress=None):
- """Create a version 2 index file for this data file.
- :param filename: Index filename.
- :param progress: Progress report function
- :return: Checksum of index file
- """
- entries = self.sorted_entries(progress=progress)
- with GitFile(filename, 'wb') as f:
- return write_pack_index_v2(f, entries, self.calculate_checksum())
- def create_index(self, filename, progress=None,
- version=2):
- """Create an index file for this data file.
- :param filename: Index filename.
- :param progress: Progress report function
- :return: Checksum of index file
- """
- if version == 1:
- return self.create_index_v1(filename, progress)
- elif version == 2:
- return self.create_index_v2(filename, progress)
- else:
- raise ValueError('unknown index format %d' % version)
- def get_stored_checksum(self):
- """Return the expected checksum stored in this pack."""
- self._file.seek(-20, SEEK_END)
- return self._file.read(20)
- def check(self):
- """Check the consistency of this pack."""
- actual = self.calculate_checksum()
- stored = self.get_stored_checksum()
- if actual != stored:
- raise ChecksumMismatch(stored, actual)
- def get_object_at(self, offset):
- """Given an offset in to the packfile return the object that is there.
- Using the associated index the location of an object can be looked up,
- and then the packfile can be asked directly for that object using this
- function.
- """
- try:
- return self._offset_cache[offset]
- except KeyError:
- pass
- assert offset >= self._header_size
- self._file.seek(offset)
- unpacked, _ = unpack_object(self._file.read)
- return (unpacked.pack_type_num, unpacked._obj())
- class DeltaChainIterator(object):
- """Abstract iterator over pack data based on delta chains.
- Each object in the pack is guaranteed to be inflated exactly once,
- regardless of how many objects reference it as a delta base. As a result,
- memory usage is proportional to the length of the longest delta chain.
- Subclasses can override _result to define the result type of the iterator.
- By default, results are UnpackedObjects with the following members set:
- * offset
- * obj_type_num
- * obj_chunks
- * pack_type_num
- * delta_base (for delta types)
- * comp_chunks (if _include_comp is True)
- * decomp_chunks
- * decomp_len
- * crc32 (if _compute_crc32 is True)
- """
- _compute_crc32 = False
- _include_comp = False
- def __init__(self, file_obj, resolve_ext_ref=None):
- self._file = file_obj
- self._resolve_ext_ref = resolve_ext_ref
- self._pending_ofs = defaultdict(list)
- self._pending_ref = defaultdict(list)
- self._full_ofs = []
- self._shas = {}
- self._ext_refs = []
- @classmethod
- def for_pack_data(cls, pack_data, resolve_ext_ref=None):
- walker = cls(None, resolve_ext_ref=resolve_ext_ref)
- walker.set_pack_data(pack_data)
- for unpacked in pack_data._iter_unpacked():
- walker.record(unpacked)
- return walker
- def record(self, unpacked):
- type_num = unpacked.pack_type_num
- offset = unpacked.offset
- if type_num == OFS_DELTA:
- base_offset = offset - unpacked.delta_base
- self._pending_ofs[base_offset].append(offset)
- elif type_num == REF_DELTA:
- self._pending_ref[unpacked.delta_base].append(offset)
- else:
- self._full_ofs.append((offset, type_num))
- def set_pack_data(self, pack_data):
- self._file = pack_data._file
- def _walk_all_chains(self):
- for offset, type_num in self._full_ofs:
- for result in self._follow_chain(offset, type_num, None):
- yield result
- for result in self._walk_ref_chains():
- yield result
- assert not self._pending_ofs
- def _ensure_no_pending(self):
- if self._pending_ref:
- raise KeyError([sha_to_hex(s) for s in self._pending_ref])
- def _walk_ref_chains(self):
- if not self._resolve_ext_ref:
- self._ensure_no_pending()
- return
- for base_sha, pending in sorted(self._pending_ref.items()):
- if base_sha not in self._pending_ref:
- continue
- try:
- type_num, chunks = self._resolve_ext_ref(base_sha)
- except KeyError:
- # Not an external ref, but may depend on one. Either it will get
- # popped via a _follow_chain call, or we will raise an error
- # below.
- continue
- self._ext_refs.append(base_sha)
- self._pending_ref.pop(base_sha)
- for new_offset in pending:
- for result in self._follow_chain(new_offset, type_num, chunks):
- yield result
- self._ensure_no_pending()
- def _result(self, unpacked):
- return unpacked
- def _resolve_object(self, offset, obj_type_num, base_chunks):
- self._file.seek(offset)
- unpacked, _ = unpack_object(
- self._file.read, include_comp=self._include_comp,
- compute_crc32=self._compute_crc32)
- unpacked.offset = offset
- if base_chunks is None:
- assert unpacked.pack_type_num == obj_type_num
- else:
- assert unpacked.pack_type_num in DELTA_TYPES
- unpacked.obj_type_num = obj_type_num
- unpacked.obj_chunks = apply_delta(base_chunks,
- unpacked.decomp_chunks)
- return unpacked
- def _follow_chain(self, offset, obj_type_num, base_chunks):
- # Unlike PackData.get_object_at, there is no need to cache offsets as
- # this approach by design inflates each object exactly once.
- unpacked = self._resolve_object(offset, obj_type_num, base_chunks)
- yield self._result(unpacked)
- pending = chain(self._pending_ofs.pop(unpacked.offset, []),
- self._pending_ref.pop(unpacked.sha(), []))
- for new_offset in pending:
- for new_result in self._follow_chain(
- new_offset, unpacked.obj_type_num, unpacked.obj_chunks):
- yield new_result
- def __iter__(self):
- return self._walk_all_chains()
- def ext_refs(self):
- return self._ext_refs
- class PackIndexer(DeltaChainIterator):
- """Delta chain iterator that yields index entries."""
- _compute_crc32 = True
- def _result(self, unpacked):
- return unpacked.sha(), unpacked.offset, unpacked.crc32
- class PackInflater(DeltaChainIterator):
- """Delta chain iterator that yields ShaFile objects."""
- def _result(self, unpacked):
- return unpacked.sha_file()
- class SHA1Reader(object):
- """Wrapper around a file-like object that remembers the SHA1 of its data."""
- def __init__(self, f):
- self.f = f
- self.sha1 = sha1(b'')
- def read(self, num=None):
- data = self.f.read(num)
- self.sha1.update(data)
- return data
- def check_sha(self):
- stored = self.f.read(20)
- if stored != self.sha1.digest():
- raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
- def close(self):
- return self.f.close()
- def tell(self):
- return self.f.tell()
- class SHA1Writer(object):
- """Wrapper around a file-like object that remembers the SHA1 of its data."""
- def __init__(self, f):
- self.f = f
- self.length = 0
- self.sha1 = sha1(b'')
- def write(self, data):
- self.sha1.update(data)
- self.f.write(data)
- self.length += len(data)
- def write_sha(self):
- sha = self.sha1.digest()
- assert len(sha) == 20
- self.f.write(sha)
- self.length += len(sha)
- return sha
- def close(self):
- sha = self.write_sha()
- self.f.close()
- return sha
- def offset(self):
- return self.length
- def tell(self):
- return self.f.tell()
- def pack_object_header(type_num, delta_base, size):
- """Create a pack object header for the given object info.
- :param type_num: Numeric type of the object.
- :param delta_base: Delta base offset or ref, or None for whole objects.
- :param size: Uncompressed object size.
- :return: A header for a packed object.
- """
- header = []
- c = (type_num << 4) | (size & 15)
- size >>= 4
- while size:
- header.append(c | 0x80)
- c = size & 0x7f
- size >>= 7
- header.append(c)
- if type_num == OFS_DELTA:
- ret = [delta_base & 0x7f]
- delta_base >>= 7
- while delta_base:
- delta_base -= 1
- ret.insert(0, 0x80 | (delta_base & 0x7f))
- delta_base >>= 7
- header.extend(ret)
- elif type_num == REF_DELTA:
- assert len(delta_base) == 20
- header += delta_base
- return bytearray(header)
- def write_pack_object(f, type, object, sha=None):
- """Write pack object to a file.
- :param f: File to write to
- :param type: Numeric type of the object
- :param object: Object to write
- :return: Tuple with offset at which the object was written, and crc32
- """
- if type in DELTA_TYPES:
- delta_base, object = object
- else:
- delta_base = None
- header = bytes(pack_object_header(type, delta_base, len(object)))
- comp_data = zlib.compress(object)
- crc32 = 0
- for data in (header, comp_data):
- f.write(data)
- if sha is not None:
- sha.update(data)
- crc32 = binascii.crc32(data, crc32)
- return crc32 & 0xffffffff
- def write_pack(filename, objects, deltify=None, delta_window_size=None):
- """Write a new pack data file.
- :param filename: Path to the new pack file (without .pack extension)
- :param objects: Iterable of (object, path) tuples to write.
- Should provide __len__
- :param window_size: Delta window size
- :param deltify: Whether to deltify pack objects
- :return: Tuple with checksum of pack file and index file
- """
- with GitFile(filename + b'.pack', 'wb') as f:
- entries, data_sum = write_pack_objects(f, objects,
- delta_window_size=delta_window_size, deltify=deltify)
- entries = [(k, v[0], v[1]) for (k, v) in entries.items()]
- entries.sort()
- with GitFile(filename + b'.idx', 'wb') as f:
- return data_sum, write_pack_index_v2(f, entries, data_sum)
- def write_pack_header(f, num_objects):
- """Write a pack header for the given number of objects."""
- f.write(b'PACK') # Pack header
- f.write(struct.pack(b'>L', 2)) # Pack version
- f.write(struct.pack(b'>L', num_objects)) # Number of objects in pack
- def deltify_pack_objects(objects, window_size=None):
- """Generate deltas for pack objects.
- :param objects: An iterable of (object, path) tuples to deltify.
- :param window_size: Window size; None for default
- :return: Iterator over type_num, object id, delta_base, content
- delta_base is None for full text entries
- """
- if window_size is None:
- window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
- # Build a list of objects ordered by the magic Linus heuristic
- # This helps us find good objects to diff against us
- magic = []
- for obj, path in objects:
- magic.append((obj.type_num, path, -obj.raw_length(), obj))
- magic.sort()
- possible_bases = deque()
- for type_num, path, neg_length, o in magic:
- raw = o.as_raw_string()
- winner = raw
- winner_base = None
- for base in possible_bases:
- if base.type_num != type_num:
- continue
- delta = create_delta(base.as_raw_string(), raw)
- if len(delta) < len(winner):
- winner_base = base.sha().digest()
- winner = delta
- yield type_num, o.sha().digest(), winner_base, winner
- possible_bases.appendleft(o)
- while len(possible_bases) > window_size:
- possible_bases.pop()
- def write_pack_objects(f, objects, delta_window_size=None, deltify=False):
- """Write a new pack data file.
- :param f: File to write to
- :param objects: Iterable of (object, path) tuples to write.
- Should provide __len__
- :param window_size: Sliding window size for searching for deltas;
- Set to None for default window size.
- :param deltify: Whether to deltify objects
- :return: Dict mapping id -> (offset, crc32 checksum), pack checksum
- """
- if deltify:
- pack_contents = deltify_pack_objects(objects, delta_window_size)
- else:
- pack_contents = (
- (o.type_num, o.sha().digest(), None, o.as_raw_string())
- for (o, path) in objects)
- return write_pack_data(f, len(objects), pack_contents)
- def write_pack_data(f, num_records, records):
- """Write a new pack data file.
- :param f: File to write to
- :param num_records: Number of records
- :param records: Iterator over type_num, object_id, delta_base, raw
- :return: Dict mapping id -> (offset, crc32 checksum), pack checksum
- """
- # Write the pack
- entries = {}
- f = SHA1Writer(f)
- write_pack_header(f, num_records)
- for type_num, object_id, delta_base, raw in records:
- offset = f.offset()
- if delta_base is not None:
- try:
- base_offset, base_crc32 = entries[delta_base]
- except KeyError:
- type_num = REF_DELTA
- raw = (delta_base, raw)
- else:
- type_num = OFS_DELTA
- raw = (offset - base_offset, raw)
- crc32 = write_pack_object(f, type_num, raw)
- entries[object_id] = (offset, crc32)
- return entries, f.write_sha()
- def write_pack_index_v1(f, entries, pack_checksum):
- """Write a new pack index file.
- :param f: A file-like object to write to
- :param entries: List of tuples with object name (sha), offset_in_pack,
- and crc32_checksum.
- :param pack_checksum: Checksum of the pack file.
- :return: The SHA of the written index file
- """
- f = SHA1Writer(f)
- fan_out_table = defaultdict(lambda: 0)
- for (name, offset, entry_checksum) in entries:
- fan_out_table[ord(name[:1])] += 1
- # Fan-out table
- for i in range(0x100):
- f.write(struct.pack('>L', fan_out_table[i]))
- fan_out_table[i+1] += fan_out_table[i]
- for (name, offset, entry_checksum) in entries:
- if not (offset <= 0xffffffff):
- raise TypeError("pack format 1 only supports offsets < 2Gb")
- f.write(struct.pack('>L20s', offset, name))
- assert len(pack_checksum) == 20
- f.write(pack_checksum)
- return f.write_sha()
- def _delta_encode_size(size):
- ret = bytearray()
- c = size & 0x7f
- size >>= 7
- while size:
- ret.append(c | 0x80)
- c = size & 0x7f
- size >>= 7
- ret.append(c)
- return ret
- # The length of delta compression copy operations in version 2 packs is limited
- # to 64K. To copy more, we use several copy operations. Version 3 packs allow
- # 24-bit lengths in copy operations, but we always make version 2 packs.
- _MAX_COPY_LEN = 0xffff
- def _encode_copy_operation(start, length):
- scratch = []
- op = 0x80
- for i in range(4):
- if start & 0xff << i*8:
- scratch.append((start >> i*8) & 0xff)
- op |= 1 << i
- for i in range(2):
- if length & 0xff << i*8:
- scratch.append((length >> i*8) & 0xff)
- op |= 1 << (4+i)
- return bytearray([op] + scratch)
- def create_delta(base_buf, target_buf):
- """Use python difflib to work out how to transform base_buf to target_buf.
- :param base_buf: Base buffer
- :param target_buf: Target buffer
- """
- assert isinstance(base_buf, bytes)
- assert isinstance(target_buf, bytes)
- out_buf = bytearray()
- # write delta header
- out_buf += _delta_encode_size(len(base_buf))
- out_buf += _delta_encode_size(len(target_buf))
- # write out delta opcodes
- seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
- for opcode, i1, i2, j1, j2 in seq.get_opcodes():
- # Git patch opcodes don't care about deletes!
- #if opcode == 'replace' or opcode == 'delete':
- # pass
- if opcode == 'equal':
- # If they are equal, unpacker will use data from base_buf
- # Write out an opcode that says what range to use
- copy_start = i1
- copy_len = i2 - i1
- while copy_len > 0:
- to_copy = min(copy_len, _MAX_COPY_LEN)
- out_buf += _encode_copy_operation(copy_start, to_copy)
- copy_start += to_copy
- copy_len -= to_copy
- if opcode == 'replace' or opcode == 'insert':
- # If we are replacing a range or adding one, then we just
- # output it to the stream (prefixed by its size)
- s = j2 - j1
- o = j1
- while s > 127:
- out_buf.append(127)
- out_buf += bytearray(target_buf[o:o+127])
- s -= 127
- o += 127
- out_buf.append(s)
- out_buf += bytearray(target_buf[o:o+s])
- return bytes(out_buf)
- def apply_delta(src_buf, delta):
- """Based on the similar function in git's patch-delta.c.
- :param src_buf: Source buffer
- :param delta: Delta instructions
- """
- if not isinstance(src_buf, bytes):
- src_buf = b''.join(src_buf)
- if not isinstance(delta, bytes):
- delta = b''.join(delta)
- out = []
- index = 0
- delta_length = len(delta)
- def get_delta_header_size(delta, index):
- size = 0
- i = 0
- while delta:
- cmd = ord(delta[index:index+1])
- index += 1
- size |= (cmd & ~0x80) << i
- i += 7
- if not cmd & 0x80:
- break
- return size, index
- src_size, index = get_delta_header_size(delta, index)
- dest_size, index = get_delta_header_size(delta, index)
- assert src_size == len(src_buf), '%d vs %d' % (src_size, len(src_buf))
- while index < delta_length:
- cmd = ord(delta[index:index+1])
- index += 1
- if cmd & 0x80:
- cp_off = 0
- for i in range(4):
- if cmd & (1 << i):
- x = ord(delta[index:index+1])
- index += 1
- cp_off |= x << (i * 8)
- cp_size = 0
- # Version 3 packs can contain copy sizes larger than 64K.
- for i in range(3):
- if cmd & (1 << (4+i)):
- x = ord(delta[index:index+1])
- index += 1
- cp_size |= x << (i * 8)
- if cp_size == 0:
- cp_size = 0x10000
- if (cp_off + cp_size < cp_size or
- cp_off + cp_size > src_size or
- cp_size > dest_size):
- break
- out.append(src_buf[cp_off:cp_off+cp_size])
- elif cmd != 0:
- out.append(delta[index:index+cmd])
- index += cmd
- else:
- raise ApplyDeltaError('Invalid opcode 0')
- if index != delta_length:
- raise ApplyDeltaError('delta not empty: %r' % delta[index:])
- if dest_size != chunks_length(out):
- raise ApplyDeltaError('dest size incorrect')
- return out
- def write_pack_index_v2(f, entries, pack_checksum):
- """Write a new pack index file.
- :param f: File-like object to write to
- :param entries: List of tuples with object name (sha), offset_in_pack, and
- crc32_checksum.
- :param pack_checksum: Checksum of the pack file.
- :return: The SHA of the index file written
- """
- f = SHA1Writer(f)
- f.write(b'\377tOc') # Magic!
- f.write(struct.pack('>L', 2))
- fan_out_table = defaultdict(lambda: 0)
- for (name, offset, entry_checksum) in entries:
- fan_out_table[ord(name[:1])] += 1
- # Fan-out table
- largetable = []
- for i in range(0x100):
- f.write(struct.pack(b'>L', fan_out_table[i]))
- fan_out_table[i+1] += fan_out_table[i]
- for (name, offset, entry_checksum) in entries:
- f.write(name)
- for (name, offset, entry_checksum) in entries:
- f.write(struct.pack(b'>L', entry_checksum))
- for (name, offset, entry_checksum) in entries:
- if offset < 2**31:
- f.write(struct.pack(b'>L', offset))
- else:
- f.write(struct.pack(b'>L', 2**31 + len(largetable)))
- largetable.append(offset)
- for offset in largetable:
- f.write(struct.pack(b'>Q', offset))
- assert len(pack_checksum) == 20
- f.write(pack_checksum)
- return f.write_sha()
- class Pack(object):
- """A Git pack object."""
- def __init__(self, basename, resolve_ext_ref=None):
- self._basename = basename
- self._data = None
- self._idx = None
- self._idx_path = self._basename + b'.idx'
- self._data_path = self._basename + b'.pack'
- self._data_load = lambda: PackData(self._data_path)
- self._idx_load = lambda: load_pack_index(self._idx_path)
- self.resolve_ext_ref = resolve_ext_ref
- @classmethod
- def from_lazy_objects(self, data_fn, idx_fn):
- """Create a new pack object from callables to load pack data and
- index objects."""
- ret = Pack(b'')
- ret._data_load = data_fn
- ret._idx_load = idx_fn
- return ret
- @classmethod
- def from_objects(self, data, idx):
- """Create a new pack object from pack data and index objects."""
- ret = Pack(b'')
- ret._data_load = lambda: data
- ret._idx_load = lambda: idx
- return ret
- def name(self):
- """The SHA over the SHAs of the objects in this pack."""
- return self.index.objects_sha1()
- @property
- def data(self):
- """The pack data object being used."""
- if self._data is None:
- self._data = self._data_load()
- self._data.pack = self
- self.check_length_and_checksum()
- return self._data
- @property
- def index(self):
- """The index being used.
- :note: This may be an in-memory index
- """
- if self._idx is None:
- self._idx = self._idx_load()
- return self._idx
- def close(self):
- if self._data is not None:
- self._data.close()
- if self._idx is not None:
- self._idx.close()
- def __enter__(self):
- return self
- def __exit__(self, exc_type, exc_val, exc_tb):
- self.close()
- def __eq__(self, other):
- return isinstance(self, type(other)) and self.index == other.index
- def __len__(self):
- """Number of entries in this pack."""
- return len(self.index)
- def __repr__(self):
- return '%s(%r)' % (self.__class__.__name__, self._basename)
- def __iter__(self):
- """Iterate over all the sha1s of the objects in this pack."""
- return iter(self.index)
- def check_length_and_checksum(self):
- """Sanity check the length and checksum of the pack index and data."""
- assert len(self.index) == len(self.data)
- idx_stored_checksum = self.index.get_pack_checksum()
- data_stored_checksum = self.data.get_stored_checksum()
- if idx_stored_checksum != data_stored_checksum:
- raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
- sha_to_hex(data_stored_checksum))
- def check(self):
- """Check the integrity of this pack.
- :raise ChecksumMismatch: if a checksum for the index or data is wrong
- """
- self.index.check()
- self.data.check()
- for obj in self.iterobjects():
- obj.check()
- # TODO: object connectivity checks
- def get_stored_checksum(self):
- return self.data.get_stored_checksum()
- def __contains__(self, sha1):
- """Check whether this pack contains a particular SHA1."""
- try:
- self.index.object_index(sha1)
- return True
- except KeyError:
- return False
- def get_raw(self, sha1):
- offset = self.index.object_index(sha1)
- obj_type, obj = self.data.get_object_at(offset)
- type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
- return type_num, b''.join(chunks)
- def __getitem__(self, sha1):
- """Retrieve the specified SHA1."""
- type, uncomp = self.get_raw(sha1)
- return ShaFile.from_raw_string(type, uncomp, sha=sha1)
- def iterobjects(self):
- """Iterate over the objects in this pack."""
- return iter(PackInflater.for_pack_data(
- self.data, resolve_ext_ref=self.resolve_ext_ref))
- def pack_tuples(self):
- """Provide an iterable for use with write_pack_objects.
- :return: Object that can iterate over (object, path) tuples
- and provides __len__
- """
- class PackTupleIterable(object):
- def __init__(self, pack):
- self.pack = pack
- def __len__(self):
- return len(self.pack)
- def __iter__(self):
- return ((o, None) for o in self.pack.iterobjects())
- return PackTupleIterable(self)
- def keep(self, msg=None):
- """Add a .keep file for the pack, preventing git from garbage collecting it.
- :param msg: A message written inside the .keep file; can be used later to
- determine whether or not a .keep file is obsolete.
- :return: The path of the .keep file, as a string.
- """
- keepfile_name = self._basename + b'.keep'
- with GitFile(keepfile_name, 'wb') as keepfile:
- if msg:
- keepfile.write(msg)
- keepfile.write(b'\n')
- return keepfile_name
- try:
- from dulwich._pack import apply_delta, bisect_find_sha
- except ImportError:
- pass
|