123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- """Classes for dealing with packed git objects.
- A pack is a compact representation of a bunch of objects, stored
- using deltas where possible.
- They have two parts, the pack file, which stores the data, and an index
- that tells you where the data is.
- To find an object you look in all of the index files 'til you find a
- match for the object name. You then use the pointer got from this as
- a pointer in to the corresponding packfile.
- """
- import mmap
- import os
- from objects import (ShaFile,
- _decompress,
- )
- def hex_to_sha(hex):
- """Converts a hex value to the number it represents"""
- mapping = { '0' : 0, '1' : 1, '2' : 2, '3' : 3, '4' : 4, '5' : 5, '6' : 6,
- '7' : 7, '8' : 8, '9' : 9, 'a' : 10, 'b' : 11, 'c' : 12,
- 'd' : 13, 'e' : 14, 'f' : 15}
- value = 0
- for c in hex:
- value = (16 * value) + mapping[c]
- return value
- def multi_ord(map, start, count):
- value = 0
- for i in range(count):
- value = value * 256 + ord(map[start+i])
- return value
- max_size = 256 * 1024 * 1024
- class PackIndex(object):
- """An index in to a packfile.
- Given a sha id of an object a pack index can tell you the location in the
- packfile of that object if it has it.
- To do the looup it opens the file, and indexes first 256 4 byte groups
- with the first byte of the sha id. The value in the four byte group indexed
- is the end of the group that shares the same starting byte. Subtract one
- from the starting byte and index again to find the start of the group.
- The values are sorted by sha id within the group, so do the math to find
- the start and end offset and then bisect in to find if the value is present.
- """
- header_record_size = 4
- header_size = 256 * header_record_size
- index_size = 4
- sha_bytes = 20
- record_size = sha_bytes + index_size
- def __init__(self, filename):
- """Create a pack index object.
- Provide it with the name of the index file to consider, and it will map
- it whenever required.
- """
- self._filename = filename
- assert os.path.exists(filename), "%s is not a pack index" % filename
-
-
- self._size = os.path.getsize(filename)
- assert self._size > self.header_size, "%s is too small to be a packfile" % \
- filename
- assert self._size < max_size, "%s is larger than 256 meg, and it " \
- "might not be a good idea to mmap it. If you want to go ahead " \
- "delete this check, or get python to support mmap offsets so that " \
- "I can map the files sensibly"
- def object_index(self, sha):
- """Return the index in to the corresponding packfile for the object.
- Given the name of an object it will return the offset that object lives
- at within the corresponding pack file. If the pack file doesn't have the
- object then None will be returned.
- """
- size = os.path.getsize(self._filename)
- assert size == self._size, "Pack index %s has changed size, I don't " \
- "like that" % self._filename
- f = open(self._filename, 'rb')
- try:
- map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
- return self._object_index(map, sha)
- finally:
- f.close()
- def _object_index(self, map, hexsha):
- """See object_index"""
- first_byte = hex_to_sha(hexsha[:2])
- header_offset = self.header_record_size * first_byte
- start = multi_ord(map, header_offset-self.header_record_size, self.header_record_size)
- end = multi_ord(map, header_offset, self.header_record_size)
- sha = hex_to_sha(hexsha)
- while start < end:
- i = (start + end)/2
- offset = self.header_size + (i * self.record_size)
- file_sha = multi_ord(map, offset + self.index_size, self.sha_bytes)
- if file_sha == sha:
- return multi_ord(map, offset, self.index_size)
- elif file_sha < sha:
- start = offset + 1
- else:
- end = offset - 1
- return None
- class PackData(object):
- """The data contained in a packfile.
- Pack files can be accessed both sequentially for exploding a pack, and
- directly with the help of an index to retrieve a specific object.
- The objects within are either complete or a delta aginst another.
- The header is variable length. If the MSB of each byte is set then it
- indicates that the subsequent byte is still part of the header.
- For the first byte the next MS bits are the type, which tells you the type
- of object, and whether it is a delta. The LS byte is the lowest bits of the
- size. For each subsequent byte the LS 7 bits are the next MS bits of the
- size, i.e. the last byte of the header contains the MS bits of the size.
- For the complete objects the data is stored as zlib deflated data.
- The size in the header is the uncompressed object size, so to uncompress
- you need to just keep feeding data to zlib until you get an object back,
- or it errors on bad data. This is done here by just giving the complete
- buffer from the start of the deflated object on. This is bad, but until I
- get mmap sorted out it will have to do.
- Currently there are no integrity checks done. Also no attempt is made to try
- and detect the delta case, or a request for an object at the wrong position.
- It will all just throw a zlib or KeyError.
- """
- def __init__(self, filename):
- """Create a PackData object that represents the pack in the given filename.
- The file must exist and stay readable until the object is disposed of. It
- must also stay the same size. It will be mapped whenever needed.
- Currently there is a restriction on the size of the pack as the python
- mmap implementation is flawed.
- """
- self._filename = filename
- assert os.path.exists(filename), "%s is not a packfile" % filename
- self._size = os.path.getsize(filename)
- assert self._size < max_size, "%s is larger than 256 meg, and it " \
- "might not be a good idea to mmap it. If you want to go ahead " \
- "delete this check, or get python to support mmap offsets so that " \
- "I can map the files sensibly"
- def get_object_at(self, offset):
- """Given an offset in to the packfile return the object that is there.
- Using the associated index the location of an object can be looked up, and
- then the packfile can be asked directly for that object using this
- function.
- Currently only non-delta objects are supported.
- """
- size = os.path.getsize(self._filename)
- assert size == self._size, "Pack data %s has changed size, I don't " \
- "like that" % self._filename
- f = open(self._filename, 'rb')
- try:
- map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
- return self._get_object_at(map, offset)
- finally:
- f.close()
- def _get_object_at(self, map, offset):
- first_byte = ord(map[offset])
- sign_extend = first_byte & 0x80
- type = (first_byte >> 4) & 0x07
- size = first_byte & 0x0f
- cur_offset = 0
- while sign_extend > 0:
- byte = ord(map[offset+cur_offset+1])
- sign_extend = byte & 0x80
- size_part = byte & 0x7f
- size += size_part << ((cur_offset * 7) + 4)
- cur_offset += 1
- raw_base = offset+cur_offset+1
-
-
-
-
- raw = map[raw_base:]
- uncomp = _decompress(raw)
- obj = ShaFile.from_raw_string(type, uncomp)
- return obj
|