Browse Source

Allow extracting raw compressed data from packs.

Jelmer Vernooij 7 years ago
parent
commit
2b6271f2a7
3 changed files with 62 additions and 1 deletions
  1. 1 1
      dulwich/object_store.py
  2. 41 0
      dulwich/pack.py
  3. 20 0
      dulwich/tests/test_pack.py

+ 1 - 1
dulwich/object_store.py

@@ -392,7 +392,7 @@ class PackBasedObjectStore(BaseObjectStore):
         return self._get_loose_object(sha) is not None
 
     def get_raw(self, name):
-        """Obtain the raw text for an object.
+        """Obtain the raw fulltext for an object.
 
         :param name: sha for the object.
         :return: tuple with numeric type and object contents.

+ 41 - 0
dulwich/pack.py

@@ -393,6 +393,16 @@ class PackIndex(object):
             sha = hex_to_sha(sha)
         return self._object_index(sha)
 
+    def object_sha1(self, index):
+        """Return the SHA1 corresponding to the index in the pack file.
+        """
+        # PERFORMANCE/TODO(jelmer): Avoid scanning entire index
+        for (name, offset, crc32) in self.iterentries():
+            if offset == index:
+                return name
+        else:
+            raise KeyError(index)
+
     def _object_index(self, sha):
         """See object_index.
 
@@ -422,8 +432,10 @@ class MemoryPackIndex(PackIndex):
         :param pack_checksum: Optional pack checksum
         """
         self._by_sha = {}
+        self._by_index = {}
         for name, idx, crc32 in entries:
             self._by_sha[name] = idx
+            self._by_index[idx] = name
         self._entries = entries
         self._pack_checksum = pack_checksum
 
@@ -436,6 +448,9 @@ class MemoryPackIndex(PackIndex):
     def _object_index(self, sha):
         return self._by_sha[sha][0]
 
+    def object_sha1(self, index):
+        return self._by_index[index]
+
     def _itersha(self):
         return iter(self._by_sha)
 
@@ -1220,6 +1235,18 @@ class PackData(object):
         if actual != stored:
             raise ChecksumMismatch(stored, actual)
 
+    def get_compressed_data_at(self, offset):
+        """Given offset in the packfile return compressed data that is there.
+
+        Using the associated index the location of an object can be looked up,
+        and then the packfile can be asked directly for that object using this
+        function.
+        """
+        assert offset >= self._header_size
+        self._file.seek(offset)
+        unpacked, _ = unpack_object(self._file.read, include_comp=True)
+        return (unpacked.pack_type_num, unpacked.delta_base, unpacked.comp_chunks)
+
     def get_object_at(self, offset):
         """Given an offset in to the packfile return the object that is there.
 
@@ -1919,6 +1946,20 @@ class Pack(object):
         except KeyError:
             return False
 
+    def get_raw_unresolved(self, sha1):
+        """Get raw unresolved data for a SHA.
+
+        :param sha1: SHA to return data for
+        :return: Tuple with pack object type, delta base (if applicable),
+            list of data chunks
+        """
+        offset = self.index.object_index(sha1)
+        (obj_type, delta_base, chunks) = self.data.get_compressed_data_at(offset)
+        if obj_type == OFS_DELTA:
+            delta_base = sha_to_hex(self.index.object_sha1(offset - delta_base))
+            obj_type = REF_DELTA
+        return (obj_type, delta_base, chunks)
+
     def get_raw(self, sha1):
         offset = self.index.object_index(sha1)
         obj_type, obj = self.data.get_object_at(offset)

+ 20 - 0
dulwich/tests/test_pack.py

@@ -130,6 +130,14 @@ class PackIndexTests(PackTests):
         self.assertEqual(p.object_index(tree_sha), 138)
         self.assertEqual(p.object_index(commit_sha), 12)
 
+    def test_object_sha1(self):
+        """Tests that the correct object offset is returned from the index."""
+        p = self.get_pack_index(pack1_sha)
+        self.assertRaises(KeyError, p.object_sha1, 876)
+        self.assertEqual(p.object_sha1(178), hex_to_sha(a_sha))
+        self.assertEqual(p.object_sha1(138), hex_to_sha(tree_sha))
+        self.assertEqual(p.object_sha1(12), hex_to_sha(commit_sha))
+
     def test_index_len(self):
         p = self.get_pack_index(pack1_sha)
         self.assertEqual(3, len(p))
@@ -524,6 +532,18 @@ class TestThinPack(PackTests):
                 (3, b'foo1234'),
                 p.get_raw(self.blobs[b'foo1234'].id))
 
+    def test_get_raw_unresolved(self):
+        with self.make_pack(False) as p:
+            self.assertEqual(
+                (7, '\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c',
+                    ['x\x9ccf\x9f\xc0\xccbhdl\x02\x00\x06f\x01l']),
+                p.get_raw_unresolved(self.blobs[b'foo1234'].id))
+        with self.make_pack(True) as p:
+            self.assertEqual(
+                (7, '\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c',
+                    ['x\x9ccf\x9f\xc0\xccbhdl\x02\x00\x06f\x01l']),
+                p.get_raw_unresolved(self.blobs[b'foo1234'].id))
+
     def test_iterobjects(self):
         with self.make_pack(False) as p:
             self.assertRaises(KeyError, list, p.iterobjects())