Browse Source

Refactor pack handling so writing packs uses UnpackedObject

Jelmer Vernooij 2 years ago
parent
commit
b51e5cdb0e

+ 2 - 2
dulwich/object_store.py

@@ -188,9 +188,9 @@ class BaseObjectStore:
         try:
             write_pack_data(
                 f.write,
-                count,
                 pack_data,
-                progress,
+                num_records=count,
+                progress=progress,
                 compression_level=self.pack_compression_level,
             )
         except BaseException:

+ 43 - 39
dulwich/pack.py

@@ -125,9 +125,9 @@ class ObjectContainer(Protocol):
 
 class PackedObjectContainer(ObjectContainer):
 
-    def get_raw_unresolved(self, sha1: bytes) -> Tuple[int, Optional[bytes], List[bytes]]:
+    def get_unpacked_object(self, sha1: bytes, *, include_comp: bool = False) -> "UnpackedObject":
         """Get a raw unresolved object."""
-        raise NotImplementedError(self.get_raw_unresolved)
+        raise NotImplementedError(self.get_unpacked_object)
 
 
 def take_msb_bytes(read: Callable[[int], bytes], crc32: Optional[int] = None) -> Tuple[List[int], Optional[int]]:
@@ -179,14 +179,17 @@ class UnpackedObject:
 
     # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
     # methods of this object.
-    def __init__(self, pack_type_num, delta_base, decomp_len, crc32):
+    def __init__(self, pack_type_num, delta_base, decomp_len=None, crc32=None, sha=None, decomp_chunks=None):
         self.offset = None
-        self._sha = None
+        self._sha = sha
         self.pack_type_num = pack_type_num
         self.delta_base = delta_base
         self.comp_chunks = None
-        self.decomp_chunks: List[bytes] = []
-        self.decomp_len = decomp_len
+        self.decomp_chunks: List[bytes] = decomp_chunks or []
+        if decomp_chunks is not None and decomp_len is None:
+            self.decomp_len = sum(map(len, decomp_chunks))
+        else:
+            self.decomp_len = decomp_len
         self.crc32 = crc32
 
         if pack_type_num in DELTA_TYPES:
@@ -1740,7 +1743,7 @@ def write_pack_header(write, num_objects):
 def deltify_pack_objects(
         objects: Iterable[Tuple[ShaFile, str]],
         window_size: Optional[int] = None,
-        reuse_pack: Optional[PackedObjectContainer] = None):
+        reuse_pack: Optional[PackedObjectContainer] = None) -> Iterator[UnpackedObject]:
     """Generate deltas for pack objects.
 
     Args:
@@ -1766,11 +1769,11 @@ def deltify_pack_objects(
             sha_digest = o.sha().digest()
             # get_raw_unresolved() translates OFS_DELTA into REF_DELTA for us
             try:
-                (obj_type, delta_base, chunks) = reuse_pack.get_raw_unresolved(sha_digest)
+                unpacked = reuse_pack.get_unpacked_object(sha_digest)
             except KeyError:
                 continue
-            if obj_type == REF_DELTA and delta_base in objects_to_pack:
-                yield obj_type, sha_digest, hex_to_sha(delta_base), chunks
+            if unpacked.pack_type_num == REF_DELTA and unpacked.delta_base in objects_to_pack:
+                yield unpacked
                 reused_deltas.add(sha_digest)
 
     # Build a list of objects ordered by the magic Linus heuristic
@@ -1803,7 +1806,8 @@ def deltify_pack_objects(
                 winner_base = base_id
                 winner = delta
                 winner_len = sum(map(len, winner))
-        yield type_num, o.sha().digest(), winner_base, winner
+        unpacked = UnpackedObject(type_num, sha=o.sha().digest(), delta_base=winner_base, decomp_len=winner_len, decomp_chunks=winner)
+        yield unpacked
         possible_bases.appendleft((o.sha().digest(), type_num, raw))
         while len(possible_bases) > window_size:
             possible_bases.pop()
@@ -1811,9 +1815,9 @@ def deltify_pack_objects(
 
 def pack_objects_to_data(
         objects,
-        delta_window_size: Optional[int] = None,
+        *, delta_window_size: Optional[int] = None,
         deltify: Optional[bool] = None,
-        reuse_pack: Optional[PackedObjectContainer] = None):
+        reuse_pack: Optional[PackedObjectContainer] = None) -> Tuple[int, Iterator[UnpackedObject]]:
     """Create pack data from objects
 
     Args:
@@ -1832,13 +1836,17 @@ def pack_objects_to_data(
     else:
         return (
             count,
-            (
-                (o.type_num, o.sha().digest(), None, o.as_raw_chunks())
-                for (o, path) in objects
-            ),
+            (full_unpacked_object(o) for (o, path) in objects)
         )
 
 
+def full_unpacked_object(o: ShaFile) -> UnpackedObject:
+    return UnpackedObject(
+        o.type_num, delta_base=None, crc32=None,
+        decomp_chunks=o.as_raw_chunks(),
+        sha=o.sha().digest())
+
+
 def write_pack_objects(
         write, objects,
         delta_window_size: Optional[int] = None,
@@ -1872,8 +1880,8 @@ def write_pack_objects(
 
     return write_pack_data(
         write,
-        pack_contents_count,
         pack_contents,
+        num_records=pack_contents_count,
         compression_level=compression_level,
     )
 
@@ -1892,8 +1900,8 @@ class PackChunkGenerator:
     def __iter__(self):
         return self._it
 
-    def _pack_data_chunks(self, num_records=None, records=None, progress=None, compression_level=-1):
-        """Iterate pack data file chunks..
+    def _pack_data_chunks(self, records: Iterator[UnpackedObject], *, num_records=None, progress=None, compression_level: int = -1) -> Iterator[bytes]:
+        """Iterate pack data file chunks.
 
         Args:
           num_records: Number of records (defaults to len(records) if None)
@@ -1904,34 +1912,38 @@ class PackChunkGenerator:
         """
         # Write the pack
         if num_records is None:
-            num_records = len(records)
+            num_records = len(records)   # type: ignore
         offset = 0
         for chunk in pack_header_chunks(num_records):
             yield chunk
             self.cs.update(chunk)
             offset += len(chunk)
         actual_num_records = 0
-        for i, (type_num, object_id, delta_base, raw) in enumerate(records):
+        for i, unpacked in enumerate(records):
+            type_num = unpacked.pack_type_num
             if progress is not None:
                 progress(("writing pack data: %d/%d\r" % (i, num_records)).encode("ascii"))
-            if delta_base is not None:
+            if unpacked.delta_base is not None:
                 try:
-                    base_offset, base_crc32 = self.entries[delta_base]
+                    base_offset, base_crc32 = self.entries[unpacked.delta_base]
                 except KeyError:
                     type_num = REF_DELTA
-                    raw = (delta_base, raw)
+                    raw = (unpacked.delta_base, unpacked.decomp_chunks)
                 else:
                     type_num = OFS_DELTA
-                    raw = (offset - base_offset, raw)
+                    raw = (offset - base_offset, unpacked.decomp_chunks)
+            else:
+                raw = unpacked.decomp_chunks
             crc32 = 0
             object_size = 0
+            # TODO(jelmer,perf): if unpacked.comp_chunks is populated, use that?
             for chunk in pack_object_chunks(type_num, raw, compression_level=compression_level):
                 yield chunk
                 crc32 = binascii.crc32(chunk, crc32)
                 self.cs.update(chunk)
                 object_size += len(chunk)
             actual_num_records += 1
-            self.entries[object_id] = (offset, crc32)
+            self.entries[unpacked.sha()] = (offset, crc32)
             offset += object_size
         if actual_num_records != num_records:
             raise AssertionError(
@@ -1941,7 +1953,7 @@ class PackChunkGenerator:
         yield self.cs.digest()
 
 
-def write_pack_data(write, num_records=None, records=None, progress=None, compression_level=-1):
+def write_pack_data(write, records: Iterator[UnpackedObject], *, num_records=None, progress=None, compression_level=-1):
     """Write a new pack data file.
 
     Args:
@@ -1952,11 +1964,6 @@ def write_pack_data(write, num_records=None, records=None, progress=None, compre
       compression_level: the zlib compression level
     Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
     """
-    if hasattr(write, 'write'):
-        warnings.warn(
-            'write_pack_data() now takes a write rather than file argument',
-            DeprecationWarning, stacklevel=2)
-        write = write.write
     chunk_generator = PackChunkGenerator(
         num_records=num_records, records=records, progress=progress,
         compression_level=compression_level)
@@ -2361,14 +2368,11 @@ class Pack:
             if uo.sha() in shas)
 
     def pack_tuples(self):
-        """Provide an iterable for use with write_pack_objects.
-
-        Returns: Object that can iterate over (object, path) tuples
-            and provides __len__
-        """
-
         return _PackTupleIterable(self.iterobjects, len(self))
 
+    def iter_unpacked(self):
+        return self.data.iter_unpacked()
+
     def keep(self, msg: Optional[bytes] = None) -> str:
         """Add a .keep file for the pack, preventing git from garbage collecting it.
 

+ 1 - 1
dulwich/tests/compat/test_pack.py

@@ -84,7 +84,7 @@ class TestPack(PackTests):
             orig_blob = orig_pack[a_sha]
             new_blob = Blob()
             new_blob.data = orig_blob.data + b"x"
-            all_to_pack = list(orig_pack.pack_tuples()) + [(new_blob, None)]
+            all_to_pack = [(o, None) for o in orig_pack.iterobjects()] + [(new_blob, None)]
         pack_path = os.path.join(self._tempdir, "pack_with_deltas")
         write_pack(pack_path, all_to_pack, deltify=True)
         output = run_git_or_fail(["verify-pack", "-v", pack_path])

+ 2 - 1
dulwich/tests/test_client.py

@@ -380,7 +380,8 @@ class GitClientTests(TestCase):
             )
 
         f = BytesIO()
-        write_pack_data(f.write, *generate_pack_data(None, None))
+        count, records = generate_pack_data(None, None)
+        write_pack_data(f.write, records, num_records=count)
         self.client.send_pack(b"/", update_refs, generate_pack_data)
         self.assertEqual(
             self.rout.getvalue(),

+ 5 - 5
dulwich/tests/test_pack.py

@@ -429,7 +429,7 @@ class TestPack(PackTests):
         with self.get_pack(pack1_sha) as origpack:
             self.assertSucceeds(origpack.index.check)
             basename = os.path.join(self.tempdir, "Elch")
-            write_pack(basename, origpack.pack_tuples())
+            write_pack(basename, list(origpack.pack_tuples()))
 
             with Pack(basename) as newpack:
                 self.assertEqual(origpack, newpack)
@@ -453,7 +453,7 @@ class TestPack(PackTests):
 
     def _copy_pack(self, origpack):
         basename = os.path.join(self.tempdir, "somepack")
-        write_pack(basename, origpack.pack_tuples())
+        write_pack(basename, list(origpack.pack_tuples()))
         return Pack(basename)
 
     def test_keep_no_message(self):
@@ -872,7 +872,7 @@ class DeltifyTests(TestCase):
     def test_single(self):
         b = Blob.from_string(b"foo")
         self.assertEqual(
-            [(b.type_num, b.sha().digest(), None, b.as_raw_chunks())],
+            [UnpackedObject(b.type_num, sha=b.sha().digest(), delta_base=None, decomp_chunks=b.as_raw_chunks())],
             list(deltify_pack_objects([(b, b"")])),
         )
 
@@ -882,8 +882,8 @@ class DeltifyTests(TestCase):
         delta = list(create_delta(b1.as_raw_chunks(), b2.as_raw_chunks()))
         self.assertEqual(
             [
-                (b1.type_num, b1.sha().digest(), None, b1.as_raw_chunks()),
-                (b2.type_num, b2.sha().digest(), b1.sha().digest(), delta),
+                UnpackedObject(b1.type_num, sha=b1.sha().digest(), delta_base=None, decomp_chunks=b1.as_raw_chunks()),
+                UnpackedObject(b2.type_num, sha=b2.sha().digest(), delta_base=b1.sha().digest(), decomp_chunks=delta),
             ],
             list(deltify_pack_objects([(b1, b""), (b2, b"")])),
         )