瀏覽代碼

Avoid copying data when creating deltas.

Jelmer Vernooij 2 年之前
父節點
當前提交
aca694f62b
共有 3 個文件被更改,包括 48 次插入37 次删除
  1. 1 1
      dulwich/object_store.py
  2. 43 32
      dulwich/pack.py
  3. 4 4
      dulwich/tests/test_pack.py

+ 1 - 1
dulwich/object_store.py

@@ -1333,7 +1333,7 @@ class MissingObjectFinder(object):
         if not leaf:
             o = self.object_store[sha]
             if isinstance(o, Commit):
-                self.add_todo([(o.tree, "", False)])
+                self.add_todo([(o.tree, b"", False)])
             elif isinstance(o, Tree):
                 self.add_todo(
                     [

+ 43 - 32
dulwich/pack.py

@@ -39,7 +39,10 @@ from io import BytesIO, UnsupportedOperation
 from collections import (
     deque,
 )
-import difflib
+try:
+    from cdifflib import CSequenceMatcher as SequenceMatcher
+except ModuleNotFoundError:
+    from difflib import SequenceMatcher
 import struct
 
 from itertools import chain
@@ -1341,8 +1344,7 @@ class DeltaChainIterator(object):
 
     def _walk_all_chains(self):
         for offset, type_num in self._full_ofs:
-            for result in self._follow_chain(offset, type_num, None):
-                yield result
+            yield from self._follow_chain(offset, type_num, None)
         for result in self._walk_ref_chains():
             yield result
         assert not self._pending_ofs
@@ -1534,10 +1536,13 @@ def pack_object_chunks(type, object, compression_level=-1):
         delta_base, object = object
     else:
         delta_base = None
-    header = bytes(pack_object_header(type, delta_base, len(object)))
-    comp_data = zlib.compress(object, compression_level)
-    for data in (header, comp_data):
-        yield data
+    if isinstance(object, bytes):
+        object = [object]
+    yield bytes(pack_object_header(type, delta_base, sum(map(len, object))))
+    compressor = zlib.compressobj(level=compression_level)
+    for data in object:
+        yield compressor.compress(data)
+    yield compressor.flush()
 
 
 def write_pack_object(write, type, object, sha=None, compression_level=-1):
@@ -1635,18 +1640,21 @@ def deltify_pack_objects(objects, window_size=None):
     possible_bases = deque()
 
     for type_num, path, neg_length, o in magic:
-        raw = o.as_raw_string()
+        raw = o.as_raw_chunks()
         winner = raw
+        winner_len = sum(map(len, winner))
         winner_base = None
-        for base in possible_bases:
-            if base.type_num != type_num:
+        for base_id, base_type_num, base in possible_bases:
+            if base_type_num != type_num:
                 continue
-            delta = create_delta(base.as_raw_string(), raw)
-            if len(delta) < len(winner):
-                winner_base = base.sha().digest()
+            delta = create_delta(base, raw)
+            delta_len = sum(map(len, delta))
+            if delta_len < winner_len:
+                winner_base = base_id
                 winner = delta
+                winner_len = sum(map(len, winner))
         yield type_num, o.sha().digest(), winner_base, winner
-        possible_bases.appendleft(o)
+        possible_bases.appendleft((o.sha().digest(), type_num, raw))
         while len(possible_bases) > window_size:
             possible_bases.pop()
 
@@ -1662,7 +1670,7 @@ def pack_objects_to_data(objects):
     return (
         count,
         (
-            (o.type_num, o.sha().digest(), None, o.as_raw_string())
+            (o.type_num, o.sha().digest(), None, o.as_raw_chunks())
             for (o, path) in objects
         ),
     )
@@ -1821,7 +1829,7 @@ def write_pack_index_v1(f, entries, pack_checksum):
     return f.write_sha()
 
 
-def _delta_encode_size(size):
+def _delta_encode_size(size) -> bytes:
     ret = bytearray()
     c = size & 0x7F
     size >>= 7
@@ -1830,7 +1838,7 @@ def _delta_encode_size(size):
         c = size & 0x7F
         size >>= 7
     ret.append(c)
-    return ret
+    return bytes(ret)
 
 
 # The length of delta compression copy operations in version 2 packs is limited
@@ -1840,17 +1848,16 @@ _MAX_COPY_LEN = 0xFFFF
 
 
 def _encode_copy_operation(start, length):
-    scratch = []
-    op = 0x80
+    scratch = bytearray([0x80])
     for i in range(4):
         if start & 0xFF << i * 8:
             scratch.append((start >> i * 8) & 0xFF)
-            op |= 1 << i
+            scratch[0] |= 1 << i
     for i in range(2):
         if length & 0xFF << i * 8:
             scratch.append((length >> i * 8) & 0xFF)
-            op |= 1 << (4 + i)
-    return bytearray([op] + scratch)
+            scratch[0] |= 1 << (4 + i)
+    return bytes(scratch)
 
 
 def create_delta(base_buf, target_buf):
@@ -1860,14 +1867,18 @@ def create_delta(base_buf, target_buf):
       base_buf: Base buffer
       target_buf: Target buffer
     """
+    if isinstance(base_buf, list):
+        base_buf = b''.join(base_buf)
+    if isinstance(target_buf, list):
+        target_buf = b''.join(target_buf)
     assert isinstance(base_buf, bytes)
     assert isinstance(target_buf, bytes)
-    out_buf = bytearray()
+    out_buf = []
     # write delta header
-    out_buf += _delta_encode_size(len(base_buf))
-    out_buf += _delta_encode_size(len(target_buf))
+    out_buf.append(_delta_encode_size(len(base_buf)))
+    out_buf.append(_delta_encode_size(len(target_buf)))
     # write out delta opcodes
-    seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
+    seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)
     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
         # Git patch opcodes don't care about deletes!
         # if opcode == 'replace' or opcode == 'delete':
@@ -1879,7 +1890,7 @@ def create_delta(base_buf, target_buf):
             copy_len = i2 - i1
             while copy_len > 0:
                 to_copy = min(copy_len, _MAX_COPY_LEN)
-                out_buf += _encode_copy_operation(copy_start, to_copy)
+                out_buf.append(_encode_copy_operation(copy_start, to_copy))
                 copy_start += to_copy
                 copy_len -= to_copy
         if opcode == "replace" or opcode == "insert":
@@ -1888,13 +1899,13 @@ def create_delta(base_buf, target_buf):
             s = j2 - j1
             o = j1
             while s > 127:
-                out_buf.append(127)
-                out_buf += bytearray(target_buf[o : o + 127])
+                out_buf.append(bytes([127]))
+                out_buf.append(memoryview(target_buf)[o:o + 127])
                 s -= 127
                 o += 127
-            out_buf.append(s)
-            out_buf += bytearray(target_buf[o : o + s])
-    return bytes(out_buf)
+            out_buf.append(bytes([s]))
+            out_buf.append(memoryview(target_buf)[o:o + s])
+    return out_buf
 
 
 def apply_delta(src_buf, delta):

+ 4 - 4
dulwich/tests/test_pack.py

@@ -873,17 +873,17 @@ class DeltifyTests(TestCase):
     def test_single(self):
         b = Blob.from_string(b"foo")
         self.assertEqual(
-            [(b.type_num, b.sha().digest(), None, b.as_raw_string())],
+            [(b.type_num, b.sha().digest(), None, b.as_raw_chunks())],
             list(deltify_pack_objects([(b, b"")])),
         )
 
     def test_simple_delta(self):
         b1 = Blob.from_string(b"a" * 101)
         b2 = Blob.from_string(b"a" * 100)
-        delta = create_delta(b1.as_raw_string(), b2.as_raw_string())
+        delta = create_delta(b1.as_raw_chunks(), b2.as_raw_chunks())
         self.assertEqual(
             [
-                (b1.type_num, b1.sha().digest(), None, b1.as_raw_string()),
+                (b1.type_num, b1.sha().digest(), None, b1.as_raw_chunks()),
                 (b2.type_num, b2.sha().digest(), b1.sha().digest(), delta),
             ],
             list(deltify_pack_objects([(b1, b""), (b2, b"")])),
@@ -927,7 +927,7 @@ class TestPackStreamReader(TestCase):
             unpacked_delta.delta_base,
         )
         delta = create_delta(b"blob", b"blob1")
-        self.assertEqual(delta, b"".join(unpacked_delta.decomp_chunks))
+        self.assertEqual(b''.join(delta), b"".join(unpacked_delta.decomp_chunks))
         self.assertEqual(entries[1][4], unpacked_delta.crc32)
 
     def test_read_objects_buffered(self):