浏览代码

Merge improvements to pack writing code.

Jelmer Vernooij 13 年之前
父节点
当前提交
caff1a41fd
共有 7 个文件被更改,包括 117 次插入57 次删除
  1. 6 3
      NEWS
  2. 2 2
      dulwich/client.py
  3. 3 3
      dulwich/object_store.py
  4. 77 44
      dulwich/pack.py
  5. 2 2
      dulwich/server.py
  6. 3 3
      dulwich/tests/test_object_store.py
  7. 24 0
      dulwich/tests/test_pack.py

+ 6 - 3
NEWS

@@ -15,9 +15,12 @@
 
  API CHANGES
 
-  * write_pack_data and write_pack no longer take a num_objects argument and
-    require an object to be passed in that is iterable (rather than an iterator)
-    and that provides __len__.  (Jelmer Vernooij)
+  * write_pack no longer takes the num_objects argument and requires an object
+    to be passed in that is iterable (rather than an iterator) and that
+    provides __len__.  (Jelmer Vernooij)
+
+  * write_pack_data has been renamed to write_pack_objects and no longer takes a
+    num_objects argument. (Jelmer Vernooij)
 
 0.7.1	2011-04-12
 

+ 2 - 2
dulwich/client.py

@@ -38,7 +38,7 @@ from dulwich.protocol import (
     extract_capabilities,
     )
 from dulwich.pack import (
-    write_pack_data,
+    write_pack_objects,
     )
 
 
@@ -184,7 +184,7 @@ class GitClient(object):
         if not want:
             return new_refs
         objects = generate_pack_contents(have, want)
-        entries, sha = write_pack_data(proto.write_file(), objects)
+        entries, sha = write_pack_objects(proto.write_file(), objects)
 
         if 'report-status' in self._send_capabilities:
             self._parse_status_report(proto)

+ 3 - 3
dulwich/object_store.py

@@ -54,8 +54,8 @@ from dulwich.pack import (
     iter_sha1,
     load_pack_index,
     write_pack,
-    write_pack_data,
     write_pack_index_v2,
+    write_pack_objects,
     )
 
 INFODIR = 'info'
@@ -271,7 +271,7 @@ class PackBasedObjectStore(BaseObjectStore):
         objects = set()
         for sha in self._iter_loose_objects():
             objects.add((self._get_loose_object(sha), None))
-        self.add_objects(objects)
+        self.add_objects(list(objects))
         for obj, path in objects:
             self._remove_loose_object(obj.id)
         return len(objects)
@@ -321,7 +321,7 @@ class PackBasedObjectStore(BaseObjectStore):
             # Don't bother writing an empty pack file
             return
         f, commit = self.add_pack()
-        write_pack_data(f, objects)
+        write_pack_objects(f, objects)
         return commit()
 
 

+ 77 - 44
dulwich/pack.py

@@ -1065,16 +1065,19 @@ class SHA1Writer(object):
 
     def __init__(self, f):
         self.f = f
+        self.length = 0
         self.sha1 = make_sha("")
 
     def write(self, data):
         self.sha1.update(data)
         self.f.write(data)
+        self.length += len(data)
 
     def write_sha(self):
         sha = self.sha1.digest()
         assert len(sha) == 20
         self.f.write(sha)
+        self.length += len(sha)
         return sha
 
     def close(self):
@@ -1082,6 +1085,9 @@ class SHA1Writer(object):
         self.f.close()
         return sha
 
+    def offset(self):
+        return self.length
+
     def tell(self):
         return self.f.tell()
 
@@ -1094,7 +1100,6 @@ def write_pack_object(f, type, object):
     :param object: Object to write
     :return: Tuple with offset at which the object was written, and crc32
     """
-    offset = f.tell()
     packed_data_hdr = ""
     if type == OFS_DELTA:
         (delta_base_offset, object) = object
@@ -1121,7 +1126,7 @@ def write_pack_object(f, type, object):
         packed_data_hdr += basename
     packed_data = packed_data_hdr + zlib.compress(object)
     f.write(packed_data)
-    return (offset, (zlib.crc32(packed_data) & 0xffffffff))
+    return (zlib.crc32(packed_data) & 0xffffffff)
 
 
 def write_pack(filename, objects, num_objects=None):
@@ -1137,9 +1142,11 @@ def write_pack(filename, objects, num_objects=None):
                       DeprecationWarning)
     f = GitFile(filename + ".pack", 'wb')
     try:
-        entries, data_sum = write_pack_data(f, objects, num_objects=num_objects)
+        entries, data_sum = write_pack_objects(f, objects,
+            num_objects=num_objects)
     finally:
         f.close()
+    entries = [(k, v[0], v[1]) for (k, v) in entries.iteritems()]
     entries.sort()
     f = GitFile(filename + ".idx", 'wb')
     try:
@@ -1155,7 +1162,41 @@ def write_pack_header(f, num_objects):
     f.write(struct.pack('>L', num_objects))  # Number of objects in pack
 
 
-def write_pack_data(f, objects, num_objects=None, window=10):
+def deltify_pack_objects(objects, window=10):
+    """Generate deltas for pack objects.
+
+    :param objects: Objects to deltify
+    :param window: Window size
+    :return: Iterator over type_num, object id, delta_base, content
+        delta_base is None for full text entries
+    """
+    # Build a list of objects ordered by the magic Linus heuristic
+    # This helps us find good objects to diff against us
+    magic = []
+    for obj, path in objects:
+        magic.append((obj.type_num, path, -obj.raw_length(), obj))
+    magic.sort()
+
+    possible_bases = deque()
+
+    for type_num, path, neg_length, o in magic:
+        raw = o.as_raw_string()
+        winner = raw
+        winner_base = None
+        for base in possible_bases:
+            if base.type_num != type_num:
+                continue
+            delta = create_delta(base.as_raw_string(), raw)
+            if len(delta) < len(winner):
+                winner_base = base.sha().digest()
+                winner = delta
+        yield type_num, o.sha().digest(), winner_base, winner
+        possible_bases.appendleft(o)
+        while len(possible_bases) > window:
+            possible_bases.pop()
+
+
+def write_pack_objects(f, objects, window=10, num_objects=None):
     """Write a new pack data file.
 
     :param f: File to write to
@@ -1163,51 +1204,43 @@ def write_pack_data(f, objects, num_objects=None, window=10):
         Should provide __len__
     :param window: Sliding window size for searching for deltas; currently
                    unimplemented
-    :return: List with (name, offset, crc32 checksum) entries, pack checksum
+    :param num_objects: Number of objects (do not use, deprecated)
+    :return: Dict mapping id -> (offset, crc32 checksum), pack checksum
     """
-    if num_objects is not None:
-        warnings.warn("num_objects argument to write_pack_data is deprecated",
-                      DeprecationWarning)
-        # Previously it was possible to pass in an iterable
-        objects = list(objects)
-    else:
+    if num_objects is None:
         num_objects = len(objects)
+    # FIXME: pack_contents = deltify_pack_objects(objects, window)
+    pack_contents = (
+        (o.type_num, o.sha().digest(), None, o.as_raw_string())
+        for (o, path) in objects)
+    return write_pack_data(f, num_objects, pack_contents)
+
 
-    # FIXME: Somehow limit delta depth
-    # FIXME: Make thin-pack optional (its not used when cloning a pack)
-    # # Build a list of objects ordered by the magic Linus heuristic
-    # # This helps us find good objects to diff against us
-    # magic = []
-    # for obj, path in objects:
-    #     magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
-    # magic.sort()
-    # # Build a map of objects and their index in magic - so we can find
-    # # preceeding objects to diff against
-    # offs = {}
-    # for i in range(len(magic)):
-    #     offs[magic[i][4]] = i
+def write_pack_data(f, num_records, records):
+    """Write a new pack data file.
 
+    :param f: File to write to
+    :param num_records: Number of records
+    :param records: Iterator over type_num, object_id, delta_base, raw
+    :return: Dict mapping id -> (offset, crc32 checksum), pack checksum
+    """
     # Write the pack
-    entries = []
+    entries = {}
     f = SHA1Writer(f)
-    write_pack_header(f, num_objects)
-    for o, path in objects:
-        sha1 = o.sha().digest()
-        orig_t = o.type_num
-        raw = o.as_raw_string()
-        winner = raw
-        t = orig_t
-        #for i in range(offs[o]-window, window):
-        #    if i < 0 or i >= len(offs): continue
-        #    b = magic[i][4]
-        #    if b.type_num != orig_t: continue
-        #    base = b.as_raw_string()
-        #    delta = create_delta(base, raw)
-        #    if len(delta) < len(winner):
-        #        winner = delta
-        #        t = 6 if magic[i][2] == 1 else 7
-        offset, crc32 = write_pack_object(f, t, winner)
-        entries.append((sha1, offset, crc32))
+    write_pack_header(f, num_records)
+    for type_num, object_id, delta_base, raw in records:
+        if delta_base is not None:
+            try:
+                base_offset, base_crc32 = entries[delta_base]
+            except KeyError:
+                type_num = REF_DELTA
+                raw = (delta_base, raw)
+            else:
+                type_num = OFS_DELTA
+                raw = (base_offset, raw)
+        offset = f.offset()
+        crc32 = write_pack_object(f, type_num, raw)
+        entries[object_id] = (offset, crc32)
     return entries, f.write_sha()
 
 
@@ -1511,7 +1544,7 @@ class Pack(object):
               *self.data.resolve_object(offset, type, obj))
 
     def pack_tuples(self):
-        """Provide an iterable for use with write_pack_data.
+        """Provide an iterable for use with write_pack_objects.
 
         :return: Object that can iterate over (object, path) tuples
             and provides __len__

+ 2 - 2
dulwich/server.py

@@ -45,7 +45,7 @@ from dulwich.objects import (
     )
 from dulwich.pack import (
     PackStreamReader,
-    write_pack_data,
+    write_pack_objects,
     )
 from dulwich.protocol import (
     BufferedPktLineWriter,
@@ -280,7 +280,7 @@ class UploadPackHandler(Handler):
 
         self.progress("dul-daemon says what\n")
         self.progress("counting objects: %d, done.\n" % len(objects_iter))
-        write_pack_data(ProtocolFile(None, write), objects_iter)
+        write_pack_objects(ProtocolFile(None, write), objects_iter)
         self.progress("how was that, then?\n")
         # we are done
         self.proto.write("0000")

+ 3 - 3
dulwich/tests/test_object_store.py

@@ -42,7 +42,7 @@ from dulwich.object_store import (
     tree_lookup_path,
     )
 from dulwich.pack import (
-    write_pack_data,
+    write_pack_objects,
     )
 from dulwich.tests import (
     TestCase,
@@ -226,14 +226,14 @@ class DiskObjectStoreTests(PackBasedObjectStoreTests, TestCase):
         o = DiskObjectStore(self.store_dir)
         f, commit = o.add_pack()
         b = make_object(Blob, data="more yummy data")
-        write_pack_data(f, [(b, None)])
+        write_pack_objects(f, [(b, None)])
         commit()
 
     def test_add_thin_pack(self):
         o = DiskObjectStore(self.store_dir)
         f, commit = o.add_thin_pack()
         b = make_object(Blob, data="more yummy data")
-        write_pack_data(f, [(b, None)])
+        write_pack_objects(f, [(b, None)])
         commit()
 
 

+ 24 - 0
dulwich/tests/test_pack.py

@@ -33,6 +33,7 @@ from dulwich.file import (
     GitFile,
     )
 from dulwich.objects import (
+    Blob,
     hex_to_sha,
     sha_to_hex,
     Tree,
@@ -44,6 +45,7 @@ from dulwich.pack import (
     ThinPackData,
     apply_delta,
     create_delta,
+    deltify_pack_objects,
     load_pack_index,
     read_zlib_chunks,
     write_pack_header,
@@ -536,3 +538,25 @@ class ReadZlibTests(TestCase):
 
     def test_decompress_buffer_size_4(self):
         self._do_decompress_test(4)
+
+
+class DeltifyTests(TestCase):
+
+    def test_empty(self):
+        self.assertEquals([], list(deltify_pack_objects([])))
+
+    def test_single(self):
+        b = Blob.from_string("foo")
+        self.assertEquals(
+            [(b.type_num, b.sha().digest(), None, b.as_raw_string())],
+            list(deltify_pack_objects([(b, "")])))
+
+    def test_simple_delta(self):
+        b1 = Blob.from_string("a" * 101)
+        b2 = Blob.from_string("a" * 100)
+        delta = create_delta(b1.as_raw_string(), b2.as_raw_string())
+        self.assertEquals([
+            (b1.type_num, b1.sha().digest(), None, b1.as_raw_string()),
+            (b2.type_num, b2.sha().digest(), b1.sha().digest(), delta)
+            ],
+            list(deltify_pack_objects([(b1, ""), (b2, "")])))