Browse Source

Merge improvements to memory usage of write_pack_data.

Jelmer Vernooij 13 years ago
parent
commit
727da82920

+ 10 - 1
NEWS

@@ -1,7 +1,10 @@
-0.7.2	UNRELEASED
+0.8.0	UNRELEASED
 
  BUG FIXES
 
+  * Avoid storing all objects in memory when writing pack.
+    (Jelmer Vernooij, #813268)
+
   * Support IPv6 for git:// connections. (Jelmer Vernooij, #801543)
 
   * Improve performance of Repo.revision_history(). (Timo Schmid, #535118)
@@ -10,6 +13,12 @@
 
   * Fix compilation on newer versions of Mac OS X (Lion and up). (Ryan McKern, #794543)
 
+ API CHANGES
+
+  * write_pack_data and write_pack no longer take a num_objects argument and
+    require an object to be passed in that is iterable (rather than an iterator)
+    and that provides __len__.  (Jelmer Vernooij)
+
 0.7.1	2011-04-12
 
  BUG FIXES

+ 3 - 4
dulwich/client.py

@@ -148,8 +148,8 @@ class GitClient(object):
         """Upload a pack to a remote repository.
 
         :param path: Repository path
-        :param generate_pack_contents: Function that can return the shas of the
-            objects to upload.
+        :param generate_pack_contents: Function that can return a sequence of the
+            shas of the objects to upload.
 
         :raises SendPackError: if server rejects the pack data
         :raises UpdateRefsError: if the server supports report-status
@@ -184,8 +184,7 @@ class GitClient(object):
         if not want:
             return new_refs
         objects = generate_pack_contents(have, want)
-        entries, sha = write_pack_data(proto.write_file(), objects,
-                                       len(objects))
+        entries, sha = write_pack_data(proto.write_file(), objects)
 
         if 'report-status' in self._send_capabilities:
             self._parse_status_report(proto)

+ 2 - 2
dulwich/object_store.py

@@ -321,7 +321,7 @@ class PackBasedObjectStore(BaseObjectStore):
             # Don't bother writing an empty pack file
             return
         f, commit = self.add_pack()
-        write_pack_data(f, objects, len(objects))
+        write_pack_data(f, objects)
         return commit()
 
 
@@ -407,7 +407,7 @@ class DiskObjectStore(PackBasedObjectStore):
             # Write a full pack version
             temppath = os.path.join(self.pack_dir,
                 sha_to_hex(urllib2.randombytes(20))+".temppack")
-            write_pack(temppath, ((o, None) for o in p.iterobjects()), len(p))
+            write_pack(temppath, [(o, None) for o in p.iterobjects()])
         finally:
             p.close()
 

+ 32 - 20
dulwich/pack.py

@@ -60,6 +60,7 @@ try:
 except ImportError:
     from dulwich._compat import unpack_from
 import sys
+import warnings
 import zlib
 
 from dulwich.errors import (
@@ -1123,17 +1124,20 @@ def write_pack_object(f, type, object):
     return (offset, (zlib.crc32(packed_data) & 0xffffffff))
 
 
-def write_pack(filename, objects, num_objects):
+def write_pack(filename, objects, num_objects=None):
     """Write a new pack data file.
 
     :param filename: Path to the new pack file (without .pack extension)
-    :param objects: Iterable over (object, path) tuples to write
-    :param num_objects: Number of objects to write
+    :param objects: Iterable of (object, path) tuples to write.
+        Should provide __len__
     :return: Tuple with checksum of pack file and index file
     """
+    if num_objects is not None:
+        warnings.warn("num_objects argument to write_pack is deprecated",
+                      DeprecationWarning)
     f = GitFile(filename + ".pack", 'wb')
     try:
-        entries, data_sum = write_pack_data(f, objects, num_objects)
+        entries, data_sum = write_pack_data(f, objects, num_objects=num_objects)
     finally:
         f.close()
     entries.sort()
@@ -1151,35 +1155,43 @@ def write_pack_header(f, num_objects):
     f.write(struct.pack('>L', num_objects))  # Number of objects in pack
 
 
-def write_pack_data(f, objects, num_objects, window=10):
+def write_pack_data(f, objects, num_objects=None, window=10):
     """Write a new pack data file.
 
     :param f: File to write to
-    :param objects: Iterable over (object, path) tuples to write
-    :param num_objects: Number of objects to write
+    :param objects: Iterable of (object, path) tuples to write.
+        Should provide __len__
     :param window: Sliding window size for searching for deltas; currently
                    unimplemented
     :return: List with (name, offset, crc32 checksum) entries, pack checksum
     """
-    recency = list(objects)
+    if num_objects is not None:
+        warnings.warn("num_objects argument to write_pack_data is deprecated",
+                      DeprecationWarning)
+        # Previously it was possible to pass in an iterable
+        objects = list(objects)
+    else:
+        num_objects = len(objects)
+
     # FIXME: Somehow limit delta depth
     # FIXME: Make thin-pack optional (its not used when cloning a pack)
-    # Build a list of objects ordered by the magic Linus heuristic
-    # This helps us find good objects to diff against us
-    magic = []
-    for obj, path in recency:
-        magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
-    magic.sort()
-    # Build a map of objects and their index in magic - so we can find
-    # preceeding objects to diff against
-    offs = {}
-    for i in range(len(magic)):
-        offs[magic[i][4]] = i
+    # # Build a list of objects ordered by the magic Linus heuristic
+    # # This helps us find good objects to diff against us
+    # magic = []
+    # for obj, path in objects:
+    #     magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
+    # magic.sort()
+    # # Build a map of objects and their index in magic - so we can find
+    # # preceeding objects to diff against
+    # offs = {}
+    # for i in range(len(magic)):
+    #     offs[magic[i][4]] = i
+
     # Write the pack
     entries = []
     f = SHA1Writer(f)
     write_pack_header(f, num_objects)
-    for o, path in recency:
+    for o, path in objects:
         sha1 = o.sha().digest()
         orig_t = o.type_num
         raw = o.as_raw_string()

+ 1 - 2
dulwich/server.py

@@ -280,8 +280,7 @@ class UploadPackHandler(Handler):
 
         self.progress("dul-daemon says what\n")
         self.progress("counting objects: %d, done.\n" % len(objects_iter))
-        write_pack_data(ProtocolFile(None, write), objects_iter, 
-                        len(objects_iter))
+        write_pack_data(ProtocolFile(None, write), objects_iter)
         self.progress("how was that, then?\n")
         # we are done
         self.proto.write("0000")

+ 1 - 2
dulwich/tests/compat/test_pack.py

@@ -54,8 +54,7 @@ class TestPack(PackTests):
         origpack = self.get_pack(pack1_sha)
         self.assertSucceeds(origpack.index.check)
         pack_path = os.path.join(self._tempdir, "Elch")
-        write_pack(pack_path, [(x, "") for x in origpack.iterobjects()],
-                   len(origpack))
+        write_pack(pack_path, [(x, "") for x in origpack.iterobjects()])
         output = run_git_or_fail(['verify-pack', '-v', pack_path])
 
         pack_shas = set()

+ 2 - 2
dulwich/tests/test_object_store.py

@@ -226,14 +226,14 @@ class DiskObjectStoreTests(PackBasedObjectStoreTests, TestCase):
         o = DiskObjectStore(self.store_dir)
         f, commit = o.add_pack()
         b = make_object(Blob, data="more yummy data")
-        write_pack_data(f, [(b, None)], 1)
+        write_pack_data(f, [(b, None)])
         commit()
 
     def test_add_thin_pack(self):
         o = DiskObjectStore(self.store_dir)
         f, commit = o.add_thin_pack()
         b = make_object(Blob, data="more yummy data")
-        write_pack_data(f, [(b, None)], 1)
+        write_pack_data(f, [(b, None)])
         commit()
 
 

+ 2 - 4
dulwich/tests/test_pack.py

@@ -276,8 +276,7 @@ class TestPack(PackTests):
         try:
             self.assertSucceeds(origpack.index.check)
             basename = os.path.join(self.tempdir, 'Elch')
-            write_pack(basename, [(x, '') for x in origpack.iterobjects()],
-                       len(origpack))
+            write_pack(basename, [(x, '') for x in origpack.iterobjects()])
             newpack = Pack(basename)
 
             try:
@@ -306,8 +305,7 @@ class TestPack(PackTests):
 
     def _copy_pack(self, origpack):
         basename = os.path.join(self.tempdir, 'somepack')
-        write_pack(basename, [(x, '') for x in origpack.iterobjects()],
-                   len(origpack))
+        write_pack(basename, [(x, '') for x in origpack.iterobjects()])
         return Pack(basename)
 
     def test_keep_no_message(self):