Prechádzať zdrojové kódy

Merge performance improvements.

Jelmer Vernooij 16 rokov pred
rodič
commit
87269e0ed6

+ 252 - 0
dulwich/lru_cache.py

@@ -0,0 +1,252 @@
+# Copyright (C) 2006, 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""A simple least-recently-used (LRU) cache."""
+
+from collections import deque
+
+
+class LRUCache(object):
+    """A class which manages a cache of entries, removing unused ones."""
+
+    def __init__(self, max_cache=100, after_cleanup_count=None):
+        self._cache = {}
+        self._cleanup = {}
+        self._queue = deque() # Track when things are accessed
+        self._refcount = {} # number of entries in self._queue for each key
+        self._update_max_cache(max_cache, after_cleanup_count)
+
+    def __contains__(self, key):
+        return key in self._cache
+
+    def __getitem__(self, key):
+        val = self._cache[key]
+        self._record_access(key)
+        return val
+
+    def __len__(self):
+        return len(self._cache)
+
+    def add(self, key, value, cleanup=None):
+        """Add a new value to the cache.
+
+        Also, if the entry is ever removed from the queue, call cleanup.
+        Passing it the key and value being removed.
+
+        :param key: The key to store it under
+        :param value: The object to store
+        :param cleanup: None or a function taking (key, value) to indicate
+                        'value' sohuld be cleaned up.
+        """
+        if key in self._cache:
+            self._remove(key)
+        self._cache[key] = value
+        if cleanup is not None:
+            self._cleanup[key] = cleanup
+        self._record_access(key)
+
+        if len(self._cache) > self._max_cache:
+            # Trigger the cleanup
+            self.cleanup()
+
+    def get(self, key, default=None):
+        if key in self._cache:
+            return self[key]
+        return default
+
+    def keys(self):
+        """Get the list of keys currently cached.
+
+        Note that values returned here may not be available by the time you
+        request them later. This is simply meant as a peak into the current
+        state.
+
+        :return: An unordered list of keys that are currently cached.
+        """
+        return self._cache.keys()
+
+    def cleanup(self):
+        """Clear the cache until it shrinks to the requested size.
+
+        This does not completely wipe the cache, just makes sure it is under
+        the after_cleanup_count.
+        """
+        # Make sure the cache is shrunk to the correct size
+        while len(self._cache) > self._after_cleanup_count:
+            self._remove_lru()
+        # No need to compact the queue at this point, because the code that
+        # calls this would have already triggered it based on queue length
+
+    def __setitem__(self, key, value):
+        """Add a value to the cache, there will be no cleanup function."""
+        self.add(key, value, cleanup=None)
+
+    def _record_access(self, key):
+        """Record that key was accessed."""
+        self._queue.append(key)
+        # Can't use setdefault because you can't += 1 the result
+        self._refcount[key] = self._refcount.get(key, 0) + 1
+
+        # If our access queue is too large, clean it up too
+        if len(self._queue) > self._compact_queue_length:
+            self._compact_queue()
+
+    def _compact_queue(self):
+        """Compact the queue, leaving things in sorted last appended order."""
+        new_queue = deque()
+        for item in self._queue:
+            if self._refcount[item] == 1:
+                new_queue.append(item)
+            else:
+                self._refcount[item] -= 1
+        self._queue = new_queue
+        # All entries should be of the same size. There should be one entry in
+        # queue for each entry in cache, and all refcounts should == 1
+        if not (len(self._queue) == len(self._cache) ==
+                len(self._refcount) == sum(self._refcount.itervalues())):
+            raise AssertionError()
+
+    def _remove(self, key):
+        """Remove an entry, making sure to maintain the invariants."""
+        cleanup = self._cleanup.pop(key, None)
+        val = self._cache.pop(key)
+        if cleanup is not None:
+            cleanup(key, val)
+        return val
+
+    def _remove_lru(self):
+        """Remove one entry from the lru, and handle consequences.
+
+        If there are no more references to the lru, then this entry should be
+        removed from the cache.
+        """
+        key = self._queue.popleft()
+        self._refcount[key] -= 1
+        if not self._refcount[key]:
+            del self._refcount[key]
+            self._remove(key)
+
+    def clear(self):
+        """Clear out all of the cache."""
+        # Clean up in LRU order
+        while self._cache:
+            self._remove_lru()
+
+    def resize(self, max_cache, after_cleanup_count=None):
+        """Change the number of entries that will be cached."""
+        self._update_max_cache(max_cache,
+                               after_cleanup_count=after_cleanup_count)
+
+    def _update_max_cache(self, max_cache, after_cleanup_count=None):
+        self._max_cache = max_cache
+        if after_cleanup_count is None:
+            self._after_cleanup_count = self._max_cache * 8 / 10
+        else:
+            self._after_cleanup_count = min(after_cleanup_count, self._max_cache)
+
+        self._compact_queue_length = 4*self._max_cache
+        if len(self._queue) > self._compact_queue_length:
+            self._compact_queue()
+        self.cleanup()
+
+
+class LRUSizeCache(LRUCache):
+    """An LRUCache that removes things based on the size of the values.
+
+    This differs in that it doesn't care how many actual items there are,
+    it just restricts the cache to be cleaned up after so much data is stored.
+
+    The values that are added must support len(value).
+    """
+
+    def __init__(self, max_size=1024*1024, after_cleanup_size=None,
+                 compute_size=None):
+        """Create a new LRUSizeCache.
+
+        :param max_size: The max number of bytes to store before we start
+            clearing out entries.
+        :param after_cleanup_size: After cleaning up, shrink everything to this
+            size.
+        :param compute_size: A function to compute the size of the values. We
+            use a function here, so that you can pass 'len' if you are just
+            using simple strings, or a more complex function if you are using
+            something like a list of strings, or even a custom object.
+            The function should take the form "compute_size(value) => integer".
+            If not supplied, it defaults to 'len()'
+        """
+        self._value_size = 0
+        self._compute_size = compute_size
+        if compute_size is None:
+            self._compute_size = len
+        # This approximates that texts are > 0.5k in size. It only really
+        # effects when we clean up the queue, so we don't want it to be too
+        # large.
+        self._update_max_size(max_size, after_cleanup_size=after_cleanup_size)
+        LRUCache.__init__(self, max_cache=max(int(max_size/512), 1))
+
+    def add(self, key, value, cleanup=None):
+        """Add a new value to the cache.
+
+        Also, if the entry is ever removed from the queue, call cleanup.
+        Passing it the key and value being removed.
+
+        :param key: The key to store it under
+        :param value: The object to store
+        :param cleanup: None or a function taking (key, value) to indicate
+                        'value' sohuld be cleaned up.
+        """
+        if key in self._cache:
+            self._remove(key)
+        value_len = self._compute_size(value)
+        if value_len >= self._after_cleanup_size:
+            return
+        self._value_size += value_len
+        self._cache[key] = value
+        if cleanup is not None:
+            self._cleanup[key] = cleanup
+        self._record_access(key)
+
+        if self._value_size > self._max_size:
+            # Time to cleanup
+            self.cleanup()
+
+    def cleanup(self):
+        """Clear the cache until it shrinks to the requested size.
+
+        This does not completely wipe the cache, just makes sure it is under
+        the after_cleanup_size.
+        """
+        # Make sure the cache is shrunk to the correct size
+        while self._value_size > self._after_cleanup_size:
+            self._remove_lru()
+
+    def _remove(self, key):
+        """Remove an entry, making sure to maintain the invariants."""
+        val = LRUCache._remove(self, key)
+        self._value_size -= self._compute_size(val)
+
+    def resize(self, max_size, after_cleanup_size=None):
+        """Change the number of bytes that will be cached."""
+        self._update_max_size(max_size, after_cleanup_size=after_cleanup_size)
+        max_cache = max(int(max_size/512), 1)
+        self._update_max_cache(max_cache)
+
+    def _update_max_size(self, max_size, after_cleanup_size=None):
+        self._max_size = max_size
+        if after_cleanup_size is None:
+            self._after_cleanup_size = self._max_size * 8 / 10
+        else:
+            self._after_cleanup_size = min(after_cleanup_size, self._max_size)

+ 61 - 32
dulwich/object_store.py

@@ -21,55 +21,66 @@ import tempfile
 import urllib2
 
 from dulwich.objects import (
+    ShaFile,
     hex_to_sha,
     sha_to_hex,
-    ShaFile,
     )
 from dulwich.pack import (
     Pack,
+    PackData, 
     iter_sha1, 
     load_packs, 
     write_pack,
     write_pack_data,
     write_pack_index_v2,
-    PackData, 
     )
 
 PACKDIR = 'pack'
 
 class ObjectStore(object):
+    """Object store."""
 
     def __init__(self, path):
+        """Open an object store.
+
+        :param path: Path of the object store.
+        """
         self.path = path
-        self._packs = None
+        self._pack_cache = None
+        self.pack_dir = os.path.join(self.path, PACKDIR)
 
     def determine_wants_all(self, refs):
 	    return [sha for (ref, sha) in refs.iteritems() if not sha in self and not ref.endswith("^{}")]
 
     def iter_shas(self, shas):
-        return ObjectStoreIterator(self, shas)
+        """Iterate over the objects for the specified shas.
 
-    def pack_dir(self):
-        return os.path.join(self.path, PACKDIR)
+        :param shas: Iterable object with SHAs
+        """
+        return ObjectStoreIterator(self, shas)
 
     def __contains__(self, sha):
-        # TODO: This can be more efficient
-        try:
-            self[sha]
+        for pack in self.packs:
+            if sha in pack:
+                return True
+        ret = self._get_shafile(sha)
+        if ret is not None:
             return True
-        except KeyError:
-            return False
+        return False
 
     @property
     def packs(self):
         """List with pack objects."""
-        if self._packs is None:
-            self._packs = list(load_packs(self.pack_dir()))
-        return self._packs
+        if self._pack_cache is None:
+            self._pack_cache = list(load_packs(self.pack_dir))
+        return self._pack_cache
 
     def _add_known_pack(self, path):
-        if self._packs is not None:
-            self._packs.append(Pack(path))
+        """Add a newly appeared pack to the cache by path.
+
+        """
+        if self._pack_cache is not None:
+            self._pack_cache.append(Pack(path))
 
     def _get_shafile_path(self, sha):
         dir = sha[:2]
@@ -99,8 +110,10 @@ class ObjectStore(object):
         :return: tuple with object type and object contents.
         """
         for pack in self.packs:
-            if sha in pack:
+            try:
                 return pack.get_raw(sha, self.get_raw)
+            except KeyError:
+                pass
         # FIXME: Are thin pack deltas ever against on-disk shafiles ?
         ret = self._get_shafile(sha)
         if ret is not None:
@@ -125,11 +138,11 @@ class ObjectStore(object):
         :param path: Path to the pack file.
         """
         p = PackData(path)
-        temppath = os.path.join(self.pack_dir(), 
+        temppath = os.path.join(self.pack_dir, 
             sha_to_hex(urllib2.randombytes(20))+".temppack")
         write_pack(temppath, p.iterobjects(self.get_raw), len(p))
         pack_sha = PackIndex(temppath+".idx").objects_sha1()
-        newbasename = os.path.join(self.pack_dir(), "pack-%s" % pack_sha)
+        newbasename = os.path.join(self.pack_dir, "pack-%s" % pack_sha)
         os.rename(temppath+".pack", newbasename+".pack")
         os.rename(temppath+".idx", newbasename+".idx")
         self._add_known_pack(newbasename)
@@ -144,7 +157,7 @@ class ObjectStore(object):
         """
         p = PackData(path)
         entries = p.sorted_entries()
-        basename = os.path.join(self.pack_dir(), 
+        basename = os.path.join(self.pack_dir, 
             "pack-%s" % iter_sha1(entry[0] for entry in entries))
         write_pack_index_v2(basename+".idx", entries, p.get_stored_checksum())
         os.rename(path, basename + ".pack")
@@ -156,7 +169,7 @@ class ObjectStore(object):
         Thin packs are packs that contain deltas with parents that exist 
         in a different pack.
         """
-        fd, path = tempfile.mkstemp(dir=self.pack_dir(), suffix=".pack")
+        fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack")
         f = os.fdopen(fd, 'w')
         def commit():
             os.fsync(fd)
@@ -171,7 +184,7 @@ class ObjectStore(object):
         :return: Fileobject to write to and a commit function to 
             call when the pack is finished.
         """
-        fd, path = tempfile.mkstemp(dir=self.pack_dir(), suffix=".pack")
+        fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack")
         f = os.fdopen(fd, 'w')
         def commit():
             os.fsync(fd)
@@ -181,6 +194,10 @@ class ObjectStore(object):
         return f, commit
 
     def add_objects(self, objects):
+        """Add a set of objects to this object store.
+
+        :param objects: Iterable over a list of objects.
+        """
         if len(objects) == 0:
             return
         f, commit = self.add_pack()
@@ -189,46 +206,60 @@ class ObjectStore(object):
 
 
 class ObjectImporter(object):
+    """Interface for importing objects."""
 
     def __init__(self, count):
+        """Create a new ObjectImporter.
+
+        :param count: Number of objects that's going to be imported.
+        """
         self.count = count
 
     def add_object(self, object):
+        """Add an object."""
         raise NotImplementedError(self.add_object)
 
     def finish(self, object):
+        """Finish the imoprt and write objects to disk."""
         raise NotImplementedError(self.finish)
 
 
 class ObjectIterator(object):
+    """Interface for iterating over objects."""
 
     def iterobjects(self):
         raise NotImplementedError(self.iterobjects)
 
 
 class ObjectStoreIterator(ObjectIterator):
+    """ObjectIterator that works on top of an ObjectStore."""
 
     def __init__(self, store, sha_iter):
         self.store = store
-        self.shas = list(sha_iter)
+        self.sha_iter = sha_iter
+        self._shas = []
 
     def __iter__(self):
-        return ((self.store[sha], path) for sha, path in self.shas)
+        for sha, path in self.itershas():
+            yield self.store[sha], path
 
     def iterobjects(self):
         for o, path in self:
             yield o
 
+    def itershas(self):
+        for sha in self._shas:
+            yield sha
+        for sha in self.sha_iter:
+            self._shas.append(sha)
+            yield sha
+
     def __contains__(self, needle):
         """Check if an object is present.
 
         :param needle: SHA1 of the object to check for
         """
-        # FIXME: This could be more efficient
-        for sha, path in self.shas:
-            if sha == needle:
-                return True
-        return False
+        return needle in self.store
 
     def __getitem__(self, key):
         """Find an object by SHA1."""
@@ -236,6 +267,4 @@ class ObjectStoreIterator(ObjectIterator):
 
     def __len__(self):
         """Return the number of objects."""
-        return len(self.shas)
-
-
+        return len(list(self.itershas()))

+ 53 - 33
dulwich/pack.py

@@ -51,6 +51,9 @@ from dulwich.errors import (
     ApplyDeltaError,
     ChecksumMismatch,
     )
+from dulwich.lru_cache import (
+    LRUSizeCache,
+    )
 from dulwich.objects import (
     ShaFile,
     hex_to_sha,
@@ -93,7 +96,7 @@ def iter_sha1(iter):
     return sha1.hexdigest()
 
 
-MAX_MMAP_SIZE = 256 * 1024 * 1024
+MAX_MMAP_SIZE = 1024 * 1024 * 1024
 
 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
     """Simple wrapper for mmap() which always supports the offset parameter.
@@ -134,28 +137,6 @@ def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
         return ArraySkipper(mem, offset)
 
 
-def resolve_object(offset, type, obj, get_ref, get_offset):
-    """Resolve an object, possibly resolving deltas when necessary."""
-    if not type in (6, 7): # Not a delta
-        return type, obj
-  
-    if type == 6: # offset delta
-        (delta_offset, delta) = obj
-        assert isinstance(delta_offset, int)
-        assert isinstance(delta, str)
-        offset = offset-delta_offset
-        type, base_obj = get_offset(offset)
-        assert isinstance(type, int)
-    elif type == 7: # ref delta
-        (basename, delta) = obj
-        assert isinstance(basename, str) and len(basename) == 20
-        assert isinstance(delta, str)
-        type, base_obj = get_ref(basename)
-        assert isinstance(type, int)
-    type, base_text = resolve_object(offset, type, base_obj, get_ref, get_offset)
-    return type, apply_delta(base_text, delta)
-
-
 class PackIndex(object):
     """An index in to a packfile.
   
@@ -367,6 +348,13 @@ def unpack_object(map):
         return type, uncomp, comp_len+raw_base
 
 
+def compute_object_size((num, obj)):
+    if num in (6, 7):
+        return len(obj[1])
+    assert isinstance(obj, str)
+    return len(obj)
+
+
 class PackData(object):
     """The data contained in a packfile.
   
@@ -409,6 +397,8 @@ class PackData(object):
         self._header_size = 12
         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
         self._read_header()
+        self._offset_cache = LRUSizeCache(1024*1024*100, 
+            compute_size=compute_object_size)
   
     def _read_header(self):
         f = open(self._filename, 'rb')
@@ -432,6 +422,39 @@ class PackData(object):
             return make_sha(map[:-20]).digest()
         finally:
             f.close()
+
+    def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
+        """Resolve an object, possibly resolving deltas when necessary.
+        
+        :return: Tuple with object type and contents.
+        """
+        if not type in (6, 7): # Not a delta
+            return type, obj
+
+        if get_offset is None:
+            get_offset = self.get_object_at
+      
+        if type == 6: # offset delta
+            (delta_offset, delta) = obj
+            assert isinstance(delta_offset, int)
+            assert isinstance(delta, str)
+            base_offset = offset-delta_offset
+            type, base_obj = get_offset(base_offset)
+            assert isinstance(type, int)
+        elif type == 7: # ref delta
+            (basename, delta) = obj
+            assert isinstance(basename, str) and len(basename) == 20
+            assert isinstance(delta, str)
+            type, base_obj = get_ref(basename)
+            assert isinstance(type, int)
+            # Can't be a ofs delta, as we wouldn't know the base offset
+            assert type != 6
+            base_offset = None
+        type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
+        if base_offset is not None:
+            self._offset_cache[base_offset] = type, base_text
+        ret = (type, apply_delta(base_text, delta))
+        return ret
   
     def iterobjects(self):
         offset = self._header_size
@@ -446,7 +469,6 @@ class PackData(object):
   
     def iterentries(self, ext_resolve_ref=None):
         found = {}
-        at = {}
         postponed = defaultdict(list)
         class Postpone(Exception):
             """Raised to postpone delta resolving."""
@@ -463,13 +485,11 @@ class PackData(object):
         todo = list(self.iterobjects())
         while todo:
             (offset, type, obj, crc32) = todo.pop(0)
-            at[offset] = (type, obj)
             assert isinstance(offset, int)
             assert isinstance(type, int)
             assert isinstance(obj, tuple) or isinstance(obj, str)
             try:
-                type, obj = resolve_object(offset, type, obj, get_ref_text,
-                    at.__getitem__)
+                type, obj = self.resolve_object(offset, type, obj, get_ref_text)
             except Postpone, (sha, ):
                 postponed[sha].append((offset, type, obj))
             else:
@@ -507,6 +527,8 @@ class PackData(object):
         then the packfile can be asked directly for that object using this
         function.
         """
+        if offset in self._offset_cache:
+            return self._offset_cache[offset]
         assert isinstance(offset, long) or isinstance(offset, int),\
                 "offset was %r" % offset
         assert offset >= self._header_size
@@ -516,7 +538,8 @@ class PackData(object):
         f = open(self._filename, 'rb')
         try:
             map = simple_mmap(f, offset, size-offset)
-            return unpack_object(map)[:2]
+            ret = unpack_object(map)[:2]
+            return ret
         finally:
             f.close()
 
@@ -884,8 +907,7 @@ class Pack(object):
         if isinstance(offset, long):
           offset = int(offset)
         assert isinstance(offset, int)
-        return resolve_object(offset, type, obj, resolve_ref,
-            self.data.get_object_at)
+        return self.data.resolve_object(offset, type, obj, resolve_ref)
 
     def __getitem__(self, sha1):
         """Retrieve the specified SHA1."""
@@ -899,9 +921,7 @@ class Pack(object):
         for offset, type, obj, crc32 in self.data.iterobjects():
             assert isinstance(offset, int)
             yield ShaFile.from_raw_string(
-                    *resolve_object(offset, type, obj, 
-                        get_raw, 
-                    self.data.get_object_at))
+                    *self.data.resolve_object(offset, type, obj, get_raw))
 
 
 def load_packs(path):

+ 1 - 1
dulwich/tests/test_object_store.py

@@ -24,7 +24,7 @@ class ObjectStoreTests(TestCase):
 
     def test_pack_dir(self):
         o = ObjectStore("foo")
-        self.assertEquals("foo/pack", o.pack_dir())
+        self.assertEquals("foo/pack", o.pack_dir)
 
     def test_empty_packs(self):
         o = ObjectStore("foo")