Browse Source

Merge pull request #1038 from jelmer/pack-refactoring

Refactoring Pack code to avoid circular references
Jelmer Vernooij 2 years ago
parent
commit
7b7be50750
2 changed files with 124 additions and 89 deletions
  1. 122 87
      dulwich/pack.py
  2. 2 2
      dulwich/tests/test_pack.py

+ 122 - 87
dulwich/pack.py

@@ -46,6 +46,7 @@ from itertools import chain
 
 
 import os
 import os
 import sys
 import sys
+from typing import Optional, Callable, Tuple, List
 
 
 from hashlib import sha1
 from hashlib import sha1
 from os import (
 from os import (
@@ -396,15 +397,7 @@ class PackIndex(object):
         lives at within the corresponding pack file. If the pack file doesn't
         lives at within the corresponding pack file. If the pack file doesn't
         have the object then None will be returned.
         have the object then None will be returned.
         """
         """
-        if len(sha) == 40:
-            sha = hex_to_sha(sha)
-        try:
-            return self._object_index(sha)
-        except ValueError:
-            closed = getattr(self._contents, "closed", None)
-            if closed in (None, True):
-                raise PackFileDisappeared(self)
-            raise
+        raise NotImplementedError(self.object_index)
 
 
     def object_sha1(self, index):
     def object_sha1(self, index):
         """Return the SHA1 corresponding to the index in the pack file."""
         """Return the SHA1 corresponding to the index in the pack file."""
@@ -459,7 +452,9 @@ class MemoryPackIndex(PackIndex):
     def __len__(self):
     def __len__(self):
         return len(self._entries)
         return len(self._entries)
 
 
-    def _object_index(self, sha):
+    def object_index(self, sha):
+        if len(sha) == 40:
+            sha = hex_to_sha(sha)
         return self._by_sha[sha][0]
         return self._by_sha[sha][0]
 
 
     def object_sha1(self, index):
     def object_sha1(self, index):
@@ -484,6 +479,8 @@ class FilePackIndex(PackIndex):
     present.
     present.
     """
     """
 
 
+    _fan_out_table: List[int]
+
     def __init__(self, filename, file=None, contents=None, size=None):
     def __init__(self, filename, file=None, contents=None, size=None):
         """Create a pack index object.
         """Create a pack index object.
 
 
@@ -595,6 +592,23 @@ class FilePackIndex(PackIndex):
         """
         """
         return bytes(self._contents[-20:])
         return bytes(self._contents[-20:])
 
 
+    def object_index(self, sha):
+        """Return the index in to the corresponding packfile for the object.
+
+        Given the name of an object it will return the offset that object
+        lives at within the corresponding pack file. If the pack file doesn't
+        have the object then None will be returned.
+        """
+        if len(sha) == 40:
+            sha = hex_to_sha(sha)
+        try:
+            return self._object_index(sha)
+        except ValueError:
+            closed = getattr(self._contents, "closed", None)
+            if closed in (None, True):
+                raise PackFileDisappeared(self)
+            raise
+
     def _object_index(self, sha):
     def _object_index(self, sha):
         """See object_index.
         """See object_index.
 
 
@@ -1059,7 +1073,6 @@ class PackData(object):
         self._offset_cache = LRUSizeCache(
         self._offset_cache = LRUSizeCache(
             1024 * 1024 * 20, compute_size=_compute_object_size
             1024 * 1024 * 20, compute_size=_compute_object_size
         )
         )
-        self.pack = None
 
 
     @property
     @property
     def filename(self):
     def filename(self):
@@ -1122,65 +1135,6 @@ class PackData(object):
         """
         """
         return compute_file_sha(self._file, end_ofs=-20).digest()
         return compute_file_sha(self._file, end_ofs=-20).digest()
 
 
-    def get_ref(self, sha):
-        """Get the object for a ref SHA, only looking in this pack."""
-        # TODO: cache these results
-        if self.pack is None:
-            raise KeyError(sha)
-        try:
-            offset = self.pack.index.object_index(sha)
-        except KeyError:
-            offset = None
-        if offset:
-            type, obj = self.get_object_at(offset)
-        elif self.pack is not None and self.pack.resolve_ext_ref:
-            type, obj = self.pack.resolve_ext_ref(sha)
-        else:
-            raise KeyError(sha)
-        return offset, type, obj
-
-    def resolve_object(self, offset, type, obj, get_ref=None):
-        """Resolve an object, possibly resolving deltas when necessary.
-
-        Returns: Tuple with object type and contents.
-        """
-        # Walk down the delta chain, building a stack of deltas to reach
-        # the requested object.
-        base_offset = offset
-        base_type = type
-        base_obj = obj
-        delta_stack = []
-        while base_type in DELTA_TYPES:
-            prev_offset = base_offset
-            if get_ref is None:
-                get_ref = self.get_ref
-            if base_type == OFS_DELTA:
-                (delta_offset, delta) = base_obj
-                # TODO: clean up asserts and replace with nicer error messages
-                base_offset = base_offset - delta_offset
-                base_type, base_obj = self.get_object_at(base_offset)
-                assert isinstance(base_type, int)
-            elif base_type == REF_DELTA:
-                (basename, delta) = base_obj
-                assert isinstance(basename, bytes) and len(basename) == 20
-                base_offset, base_type, base_obj = get_ref(basename)
-                assert isinstance(base_type, int)
-            delta_stack.append((prev_offset, base_type, delta))
-
-        # Now grab the base object (mustn't be a delta) and apply the
-        # deltas all the way up the stack.
-        chunks = base_obj
-        for prev_offset, delta_type, delta in reversed(delta_stack):
-            chunks = apply_delta(chunks, delta)
-            # TODO(dborowitz): This can result in poor performance if
-            # large base objects are separated from deltas in the pack.
-            # We should reorganize so that we apply deltas to all
-            # objects in a chain one after the other to optimize cache
-            # performance.
-            if prev_offset is not None:
-                self._offset_cache[prev_offset] = base_type, chunks
-        return base_type, chunks
-
     def iterobjects(self, progress=None, compute_crc32=True):
     def iterobjects(self, progress=None, compute_crc32=True):
         self._file.seek(self._header_size)
         self._file.seek(self._header_size)
         for i in range(1, self._num_objects + 1):
         for i in range(1, self._num_objects + 1):
@@ -1215,7 +1169,7 @@ class PackData(object):
             # Back up over unused data.
             # Back up over unused data.
             self._file.seek(-len(unused), SEEK_CUR)
             self._file.seek(-len(unused), SEEK_CUR)
 
 
-    def iterentries(self, progress=None):
+    def iterentries(self, progress=None, resolve_ext_ref=None):
         """Yield entries summarizing the contents of this pack.
         """Yield entries summarizing the contents of this pack.
 
 
         Args:
         Args:
@@ -1224,25 +1178,24 @@ class PackData(object):
         Returns: iterator of tuples with (sha, offset, crc32)
         Returns: iterator of tuples with (sha, offset, crc32)
         """
         """
         num_objects = self._num_objects
         num_objects = self._num_objects
-        resolve_ext_ref = self.pack.resolve_ext_ref if self.pack is not None else None
         indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)
         indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)
         for i, result in enumerate(indexer):
         for i, result in enumerate(indexer):
             if progress is not None:
             if progress is not None:
                 progress(i, num_objects)
                 progress(i, num_objects)
             yield result
             yield result
 
 
-    def sorted_entries(self, progress=None):
+    def sorted_entries(self, progress=None, resolve_ext_ref=None):
         """Return entries in this pack, sorted by SHA.
         """Return entries in this pack, sorted by SHA.
 
 
         Args:
         Args:
           progress: Progress function, called with current and total
           progress: Progress function, called with current and total
             object count
             object count
-        Returns: List of tuples with (sha, offset, crc32)
+        Returns: Iterator of tuples with (sha, offset, crc32)
         """
         """
-        ret = sorted(self.iterentries(progress=progress))
-        return ret
+        return sorted(self.iterentries(
+            progress=progress, resolve_ext_ref=resolve_ext_ref))
 
 
-    def create_index_v1(self, filename, progress=None):
+    def create_index_v1(self, filename, progress=None, resolve_ext_ref=None):
         """Create a version 1 file for this data file.
         """Create a version 1 file for this data file.
 
 
         Args:
         Args:
@@ -1250,11 +1203,12 @@ class PackData(object):
           progress: Progress report function
           progress: Progress report function
         Returns: Checksum of index file
         Returns: Checksum of index file
         """
         """
-        entries = self.sorted_entries(progress=progress)
+        entries = self.sorted_entries(
+            progress=progress, resolve_ext_ref=resolve_ext_ref)
         with GitFile(filename, "wb") as f:
         with GitFile(filename, "wb") as f:
             return write_pack_index_v1(f, entries, self.calculate_checksum())
             return write_pack_index_v1(f, entries, self.calculate_checksum())
 
 
-    def create_index_v2(self, filename, progress=None):
+    def create_index_v2(self, filename, progress=None, resolve_ext_ref=None):
         """Create a version 2 index file for this data file.
         """Create a version 2 index file for this data file.
 
 
         Args:
         Args:
@@ -1262,11 +1216,12 @@ class PackData(object):
           progress: Progress report function
           progress: Progress report function
         Returns: Checksum of index file
         Returns: Checksum of index file
         """
         """
-        entries = self.sorted_entries(progress=progress)
+        entries = self.sorted_entries(
+            progress=progress, resolve_ext_ref=resolve_ext_ref)
         with GitFile(filename, "wb") as f:
         with GitFile(filename, "wb") as f:
             return write_pack_index_v2(f, entries, self.calculate_checksum())
             return write_pack_index_v2(f, entries, self.calculate_checksum())
 
 
-    def create_index(self, filename, progress=None, version=2):
+    def create_index(self, filename, progress=None, version=2, resolve_ext_ref=None):
         """Create an  index file for this data file.
         """Create an  index file for this data file.
 
 
         Args:
         Args:
@@ -1275,9 +1230,11 @@ class PackData(object):
         Returns: Checksum of index file
         Returns: Checksum of index file
         """
         """
         if version == 1:
         if version == 1:
-            return self.create_index_v1(filename, progress)
+            return self.create_index_v1(
+                filename, progress, resolve_ext_ref=resolve_ext_ref)
         elif version == 2:
         elif version == 2:
-            return self.create_index_v2(filename, progress)
+            return self.create_index_v2(
+                filename, progress, resolve_ext_ref=resolve_ext_ref)
         else:
         else:
             raise ValueError("unknown index format %d" % version)
             raise ValueError("unknown index format %d" % version)
 
 
@@ -2010,7 +1967,8 @@ write_pack_index = write_pack_index_v2
 class Pack(object):
 class Pack(object):
     """A Git pack object."""
     """A Git pack object."""
 
 
-    def __init__(self, basename, resolve_ext_ref=None):
+    def __init__(self, basename, resolve_ext_ref: Optional[
+            Callable[[bytes], Tuple[int, UnpackedObject]]] = None):
         self._basename = basename
         self._basename = basename
         self._data = None
         self._data = None
         self._idx = None
         self._idx = None
@@ -2034,7 +1992,6 @@ class Pack(object):
         """Create a new pack object from pack data and index objects."""
         """Create a new pack object from pack data and index objects."""
         ret = cls("")
         ret = cls("")
         ret._data = data
         ret._data = data
-        ret._data.pack = ret
         ret._data_load = None
         ret._data_load = None
         ret._idx = idx
         ret._idx = idx
         ret._idx_load = None
         ret._idx_load = None
@@ -2050,7 +2007,6 @@ class Pack(object):
         """The pack data object being used."""
         """The pack data object being used."""
         if self._data is None:
         if self._data is None:
             self._data = self._data_load()
             self._data = self._data_load()
-            self._data.pack = self
             self.check_length_and_checksum()
             self.check_length_and_checksum()
         return self._data
         return self._data
 
 
@@ -2142,7 +2098,7 @@ class Pack(object):
     def get_raw(self, sha1):
     def get_raw(self, sha1):
         offset = self.index.object_index(sha1)
         offset = self.index.object_index(sha1)
         obj_type, obj = self.data.get_object_at(offset)
         obj_type, obj = self.data.get_object_at(offset)
-        type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
+        type_num, chunks = self.resolve_object(offset, obj_type, obj)
         return type_num, b"".join(chunks)
         return type_num, b"".join(chunks)
 
 
     def __getitem__(self, sha1):
     def __getitem__(self, sha1):
@@ -2190,6 +2146,85 @@ class Pack(object):
                 keepfile.write(b"\n")
                 keepfile.write(b"\n")
         return keepfile_name
         return keepfile_name
 
 
+    def get_ref(self, sha) -> Tuple[int, int, UnpackedObject]:
+        """Get the object for a ref SHA, only looking in this pack."""
+        # TODO: cache these results
+        try:
+            offset = self.index.object_index(sha)
+        except KeyError:
+            offset = None
+        if offset:
+            type, obj = self.data.get_object_at(offset)
+        elif self.resolve_ext_ref:
+            type, obj = self.resolve_ext_ref(sha)
+        else:
+            raise KeyError(sha)
+        return offset, type, obj
+
+    def resolve_object(self, offset, type, obj, get_ref=None):
+        """Resolve an object, possibly resolving deltas when necessary.
+
+        Returns: Tuple with object type and contents.
+        """
+        # Walk down the delta chain, building a stack of deltas to reach
+        # the requested object.
+        base_offset = offset
+        base_type = type
+        base_obj = obj
+        delta_stack = []
+        while base_type in DELTA_TYPES:
+            prev_offset = base_offset
+            if get_ref is None:
+                get_ref = self.get_ref
+            if base_type == OFS_DELTA:
+                (delta_offset, delta) = base_obj
+                # TODO: clean up asserts and replace with nicer error messages
+                base_offset = base_offset - delta_offset
+                base_type, base_obj = self.data.get_object_at(base_offset)
+                assert isinstance(base_type, int)
+            elif base_type == REF_DELTA:
+                (basename, delta) = base_obj
+                assert isinstance(basename, bytes) and len(basename) == 20
+                base_offset, base_type, base_obj = get_ref(basename)
+                assert isinstance(base_type, int)
+            delta_stack.append((prev_offset, base_type, delta))
+
+        # Now grab the base object (mustn't be a delta) and apply the
+        # deltas all the way up the stack.
+        chunks = base_obj
+        for prev_offset, delta_type, delta in reversed(delta_stack):
+            chunks = apply_delta(chunks, delta)
+            # TODO(dborowitz): This can result in poor performance if
+            # large base objects are separated from deltas in the pack.
+            # We should reorganize so that we apply deltas to all
+            # objects in a chain one after the other to optimize cache
+            # performance.
+            if prev_offset is not None:
+                self.data._offset_cache[prev_offset] = base_type, chunks
+        return base_type, chunks
+
+    def entries(self, progress=None):
+        """Yield entries summarizing the contents of this pack.
+
+        Args:
+          progress: Progress function, called with current and total
+            object count.
+        Returns: iterator of tuples with (sha, offset, crc32)
+        """
+        return self.data.iterentries(
+            progress=progress, resolve_ext_ref=self.resolve_ext_ref)
+
+    def sorted_entries(self, progress=None):
+        """Return entries in this pack, sorted by SHA.
+
+        Args:
+          progress: Progress function, called with current and total
+            object count
+        Returns: Iterator of tuples with (sha, offset, crc32)
+        """
+        return self.data.sorted_entries(
+            progress=progress, resolve_ext_ref=self.resolve_ext_ref)
+
 
 
 try:
 try:
     from dulwich._pack import (  # type: ignore # noqa: F811
     from dulwich._pack import (  # type: ignore # noqa: F811

+ 2 - 2
dulwich/tests/test_pack.py

@@ -562,8 +562,8 @@ class TestThinPack(PackTests):
         # Index the new pack.
         # Index the new pack.
         with self.make_pack(True) as pack:
         with self.make_pack(True) as pack:
             with PackData(pack._data_path) as data:
             with PackData(pack._data_path) as data:
-                data.pack = pack
-                data.create_index(self.pack_prefix + ".idx")
+                data.create_index(
+                    self.pack_prefix + ".idx", resolve_ext_ref=pack.resolve_ext_ref)
 
 
         del self.store[self.blobs[b"bar"].id]
         del self.store[self.blobs[b"bar"].id]