Przeglądaj źródła

Refactor BitmapReachability to eliminate code duplication

The get_reachable_commits() and get_reachable_objects() methods had
significant duplication in bitmap combining and exclusion logic.
Jelmer Vernooij 2 miesięcy temu
rodzic
commit
b89cd3454d
2 zmienionych plików z 139 dodań i 161 usunięć
  1. 66 51
      dulwich/bitmap.py
  2. 73 110
      dulwich/object_store.py

+ 66 - 51
dulwich/bitmap.py

@@ -309,6 +309,23 @@ class EWAHBitmap:
         result.bit_count = max(self.bit_count, other.bit_count)
         return result
 
+    def __sub__(self, other: "EWAHBitmap") -> "EWAHBitmap":
+        """Bitwise subtraction (set difference).
+
+        Returns bits that are in self but not in other.
+        Equivalent to: self & ~other
+
+        Args:
+            other: Bitmap to subtract
+
+        Returns:
+            New bitmap with bits in self but not in other
+        """
+        result = EWAHBitmap()
+        result.bits = self.bits - other.bits
+        result.bit_count = self.bit_count
+        return result
+
     def add(self, bit: int) -> None:
         """Set a bit.
 
@@ -789,13 +806,14 @@ def select_bitmap_commits(
     for ref_name, sha in refs.items():
         try:
             obj = object_store[sha]
+        except KeyError:
+            continue
+        else:
             # Dereference tags to get to commits
             while isinstance(obj, Tag):
                 obj = object_store[obj.object[1]]
             if isinstance(obj, Commit):
                 ref_commits.add(obj.id)
-        except KeyError:
-            continue
 
     # Add all ref tips
     selected.update(ref_commits)
@@ -831,7 +849,7 @@ def select_bitmap_commits(
 
 def build_reachability_bitmap(
     commit_sha: bytes,
-    pack_index: "PackIndex",
+    sha_to_pos: dict[bytes, int],
     object_store: "BaseObjectStore",
 ) -> EWAHBitmap:
     """Build a reachability bitmap for a commit.
@@ -841,7 +859,7 @@ def build_reachability_bitmap(
 
     Args:
         commit_sha: The commit to build a bitmap for
-        pack_index: Pack index to get object positions
+        sha_to_pos: Pre-built mapping from SHA to position in pack
         object_store: Object store to traverse objects
 
     Returns:
@@ -849,11 +867,6 @@ def build_reachability_bitmap(
     """
     bitmap = EWAHBitmap()
 
-    # Create mapping from SHA to position in pack index
-    sha_to_pos = {}
-    for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
-        sha_to_pos[sha] = pos
-
     # Traverse all objects reachable from the commit
     seen = set()
     queue = deque([commit_sha])
@@ -928,7 +941,7 @@ def apply_xor_compression(
 
 
 def build_type_bitmaps(
-    pack_index: "PackIndex",
+    sha_to_pos: dict[bytes, int],
     object_store: "BaseObjectStore",
 ) -> tuple[EWAHBitmap, EWAHBitmap, EWAHBitmap, EWAHBitmap]:
     """Build type bitmaps for all objects in a pack.
@@ -936,7 +949,7 @@ def build_type_bitmaps(
     Type bitmaps classify objects by type: commit, tree, blob, or tag.
 
     Args:
-        pack_index: Pack index to iterate objects
+        sha_to_pos: Pre-built mapping from SHA to position in pack
         object_store: Object store to read object types
 
     Returns:
@@ -947,7 +960,7 @@ def build_type_bitmaps(
     blob_bitmap = EWAHBitmap()
     tag_bitmap = EWAHBitmap()
 
-    for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
+    for sha, pos in sha_to_pos.items():
         try:
             obj = object_store[sha]
         except KeyError:
@@ -968,7 +981,7 @@ def build_type_bitmaps(
 
 
 def build_name_hash_cache(
-    pack_index: "PackIndex",
+    sha_to_pos: dict[bytes, int],
     object_store: "BaseObjectStore",
 ) -> list[int]:
     """Build name-hash cache for all objects in a pack.
@@ -977,22 +990,27 @@ def build_name_hash_cache(
     which can speed up path-based operations.
 
     Args:
-        pack_index: Pack index to iterate objects
+        sha_to_pos: Pre-built mapping from SHA to position in pack
         object_store: Object store to read objects
 
     Returns:
         List of 32-bit hash values, one per object in the pack
     """
-    name_hashes = []
+    # Pre-allocate list with correct size
+    num_objects = len(sha_to_pos)
+    name_hashes = [0] * num_objects
 
-    for _pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
+    for sha, pos in sha_to_pos.items():
         try:
             obj = object_store[sha]
-
+        except KeyError:
+            # Object not in store, use zero hash
+            pass
+        else:
             # For tree entries, use the tree entry name
             # For commits, use the tree SHA
             # For other objects, use the object SHA
-            if hasattr(obj, "items"):
+            if isinstance(obj, Tree):
                 # Tree object - use the SHA as the name
                 name_hash = _compute_name_hash(sha)
             elif isinstance(obj, Commit):
@@ -1002,10 +1020,7 @@ def build_name_hash_cache(
                 # Other objects - use the SHA as the name
                 name_hash = _compute_name_hash(sha)
 
-            name_hashes.append(name_hash)
-        except KeyError:
-            # Object not in store, use zero hash
-            name_hashes.append(0)
+            name_hashes[pos] = name_hash
 
     return name_hashes
 
@@ -1035,6 +1050,15 @@ def generate_bitmap(
     Returns:
         Complete PackBitmap ready to write to disk
     """
+    if progress:
+        progress("Building pack index mapping")
+
+    # Build mapping from SHA to position in pack index ONCE
+    # This is used by all subsequent operations and avoids repeated enumeration
+    sha_to_pos = {}
+    for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
+        sha_to_pos[sha] = pos
+
     if progress:
         progress("Selecting commits for bitmap")
 
@@ -1050,7 +1074,7 @@ def generate_bitmap(
         if progress and i % 10 == 0:
             progress(f"Building bitmap {i + 1}/{len(selected_commits)}")
 
-        bitmap = build_reachability_bitmap(commit_sha, pack_index, object_store)
+        bitmap = build_reachability_bitmap(commit_sha, sha_to_pos, object_store)
         commit_bitmaps.append((commit_sha, bitmap))
 
     if progress:
@@ -1062,9 +1086,9 @@ def generate_bitmap(
     if progress:
         progress("Building type bitmaps")
 
-    # Build type bitmaps
+    # Build type bitmaps (using pre-built sha_to_pos mapping)
     commit_type_bitmap, tree_type_bitmap, blob_type_bitmap, tag_type_bitmap = (
-        build_type_bitmaps(pack_index, object_store)
+        build_type_bitmaps(sha_to_pos, object_store)
     )
 
     # Create PackBitmap
@@ -1081,11 +1105,6 @@ def generate_bitmap(
     pack_bitmap.blob_bitmap = blob_type_bitmap
     pack_bitmap.tag_bitmap = tag_type_bitmap
 
-    # Create mapping from SHA to position in pack index
-    sha_to_pos = {}
-    for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
-        sha_to_pos[sha] = pos
-
     # Add bitmap entries
     for commit_sha, xor_bitmap, xor_offset in compressed_bitmaps:
         if commit_sha not in sha_to_pos:
@@ -1100,11 +1119,11 @@ def generate_bitmap(
         pack_bitmap.entries[commit_sha] = entry
         pack_bitmap.entries_list.append((commit_sha, entry))
 
-    # Build optional name-hash cache
+    # Build optional name-hash cache (using pre-built sha_to_pos mapping)
     if include_hash_cache:
         if progress:
             progress("Building name-hash cache")
-        pack_bitmap.name_hash_cache = build_name_hash_cache(pack_index, object_store)
+        pack_bitmap.name_hash_cache = build_name_hash_cache(sha_to_pos, object_store)
 
     # Build optional lookup table
     if include_lookup_table:
@@ -1140,27 +1159,23 @@ def find_commit_bitmaps(
         if not remaining:
             break
 
-        try:
-            pack_bitmap = pack.bitmap
-            if not pack_bitmap:
-                continue
-
-            # Build SHA to position mapping for this pack
-            sha_to_pos = {}
-            for pos, (sha, _offset, _crc32) in enumerate(pack.index.iterentries()):
-                sha_to_pos[sha] = pos
-
-            # Check which commits have bitmaps
-            for commit_sha in list(remaining):
-                if pack_bitmap.has_commit(commit_sha):
-                    if commit_sha in sha_to_pos:
-                        result[commit_sha] = (pack, pack_bitmap, sha_to_pos)
-                        remaining.remove(commit_sha)
-
-        except (FileNotFoundError, ValueError, AttributeError):
-            # No bitmap or corrupt, skip this pack
+        pack_bitmap = pack.bitmap
+        if not pack_bitmap:
+            # No bitmap for this pack
             continue
 
+        # Build SHA to position mapping for this pack
+        sha_to_pos = {}
+        for pos, (sha, _offset, _crc32) in enumerate(pack.index.iterentries()):
+            sha_to_pos[sha] = pos
+
+        # Check which commits have bitmaps
+        for commit_sha in list(remaining):
+            if pack_bitmap.has_commit(commit_sha):
+                if commit_sha in sha_to_pos:
+                    result[commit_sha] = (pack, pack_bitmap, sha_to_pos)
+                    remaining.remove(commit_sha)
+
     return result
 
 

+ 73 - 110
dulwich/object_store.py

@@ -3255,50 +3255,39 @@ class BitmapReachability:
         # Fallback to graph traversal for operations not yet optimized
         self._fallback = GraphTraversalReachability(object_store)
 
-    def get_reachable_commits(
+    def _combine_commit_bitmaps(
         self,
-        heads: Iterable[bytes],
-        exclude: Iterable[bytes] | None = None,
-        shallow: Set[bytes] | None = None,
-    ) -> set[bytes]:
-        """Get all commits reachable from heads using bitmaps where possible.
+        commit_shas: set[bytes],
+        exclude_shas: set[bytes] | None = None,
+    ) -> tuple | None:
+        """Combine bitmaps for multiple commits using OR, with optional exclusion.
 
         Args:
-          heads: Starting commit SHAs
-          exclude: Commit SHAs to exclude (and their ancestors)
-          shallow: Set of shallow commit boundaries
+          commit_shas: Set of commit SHAs to combine
+          exclude_shas: Optional set of commit SHAs to exclude
 
         Returns:
-          Set of commit SHAs reachable from heads but not from exclude
+          Tuple of (combined_bitmap, pack) or None if bitmaps unavailable
         """
-        from .bitmap import bitmap_to_object_shas, find_commit_bitmaps
-
-        heads_set = set(heads)
-        exclude_set = set(exclude) if exclude else set()
-
-        # If shallow is specified, fall back to graph traversal
-        # (bitmap don't support shallow boundaries well)
-        if shallow:
-            return self._fallback.get_reachable_commits(heads, exclude, shallow)
+        from .bitmap import find_commit_bitmaps
 
-        # Try to find bitmaps for the heads
-        head_bitmaps = find_commit_bitmaps(heads_set, self.store.packs)
+        # Find bitmaps for the commits
+        commit_bitmaps = find_commit_bitmaps(commit_shas, self.store.packs)
 
-        # If we can't find bitmaps for all heads, fall back
-        if len(head_bitmaps) < len(heads_set):
-            return self._fallback.get_reachable_commits(heads, exclude, shallow)
+        # If we can't find bitmaps for all commits, return None
+        if len(commit_bitmaps) < len(commit_shas):
+            return None
 
-        # Combine bitmaps for all heads using OR
+        # Combine bitmaps using OR
         combined_bitmap = None
         result_pack = None
 
-        for commit_sha in heads_set:
-            pack, pack_bitmap, _sha_to_pos = head_bitmaps[commit_sha]
+        for commit_sha in commit_shas:
+            pack, pack_bitmap, _sha_to_pos = commit_bitmaps[commit_sha]
             commit_bitmap = pack_bitmap.get_bitmap(commit_sha)
 
             if commit_bitmap is None:
-                # Bitmap not found, fall back
-                return self._fallback.get_reachable_commits(heads, exclude, shallow)
+                return None
 
             if combined_bitmap is None:
                 combined_bitmap = commit_bitmap
@@ -3307,19 +3296,19 @@ class BitmapReachability:
                 # Same pack, can OR directly
                 combined_bitmap = combined_bitmap | commit_bitmap
             else:
-                # Different packs, fall back to traversal
-                return self._fallback.get_reachable_commits(heads, exclude, shallow)
+                # Different packs, can't combine
+                return None
 
         # Handle exclusions if provided
-        if exclude_set and result_pack:
-            exclude_bitmaps = find_commit_bitmaps(exclude_set, [result_pack])
+        if exclude_shas and result_pack and combined_bitmap:
+            exclude_bitmaps = find_commit_bitmaps(exclude_shas, [result_pack])
 
-            if len(exclude_bitmaps) == len(exclude_set):
+            if len(exclude_bitmaps) == len(exclude_shas):
                 # All excludes have bitmaps, compute exclusion
                 exclude_combined = None
 
-                for commit_sha in exclude_set:
-                    pack, pack_bitmap, _sha_to_pos = exclude_bitmaps[commit_sha]
+                for commit_sha in exclude_shas:
+                    _pack, pack_bitmap, _sha_to_pos = exclude_bitmaps[commit_sha]
                     exclude_bitmap = pack_bitmap.get_bitmap(commit_sha)
 
                     if exclude_bitmap is None:
@@ -3330,24 +3319,50 @@ class BitmapReachability:
                     else:
                         exclude_combined = exclude_combined | exclude_bitmap
 
-                # Subtract excludes: combined & ~exclude
+                # Subtract excludes using set difference
                 if exclude_combined:
-                    # Create a bitmap with all bits set in exclude_combined inverted
-                    # Then AND with combined_bitmap
-                    combined_bitmap = combined_bitmap & (
-                        combined_bitmap ^ exclude_combined
-                    )
+                    combined_bitmap = combined_bitmap - exclude_combined
 
-        # Convert bitmap to commit SHAs
-        if combined_bitmap and result_pack:
-            # Filter for commits only using the commit type bitmap
-            commit_type_filter = result_pack.bitmap.commit_bitmap
-            return bitmap_to_object_shas(
-                combined_bitmap, result_pack.index, commit_type_filter
-            )
+        return (combined_bitmap, result_pack) if combined_bitmap else None
+
+    def get_reachable_commits(
+        self,
+        heads: Iterable[bytes],
+        exclude: Iterable[bytes] | None = None,
+        shallow: Set[bytes] | None = None,
+    ) -> set[bytes]:
+        """Get all commits reachable from heads using bitmaps where possible.
+
+        Args:
+          heads: Starting commit SHAs
+          exclude: Commit SHAs to exclude (and their ancestors)
+          shallow: Set of shallow commit boundaries
+
+        Returns:
+          Set of commit SHAs reachable from heads but not from exclude
+        """
+        from .bitmap import bitmap_to_object_shas
+
+        # If shallow is specified, fall back to graph traversal
+        # (bitmaps don't support shallow boundaries well)
+        if shallow:
+            return self._fallback.get_reachable_commits(heads, exclude, shallow)
+
+        heads_set = set(heads)
+        exclude_set = set(exclude) if exclude else None
 
-        # Fallback if anything went wrong
-        return self._fallback.get_reachable_commits(heads, exclude, shallow)
+        # Try to combine bitmaps
+        result = self._combine_commit_bitmaps(heads_set, exclude_set)
+        if result is None:
+            return self._fallback.get_reachable_commits(heads, exclude, shallow)
+
+        combined_bitmap, result_pack = result
+
+        # Convert bitmap to commit SHAs, filtering for commits only
+        commit_type_filter = result_pack.bitmap.commit_bitmap
+        return bitmap_to_object_shas(
+            combined_bitmap, result_pack.index, commit_type_filter
+        )
 
     def get_tree_objects(
         self,
@@ -3378,69 +3393,17 @@ class BitmapReachability:
         Returns:
           Set of all object SHAs (commits, trees, blobs)
         """
-        from .bitmap import bitmap_to_object_shas, find_commit_bitmaps
+        from .bitmap import bitmap_to_object_shas
 
         commits_set = set(commits)
-        exclude_set = set(exclude_commits) if exclude_commits else set()
-
-        # Try to find bitmaps for the commits
-        commit_bitmaps = find_commit_bitmaps(commits_set, self.store.packs)
+        exclude_set = set(exclude_commits) if exclude_commits else None
 
-        # If we can't find bitmaps for all commits, fall back
-        if len(commit_bitmaps) < len(commits_set):
+        # Try to combine bitmaps
+        result = self._combine_commit_bitmaps(commits_set, exclude_set)
+        if result is None:
             return self._fallback.get_reachable_objects(commits, exclude_commits)
 
-        # Combine bitmaps for all commits using OR
-        combined_bitmap = None
-        result_pack = None
-
-        for commit_sha in commits_set:
-            pack, pack_bitmap, _sha_to_pos = commit_bitmaps[commit_sha]
-            commit_bitmap = pack_bitmap.get_bitmap(commit_sha)
-
-            if commit_bitmap is None:
-                # Bitmap not found, fall back
-                return self._fallback.get_reachable_objects(commits, exclude_commits)
-
-            if combined_bitmap is None:
-                combined_bitmap = commit_bitmap
-                result_pack = pack
-            elif pack == result_pack:
-                # Same pack, can OR directly
-                combined_bitmap = combined_bitmap | commit_bitmap
-            else:
-                # Different packs, fall back to traversal
-                return self._fallback.get_reachable_objects(commits, exclude_commits)
-
-        # Handle exclusions if provided
-        if exclude_set and result_pack:
-            exclude_bitmaps = find_commit_bitmaps(exclude_set, [result_pack])
-
-            if len(exclude_bitmaps) == len(exclude_set):
-                # All excludes have bitmaps, compute exclusion
-                exclude_combined = None
-
-                for commit_sha in exclude_set:
-                    pack, pack_bitmap, _sha_to_pos = exclude_bitmaps[commit_sha]
-                    exclude_bitmap = pack_bitmap.get_bitmap(commit_sha)
-
-                    if exclude_bitmap is None:
-                        break
-
-                    if exclude_combined is None:
-                        exclude_combined = exclude_bitmap
-                    else:
-                        exclude_combined = exclude_combined | exclude_bitmap
-
-                # Subtract excludes: combined & ~exclude
-                if exclude_combined:
-                    combined_bitmap = combined_bitmap & (
-                        combined_bitmap ^ exclude_combined
-                    )
+        combined_bitmap, result_pack = result
 
         # Convert bitmap to all object SHAs (no type filter)
-        if combined_bitmap and result_pack:
-            return bitmap_to_object_shas(combined_bitmap, result_pack.index, None)
-
-        # Fallback if anything went wrong
-        return self._fallback.get_reachable_objects(commits, exclude_commits)
+        return bitmap_to_object_shas(combined_bitmap, result_pack.index, None)