2
0
Эх сурвалжийг харах

Add high-level API for generating pack bitmaps

Jelmer Vernooij 2 сар өмнө
parent
commit
023c55cade

+ 43 - 31
dulwich/bitmap.py

@@ -958,27 +958,32 @@ def build_type_bitmaps(
     Returns:
         Tuple of (commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap)
     """
+    from .objects import sha_to_hex
+
     commit_bitmap = EWAHBitmap()
     tree_bitmap = EWAHBitmap()
     blob_bitmap = EWAHBitmap()
     tag_bitmap = EWAHBitmap()
 
     for sha, pos in sha_to_pos.items():
+        # Pack index returns binary SHA (20 bytes), but object_store expects hex SHA (40 bytes)
+        hex_sha = sha_to_hex(sha) if len(sha) == 20 else sha
         try:
-            obj = object_store[sha]
+            obj = object_store[hex_sha]
         except KeyError:
-            pass
-        else:
-            obj_type = obj.type_num
+            # Object not in store, skip it
+            continue
+
+        obj_type = obj.type_num
 
-            if obj_type == Commit.type_num:
-                commit_bitmap.add(pos)
-            elif obj_type == Tree.type_num:
-                tree_bitmap.add(pos)
-            elif obj_type == Blob.type_num:
-                blob_bitmap.add(pos)
-            elif obj_type == Tag.type_num:
-                tag_bitmap.add(pos)
+        if obj_type == Commit.type_num:
+            commit_bitmap.add(pos)
+        elif obj_type == Tree.type_num:
+            tree_bitmap.add(pos)
+        elif obj_type == Blob.type_num:
+            blob_bitmap.add(pos)
+        elif obj_type == Tag.type_num:
+            tag_bitmap.add(pos)
 
     return commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap
 
@@ -999,31 +1004,35 @@ def build_name_hash_cache(
     Returns:
         List of 32-bit hash values, one per object in the pack
     """
+    from .objects import sha_to_hex
+
     # Pre-allocate list with correct size
     num_objects = len(sha_to_pos)
     name_hashes = [0] * num_objects
 
     for sha, pos in sha_to_pos.items():
+        # Pack index returns binary SHA (20 bytes), but object_store expects hex SHA (40 bytes)
+        hex_sha = sha_to_hex(sha) if len(sha) == 20 else sha
         try:
-            obj = object_store[sha]
+            obj = object_store[hex_sha]
         except KeyError:
             # Object not in store, use zero hash
-            pass
+            continue
+
+        # For tree entries, use the tree entry name
+        # For commits, use the tree SHA
+        # For other objects, use the object SHA
+        if isinstance(obj, Tree):
+            # Tree object - use the SHA as the name
+            name_hash = _compute_name_hash(sha)
+        elif isinstance(obj, Commit):
+            # Commit - use the tree SHA as the name
+            name_hash = _compute_name_hash(obj.tree)
         else:
-            # For tree entries, use the tree entry name
-            # For commits, use the tree SHA
-            # For other objects, use the object SHA
-            if isinstance(obj, Tree):
-                # Tree object - use the SHA as the name
-                name_hash = _compute_name_hash(sha)
-            elif isinstance(obj, Commit):
-                # Commit - use the tree SHA as the name
-                name_hash = _compute_name_hash(obj.tree)
-            else:
-                # Other objects - use the SHA as the name
-                name_hash = _compute_name_hash(sha)
-
-            name_hashes[pos] = name_hash
+            # Other objects - use the SHA as the name
+            name_hash = _compute_name_hash(sha)
+
+        name_hashes[pos] = name_hash
 
     return name_hashes
 
@@ -1035,7 +1044,7 @@ def generate_bitmap(
     pack_checksum: bytes,
     include_hash_cache: bool = True,
     include_lookup_table: bool = True,
-    commit_interval: int = DEFAULT_COMMIT_INTERVAL,
+    commit_interval: int | None = None,
     progress: Callable[[str], None] | None = None,
 ) -> PackBitmap:
     """Generate a complete bitmap for a pack.
@@ -1047,12 +1056,15 @@ def generate_bitmap(
         pack_checksum: SHA-1 checksum of the pack file
         include_hash_cache: Whether to include name-hash cache
         include_lookup_table: Whether to include lookup table
-        commit_interval: Include every Nth commit in history
+        commit_interval: Include every Nth commit in history (None for default)
         progress: Optional progress reporting callback
 
     Returns:
         Complete PackBitmap ready to write to disk
     """
+    if commit_interval is None:
+        commit_interval = DEFAULT_COMMIT_INTERVAL
+
     if progress:
         progress("Building pack index mapping")
 
@@ -1144,7 +1156,7 @@ def generate_bitmap(
 
 
 def find_commit_bitmaps(
-    commit_shas: set[bytes], packs: Iterable[Pack]
+    commit_shas: set[bytes], packs: Iterable["Pack"]
 ) -> dict[bytes, tuple["Pack", "PackBitmap", dict[bytes, int]]]:
     """Find which packs have bitmaps for the given commits.
 

+ 40 - 4
dulwich/object_store.py

@@ -825,10 +825,14 @@ class PackBasedObjectStore(PackCapableObjectStore, PackedObjectContainer):
             # Check if any packs have bitmaps
             has_bitmap = False
             for pack in self.packs:
-                # Try to access bitmap property
-                if pack.bitmap is not None:
-                    has_bitmap = True
-                    break
+                try:
+                    # Try to access bitmap property
+                    if pack.bitmap is not None:
+                        has_bitmap = True
+                        break
+                except FileNotFoundError:
+                    # Bitmap file doesn't exist for this pack
+                    continue
 
             if has_bitmap:
                 return BitmapReachability(self)
@@ -1075,6 +1079,38 @@ class PackBasedObjectStore(PackCapableObjectStore, PackedObjectContainer):
         self._update_pack_cache()
         return len(objects)
 
+    def generate_pack_bitmaps(
+        self,
+        refs: dict[bytes, bytes],
+        *,
+        commit_interval: int | None = None,
+        progress: Callable[[str], None] | None = None,
+    ) -> int:
+        """Generate bitmap indexes for all packs that don't have them.
+
+        This generates .bitmap files for packfiles, enabling fast reachability
+        queries. Equivalent to the bitmap generation part of 'git repack -b'.
+
+        Args:
+          refs: Dictionary of ref names to commit SHAs
+          commit_interval: Include every Nth commit in bitmap index (None for default)
+          progress: Optional progress reporting callback
+
+        Returns:
+          Number of bitmaps generated
+        """
+        count = 0
+        for pack in self.packs:
+            pack.ensure_bitmap(
+                self, refs, commit_interval=commit_interval, progress=progress
+            )
+            count += 1
+
+        # Update cache to pick up new bitmaps
+        self._update_pack_cache()
+
+        return count
+
     def __iter__(self) -> Iterator[bytes]:
         """Iterate over the SHAs that are present in this store."""
         self._update_pack_cache()

+ 53 - 0
dulwich/pack.py

@@ -83,6 +83,7 @@ if TYPE_CHECKING:
 
     from .bitmap import PackBitmap
     from .commit_graph import CommitGraph
+    from .object_store import BaseObjectStore
 
 # For some reason the above try, except fails to set has_mmap = False for plan9
 if sys.platform == "Plan9":
@@ -3538,6 +3539,58 @@ class Pack:
             self._bitmap = read_bitmap(self._bitmap_path, pack_index=self.index)
         return self._bitmap
 
+    def ensure_bitmap(
+        self,
+        object_store: "BaseObjectStore",
+        refs: dict[bytes, bytes],
+        commit_interval: int | None = None,
+        progress: Callable[[str], None] | None = None,
+    ) -> "PackBitmap":
+        """Ensure a bitmap exists for this pack, generating one if needed.
+
+        Args:
+          object_store: Object store to read objects from
+          refs: Dictionary of ref names to commit SHAs
+          commit_interval: Include every Nth commit in bitmap index
+          progress: Optional progress reporting callback
+
+        Returns:
+          PackBitmap instance (either existing or newly generated)
+        """
+        from .bitmap import generate_bitmap, write_bitmap
+
+        # Check if bitmap already exists
+        try:
+            existing = self.bitmap
+            if existing is not None:
+                return existing
+        except FileNotFoundError:
+            pass  # No bitmap, we'll generate one
+
+        # Generate new bitmap
+        if progress:
+            progress(f"Generating bitmap for {self.name().decode('utf-8')}...\n")
+
+        pack_bitmap = generate_bitmap(
+            self.index,
+            object_store,
+            refs,
+            self.get_stored_checksum(),
+            commit_interval=commit_interval,
+            progress=progress,
+        )
+
+        # Write bitmap file
+        write_bitmap(self._bitmap_path, pack_bitmap)
+
+        if progress:
+            progress(f"Wrote {self._bitmap_path}\n")
+
+        # Update cached bitmap
+        self._bitmap = pack_bitmap
+
+        return pack_bitmap
+
     def close(self) -> None:
         """Close the pack file and index."""
         if self._data is not None:

+ 171 - 10
tests/test_bitmap.py

@@ -22,6 +22,7 @@
 """Tests for bitmap support."""
 
 import os
+import shutil
 import tempfile
 import unittest
 from io import BytesIO
@@ -1078,24 +1079,18 @@ class ReachabilityProviderTests(unittest.TestCase):
 
     def test_get_reachability_provider_without_bitmaps(self):
         """Test get_reachability_provider returns GraphTraversalReachability when no bitmaps."""
-        from dulwich.object_store import (
-            GraphTraversalReachability,
-            get_reachability_provider,
-        )
+        from dulwich.object_store import GraphTraversalReachability
 
-        provider = get_reachability_provider(self.store)
+        provider = self.store.get_reachability_provider()
 
         # Should return GraphTraversalReachability when no bitmaps available
         self.assertIsInstance(provider, GraphTraversalReachability)
 
     def test_get_reachability_provider_prefer_bitmaps_false(self):
         """Test get_reachability_provider with prefer_bitmaps=False."""
-        from dulwich.object_store import (
-            GraphTraversalReachability,
-            get_reachability_provider,
-        )
+        from dulwich.object_store import GraphTraversalReachability
 
-        provider = get_reachability_provider(self.store, prefer_bitmaps=False)
+        provider = self.store.get_reachability_provider(prefer_bitmaps=False)
 
         # Should return GraphTraversalReachability when prefer_bitmaps=False
         self.assertIsInstance(provider, GraphTraversalReachability)
@@ -1162,3 +1157,169 @@ class ReachabilityProviderTests(unittest.TestCase):
             [self.commit3.id], exclude_commits=None
         )
         self.assertEqual(graph_objects, bitmap_objects)
+
+
+class PackEnsureBitmapTests(unittest.TestCase):
+    """Tests for Pack.ensure_bitmap() method."""
+
+    def setUp(self):
+        """Set up test repository with a pack."""
+        from dulwich.object_store import DiskObjectStore
+        from dulwich.objects import Blob, Commit, Tree
+
+        self.temp_dir = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, self.temp_dir)
+
+        # Create pack directory
+        os.makedirs(os.path.join(self.temp_dir, "pack"))
+
+        self.store = DiskObjectStore(self.temp_dir)
+
+        # Create test objects
+        self.blob = Blob.from_string(b"test content")
+        self.store.add_object(self.blob)
+
+        self.tree = Tree()
+        self.tree.add(b"file.txt", 0o100644, self.blob.id)
+        self.store.add_object(self.tree)
+
+        self.commit = Commit()
+        self.commit.tree = self.tree.id
+        self.commit.author = self.commit.committer = b"Test <test@example.com>"
+        self.commit.author_time = self.commit.commit_time = 1234567890
+        self.commit.author_timezone = self.commit.commit_timezone = 0
+        self.commit.message = b"Test commit"
+        self.store.add_object(self.commit)
+
+        # Repack to create a pack
+        self.store.repack()
+        self.pack = self.store.packs[0]
+
+    def test_ensure_bitmap_creates_bitmap(self):
+        """Test that ensure_bitmap creates a bitmap file."""
+        # Initially no bitmap
+        self.assertFalse(os.path.exists(self.pack._bitmap_path))
+
+        # Ensure bitmap with commit_interval=1 to ensure our single commit is selected
+        refs = {b"refs/heads/master": self.commit.id}
+        bitmap = self.pack.ensure_bitmap(self.store, refs, commit_interval=1)
+
+        # Bitmap should now exist
+        self.assertIsNotNone(bitmap)
+        self.assertTrue(os.path.exists(self.pack._bitmap_path))
+        # Verify it's a PackBitmap instance
+        from dulwich.bitmap import PackBitmap
+
+        self.assertIsInstance(bitmap, PackBitmap)
+
+    def test_ensure_bitmap_returns_existing(self):
+        """Test that ensure_bitmap returns existing bitmap without regenerating."""
+        refs = {b"refs/heads/master": self.commit.id}
+
+        # Create bitmap with commit_interval=1
+        self.pack.ensure_bitmap(self.store, refs, commit_interval=1)
+        mtime1 = os.path.getmtime(self.pack._bitmap_path)
+
+        # Ensure again - should return existing
+        import time
+
+        time.sleep(0.01)  # Ensure time difference
+        self.pack.ensure_bitmap(self.store, refs, commit_interval=1)
+        mtime2 = os.path.getmtime(self.pack._bitmap_path)
+
+        # File should not have been regenerated
+        self.assertEqual(mtime1, mtime2)
+
+    def test_ensure_bitmap_with_custom_interval(self):
+        """Test ensure_bitmap with custom commit_interval."""
+        refs = {b"refs/heads/master": self.commit.id}
+        bitmap = self.pack.ensure_bitmap(self.store, refs, commit_interval=50)
+        self.assertIsNotNone(bitmap)
+
+
+class GeneratePackBitmapsTests(unittest.TestCase):
+    """Tests for PackBasedObjectStore.generate_pack_bitmaps()."""
+
+    def setUp(self):
+        """Set up test repository."""
+        from dulwich.object_store import DiskObjectStore
+        from dulwich.objects import Blob, Commit, Tree
+
+        self.temp_dir = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, self.temp_dir)
+
+        # Create pack directory
+        os.makedirs(os.path.join(self.temp_dir, "pack"))
+
+        self.store = DiskObjectStore(self.temp_dir)
+
+        # Create multiple commits
+        self.commits = []
+        for i in range(3):
+            blob = Blob.from_string(f"content {i}".encode())
+            self.store.add_object(blob)
+
+            tree = Tree()
+            tree.add(f"file{i}.txt".encode(), 0o100644, blob.id)
+            self.store.add_object(tree)
+
+            commit = Commit()
+            commit.tree = tree.id
+            if i > 0:
+                commit.parents = [self.commits[-1].id]
+            commit.author = commit.committer = b"Test <test@example.com>"
+            commit.author_time = commit.commit_time = 1234567890 + i
+            commit.author_timezone = commit.commit_timezone = 0
+            commit.message = f"Commit {i}".encode()
+            self.store.add_object(commit)
+            self.commits.append(commit)
+
+        # Repack to create pack
+        self.store.repack()
+
+    def test_generate_pack_bitmaps(self):
+        """Test generating bitmaps for all packs."""
+        refs = {b"refs/heads/master": self.commits[-1].id}
+
+        # Initially no bitmaps
+        for pack in self.store.packs:
+            self.assertFalse(os.path.exists(pack._bitmap_path))
+
+        # Generate bitmaps
+        count = self.store.generate_pack_bitmaps(refs)
+
+        # Should have generated bitmaps
+        self.assertEqual(count, len(self.store.packs))
+        for pack in self.store.packs:
+            self.assertTrue(os.path.exists(pack._bitmap_path))
+
+    def test_generate_pack_bitmaps_multiple_calls(self):
+        """Test that calling generate_pack_bitmaps multiple times is safe."""
+        refs = {b"refs/heads/master": self.commits[-1].id}
+
+        # Generate once
+        self.store.generate_pack_bitmaps(refs)
+        mtimes1 = [os.path.getmtime(p._bitmap_path) for p in self.store.packs]
+
+        # Generate again
+        import time
+
+        time.sleep(0.01)
+        self.store.generate_pack_bitmaps(refs)
+        mtimes2 = [os.path.getmtime(p._bitmap_path) for p in self.store.packs]
+
+        # Should not regenerate existing bitmaps
+        self.assertEqual(mtimes1, mtimes2)
+
+    def test_generate_pack_bitmaps_with_progress(self):
+        """Test generate_pack_bitmaps with progress callback."""
+        refs = {b"refs/heads/master": self.commits[-1].id}
+        messages = []
+
+        def progress(msg):
+            messages.append(msg)
+
+        self.store.generate_pack_bitmaps(refs, progress=progress)
+
+        # Should have received progress messages
+        self.assertGreater(len(messages), 0)