Quellcode durchsuchen

Implement pack filtering for blob size-based partial clone

Jelmer Vernooij vor 3 Wochen
Ursprung
Commit
2c9705b1df
3 geänderte Dateien mit 180 neuen und 1 gelöschten Zeilen
  1. 54 0
      dulwich/partial_clone.py
  2. 14 1
      dulwich/server.py
  3. 112 0
      tests/test_partial_clone.py

+ 54 - 0
dulwich/partial_clone.py

@@ -40,6 +40,7 @@ __all__ = [
     "SparseOidFilter",
     "CombineFilter",
     "parse_filter_spec",
+    "filter_pack_objects",
 ]
 
 from abc import ABC, abstractmethod
@@ -291,3 +292,56 @@ def parse_filter_spec(spec: str | bytes) -> FilterSpec:
         return CombineFilter(filters)
     else:
         raise ValueError(f"Unknown filter specification: {spec}")
+
+
+def filter_pack_objects(
+    object_store: "BaseObjectStore",
+    object_ids: list["ObjectID"],
+    filter_spec: FilterSpec,
+) -> list["ObjectID"]:
+    """Filter a list of object IDs based on a filter specification.
+
+    This function examines each object and excludes those that don't match
+    the filter criteria (e.g., blobs that are too large, trees beyond max depth).
+
+    Args:
+        object_store: Object store to retrieve objects from
+        object_ids: List of object IDs to filter
+        filter_spec: Filter specification to apply
+
+    Returns:
+        Filtered list of object IDs that should be included in the pack
+
+    Note:
+        This function currently supports blob size filtering. Tree depth filtering
+        requires additional path/depth tracking which is not yet implemented.
+    """
+    from .objects import Blob, Tree, Commit, Tag
+
+    filtered_ids = []
+
+    for oid in object_ids:
+        try:
+            obj = object_store[oid]
+        except KeyError:
+            # Object not found, skip it
+            continue
+
+        # Determine object type and apply appropriate filter
+        if isinstance(obj, Blob):
+            # Check if blob should be included based on size
+            blob_size = len(obj.data)
+            if filter_spec.should_include_blob(blob_size):
+                filtered_ids.append(oid)
+            # else: blob is filtered out
+        elif isinstance(obj, (Tree, Commit, Tag)):
+            # For now, include all trees, commits, and tags
+            # Tree depth filtering would require tracking depth during traversal
+            # which needs to be implemented at the object collection stage
+            if filter_spec.should_include_tree(0):  # depth=0 for now
+                filtered_ids.append(oid)
+        else:
+            # Unknown object type, include it to be safe
+            filtered_ids.append(oid)
+
+    return filtered_ids

+ 14 - 1
dulwich/server.py

@@ -100,7 +100,7 @@ from .errors import (
 from .object_store import MissingObjectFinder, PackBasedObjectStore, find_shallow
 from .objects import Commit, ObjectID, Tree, valid_hexsha
 from .pack import ObjectContainer, write_pack_from_container
-from .partial_clone import FilterSpec, parse_filter_spec
+from .partial_clone import FilterSpec, filter_pack_objects, parse_filter_spec
 from .protocol import (
     CAPABILITIES_REF,
     CAPABILITY_AGENT,
@@ -612,6 +612,19 @@ class UploadPackHandler(PackHandler):
             return
 
         self._start_pack_send_phase()
+
+        # Apply filter if specified (partial clone support)
+        if self.filter_spec is not None:
+            original_count = len(object_ids)
+            object_ids = filter_pack_objects(
+                self.repo.object_store, object_ids, self.filter_spec
+            )
+            filtered_count = original_count - len(object_ids)
+            if filtered_count > 0:
+                self.progress(
+                    (f"filtered {filtered_count} objects.\n").encode("ascii")
+                )
+
         self.progress((f"counting objects: {len(object_ids)}, done.\n").encode("ascii"))
 
         write_pack_from_container(

+ 112 - 0
tests/test_partial_clone.py

@@ -21,14 +21,18 @@
 
 """Tests for partial clone filter specifications."""
 
+from dulwich.object_store import MemoryObjectStore
+from dulwich.objects import Blob, Tree
 from dulwich.partial_clone import (
     BlobLimitFilter,
     BlobNoneFilter,
     CombineFilter,
     SparseOidFilter,
     TreeDepthFilter,
+    filter_pack_objects,
     parse_filter_spec,
 )
+from dulwich.tests.utils import make_commit
 
 from . import TestCase
 
@@ -307,3 +311,111 @@ class CombineFilterTests(TestCase):
         filters = [BlobNoneFilter()]
         filter_spec = CombineFilter(filters)
         self.assertIn("CombineFilter", repr(filter_spec))
+
+
+class FilterPackObjectsTests(TestCase):
+    """Test filter_pack_objects function."""
+
+    def setUp(self):
+        super().setUp()
+        self.store = MemoryObjectStore()
+
+        # Create test objects
+        self.small_blob = Blob.from_string(b"small")
+        self.large_blob = Blob.from_string(b"x" * 2000)
+        self.tree = Tree()
+        self.commit = make_commit(tree=self.tree.id)
+
+        # Add objects to store
+        self.store.add_object(self.small_blob)
+        self.store.add_object(self.large_blob)
+        self.store.add_object(self.tree)
+        self.store.add_object(self.commit)
+
+    def test_filter_blob_none(self):
+        """Test that blob:none filter excludes all blobs."""
+        object_ids = [
+            self.small_blob.id,
+            self.large_blob.id,
+            self.tree.id,
+            self.commit.id,
+        ]
+
+        filter_spec = BlobNoneFilter()
+        filtered = filter_pack_objects(self.store, object_ids, filter_spec)
+
+        # Should exclude both blobs but keep tree and commit
+        self.assertNotIn(self.small_blob.id, filtered)
+        self.assertNotIn(self.large_blob.id, filtered)
+        self.assertIn(self.tree.id, filtered)
+        self.assertIn(self.commit.id, filtered)
+
+    def test_filter_blob_limit(self):
+        """Test that blob:limit filter excludes blobs over size limit."""
+        object_ids = [
+            self.small_blob.id,
+            self.large_blob.id,
+            self.tree.id,
+        ]
+
+        # Set limit to 100 bytes
+        filter_spec = BlobLimitFilter(100)
+        filtered = filter_pack_objects(self.store, object_ids, filter_spec)
+
+        # Should keep small blob but exclude large blob
+        self.assertIn(self.small_blob.id, filtered)
+        self.assertNotIn(self.large_blob.id, filtered)
+        self.assertIn(self.tree.id, filtered)
+
+    def test_filter_no_filter_keeps_all(self):
+        """Test that without filtering all objects are kept."""
+        # Create a filter that includes everything
+        filter_spec = BlobLimitFilter(10000)  # Large limit
+
+        object_ids = [
+            self.small_blob.id,
+            self.large_blob.id,
+            self.tree.id,
+            self.commit.id,
+        ]
+
+        filtered = filter_pack_objects(self.store, object_ids, filter_spec)
+
+        # All objects should be included
+        self.assertEqual(len(filtered), len(object_ids))
+        for oid in object_ids:
+            self.assertIn(oid, filtered)
+
+    def test_filter_missing_object(self):
+        """Test that missing objects are skipped without error."""
+        from dulwich.objects import ObjectID
+
+        fake_id = ObjectID(b"0" * 40)
+        object_ids = [fake_id, self.small_blob.id]
+
+        filter_spec = BlobNoneFilter()
+        filtered = filter_pack_objects(self.store, object_ids, filter_spec)
+
+        # Should skip the missing object
+        self.assertNotIn(fake_id, filtered)
+
+    def test_filter_combine(self):
+        """Test combined filters."""
+        object_ids = [
+            self.small_blob.id,
+            self.large_blob.id,
+            self.tree.id,
+        ]
+
+        # Combine blob:limit with another filter
+        filter_spec = CombineFilter([
+            BlobLimitFilter(100),
+            BlobNoneFilter(),  # This will exclude ALL blobs
+        ])
+
+        filtered = filter_pack_objects(self.store, object_ids, filter_spec)
+
+        # Should exclude all blobs due to BlobNoneFilter
+        self.assertNotIn(self.small_blob.id, filtered)
+        self.assertNotIn(self.large_blob.id, filtered)
+        self.assertIn(self.tree.id, filtered)