Kaynağa Gözat

Add partial clone support (#2044)

Jelmer Vernooij 2 hafta önce
ebeveyn
işleme
68191f6606

+ 614 - 0
dulwich/object_filters.py

@@ -0,0 +1,614 @@
+# object_filters.py -- Object filtering for partial clone and similar operations
+# Copyright (C) 2024 Jelmer Vernooij <jelmer@jelmer.uk>
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as published by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Object filtering for Git partial clone and pack generation.
+
+This module implements Git's object filter specifications for partial clone,
+as documented in:
+https://git-scm.com/docs/rev-list-options#Documentation/rev-list-options.txt---filterltfilter-specgt
+
+Filter specifications control which objects are included when generating packs,
+enabling partial clone (downloading only needed objects) and similar operations.
+
+Supported filter specs:
+- blob:none - Exclude all blobs
+- blob:limit=<n>[kmg] - Exclude blobs larger than n bytes/KB/MB/GB
+- tree:<depth> - Exclude trees beyond specified depth
+- sparse:oid=<oid> - Use sparse specification from object
+- combine:<filter>+<filter>+... - Combine multiple filters
+"""
+
+__all__ = [
+    "BlobLimitFilter",
+    "BlobNoneFilter",
+    "CombineFilter",
+    "FilterSpec",
+    "SparseOidFilter",
+    "TreeDepthFilter",
+    "filter_pack_objects",
+    "filter_pack_objects_with_paths",
+    "parse_filter_spec",
+]
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+from .objects import S_ISGITLINK, Blob, Commit, ObjectID, Tag, Tree, valid_hexsha
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from .object_store import BaseObjectStore
+    from .objects import ObjectID
+
+
+class FilterSpec(ABC):
+    """Base class for all filter specifications."""
+
+    @abstractmethod
+    def should_include_blob(self, blob_size: int) -> bool:
+        """Determine if a blob of given size should be included.
+
+        Args:
+            blob_size: Size of the blob in bytes
+
+        Returns:
+            True if the blob should be included, False otherwise
+        """
+        ...
+
+    @abstractmethod
+    def should_include_tree(self, depth: int) -> bool:
+        """Determine if a tree at given depth should be included.
+
+        Args:
+            depth: Depth of the tree (0 = root)
+
+        Returns:
+            True if the tree should be included, False otherwise
+        """
+        ...
+
+    @abstractmethod
+    def to_spec_string(self) -> str:
+        """Convert filter spec back to string format.
+
+        Returns:
+            Filter specification string (e.g., ``blob:none``, ``blob:limit=1m``)
+        """
+        ...
+
+
+class BlobNoneFilter(FilterSpec):
+    """Filter that excludes all blobs."""
+
+    def should_include_blob(self, blob_size: int) -> bool:
+        """Exclude all blobs."""
+        return False
+
+    def should_include_tree(self, depth: int) -> bool:
+        """Include all trees."""
+        return True
+
+    def to_spec_string(self) -> str:
+        """Return 'blob:none'."""
+        return "blob:none"
+
+    def __repr__(self) -> str:
+        """Return string representation of the filter."""
+        return "BlobNoneFilter()"
+
+
+class BlobLimitFilter(FilterSpec):
+    """Filter that excludes blobs larger than a specified size."""
+
+    def __init__(self, limit: int) -> None:
+        """Initialize blob limit filter.
+
+        Args:
+            limit: Maximum blob size in bytes
+        """
+        self.limit = limit
+
+    def should_include_blob(self, blob_size: int) -> bool:
+        """Include only blobs smaller than or equal to the limit."""
+        return blob_size <= self.limit
+
+    def should_include_tree(self, depth: int) -> bool:
+        """Include all trees."""
+        return True
+
+    def to_spec_string(self) -> str:
+        """Return 'blob:limit=<size>' with appropriate unit."""
+        size = self.limit
+        if size >= 1024 * 1024 * 1024 and size % (1024 * 1024 * 1024) == 0:
+            return f"blob:limit={size // (1024 * 1024 * 1024)}g"
+        elif size >= 1024 * 1024 and size % (1024 * 1024) == 0:
+            return f"blob:limit={size // (1024 * 1024)}m"
+        elif size >= 1024 and size % 1024 == 0:
+            return f"blob:limit={size // 1024}k"
+        else:
+            return f"blob:limit={size}"
+
+    def __repr__(self) -> str:
+        """Return string representation of the filter."""
+        return f"BlobLimitFilter(limit={self.limit})"
+
+
+class TreeDepthFilter(FilterSpec):
+    """Filter that excludes trees beyond a specified depth."""
+
+    def __init__(self, max_depth: int) -> None:
+        """Initialize tree depth filter.
+
+        Args:
+            max_depth: Maximum tree depth (0 = only root tree)
+        """
+        self.max_depth = max_depth
+
+    def should_include_blob(self, blob_size: int) -> bool:
+        """Include all blobs."""
+        return True
+
+    def should_include_tree(self, depth: int) -> bool:
+        """Include only trees up to max_depth."""
+        return depth <= self.max_depth
+
+    def to_spec_string(self) -> str:
+        """Return 'tree:<depth>'."""
+        return f"tree:{self.max_depth}"
+
+    def __repr__(self) -> str:
+        """Return string representation of the filter."""
+        return f"TreeDepthFilter(max_depth={self.max_depth})"
+
+
+class SparseOidFilter(FilterSpec):
+    """Filter that uses a sparse specification from an object.
+
+    This filter reads sparse-checkout patterns from a blob object and uses them
+    to determine which paths should be included in the partial clone.
+    """
+
+    def __init__(
+        self, oid: "ObjectID", object_store: "BaseObjectStore | None" = None
+    ) -> None:
+        """Initialize sparse OID filter.
+
+        Args:
+            oid: Object ID of the sparse specification blob
+            object_store: Optional object store to load the sparse patterns from
+        """
+        self.oid = oid
+        self._patterns: list[tuple[str, bool, bool, bool]] | None = None
+        self._object_store = object_store
+
+    def _load_patterns(self) -> None:
+        """Load and parse sparse patterns from the blob."""
+        if self._patterns is not None:
+            return
+
+        if self._object_store is None:
+            raise ValueError("Cannot load sparse patterns without an object store")
+
+        from .sparse_patterns import parse_sparse_patterns
+
+        try:
+            obj = self._object_store[self.oid]
+        except KeyError:
+            raise ValueError(
+                f"Sparse specification blob {self.oid.hex() if isinstance(self.oid, bytes) else self.oid} not found"
+            )
+
+        if not isinstance(obj, Blob):
+            raise ValueError(
+                f"Sparse specification {self.oid.hex() if isinstance(self.oid, bytes) else self.oid} is not a blob"
+            )
+
+        # Parse the blob content as sparse patterns
+        lines = obj.data.decode("utf-8").splitlines()
+        self._patterns = parse_sparse_patterns(lines)
+
+    def should_include_path(self, path: str) -> bool:
+        """Determine if a path should be included based on sparse patterns.
+
+        Args:
+            path: Path to check (e.g., 'src/file.py')
+
+        Returns:
+            True if the path matches the sparse patterns, False otherwise
+        """
+        self._load_patterns()
+        from .sparse_patterns import match_sparse_patterns
+
+        # Determine if path is a directory based on whether it ends with '/'
+        path_is_dir = path.endswith("/")
+        path_str = path.rstrip("/")
+
+        assert self._patterns is not None  # _load_patterns ensures this
+        return match_sparse_patterns(path_str, self._patterns, path_is_dir=path_is_dir)
+
+    def should_include_blob(self, blob_size: int) -> bool:
+        """Include all blobs (sparse filtering is path-based, not size-based)."""
+        return True
+
+    def should_include_tree(self, depth: int) -> bool:
+        """Include all trees (sparse filtering is path-based)."""
+        return True
+
+    def to_spec_string(self) -> str:
+        """Return 'sparse:oid=<oid>'."""
+        return f"sparse:oid={self.oid.decode('ascii') if isinstance(self.oid, bytes) else self.oid}"
+
+    def __repr__(self) -> str:
+        """Return string representation of the filter."""
+        oid_str = self.oid.decode("ascii") if isinstance(self.oid, bytes) else self.oid
+        return f"SparseOidFilter(oid={oid_str!r})"
+
+
+class CombineFilter(FilterSpec):
+    """Filter that combines multiple filters with AND logic."""
+
+    def __init__(self, filters: list[FilterSpec]) -> None:
+        """Initialize combine filter.
+
+        Args:
+            filters: List of filters to combine
+        """
+        self.filters = filters
+
+    def should_include_blob(self, blob_size: int) -> bool:
+        """Include blob only if all filters agree."""
+        return all(f.should_include_blob(blob_size) for f in self.filters)
+
+    def should_include_tree(self, depth: int) -> bool:
+        """Include tree only if all filters agree."""
+        return all(f.should_include_tree(depth) for f in self.filters)
+
+    def to_spec_string(self) -> str:
+        """Return 'combine:<filter1>+<filter2>+...'."""
+        return "combine:" + "+".join(f.to_spec_string() for f in self.filters)
+
+    def __repr__(self) -> str:
+        """Return string representation of the filter."""
+        return f"CombineFilter(filters={self.filters!r})"
+
+
+def _parse_size(size_str: str) -> int:
+    """Parse a size specification like '100', '10k', '5m', '1g'.
+
+    Args:
+        size_str: Size string with optional unit suffix
+
+    Returns:
+        Size in bytes
+
+    Raises:
+        ValueError: If size_str is not a valid size specification
+    """
+    size_str = size_str.lower()
+    multipliers = {"k": 1024, "m": 1024 * 1024, "g": 1024 * 1024 * 1024}
+
+    if size_str[-1] in multipliers:
+        try:
+            value = int(size_str[:-1])
+            return value * multipliers[size_str[-1]]
+        except ValueError:
+            raise ValueError(f"Invalid size specification: {size_str}")
+    else:
+        try:
+            return int(size_str)
+        except ValueError:
+            raise ValueError(f"Invalid size specification: {size_str}")
+
+
+def parse_filter_spec(
+    spec: str | bytes, object_store: "BaseObjectStore | None" = None
+) -> FilterSpec:
+    """Parse a filter specification string.
+
+    Args:
+        spec: Filter specification (e.g., 'blob:none', 'blob:limit=1m')
+        object_store: Optional object store for loading sparse specifications
+
+    Returns:
+        Parsed FilterSpec object
+
+    Raises:
+        ValueError: If spec is not a valid filter specification
+
+    Examples:
+        >>> parse_filter_spec("blob:none")
+        BlobNoneFilter()
+        >>> parse_filter_spec("blob:limit=1m")
+        BlobLimitFilter(limit=1048576)
+        >>> parse_filter_spec("tree:0")
+        TreeDepthFilter(max_depth=0)
+    """
+    if isinstance(spec, bytes):
+        try:
+            spec = spec.decode("utf-8")
+        except UnicodeDecodeError as e:
+            raise ValueError(f"Filter specification must be valid UTF-8: {e}")
+
+    spec = spec.strip()
+
+    if not spec:
+        raise ValueError("Filter specification cannot be empty")
+
+    if spec == "blob:none":
+        return BlobNoneFilter()
+    elif spec.startswith("blob:limit="):
+        limit_str = spec[11:]  # len('blob:limit=') == 11
+        if not limit_str:
+            raise ValueError("blob:limit requires a size value (e.g., blob:limit=1m)")
+        try:
+            limit = _parse_size(limit_str)
+            if limit < 0:
+                raise ValueError(
+                    f"blob:limit size must be non-negative, got {limit_str}"
+                )
+            return BlobLimitFilter(limit)
+        except ValueError as e:
+            raise ValueError(f"Invalid blob:limit specification: {e}")
+    elif spec.startswith("tree:"):
+        depth_str = spec[5:]  # len('tree:') == 5
+        if not depth_str:
+            raise ValueError("tree filter requires a depth value (e.g., tree:0)")
+        try:
+            depth = int(depth_str)
+            if depth < 0:
+                raise ValueError(f"tree depth must be non-negative, got {depth}")
+            return TreeDepthFilter(depth)
+        except ValueError as e:
+            raise ValueError(f"Invalid tree filter: {e}")
+    elif spec.startswith("sparse:oid="):
+        oid_str = spec[11:]  # len('sparse:oid=') == 11
+        if not oid_str:
+            raise ValueError(
+                "sparse:oid requires an object ID (e.g., sparse:oid=abc123...)"
+            )
+        # Validate OID format (should be 40 hex chars for SHA-1 or 64 for SHA-256)
+        if not valid_hexsha(oid_str):
+            raise ValueError(
+                f"sparse:oid requires a valid object ID (40 or 64 hex chars), got {len(oid_str)} chars"
+            )
+
+        oid: ObjectID = ObjectID(oid_str.encode("ascii"))
+        return SparseOidFilter(oid, object_store=object_store)
+    elif spec.startswith("combine:"):
+        filter_str = spec[8:]  # len('combine:') == 8
+        if not filter_str:
+            raise ValueError(
+                "combine filter requires at least one filter (e.g., combine:blob:none+tree:0)"
+            )
+        filter_specs = filter_str.split("+")
+        if len(filter_specs) < 2:
+            raise ValueError(
+                "combine filter requires at least two filters separated by '+'"
+            )
+        try:
+            filters = [
+                parse_filter_spec(f, object_store=object_store) for f in filter_specs
+            ]
+        except ValueError as e:
+            raise ValueError(f"Invalid filter in combine specification: {e}")
+        return CombineFilter(filters)
+    else:
+        # Provide helpful error message with supported formats
+        raise ValueError(
+            f"Unknown filter specification: '{spec}'. "
+            f"Supported formats: blob:none, blob:limit=<n>[kmg], tree:<depth>, "
+            f"sparse:oid=<oid>, combine:<filter>+<filter>+..."
+        )
+
+
+def filter_pack_objects(
+    object_store: "BaseObjectStore",
+    object_ids: list["ObjectID"],
+    filter_spec: FilterSpec,
+) -> list["ObjectID"]:
+    """Filter a list of object IDs based on a filter specification.
+
+    This function examines each object and excludes those that don't match
+    the filter criteria (e.g., blobs that are too large, trees beyond max depth).
+
+    Args:
+        object_store: Object store to retrieve objects from
+        object_ids: List of object IDs to filter
+        filter_spec: Filter specification to apply
+
+    Returns:
+        Filtered list of object IDs that should be included in the pack
+
+    Note:
+        This function currently supports blob size filtering. Tree depth filtering
+        requires additional path/depth tracking which is not yet implemented.
+    """
+    filtered_ids = []
+
+    for oid in object_ids:
+        try:
+            obj = object_store[oid]
+        except KeyError:
+            # Object not found, skip it
+            continue
+
+        # Determine object type and apply appropriate filter
+        if isinstance(obj, Blob):
+            # Check if blob should be included based on size
+            blob_size = len(obj.data)
+            if filter_spec.should_include_blob(blob_size):
+                filtered_ids.append(oid)
+            # else: blob is filtered out
+        elif isinstance(obj, (Tree, Commit, Tag)):
+            # For now, include all trees, commits, and tags
+            # Tree depth filtering would require tracking depth during traversal
+            # which needs to be implemented at the object collection stage
+            if filter_spec.should_include_tree(0):  # depth=0 for now
+                filtered_ids.append(oid)
+        else:
+            # Unknown object type, include it to be safe
+            filtered_ids.append(oid)
+
+    return filtered_ids
+
+
+def filter_pack_objects_with_paths(
+    object_store: "BaseObjectStore",
+    wants: list["ObjectID"],
+    filter_spec: FilterSpec,
+    *,
+    progress: "Callable[[bytes], None] | None" = None,
+) -> list["ObjectID"]:
+    """Filter objects for a pack with full path and depth tracking.
+
+    This function performs a complete tree traversal starting from the wanted
+    commits, tracking paths and depths to enable proper filtering for sparse:oid
+    and tree:<depth> filters.
+
+    Args:
+        object_store: Object store to retrieve objects from
+        wants: List of commit/tree/blob IDs that are wanted
+        filter_spec: Filter specification to apply
+        progress: Optional progress callback
+
+    Returns:
+        Filtered list of object IDs that should be included in the pack
+    """
+    import stat
+
+    included_objects: set[ObjectID] = set()
+    # Track (oid, path, depth) tuples to process
+    to_process: list[tuple[ObjectID, str, int]] = []
+
+    # Start with the wanted commits
+    for want in wants:
+        try:
+            obj = object_store[want]
+        except KeyError:
+            continue
+
+        if isinstance(obj, Commit):
+            # Always include commits
+            included_objects.add(want)
+            # Add the root tree to process with depth 0
+            to_process.append((obj.tree, "", 0))
+        elif isinstance(obj, Tree):
+            # Direct tree wants start at depth 0
+            to_process.append((want, "", 0))
+        elif isinstance(obj, Tag):
+            # Always include tags
+            included_objects.add(want)
+            # Process the tagged object
+            tagged_oid = obj.object[1]
+            to_process.append((tagged_oid, "", 0))
+        elif isinstance(obj, Blob):
+            # Direct blob wants - check size filter
+            blob_size = len(obj.data)
+            if filter_spec.should_include_blob(blob_size):
+                included_objects.add(want)
+
+    # Process trees and their contents
+    processed_trees: set[ObjectID] = set()
+
+    while to_process:
+        oid, current_path, depth = to_process.pop()
+
+        # Skip if already processed
+        if oid in processed_trees:
+            continue
+
+        try:
+            obj = object_store[oid]
+        except KeyError:
+            continue
+
+        if isinstance(obj, Tree):
+            # Check if this tree should be included based on depth
+            if not filter_spec.should_include_tree(depth):
+                continue
+
+            # Include this tree
+            included_objects.add(oid)
+            processed_trees.add(oid)
+
+            # Process tree entries
+            for name, mode, entry_oid in obj.iteritems():
+                assert name is not None
+                assert mode is not None
+                assert entry_oid is not None
+
+                # Skip gitlinks
+                if S_ISGITLINK(mode):
+                    continue
+
+                # Build full path
+                if current_path:
+                    full_path = f"{current_path}/{name.decode('utf-8')}"
+                else:
+                    full_path = name.decode("utf-8")
+
+                if stat.S_ISDIR(mode):
+                    # It's a subdirectory - add to process list with increased depth
+                    to_process.append((entry_oid, full_path, depth + 1))
+                elif stat.S_ISREG(mode):
+                    # It's a blob - check filters
+                    try:
+                        blob = object_store[entry_oid]
+                    except KeyError:
+                        continue
+
+                    if not isinstance(blob, Blob):
+                        continue
+
+                    # Check filters
+                    blob_size = len(blob.data)
+
+                    # For non-path-based filters (size, blob:none), check directly
+                    if not filter_spec.should_include_blob(blob_size):
+                        continue
+
+                    # Check path filter for sparse:oid
+                    path_allowed = True
+                    if isinstance(filter_spec, SparseOidFilter):
+                        path_allowed = filter_spec.should_include_path(full_path)
+                    elif isinstance(filter_spec, CombineFilter):
+                        # Check path filters in combination
+                        for f in filter_spec.filters:
+                            if isinstance(f, SparseOidFilter):
+                                if not f.should_include_path(full_path):
+                                    path_allowed = False
+                                    break
+
+                    if not path_allowed:
+                        continue
+
+                    # Include this blob
+                    included_objects.add(entry_oid)
+
+        elif isinstance(obj, Blob):
+            # Standalone blob (shouldn't normally happen in tree traversal)
+            blob_size = len(obj.data)
+            if filter_spec.should_include_blob(blob_size):
+                included_objects.add(oid)
+
+    return list(included_objects)

+ 38 - 0
dulwich/protocol.py

@@ -52,6 +52,7 @@ __all__ = [
     "COMMAND_DEEPEN_NOT",
     "COMMAND_DEEPEN_SINCE",
     "COMMAND_DONE",
+    "COMMAND_FILTER",
     "COMMAND_HAVE",
     "COMMAND_SHALLOW",
     "COMMAND_UNSHALLOW",
@@ -84,6 +85,7 @@ __all__ = [
     "extract_capabilities",
     "extract_capability_names",
     "extract_want_line_capabilities",
+    "find_capability",
     "format_ack_line",
     "format_capability_line",
     "format_cmd_pkt",
@@ -201,6 +203,7 @@ KNOWN_UPLOAD_CAPABILITIES = set(
         CAPABILITY_ALLOW_TIP_SHA1_IN_WANT,
         CAPABILITY_ALLOW_REACHABLE_SHA1_IN_WANT,
         CAPABILITY_FETCH,
+        CAPABILITY_FILTER,
     ]
 )
 KNOWN_RECEIVE_CAPABILITIES = set(
@@ -308,6 +311,7 @@ COMMAND_UNSHALLOW = b"unshallow"
 COMMAND_DONE = b"done"
 COMMAND_WANT = b"want"
 COMMAND_HAVE = b"have"
+COMMAND_FILTER = b"filter"
 
 
 def format_cmd_pkt(cmd: bytes, *args: bytes) -> bytes:
@@ -721,6 +725,40 @@ def ack_type(capabilities: Iterable[bytes]) -> int:
     return SINGLE_ACK
 
 
+def find_capability(
+    capabilities: Iterable[bytes], *capability_names: bytes
+) -> bytes | None:
+    """Find a capability value in a list of capabilities.
+
+    This function looks for capabilities that may include arguments after an equals sign
+    and returns only the value part (after the '='). For capabilities without values,
+    returns the capability name itself.
+
+    Args:
+      capabilities: List of capability strings
+      capability_names: Capability name(s) to search for
+
+    Returns:
+      The value after '=' if found, or the capability name if no '=', or None if not found
+
+    Example:
+      >>> caps = [b'filter=blob:none', b'agent=git/2.0', b'thin-pack']
+      >>> find_capability(caps, b'filter')
+      b'blob:none'
+      >>> find_capability(caps, b'thin-pack')
+      b'thin-pack'
+      >>> find_capability(caps, b'missing')
+      None
+    """
+    for cap in capabilities:
+        for name in capability_names:
+            if cap == name:
+                return cap
+            elif cap.startswith(name + b"="):
+                return cap[len(name) + 1 :]
+    return None
+
+
 class BufferedPktLineWriter:
     """Writer that wraps its data in pkt-lines and has an independent buffer.
 

+ 101 - 2
dulwich/server.py

@@ -97,6 +97,15 @@ from .errors import (
     ObjectFormatException,
     UnexpectedCommandError,
 )
+from .object_filters import (
+    CombineFilter,
+    FilterSpec,
+    SparseOidFilter,
+    TreeDepthFilter,
+    filter_pack_objects,
+    filter_pack_objects_with_paths,
+    parse_filter_spec,
+)
 from .object_store import MissingObjectFinder, PackBasedObjectStore, find_shallow
 from .objects import Commit, ObjectID, Tree, valid_hexsha
 from .pack import ObjectContainer, write_pack_from_container
@@ -104,6 +113,7 @@ from .protocol import (
     CAPABILITIES_REF,
     CAPABILITY_AGENT,
     CAPABILITY_DELETE_REFS,
+    CAPABILITY_FILTER,
     CAPABILITY_INCLUDE_TAG,
     CAPABILITY_MULTI_ACK,
     CAPABILITY_MULTI_ACK_DETAILED,
@@ -117,6 +127,7 @@ from .protocol import (
     CAPABILITY_THIN_PACK,
     COMMAND_DEEPEN,
     COMMAND_DONE,
+    COMMAND_FILTER,
     COMMAND_HAVE,
     COMMAND_SHALLOW,
     COMMAND_UNSHALLOW,
@@ -138,6 +149,7 @@ from .protocol import (
     capability_object_format,
     extract_capabilities,
     extract_want_line_capabilities,
+    find_capability,
     format_ack_line,
     format_ref_line,
     format_shallow_line,
@@ -378,6 +390,13 @@ class PackHandler(Handler):
         for cap in caps:
             if cap.startswith(CAPABILITY_AGENT + b"="):
                 continue
+            if cap.startswith(CAPABILITY_FILTER + b"="):
+                # Filter capability can have a value (e.g., filter=blob:none)
+                if CAPABILITY_FILTER not in allowable_caps:
+                    raise GitProtocolError(
+                        f"Client asked for capability {cap!r} that was not advertised."
+                    )
+                continue
             if cap not in allowable_caps:
                 raise GitProtocolError(
                     f"Client asked for capability {cap!r} that was not advertised."
@@ -438,6 +457,8 @@ class UploadPackHandler(PackHandler):
         # being processed, and the client is not accepting any other
         # data (such as side-band, see the progress method here).
         self._processing_have_lines = False
+        # Filter specification for partial clone support
+        self.filter_spec: FilterSpec | None = None
 
     def capabilities(self) -> list[bytes]:
         """Return the list of capabilities supported by upload-pack.
@@ -455,6 +476,7 @@ class UploadPackHandler(PackHandler):
             CAPABILITY_INCLUDE_TAG,
             CAPABILITY_SHALLOW,
             CAPABILITY_NO_DONE,
+            CAPABILITY_FILTER,
             capability_object_format(self.repo.object_format.name),
         ]
 
@@ -467,6 +489,26 @@ class UploadPackHandler(PackHandler):
             CAPABILITY_OFS_DELTA,
         )
 
+    def set_client_capabilities(self, caps: Iterable[bytes]) -> None:
+        """Set client capabilities and parse filter specification if present.
+
+        Args:
+            caps: List of capability strings from the client
+        """
+        super().set_client_capabilities(caps)
+        # Parse filter specification if present in capabilities
+        # In protocol v1, filter can be sent as "filter=<spec>" capability
+        # In protocol v2, filter is sent as a separate "filter <spec>" command
+        filter_spec_bytes = find_capability(caps, CAPABILITY_FILTER)
+        if filter_spec_bytes and filter_spec_bytes != CAPABILITY_FILTER:
+            # Only parse if there's an actual spec (not just the capability name)
+            try:
+                self.filter_spec = parse_filter_spec(
+                    filter_spec_bytes, object_store=self.repo.object_store
+                )
+            except ValueError as e:
+                raise GitProtocolError(f"Invalid filter specification: {e}")
+
     def progress(self, message: bytes) -> None:
         """Send a progress message to the client.
 
@@ -584,6 +626,46 @@ class UploadPackHandler(PackHandler):
             return
 
         self._start_pack_send_phase()
+
+        # Apply filter if specified (partial clone support)
+        if self.filter_spec is not None:
+            original_count = len(object_ids)
+
+            # Use path-aware filtering for tree depth and sparse:oid filters
+            # Check if filter requires path tracking
+            def needs_path_tracking(filter_spec: FilterSpec) -> bool:
+                if isinstance(filter_spec, (TreeDepthFilter, SparseOidFilter)):
+                    return True
+                if isinstance(filter_spec, CombineFilter):
+                    return any(needs_path_tracking(f) for f in filter_spec.filters)
+                return False
+
+            if needs_path_tracking(self.filter_spec):
+                # Path-aware filtering returns list of OIDs, convert to tuples for pack generation
+                filtered_oids = filter_pack_objects_with_paths(
+                    self.repo.object_store,
+                    wants,
+                    self.filter_spec,
+                    progress=self.progress,
+                )
+                object_ids = [(oid, None) for oid in filtered_oids]
+            else:
+                # Extract just the object IDs (filter_pack_objects expects list of OIDs)
+                # object_ids is a list of tuples (oid, (depth, path))
+                oid_list = [oid for oid, _hint in object_ids]
+                filtered_oids = filter_pack_objects(
+                    self.repo.object_store, oid_list, self.filter_spec
+                )
+                # Reconstruct tuples with hints for pack generation
+                filtered_oid_set = set(filtered_oids)
+                object_ids = [
+                    (oid, hint) for oid, hint in object_ids if oid in filtered_oid_set
+                ]
+
+            filtered_count = original_count - len(object_ids)
+            if filtered_count > 0:
+                self.progress((f"filtered {filtered_count} objects.\n").encode("ascii"))
+
         self.progress((f"counting objects: {len(object_ids)}, done.\n").encode("ascii"))
 
         write_pack_from_container(
@@ -640,6 +722,10 @@ def _split_proto_line(
         elif command == COMMAND_DEEPEN:
             assert fields[1] is not None
             return command, int(fields[1])
+        elif command == COMMAND_FILTER:
+            # Filter specification (e.g., "filter blob:none")
+            assert fields[1] is not None
+            return (command, fields[1])
     raise GitProtocolError(f"Received invalid line from client: {line!r}")
 
 
@@ -742,7 +828,7 @@ class _ProtocolGraphWalker:
 
     def __init__(
         self,
-        handler: PackHandler,
+        handler: "UploadPackHandler",
         object_store: ObjectContainer,
         get_peeled: Callable[[bytes], ObjectID | None],
         get_symrefs: Callable[[], dict[Ref, Ref]],
@@ -833,7 +919,7 @@ class _ProtocolGraphWalker:
         line, caps = extract_want_line_capabilities(want)
         self.handler.set_client_capabilities(caps)
         self.set_ack_type(ack_type(caps))
-        allowed = (COMMAND_WANT, COMMAND_SHALLOW, COMMAND_DEEPEN, None)
+        allowed = (COMMAND_WANT, COMMAND_SHALLOW, COMMAND_DEEPEN, COMMAND_FILTER, None)
         command, sha_result = _split_proto_line(line, allowed)
 
         want_revs: list[ObjectID] = []
@@ -845,6 +931,19 @@ class _ProtocolGraphWalker:
             command, sha_result = self.read_proto_line(allowed)
 
         self.set_wants(want_revs)
+
+        # Handle filter command if present (protocol v2)
+        if command == COMMAND_FILTER:
+            assert isinstance(sha_result, bytes)
+            try:
+                self.handler.filter_spec = parse_filter_spec(
+                    sha_result, object_store=self.handler.repo.object_store
+                )
+            except ValueError as e:
+                raise GitProtocolError(f"Invalid filter specification: {e}")
+            # Read next command after processing filter
+            command, sha_result = self.read_proto_line(allowed)
+
         if command in (COMMAND_SHALLOW, COMMAND_DEEPEN):
             assert sha_result is not None
             self.unread_proto_line(command, sha_result)

+ 1 - 0
tests/__init__.py

@@ -169,6 +169,7 @@ def self_test_suite() -> unittest.TestSuite:
         "notes",
         "objects",
         "objectspec",
+        "object_filters",
         "object_store",
         "pack",
         "patch",

+ 1 - 0
tests/compat/__init__.py

@@ -37,6 +37,7 @@ def test_suite() -> unittest.TestSuite:
         "lfs",
         "midx",
         "pack",
+        "partial_clone",
         "patch",
         "porcelain",
         "reftable",

+ 489 - 0
tests/compat/test_partial_clone.py

@@ -0,0 +1,489 @@
+# test_partial_clone.py -- Compatibility tests for partial clone.
+# Copyright (C) 2024 Jelmer Vernooij <jelmer@jelmer.uk>
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as published by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Compatibility tests for partial clone support."""
+
+import os
+import shutil
+import sys
+import tempfile
+import threading
+
+from dulwich.objects import Blob, Tree
+from dulwich.repo import Repo
+from dulwich.server import DictBackend, TCPGitServer
+from dulwich.tests.utils import make_commit
+
+from .. import skipIf
+from .utils import CompatTestCase, require_git_version, run_git_or_fail
+
+
+@skipIf(sys.platform == "win32", "Broken on windows, with very long fail time.")
+class PartialCloneServerTestCase(CompatTestCase):
+    """Tests for partial clone server compatibility with git client."""
+
+    protocol = "git"
+    # Partial clone support was introduced in git 2.17.0
+    min_git_version = (2, 17, 0)
+
+    def setUp(self) -> None:
+        super().setUp()
+        require_git_version(self.min_git_version)
+
+    def _start_server(self, repo):
+        backend = DictBackend({b"/": repo})
+        dul_server = TCPGitServer(backend, b"localhost", 0)
+
+        # Start server in a thread
+        server_thread = threading.Thread(target=dul_server.serve)
+        server_thread.daemon = True
+        server_thread.start()
+
+        # Add cleanup
+        def cleanup_server():
+            dul_server.shutdown()
+            dul_server.server_close()
+            server_thread.join(timeout=1.0)
+
+        self.addCleanup(cleanup_server)
+        self._server = dul_server
+        _, port = self._server.socket.getsockname()
+        return port
+
+    def url(self, port) -> str:
+        return f"{self.protocol}://localhost:{port}/"
+
+    def test_clone_with_blob_none_filter(self) -> None:
+        """Test that git client can clone with blob:none filter."""
+        # Create repository with dulwich
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, repo_path)
+        source_repo = Repo.init(repo_path, mkdir=False)
+
+        # Create test content with multiple blobs
+        blob1 = Blob.from_string(b"File 1 content - this is a test file")
+        blob2 = Blob.from_string(b"File 2 content - another test file")
+        blob3 = Blob.from_string(b"File 3 content - third test file")
+
+        tree = Tree()
+        tree.add(b"file1.txt", 0o100644, blob1.id)
+        tree.add(b"file2.txt", 0o100644, blob2.id)
+        tree.add(b"file3.txt", 0o100644, blob3.id)
+
+        # Add objects to repo
+        source_repo.object_store.add_object(blob1)
+        source_repo.object_store.add_object(blob2)
+        source_repo.object_store.add_object(blob3)
+        source_repo.object_store.add_object(tree)
+
+        commit = make_commit(tree=tree.id, message=b"Test commit with multiple files")
+        source_repo.object_store.add_object(commit)
+        source_repo.refs[b"refs/heads/master"] = commit.id
+
+        # Start dulwich server
+        port = self._start_server(source_repo)
+
+        # Clone with blob:none filter
+        clone_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, clone_path)
+        clone_dir = os.path.join(clone_path, "cloned_repo")
+
+        run_git_or_fail(
+            ["clone", "--filter=blob:none", "--no-checkout", self.url(port), clone_dir],
+            cwd=clone_path,
+        )
+
+        # Verify cloned repo has commit and tree but no blobs
+        cloned_repo = Repo(clone_dir)
+        self.addCleanup(cloned_repo.close)
+
+        # Commit should exist
+        self.assertEqual(cloned_repo.refs[b"refs/heads/master"], commit.id)
+
+        # Tree should exist
+        self.assertIn(tree.id, cloned_repo.object_store)
+
+        # Blobs should NOT be in object store (filtered out)
+        # Note: git may still have the blobs if they're small enough to be inlined
+        # or if it fetched them anyway, so we just verify the filter was accepted
+
+        # Verify git recognizes this as a partial clone
+        config_output = run_git_or_fail(
+            ["config", "--get", "remote.origin.promisor"], cwd=clone_dir
+        )
+        self.assertEqual(config_output.strip(), b"true")
+
+        source_repo.close()
+
+    def test_clone_with_blob_limit_filter(self) -> None:
+        """Test that git client can clone with blob:limit filter."""
+        # Create repository
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, repo_path)
+        source_repo = Repo.init(repo_path, mkdir=False)
+
+        # Create blobs of different sizes
+        small_blob = Blob.from_string(b"small")  # 5 bytes
+        large_blob = Blob.from_string(b"x" * 1000)  # 1000 bytes
+
+        tree = Tree()
+        tree.add(b"small.txt", 0o100644, small_blob.id)
+        tree.add(b"large.txt", 0o100644, large_blob.id)
+
+        source_repo.object_store.add_object(small_blob)
+        source_repo.object_store.add_object(large_blob)
+        source_repo.object_store.add_object(tree)
+
+        commit = make_commit(tree=tree.id, message=b"Test commit with mixed sizes")
+        source_repo.object_store.add_object(commit)
+        source_repo.refs[b"refs/heads/master"] = commit.id
+
+        # Start server
+        port = self._start_server(source_repo)
+
+        # Clone with blob:limit=100 filter (should exclude large blob)
+        clone_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, clone_path)
+        clone_dir = os.path.join(clone_path, "cloned_repo")
+
+        run_git_or_fail(
+            [
+                "clone",
+                "--filter=blob:limit=100",
+                "--no-checkout",
+                self.url(port),
+                clone_dir,
+            ],
+            cwd=clone_path,
+        )
+
+        # Verify it's a partial clone
+        cloned_repo = Repo(clone_dir)
+        self.addCleanup(cloned_repo.close)
+
+        config_output = run_git_or_fail(
+            ["config", "--get", "remote.origin.promisor"], cwd=clone_dir
+        )
+        self.assertEqual(config_output.strip(), b"true")
+
+        source_repo.close()
+
+    def test_clone_with_tree_depth_filter(self) -> None:
+        """Test that git client can clone with tree:0 filter."""
+        # Create repository with nested structure
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, repo_path)
+        source_repo = Repo.init(repo_path, mkdir=False)
+
+        # Create nested tree structure
+        blob1 = Blob.from_string(b"root file")
+        blob2 = Blob.from_string(b"nested file")
+
+        inner_tree = Tree()
+        inner_tree.add(b"nested.txt", 0o100644, blob2.id)
+
+        outer_tree = Tree()
+        outer_tree.add(b"root.txt", 0o100644, blob1.id)
+        outer_tree.add(b"subdir", 0o040000, inner_tree.id)
+
+        source_repo.object_store.add_object(blob1)
+        source_repo.object_store.add_object(blob2)
+        source_repo.object_store.add_object(inner_tree)
+        source_repo.object_store.add_object(outer_tree)
+
+        commit = make_commit(tree=outer_tree.id, message=b"Test nested structure")
+        source_repo.object_store.add_object(commit)
+        source_repo.refs[b"refs/heads/master"] = commit.id
+
+        # Start server
+        port = self._start_server(source_repo)
+
+        # Clone with tree:0 filter
+        clone_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, clone_path)
+        clone_dir = os.path.join(clone_path, "cloned_repo")
+
+        run_git_or_fail(
+            ["clone", "--filter=tree:0", "--no-checkout", self.url(port), clone_dir],
+            cwd=clone_path,
+        )
+
+        # Verify it's a partial clone
+        cloned_repo = Repo(clone_dir)
+        self.addCleanup(cloned_repo.close)
+
+        config_output = run_git_or_fail(
+            ["config", "--get", "remote.origin.promisor"], cwd=clone_dir
+        )
+        self.assertEqual(config_output.strip(), b"true")
+
+        source_repo.close()
+
+    def test_clone_with_filter_protocol_v0(self) -> None:
+        """Test that git client can clone with filter using protocol v0."""
+        # Create repository with dulwich
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, repo_path)
+        source_repo = Repo.init(repo_path, mkdir=False)
+
+        # Create test content
+        blob = Blob.from_string(b"test content")
+        tree = Tree()
+        tree.add(b"file.txt", 0o100644, blob.id)
+
+        source_repo.object_store.add_object(blob)
+        source_repo.object_store.add_object(tree)
+
+        commit = make_commit(tree=tree.id, message=b"Test commit")
+        source_repo.object_store.add_object(commit)
+        source_repo.refs[b"refs/heads/master"] = commit.id
+
+        # Start server
+        port = self._start_server(source_repo)
+
+        # Clone with protocol v0 and blob:none filter
+        clone_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, clone_path)
+        clone_dir = os.path.join(clone_path, "cloned_repo")
+
+        run_git_or_fail(
+            [
+                "-c",
+                "protocol.version=0",
+                "clone",
+                "--filter=blob:none",
+                "--no-checkout",
+                self.url(port),
+                clone_dir,
+            ],
+            cwd=clone_path,
+        )
+
+        # Verify partial clone
+        cloned_repo = Repo(clone_dir)
+        self.addCleanup(cloned_repo.close)
+        self.assertIn(commit.id, cloned_repo.object_store)
+        self.assertIn(tree.id, cloned_repo.object_store)
+
+        source_repo.close()
+
+    def test_clone_with_filter_protocol_v2(self) -> None:
+        """Test that git client can clone with filter using protocol v2."""
+        # Create repository with dulwich
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, repo_path)
+        source_repo = Repo.init(repo_path, mkdir=False)
+
+        # Create test content
+        blob = Blob.from_string(b"test content")
+        tree = Tree()
+        tree.add(b"file.txt", 0o100644, blob.id)
+
+        source_repo.object_store.add_object(blob)
+        source_repo.object_store.add_object(tree)
+
+        commit = make_commit(tree=tree.id, message=b"Test commit")
+        source_repo.object_store.add_object(commit)
+        source_repo.refs[b"refs/heads/master"] = commit.id
+
+        # Start server
+        port = self._start_server(source_repo)
+
+        # Clone with protocol v2 and blob:none filter
+        clone_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, clone_path)
+        clone_dir = os.path.join(clone_path, "cloned_repo")
+
+        run_git_or_fail(
+            [
+                "-c",
+                "protocol.version=2",
+                "clone",
+                "--filter=blob:none",
+                "--no-checkout",
+                self.url(port),
+                clone_dir,
+            ],
+            cwd=clone_path,
+        )
+
+        # Verify partial clone
+        cloned_repo = Repo(clone_dir)
+        self.addCleanup(cloned_repo.close)
+        self.assertIn(commit.id, cloned_repo.object_store)
+        self.assertIn(tree.id, cloned_repo.object_store)
+
+        source_repo.close()
+
+
+@skipIf(sys.platform == "win32", "Broken on windows, with very long fail time.")
+class PartialCloneClientTestCase(CompatTestCase):
+    """Tests for partial clone client compatibility with git server."""
+
+    # Partial clone support was introduced in git 2.17.0
+    min_git_version = (2, 17, 0)
+
+    def setUp(self) -> None:
+        super().setUp()
+        require_git_version(self.min_git_version)
+
+    def test_fetch_with_blob_none_filter(self) -> None:
+        """Test that dulwich client can fetch with blob:none filter."""
+        from dulwich.client import get_transport_and_path
+
+        # Create a git repository using git itself
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, repo_path)
+
+        # Initialize with git
+        run_git_or_fail(["init"], cwd=repo_path)
+        run_git_or_fail(["config", "user.name", "Test User"], cwd=repo_path)
+        run_git_or_fail(["config", "user.email", "test@example.com"], cwd=repo_path)
+
+        # Create test files
+        file1 = os.path.join(repo_path, "file1.txt")
+        with open(file1, "wb") as f:
+            f.write(b"Content of file 1")
+
+        file2 = os.path.join(repo_path, "file2.txt")
+        with open(file2, "wb") as f:
+            f.write(b"Content of file 2")
+
+        # Commit files
+        run_git_or_fail(["add", "."], cwd=repo_path)
+        run_git_or_fail(["commit", "-m", "Initial commit"], cwd=repo_path)
+
+        # Start git daemon
+        daemon_port = self._start_git_daemon(repo_path)
+
+        # Create destination repo
+        dest_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, dest_path)
+        dest_repo = Repo.init(dest_path, mkdir=False)
+        self.addCleanup(dest_repo.close)
+
+        # Fetch with blob:none filter using dulwich client
+        client, path = get_transport_and_path(
+            f"git://localhost:{daemon_port}/",
+            thin_packs=False,
+        )
+
+        def determine_wants(refs, depth=None):
+            # Get all refs
+            return list(refs.values())
+
+        # Fetch with filter
+        result = client.fetch(
+            path,
+            dest_repo,
+            determine_wants=determine_wants,
+            progress=None,
+            filter_spec=b"blob:none",
+        )
+
+        # The fetch should succeed with partial clone
+        self.assertIsNotNone(result)
+
+    def test_clone_with_filter(self) -> None:
+        """Test that dulwich clone function works with filter."""
+        from dulwich.client import get_transport_and_path
+
+        # Create a git repository
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, repo_path)
+
+        run_git_or_fail(["init"], cwd=repo_path)
+        run_git_or_fail(["config", "user.name", "Test User"], cwd=repo_path)
+        run_git_or_fail(["config", "user.email", "test@example.com"], cwd=repo_path)
+
+        # Create and commit a file
+        test_file = os.path.join(repo_path, "test.txt")
+        with open(test_file, "wb") as f:
+            f.write(b"Test content for partial clone")
+        run_git_or_fail(["add", "."], cwd=repo_path)
+        run_git_or_fail(["commit", "-m", "Test commit"], cwd=repo_path)
+
+        # Start git daemon
+        daemon_port = self._start_git_daemon(repo_path)
+
+        # Clone with dulwich using filter
+        dest_path = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, dest_path)
+
+        client, path = get_transport_and_path(f"git://localhost:{daemon_port}/")
+
+        # Clone with blob:limit filter
+        cloned_repo = client.clone(
+            path,
+            dest_path,
+            mkdir=False,
+            filter_spec=b"blob:limit=100",
+        )
+        self.addCleanup(cloned_repo.close)
+
+        # Verify clone succeeded
+        self.assertTrue(os.path.exists(dest_path))
+        self.assertTrue(os.path.exists(os.path.join(dest_path, ".git")))
+
+    def _start_git_daemon(self, repo_path):
+        """Start git daemon for testing."""
+        import socket
+        import subprocess
+        import time
+
+        # Find an available port
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.bind(("localhost", 0))
+        _, port = sock.getsockname()
+        sock.close()
+
+        # Mark directory as git daemon export
+        export_file = os.path.join(repo_path, "git-daemon-export-ok")
+        with open(export_file, "w") as f:
+            f.write("")
+
+        # Start git daemon
+        daemon_process = subprocess.Popen(
+            [
+                "git",
+                "daemon",
+                "--reuseaddr",
+                f"--port={port}",
+                "--base-path=.",
+                "--export-all",
+                "--enable=receive-pack",
+                ".",
+            ],
+            cwd=repo_path,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+
+        # Give daemon time to start
+        time.sleep(0.5)
+
+        def cleanup_daemon():
+            daemon_process.terminate()
+            daemon_process.wait(timeout=2)
+
+        self.addCleanup(cleanup_daemon)
+
+        return port

+ 999 - 0
tests/test_object_filters.py

@@ -0,0 +1,999 @@
+# test_object_filters.py -- Tests for object filtering
+# Copyright (C) 2024 Jelmer Vernooij <jelmer@jelmer.uk>
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as published by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Tests for object filtering (partial clone filter specifications)."""
+
+import os
+import tempfile
+
+from dulwich.object_filters import (
+    BlobLimitFilter,
+    BlobNoneFilter,
+    CombineFilter,
+    SparseOidFilter,
+    TreeDepthFilter,
+    filter_pack_objects,
+    parse_filter_spec,
+)
+from dulwich.object_store import MemoryObjectStore
+from dulwich.objects import Blob, Tree
+from dulwich.repo import Repo
+from dulwich.tests.utils import make_commit
+
+from . import TestCase
+
+
+class ParseFilterSpecTests(TestCase):
+    """Test parse_filter_spec function."""
+
+    def test_parse_blob_none(self):
+        """Test parsing 'blob:none' filter."""
+        filter_spec = parse_filter_spec("blob:none")
+        self.assertIsInstance(filter_spec, BlobNoneFilter)
+        self.assertEqual("blob:none", filter_spec.to_spec_string())
+
+    def test_parse_blob_none_bytes(self):
+        """Test parsing 'blob:none' as bytes."""
+        filter_spec = parse_filter_spec(b"blob:none")
+        self.assertIsInstance(filter_spec, BlobNoneFilter)
+
+    def test_parse_blob_limit_bytes(self):
+        """Test parsing 'blob:limit=100' in bytes."""
+        filter_spec = parse_filter_spec("blob:limit=100")
+        self.assertIsInstance(filter_spec, BlobLimitFilter)
+        self.assertEqual(100, filter_spec.limit)
+
+    def test_parse_blob_limit_kb(self):
+        """Test parsing 'blob:limit=10k'."""
+        filter_spec = parse_filter_spec("blob:limit=10k")
+        self.assertIsInstance(filter_spec, BlobLimitFilter)
+        self.assertEqual(10 * 1024, filter_spec.limit)
+
+    def test_parse_blob_limit_mb(self):
+        """Test parsing 'blob:limit=5m'."""
+        filter_spec = parse_filter_spec("blob:limit=5m")
+        self.assertIsInstance(filter_spec, BlobLimitFilter)
+        self.assertEqual(5 * 1024 * 1024, filter_spec.limit)
+
+    def test_parse_blob_limit_gb(self):
+        """Test parsing 'blob:limit=1g'."""
+        filter_spec = parse_filter_spec("blob:limit=1g")
+        self.assertIsInstance(filter_spec, BlobLimitFilter)
+        self.assertEqual(1024 * 1024 * 1024, filter_spec.limit)
+
+    def test_parse_tree_depth(self):
+        """Test parsing 'tree:0' filter."""
+        filter_spec = parse_filter_spec("tree:0")
+        self.assertIsInstance(filter_spec, TreeDepthFilter)
+        self.assertEqual(0, filter_spec.max_depth)
+
+    def test_parse_tree_depth_nonzero(self):
+        """Test parsing 'tree:3' filter."""
+        filter_spec = parse_filter_spec("tree:3")
+        self.assertIsInstance(filter_spec, TreeDepthFilter)
+        self.assertEqual(3, filter_spec.max_depth)
+
+    def test_parse_sparse_oid(self):
+        """Test parsing 'sparse:oid=<oid>' filter."""
+        oid = b"1234567890abcdef1234567890abcdef12345678"
+        filter_spec = parse_filter_spec(f"sparse:oid={oid.decode('ascii')}")
+        self.assertIsInstance(filter_spec, SparseOidFilter)
+        self.assertEqual(oid, filter_spec.oid)
+
+    def test_parse_combine(self):
+        """Test parsing 'combine:blob:none+tree:0' filter."""
+        filter_spec = parse_filter_spec("combine:blob:none+tree:0")
+        self.assertIsInstance(filter_spec, CombineFilter)
+        self.assertEqual(2, len(filter_spec.filters))
+        self.assertIsInstance(filter_spec.filters[0], BlobNoneFilter)
+        self.assertIsInstance(filter_spec.filters[1], TreeDepthFilter)
+
+    def test_parse_combine_multiple(self):
+        """Test parsing combine filter with 3+ filters."""
+        filter_spec = parse_filter_spec("combine:blob:none+tree:0+blob:limit=1m")
+        self.assertIsInstance(filter_spec, CombineFilter)
+        self.assertEqual(3, len(filter_spec.filters))
+
+    def test_parse_unknown_spec(self):
+        """Test that unknown filter specs raise ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("unknown:spec")
+        self.assertIn("Unknown filter specification", str(cm.exception))
+
+    def test_parse_invalid_tree_depth(self):
+        """Test that invalid tree depth raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("tree:invalid")
+        self.assertIn("Invalid tree filter", str(cm.exception))
+
+    def test_parse_invalid_blob_limit(self):
+        """Test that invalid blob limit raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("blob:limit=invalid")
+        self.assertIn("Invalid", str(cm.exception))
+
+    def test_parse_empty_spec(self):
+        """Test that empty filter spec raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("")
+        self.assertIn("cannot be empty", str(cm.exception))
+
+    def test_parse_blob_limit_no_value(self):
+        """Test that blob:limit without value raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("blob:limit=")
+        self.assertIn("requires a size value", str(cm.exception))
+
+    def test_parse_tree_no_value(self):
+        """Test that tree: without depth raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("tree:")
+        self.assertIn("requires a depth value", str(cm.exception))
+
+    def test_parse_tree_negative_depth(self):
+        """Test that negative tree depth raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("tree:-1")
+        self.assertIn("non-negative", str(cm.exception))
+
+    def test_parse_sparse_oid_invalid_length(self):
+        """Test that invalid OID length raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("sparse:oid=abc123")
+        self.assertIn("40 or 64 hex chars", str(cm.exception))
+
+    def test_parse_sparse_oid_invalid_hex(self):
+        """Test that non-hex OID raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("sparse:oid=" + "x" * 40)
+        self.assertIn("valid object ID", str(cm.exception))
+
+    def test_parse_combine_single_filter(self):
+        """Test that combine with single filter raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("combine:blob:none")
+        self.assertIn("at least two filters", str(cm.exception))
+
+    def test_parse_unknown_with_helpful_message(self):
+        """Test that unknown spec gives helpful error message."""
+        with self.assertRaises(ValueError) as cm:
+            parse_filter_spec("unknown:spec")
+        error_msg = str(cm.exception)
+        self.assertIn("Unknown filter specification", error_msg)
+        self.assertIn("Supported formats", error_msg)
+        self.assertIn("blob:none", error_msg)
+
+
+class BlobNoneFilterTests(TestCase):
+    """Test BlobNoneFilter class."""
+
+    def test_should_include_blob(self):
+        """Test that BlobNoneFilter excludes all blobs."""
+        filter_spec = BlobNoneFilter()
+        self.assertFalse(filter_spec.should_include_blob(0))
+        self.assertFalse(filter_spec.should_include_blob(100))
+        self.assertFalse(filter_spec.should_include_blob(1024 * 1024))
+
+    def test_should_include_tree(self):
+        """Test that BlobNoneFilter includes all trees."""
+        filter_spec = BlobNoneFilter()
+        self.assertTrue(filter_spec.should_include_tree(0))
+        self.assertTrue(filter_spec.should_include_tree(1))
+        self.assertTrue(filter_spec.should_include_tree(100))
+
+    def test_to_spec_string(self):
+        """Test conversion back to spec string."""
+        filter_spec = BlobNoneFilter()
+        self.assertEqual("blob:none", filter_spec.to_spec_string())
+
+    def test_repr(self):
+        """Test repr output."""
+        filter_spec = BlobNoneFilter()
+        self.assertEqual("BlobNoneFilter()", repr(filter_spec))
+
+
+class BlobLimitFilterTests(TestCase):
+    """Test BlobLimitFilter class."""
+
+    def test_should_include_blob_under_limit(self):
+        """Test that blobs under limit are included."""
+        filter_spec = BlobLimitFilter(1024)
+        self.assertTrue(filter_spec.should_include_blob(0))
+        self.assertTrue(filter_spec.should_include_blob(512))
+        self.assertTrue(filter_spec.should_include_blob(1024))
+
+    def test_should_include_blob_over_limit(self):
+        """Test that blobs over limit are excluded."""
+        filter_spec = BlobLimitFilter(1024)
+        self.assertFalse(filter_spec.should_include_blob(1025))
+        self.assertFalse(filter_spec.should_include_blob(2048))
+
+    def test_should_include_tree(self):
+        """Test that BlobLimitFilter includes all trees."""
+        filter_spec = BlobLimitFilter(1024)
+        self.assertTrue(filter_spec.should_include_tree(0))
+        self.assertTrue(filter_spec.should_include_tree(100))
+
+    def test_to_spec_string_bytes(self):
+        """Test conversion to spec string with bytes."""
+        filter_spec = BlobLimitFilter(100)
+        self.assertEqual("blob:limit=100", filter_spec.to_spec_string())
+
+    def test_to_spec_string_kb(self):
+        """Test conversion to spec string with KB."""
+        filter_spec = BlobLimitFilter(10 * 1024)
+        self.assertEqual("blob:limit=10k", filter_spec.to_spec_string())
+
+    def test_to_spec_string_mb(self):
+        """Test conversion to spec string with MB."""
+        filter_spec = BlobLimitFilter(5 * 1024 * 1024)
+        self.assertEqual("blob:limit=5m", filter_spec.to_spec_string())
+
+    def test_to_spec_string_gb(self):
+        """Test conversion to spec string with GB."""
+        filter_spec = BlobLimitFilter(2 * 1024 * 1024 * 1024)
+        self.assertEqual("blob:limit=2g", filter_spec.to_spec_string())
+
+    def test_to_spec_string_not_round(self):
+        """Test conversion to spec string with non-round size."""
+        filter_spec = BlobLimitFilter(1500)
+        self.assertEqual("blob:limit=1500", filter_spec.to_spec_string())
+
+    def test_repr(self):
+        """Test repr output."""
+        filter_spec = BlobLimitFilter(1024)
+        self.assertEqual("BlobLimitFilter(limit=1024)", repr(filter_spec))
+
+
+class TreeDepthFilterTests(TestCase):
+    """Test TreeDepthFilter class."""
+
+    def test_should_include_blob(self):
+        """Test that TreeDepthFilter includes all blobs."""
+        filter_spec = TreeDepthFilter(0)
+        self.assertTrue(filter_spec.should_include_blob(0))
+        self.assertTrue(filter_spec.should_include_blob(1024))
+
+    def test_should_include_tree_at_depth(self):
+        """Test that trees at or below max_depth are included."""
+        filter_spec = TreeDepthFilter(2)
+        self.assertTrue(filter_spec.should_include_tree(0))
+        self.assertTrue(filter_spec.should_include_tree(1))
+        self.assertTrue(filter_spec.should_include_tree(2))
+
+    def test_should_include_tree_beyond_depth(self):
+        """Test that trees beyond max_depth are excluded."""
+        filter_spec = TreeDepthFilter(2)
+        self.assertFalse(filter_spec.should_include_tree(3))
+        self.assertFalse(filter_spec.should_include_tree(10))
+
+    def test_to_spec_string(self):
+        """Test conversion back to spec string."""
+        filter_spec = TreeDepthFilter(3)
+        self.assertEqual("tree:3", filter_spec.to_spec_string())
+
+    def test_repr(self):
+        """Test repr output."""
+        filter_spec = TreeDepthFilter(2)
+        self.assertEqual("TreeDepthFilter(max_depth=2)", repr(filter_spec))
+
+
+class SparseOidFilterTests(TestCase):
+    """Test SparseOidFilter class."""
+
+    def test_should_include_blob(self):
+        """Test that SparseOidFilter includes all blobs."""
+        oid = b"1234567890abcdef1234567890abcdef12345678"
+        filter_spec = SparseOidFilter(oid)
+        self.assertTrue(filter_spec.should_include_blob(0))
+        self.assertTrue(filter_spec.should_include_blob(1024))
+
+    def test_should_include_tree(self):
+        """Test that SparseOidFilter includes all trees."""
+        oid = b"1234567890abcdef1234567890abcdef12345678"
+        filter_spec = SparseOidFilter(oid)
+        self.assertTrue(filter_spec.should_include_tree(0))
+        self.assertTrue(filter_spec.should_include_tree(10))
+
+    def test_to_spec_string(self):
+        """Test conversion back to spec string."""
+        oid = b"1234567890abcdef1234567890abcdef12345678"
+        filter_spec = SparseOidFilter(oid)
+        expected = "sparse:oid=1234567890abcdef1234567890abcdef12345678"
+        self.assertEqual(expected, filter_spec.to_spec_string())
+
+    def test_repr(self):
+        """Test repr output."""
+        oid = b"1234567890abcdef1234567890abcdef12345678"
+        filter_spec = SparseOidFilter(oid)
+        self.assertIn("SparseOidFilter", repr(filter_spec))
+        self.assertIn("1234567890abcdef1234567890abcdef12345678", repr(filter_spec))
+
+    def test_load_patterns_from_blob(self):
+        """Test loading sparse patterns from a blob object."""
+        from dulwich.object_store import MemoryObjectStore
+        from dulwich.objects import Blob
+
+        # Create a sparse patterns blob
+        patterns = b"*.txt\n!*.log\n/src/\n"
+        blob = Blob.from_string(patterns)
+
+        object_store = MemoryObjectStore()
+        object_store.add_object(blob)
+
+        filter_spec = SparseOidFilter(blob.id, object_store=object_store)
+        filter_spec._load_patterns()
+
+        # Verify patterns were loaded
+        self.assertIsNotNone(filter_spec._patterns)
+        self.assertEqual(3, len(filter_spec._patterns))
+
+    def test_load_patterns_missing_blob(self):
+        """Test error when sparse blob is not found."""
+        from dulwich.object_store import MemoryObjectStore
+
+        oid = b"1234567890abcdef1234567890abcdef12345678"
+        object_store = MemoryObjectStore()
+
+        filter_spec = SparseOidFilter(oid, object_store=object_store)
+
+        with self.assertRaises(ValueError) as cm:
+            filter_spec._load_patterns()
+        self.assertIn("not found", str(cm.exception))
+
+    def test_load_patterns_not_a_blob(self):
+        """Test error when sparse OID points to non-blob object."""
+        from dulwich.object_store import MemoryObjectStore
+        from dulwich.objects import Tree
+
+        tree = Tree()
+        object_store = MemoryObjectStore()
+        object_store.add_object(tree)
+
+        filter_spec = SparseOidFilter(tree.id, object_store=object_store)
+
+        with self.assertRaises(ValueError) as cm:
+            filter_spec._load_patterns()
+        self.assertIn("not a blob", str(cm.exception))
+
+    def test_load_patterns_without_object_store(self):
+        """Test error when trying to load patterns without object store."""
+        oid = b"1234567890abcdef1234567890abcdef12345678"
+        filter_spec = SparseOidFilter(oid)
+
+        with self.assertRaises(ValueError) as cm:
+            filter_spec._load_patterns()
+        self.assertIn("without an object store", str(cm.exception))
+
+    def test_should_include_path_matching(self):
+        """Test path matching with sparse patterns."""
+        from dulwich.object_store import MemoryObjectStore
+        from dulwich.objects import Blob
+
+        # Create a sparse patterns blob: include *.txt files
+        patterns = b"*.txt\n"
+        blob = Blob.from_string(patterns)
+
+        object_store = MemoryObjectStore()
+        object_store.add_object(blob)
+
+        filter_spec = SparseOidFilter(blob.id, object_store=object_store)
+
+        # .txt files should be included
+        self.assertTrue(filter_spec.should_include_path("readme.txt"))
+        self.assertTrue(filter_spec.should_include_path("docs/file.txt"))
+
+        # Other files should not be included
+        self.assertFalse(filter_spec.should_include_path("readme.md"))
+        self.assertFalse(filter_spec.should_include_path("script.py"))
+
+    def test_should_include_path_negation(self):
+        """Test path matching with negation patterns."""
+        from dulwich.object_store import MemoryObjectStore
+        from dulwich.objects import Blob
+
+        # Include all .txt files except logs
+        patterns = b"*.txt\n!*.log\n"
+        blob = Blob.from_string(patterns)
+
+        object_store = MemoryObjectStore()
+        object_store.add_object(blob)
+
+        filter_spec = SparseOidFilter(blob.id, object_store=object_store)
+
+        # .txt files should be included
+        self.assertTrue(filter_spec.should_include_path("readme.txt"))
+
+        # But .log files should be excluded (even though they end in .txt pattern)
+        # Note: This depends on pattern order and sparse_patterns implementation
+        self.assertFalse(filter_spec.should_include_path("debug.log"))
+
+
+class CombineFilterTests(TestCase):
+    """Test CombineFilter class."""
+
+    def test_should_include_blob_all_allow(self):
+        """Test that blob is included when all filters allow it."""
+        filters = [BlobLimitFilter(1024), BlobLimitFilter(2048)]
+        filter_spec = CombineFilter(filters)
+        self.assertTrue(filter_spec.should_include_blob(512))
+
+    def test_should_include_blob_one_denies(self):
+        """Test that blob is excluded when one filter denies it."""
+        filters = [BlobLimitFilter(1024), BlobNoneFilter()]
+        filter_spec = CombineFilter(filters)
+        self.assertFalse(filter_spec.should_include_blob(512))
+
+    def test_should_include_tree_all_allow(self):
+        """Test that tree is included when all filters allow it."""
+        filters = [TreeDepthFilter(2), TreeDepthFilter(3)]
+        filter_spec = CombineFilter(filters)
+        self.assertTrue(filter_spec.should_include_tree(1))
+
+    def test_should_include_tree_one_denies(self):
+        """Test that tree is excluded when one filter denies it."""
+        filters = [TreeDepthFilter(2), TreeDepthFilter(1)]
+        filter_spec = CombineFilter(filters)
+        self.assertFalse(filter_spec.should_include_tree(2))
+
+    def test_to_spec_string(self):
+        """Test conversion back to spec string."""
+        filters = [BlobNoneFilter(), TreeDepthFilter(0)]
+        filter_spec = CombineFilter(filters)
+        self.assertEqual("combine:blob:none+tree:0", filter_spec.to_spec_string())
+
+    def test_repr(self):
+        """Test repr output."""
+        filters = [BlobNoneFilter()]
+        filter_spec = CombineFilter(filters)
+        self.assertIn("CombineFilter", repr(filter_spec))
+
+
+class FilterPackObjectsTests(TestCase):
+    """Test filter_pack_objects function."""
+
+    def setUp(self):
+        super().setUp()
+        self.store = MemoryObjectStore()
+
+        # Create test objects
+        self.small_blob = Blob.from_string(b"small")
+        self.large_blob = Blob.from_string(b"x" * 2000)
+        self.tree = Tree()
+        self.commit = make_commit(tree=self.tree.id)
+
+        # Add objects to store
+        self.store.add_object(self.small_blob)
+        self.store.add_object(self.large_blob)
+        self.store.add_object(self.tree)
+        self.store.add_object(self.commit)
+
+    def test_filter_blob_none(self):
+        """Test that blob:none filter excludes all blobs."""
+        object_ids = [
+            self.small_blob.id,
+            self.large_blob.id,
+            self.tree.id,
+            self.commit.id,
+        ]
+
+        filter_spec = BlobNoneFilter()
+        filtered = filter_pack_objects(self.store, object_ids, filter_spec)
+
+        # Should exclude both blobs but keep tree and commit
+        self.assertNotIn(self.small_blob.id, filtered)
+        self.assertNotIn(self.large_blob.id, filtered)
+        self.assertIn(self.tree.id, filtered)
+        self.assertIn(self.commit.id, filtered)
+
+    def test_filter_blob_limit(self):
+        """Test that blob:limit filter excludes blobs over size limit."""
+        object_ids = [
+            self.small_blob.id,
+            self.large_blob.id,
+            self.tree.id,
+        ]
+
+        # Set limit to 100 bytes
+        filter_spec = BlobLimitFilter(100)
+        filtered = filter_pack_objects(self.store, object_ids, filter_spec)
+
+        # Should keep small blob but exclude large blob
+        self.assertIn(self.small_blob.id, filtered)
+        self.assertNotIn(self.large_blob.id, filtered)
+        self.assertIn(self.tree.id, filtered)
+
+    def test_filter_no_filter_keeps_all(self):
+        """Test that without filtering all objects are kept."""
+        # Create a filter that includes everything
+        filter_spec = BlobLimitFilter(10000)  # Large limit
+
+        object_ids = [
+            self.small_blob.id,
+            self.large_blob.id,
+            self.tree.id,
+            self.commit.id,
+        ]
+
+        filtered = filter_pack_objects(self.store, object_ids, filter_spec)
+
+        # All objects should be included
+        self.assertEqual(len(filtered), len(object_ids))
+        for oid in object_ids:
+            self.assertIn(oid, filtered)
+
+    def test_filter_missing_object(self):
+        """Test that missing objects are skipped without error."""
+        from dulwich.objects import ObjectID
+
+        fake_id = ObjectID(b"0" * 40)
+        object_ids = [fake_id, self.small_blob.id]
+
+        filter_spec = BlobNoneFilter()
+        filtered = filter_pack_objects(self.store, object_ids, filter_spec)
+
+        # Should skip the missing object
+        self.assertNotIn(fake_id, filtered)
+
+    def test_filter_combine(self):
+        """Test combined filters."""
+        object_ids = [
+            self.small_blob.id,
+            self.large_blob.id,
+            self.tree.id,
+        ]
+
+        # Combine blob:limit with another filter
+        filter_spec = CombineFilter(
+            [
+                BlobLimitFilter(100),
+                BlobNoneFilter(),  # This will exclude ALL blobs
+            ]
+        )
+
+        filtered = filter_pack_objects(self.store, object_ids, filter_spec)
+
+        # Should exclude all blobs due to BlobNoneFilter
+        self.assertNotIn(self.small_blob.id, filtered)
+        self.assertNotIn(self.large_blob.id, filtered)
+        self.assertIn(self.tree.id, filtered)
+
+
+class PartialCloneIntegrationTests(TestCase):
+    """Integration tests for partial clone with real repositories."""
+
+    def setUp(self):
+        super().setUp()
+        self.repo_dir = tempfile.mkdtemp()
+        self.addCleanup(self._cleanup)
+        self.repo = Repo.init(self.repo_dir)
+
+    def _cleanup(self):
+        """Clean up test repository."""
+        import shutil
+
+        if os.path.exists(self.repo_dir):
+            shutil.rmtree(self.repo_dir)
+
+    def test_blob_none_filter_with_real_repo(self):
+        """Test blob:none filter excludes blobs in real repository."""
+        # Create a tree with files
+        tree = Tree()
+
+        # Add some blobs to the tree
+        blob1 = Blob.from_string(b"file1 content")
+        blob2 = Blob.from_string(b"file2 content")
+        tree.add(b"file1.txt", 0o100644, blob1.id)
+        tree.add(b"file2.txt", 0o100644, blob2.id)
+
+        # Add objects to repo
+        self.repo.object_store.add_object(blob1)
+        self.repo.object_store.add_object(blob2)
+        self.repo.object_store.add_object(tree)
+
+        # Create commit
+        commit = make_commit(tree=tree.id, message=b"Test commit")
+        self.repo.object_store.add_object(commit)
+
+        # Get all objects
+        object_ids = [blob1.id, blob2.id, tree.id, commit.id]
+
+        # Apply blob:none filter
+        filter_spec = BlobNoneFilter()
+        filtered = filter_pack_objects(self.repo.object_store, object_ids, filter_spec)
+
+        # Verify blobs are excluded
+        self.assertNotIn(blob1.id, filtered)
+        self.assertNotIn(blob2.id, filtered)
+        # But tree and commit are included
+        self.assertIn(tree.id, filtered)
+        self.assertIn(commit.id, filtered)
+
+        # Verify we have only 2 objects (tree + commit)
+        self.assertEqual(2, len(filtered))
+
+    def test_blob_limit_filter_with_mixed_sizes(self):
+        """Test blob:limit filter with mixed blob sizes."""
+        tree = Tree()
+
+        # Create blobs of different sizes
+        small_blob = Blob.from_string(b"small")  # 5 bytes
+        medium_blob = Blob.from_string(b"x" * 50)  # 50 bytes
+        large_blob = Blob.from_string(b"y" * 500)  # 500 bytes
+
+        tree.add(b"small.txt", 0o100644, small_blob.id)
+        tree.add(b"medium.txt", 0o100644, medium_blob.id)
+        tree.add(b"large.txt", 0o100644, large_blob.id)
+
+        # Add to repo
+        self.repo.object_store.add_object(small_blob)
+        self.repo.object_store.add_object(medium_blob)
+        self.repo.object_store.add_object(large_blob)
+        self.repo.object_store.add_object(tree)
+
+        commit = make_commit(tree=tree.id)
+        self.repo.object_store.add_object(commit)
+
+        # Test with 100 byte limit
+        object_ids = [
+            small_blob.id,
+            medium_blob.id,
+            large_blob.id,
+            tree.id,
+            commit.id,
+        ]
+
+        filter_spec = BlobLimitFilter(100)
+        filtered = filter_pack_objects(self.repo.object_store, object_ids, filter_spec)
+
+        # Small and medium should be included
+        self.assertIn(small_blob.id, filtered)
+        self.assertIn(medium_blob.id, filtered)
+        # Large should be excluded
+        self.assertNotIn(large_blob.id, filtered)
+        # Tree and commit included
+        self.assertIn(tree.id, filtered)
+        self.assertIn(commit.id, filtered)
+
+    def test_combined_filter_integration(self):
+        """Test combined filters in real scenario."""
+        tree = Tree()
+
+        blob1 = Blob.from_string(b"content1")
+        blob2 = Blob.from_string(b"x" * 1000)
+
+        tree.add(b"file1.txt", 0o100644, blob1.id)
+        tree.add(b"file2.txt", 0o100644, blob2.id)
+
+        self.repo.object_store.add_object(blob1)
+        self.repo.object_store.add_object(blob2)
+        self.repo.object_store.add_object(tree)
+
+        commit = make_commit(tree=tree.id)
+        self.repo.object_store.add_object(commit)
+
+        # Combine: limit to 500 bytes, but also apply blob:none
+        # This should exclude ALL blobs (blob:none overrides limit)
+        filter_spec = CombineFilter(
+            [
+                BlobLimitFilter(500),
+                BlobNoneFilter(),
+            ]
+        )
+
+        object_ids = [blob1.id, blob2.id, tree.id, commit.id]
+        filtered = filter_pack_objects(self.repo.object_store, object_ids, filter_spec)
+
+        # All blobs excluded
+        self.assertNotIn(blob1.id, filtered)
+        self.assertNotIn(blob2.id, filtered)
+        # Only tree and commit
+        self.assertEqual(2, len(filtered))
+
+
+class FilterPackObjectsWithPathsTests(TestCase):
+    """Test filter_pack_objects_with_paths function."""
+
+    def setUp(self):
+        super().setUp()
+        self.object_store = MemoryObjectStore()
+
+    def test_tree_depth_filtering(self):
+        """Test filtering by tree depth."""
+        from dulwich.object_filters import (
+            TreeDepthFilter,
+            filter_pack_objects_with_paths,
+        )
+        from dulwich.objects import Blob, Tree
+        from dulwich.tests.utils import make_commit
+
+        # Create a nested tree structure:
+        # root/
+        #   file1.txt (blob1)
+        #   dir1/
+        #     file2.txt (blob2)
+        #     dir2/
+        #       file3.txt (blob3)
+
+        blob1 = Blob.from_string(b"file1 content")
+        blob2 = Blob.from_string(b"file2 content")
+        blob3 = Blob.from_string(b"file3 content")
+
+        # deepest tree (dir2)
+        tree_dir2 = Tree()
+        tree_dir2.add(b"file3.txt", 0o100644, blob3.id)
+
+        # middle tree (dir1)
+        tree_dir1 = Tree()
+        tree_dir1.add(b"file2.txt", 0o100644, blob2.id)
+        tree_dir1.add(b"dir2", 0o040000, tree_dir2.id)
+
+        # root tree
+        tree_root = Tree()
+        tree_root.add(b"file1.txt", 0o100644, blob1.id)
+        tree_root.add(b"dir1", 0o040000, tree_dir1.id)
+
+        # Add all objects to store
+        for obj in [blob1, blob2, blob3, tree_dir2, tree_dir1, tree_root]:
+            self.object_store.add_object(obj)
+
+        commit = make_commit(tree=tree_root.id)
+        self.object_store.add_object(commit)
+
+        # Filter with depth=1 (root + 1 level deep)
+        filter_spec = TreeDepthFilter(1)
+        filtered = filter_pack_objects_with_paths(
+            self.object_store, [commit.id], filter_spec
+        )
+
+        # Should include: commit, tree_root (depth 0), tree_dir1 (depth 1),
+        # blob1 (in root), blob2 (in dir1)
+        # Should exclude: tree_dir2 (depth 2), blob3 (in dir2)
+        self.assertIn(commit.id, filtered)
+        self.assertIn(tree_root.id, filtered)
+        self.assertIn(tree_dir1.id, filtered)
+        self.assertIn(blob1.id, filtered)
+        self.assertIn(blob2.id, filtered)
+        self.assertNotIn(tree_dir2.id, filtered)
+        self.assertNotIn(blob3.id, filtered)
+
+    def test_sparse_oid_path_filtering(self):
+        """Test filtering by sparse checkout patterns."""
+        from dulwich.object_filters import (
+            SparseOidFilter,
+            filter_pack_objects_with_paths,
+        )
+        from dulwich.objects import Blob, Tree
+        from dulwich.tests.utils import make_commit
+
+        # Create sparse patterns blob that includes only *.txt files
+        patterns = b"*.txt\n"
+        patterns_blob = Blob.from_string(patterns)
+        self.object_store.add_object(patterns_blob)
+
+        # Create a tree with mixed file types:
+        # root/
+        #   readme.txt (should be included)
+        #   script.py (should be excluded)
+        #   docs/
+        #     guide.txt (should be included)
+        #     image.png (should be excluded)
+
+        blob_readme = Blob.from_string(b"readme content")
+        blob_script = Blob.from_string(b"script content")
+        blob_guide = Blob.from_string(b"guide content")
+        blob_image = Blob.from_string(b"image content")
+
+        tree_docs = Tree()
+        tree_docs.add(b"guide.txt", 0o100644, blob_guide.id)
+        tree_docs.add(b"image.png", 0o100644, blob_image.id)
+
+        tree_root = Tree()
+        tree_root.add(b"readme.txt", 0o100644, blob_readme.id)
+        tree_root.add(b"script.py", 0o100644, blob_script.id)
+        tree_root.add(b"docs", 0o040000, tree_docs.id)
+
+        # Add all objects
+        for obj in [
+            blob_readme,
+            blob_script,
+            blob_guide,
+            blob_image,
+            tree_docs,
+            tree_root,
+        ]:
+            self.object_store.add_object(obj)
+
+        commit = make_commit(tree=tree_root.id)
+        self.object_store.add_object(commit)
+
+        # Create sparse filter
+        filter_spec = SparseOidFilter(patterns_blob.id, object_store=self.object_store)
+        filtered = filter_pack_objects_with_paths(
+            self.object_store, [commit.id], filter_spec
+        )
+
+        # Should include: commit, trees, and .txt blobs
+        self.assertIn(commit.id, filtered)
+        self.assertIn(tree_root.id, filtered)
+        self.assertIn(tree_docs.id, filtered)
+        self.assertIn(blob_readme.id, filtered)
+        self.assertIn(blob_guide.id, filtered)
+
+        # Should exclude: non-.txt blobs
+        self.assertNotIn(blob_script.id, filtered)
+        self.assertNotIn(blob_image.id, filtered)
+
+    def test_blob_size_filtering_with_paths(self):
+        """Test that blob size filtering still works with path tracking."""
+        from dulwich.object_filters import (
+            BlobLimitFilter,
+            filter_pack_objects_with_paths,
+        )
+        from dulwich.objects import Blob, Tree
+        from dulwich.tests.utils import make_commit
+
+        # Create blobs of different sizes
+        blob_small = Blob.from_string(b"small")  # 5 bytes
+        blob_large = Blob.from_string(b"x" * 1000)  # 1000 bytes
+
+        tree = Tree()
+        tree.add(b"small.txt", 0o100644, blob_small.id)
+        tree.add(b"large.txt", 0o100644, blob_large.id)
+
+        for obj in [blob_small, blob_large, tree]:
+            self.object_store.add_object(obj)
+
+        commit = make_commit(tree=tree.id)
+        self.object_store.add_object(commit)
+
+        # Filter with 100 byte limit
+        filter_spec = BlobLimitFilter(100)
+        filtered = filter_pack_objects_with_paths(
+            self.object_store, [commit.id], filter_spec
+        )
+
+        # Should include small blob but not large
+        self.assertIn(commit.id, filtered)
+        self.assertIn(tree.id, filtered)
+        self.assertIn(blob_small.id, filtered)
+        self.assertNotIn(blob_large.id, filtered)
+
+    def test_combined_sparse_and_size_filter(self):
+        """Test combining sparse patterns with blob size limits."""
+        from dulwich.object_filters import (
+            BlobLimitFilter,
+            CombineFilter,
+            SparseOidFilter,
+            filter_pack_objects_with_paths,
+        )
+        from dulwich.objects import Blob, Tree
+        from dulwich.tests.utils import make_commit
+
+        # Create sparse patterns: only *.txt files
+        patterns = b"*.txt\n"
+        patterns_blob = Blob.from_string(patterns)
+        self.object_store.add_object(patterns_blob)
+
+        # Create files:
+        # - small.txt (5 bytes, .txt) -> should be included
+        # - large.txt (1000 bytes, .txt) -> excluded by size
+        # - small.py (5 bytes, .py) -> excluded by pattern
+        # - large.py (1000 bytes, .py) -> excluded by both
+
+        blob_small_txt = Blob.from_string(b"small txt")
+        blob_large_txt = Blob.from_string(b"x" * 1000)
+        blob_small_py = Blob.from_string(b"small py")
+        blob_large_py = Blob.from_string(b"y" * 1000)
+
+        tree = Tree()
+        tree.add(b"small.txt", 0o100644, blob_small_txt.id)
+        tree.add(b"large.txt", 0o100644, blob_large_txt.id)
+        tree.add(b"small.py", 0o100644, blob_small_py.id)
+        tree.add(b"large.py", 0o100644, blob_large_py.id)
+
+        for obj in [blob_small_txt, blob_large_txt, blob_small_py, blob_large_py, tree]:
+            self.object_store.add_object(obj)
+
+        commit = make_commit(tree=tree.id)
+        self.object_store.add_object(commit)
+
+        # Combine: sparse filter + 100 byte limit
+        filter_spec = CombineFilter(
+            [
+                SparseOidFilter(patterns_blob.id, object_store=self.object_store),
+                BlobLimitFilter(100),
+            ]
+        )
+
+        filtered = filter_pack_objects_with_paths(
+            self.object_store, [commit.id], filter_spec
+        )
+
+        # Only small.txt should be included (matches pattern AND size limit)
+        self.assertIn(commit.id, filtered)
+        self.assertIn(tree.id, filtered)
+        self.assertIn(blob_small_txt.id, filtered)
+        self.assertNotIn(blob_large_txt.id, filtered)  # Too large
+        self.assertNotIn(blob_small_py.id, filtered)  # Wrong pattern
+        self.assertNotIn(blob_large_py.id, filtered)  # Both wrong
+
+    def test_blob_none_filter_with_paths(self):
+        """Test that blob:none excludes all blobs with path tracking."""
+        from dulwich.object_filters import (
+            BlobNoneFilter,
+            filter_pack_objects_with_paths,
+        )
+        from dulwich.objects import Blob, Tree
+        from dulwich.tests.utils import make_commit
+
+        blob1 = Blob.from_string(b"content1")
+        blob2 = Blob.from_string(b"content2")
+
+        tree = Tree()
+        tree.add(b"file1.txt", 0o100644, blob1.id)
+        tree.add(b"file2.txt", 0o100644, blob2.id)
+
+        for obj in [blob1, blob2, tree]:
+            self.object_store.add_object(obj)
+
+        commit = make_commit(tree=tree.id)
+        self.object_store.add_object(commit)
+
+        filter_spec = BlobNoneFilter()
+        filtered = filter_pack_objects_with_paths(
+            self.object_store, [commit.id], filter_spec
+        )
+
+        # Should include commit and tree but no blobs
+        self.assertIn(commit.id, filtered)
+        self.assertIn(tree.id, filtered)
+        self.assertNotIn(blob1.id, filtered)
+        self.assertNotIn(blob2.id, filtered)
+
+    def test_direct_tree_want(self):
+        """Test filtering when a tree (not commit) is wanted."""
+        from dulwich.object_filters import (
+            BlobLimitFilter,
+            filter_pack_objects_with_paths,
+        )
+        from dulwich.objects import Blob, Tree
+
+        blob_small = Blob.from_string(b"small")
+        blob_large = Blob.from_string(b"x" * 1000)
+
+        tree = Tree()
+        tree.add(b"small.txt", 0o100644, blob_small.id)
+        tree.add(b"large.txt", 0o100644, blob_large.id)
+
+        for obj in [blob_small, blob_large, tree]:
+            self.object_store.add_object(obj)
+
+        # Want the tree directly (not via commit)
+        filter_spec = BlobLimitFilter(100)
+        filtered = filter_pack_objects_with_paths(
+            self.object_store, [tree.id], filter_spec
+        )
+
+        # Should include tree and small blob
+        self.assertIn(tree.id, filtered)
+        self.assertIn(blob_small.id, filtered)
+        self.assertNotIn(blob_large.id, filtered)

+ 41 - 0
tests/test_protocol.py

@@ -25,6 +25,8 @@ from io import BytesIO
 
 from dulwich.errors import HangupException
 from dulwich.protocol import (
+    CAPABILITY_FILTER,
+    KNOWN_UPLOAD_CAPABILITIES,
     MULTI_ACK,
     MULTI_ACK_DETAILED,
     SINGLE_ACK,
@@ -36,6 +38,7 @@ from dulwich.protocol import (
     ack_type,
     extract_capabilities,
     extract_want_line_capabilities,
+    find_capability,
     pkt_line,
     pkt_seq,
 )
@@ -348,3 +351,41 @@ class PktLineParserTests(TestCase):
         parser.parse(b"0005z0006aba")
         self.assertEqual(pktlines, [b"z", b"ab"])
         self.assertEqual(b"a", parser.get_tail())
+
+
+class CapabilitiesTests(TestCase):
+    """Tests for protocol capabilities."""
+
+    def test_filter_capability_in_known_upload_capabilities(self) -> None:
+        """Test that CAPABILITY_FILTER is in KNOWN_UPLOAD_CAPABILITIES."""
+        self.assertIn(CAPABILITY_FILTER, KNOWN_UPLOAD_CAPABILITIES)
+
+
+class FindCapabilityTests(TestCase):
+    """Tests for find_capability function."""
+
+    def test_find_capability_with_value(self) -> None:
+        """Test finding a capability with a value."""
+        caps = [b"filter=blob:none", b"agent=git/2.0"]
+        self.assertEqual(b"blob:none", find_capability(caps, b"filter"))
+
+    def test_find_capability_without_value(self) -> None:
+        """Test finding a capability without a value."""
+        caps = [b"thin-pack", b"ofs-delta"]
+        self.assertEqual(b"thin-pack", find_capability(caps, b"thin-pack"))
+
+    def test_find_capability_not_found(self) -> None:
+        """Test finding a capability that doesn't exist."""
+        caps = [b"thin-pack", b"ofs-delta"]
+        self.assertIsNone(find_capability(caps, b"missing"))
+
+    def test_find_capability_multiple_names(self) -> None:
+        """Test finding with multiple capability names."""
+        caps = [b"filter=blob:none", b"agent=git/2.0"]
+        # Should find first match
+        self.assertEqual(b"git/2.0", find_capability(caps, b"missing", b"agent"))
+
+    def test_find_capability_complex_value(self) -> None:
+        """Test finding capability with complex value."""
+        caps = [b"filter=combine:blob:none+tree:0"]
+        self.assertEqual(b"combine:blob:none+tree:0", find_capability(caps, b"filter"))

+ 28 - 0
tests/test_server.py

@@ -180,6 +180,34 @@ class UploadPackHandlerTestCase(TestCase):
         self._handler.progress(b"second message")
         self.assertRaises(IndexError, self._handler.proto.get_received_line, 2)
 
+    def test_filter_capability_advertised(self) -> None:
+        """Test that the filter capability is advertised by UploadPackHandler."""
+        caps = self._handler.capabilities()
+        self.assertIn(b"filter", caps)
+
+    def test_filter_spec_parsed(self) -> None:
+        """Test that filter specification is parsed from client capabilities."""
+        from dulwich.object_filters import BlobNoneFilter
+
+        caps = [b"filter=blob:none", *list(self._handler.required_capabilities())]
+        self._handler.set_client_capabilities(caps)
+        self.assertIsNotNone(self._handler.filter_spec)
+        self.assertIsInstance(self._handler.filter_spec, BlobNoneFilter)
+
+    def test_filter_spec_not_present(self) -> None:
+        """Test that filter_spec is None when filter capability is not used."""
+        caps = self._handler.required_capabilities()
+        self._handler.set_client_capabilities(caps)
+        self.assertIsNone(self._handler.filter_spec)
+
+    def test_filter_spec_invalid(self) -> None:
+        """Test that invalid filter spec raises GitProtocolError."""
+        from dulwich.errors import GitProtocolError
+
+        caps = [b"filter=invalid:spec", *list(self._handler.required_capabilities())]
+        with self.assertRaises(GitProtocolError):
+            self._handler.set_client_capabilities(caps)
+
     def test_get_tagged(self) -> None:
         refs = {
             b"refs/tags/tag1": ONE,