Browse Source

Add support for reading GitAttributes (#1662)

Jelmer Vernooij 1 month ago
parent
commit
522d961303
7 changed files with 910 additions and 18 deletions
  1. 6 1
      NEWS
  2. 336 0
      dulwich/attrs.py
  3. 22 17
      dulwich/filters.py
  4. 58 0
      dulwich/repo.py
  5. 1 0
      tests/__init__.py
  6. 458 0
      tests/test_attrs.py
  7. 29 0
      tests/test_repository.py

+ 6 - 1
NEWS

@@ -3,7 +3,7 @@
  * Only write Git index extensions when they contain meaningful data.
    Previously, dulwich would write empty extensions to the index file,
    causing unnecessary bloat.
-   (Jelmer Vernooij, #1643)
+   (Andrew Shadura, Jelmer Vernooij, #1643)
 
  * Document that ``porcelain.push`` returns per-ref status information
    in the ``SendPackResult`` object. Added test coverage to verify this
@@ -14,6 +14,11 @@
    CLI command, and ``submodule_update`` CLI command. Add ``--recurse-submodules``
    option to ``clone`` command. (#506, Jelmer Vernooij)
 
+ * Add support for parsing Git attributes from .gitattributes files.
+   This enables proper handling of text/binary detection, line ending
+   normalization, and filter specifications for files.
+   (Jelmer Vernooij, #1211)
+
 0.23.1	2025-06-30
 
  * Support ``untracked_files="normal"`` argument to ``porcelain.status``,

+ 336 - 0
dulwich/attrs.py

@@ -0,0 +1,336 @@
+# attrs.py -- Git attributes for dulwich
+# Copyright (C) 2019-2020 Collabora Ltd
+# Copyright (C) 2019-2020 Andrej Shadura <andrew.shadura@collabora.co.uk>
+#
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Parse .gitattributes file."""
+
+import os
+import re
+from collections.abc import Generator, Mapping
+from typing import (
+    IO,
+    Optional,
+    Union,
+)
+
+AttributeValue = Union[bytes, bool, None]
+
+
+def _parse_attr(attr: bytes) -> tuple[bytes, AttributeValue]:
+    """Parse a git attribute into its value.
+
+    >>> _parse_attr(b'attr')
+    (b'attr', True)
+    >>> _parse_attr(b'-attr')
+    (b'attr', False)
+    >>> _parse_attr(b'!attr')
+    (b'attr', None)
+    >>> _parse_attr(b'attr=text')
+    (b'attr', b'text')
+    """
+    if attr.startswith(b"!"):
+        return attr[1:], None
+    if attr.startswith(b"-"):
+        return attr[1:], False
+    if b"=" not in attr:
+        return attr, True
+    # Split only on first = to handle values with = in them
+    name, _, value = attr.partition(b"=")
+    return name, value
+
+
+def parse_git_attributes(
+    f: IO[bytes],
+) -> Generator[tuple[bytes, Mapping[bytes, AttributeValue]], None, None]:
+    """Parse a Git attributes string.
+
+    Args:
+      f: File-like object to read bytes from
+    Returns:
+      List of patterns and corresponding patterns in the order or them being encountered
+    >>> from io import BytesIO
+    >>> list(parse_git_attributes(BytesIO(b'''*.tar.* filter=lfs diff=lfs merge=lfs -text
+    ...
+    ... # store signatures in Git
+    ... *.tar.*.asc -filter -diff merge=binary -text
+    ...
+    ... # store .dsc verbatim
+    ... *.dsc -filter !diff merge=binary !text
+    ... '''))) #doctest: +NORMALIZE_WHITESPACE
+    [(b'*.tar.*', {'filter': 'lfs', 'diff': 'lfs', 'merge': 'lfs', 'text': False}),
+     (b'*.tar.*.asc', {'filter': False, 'diff': False, 'merge': 'binary', 'text': False}),
+     (b'*.dsc', {'filter': False, 'diff': None, 'merge': 'binary', 'text': None})]
+    """
+    for line in f:
+        line = line.strip()
+
+        # Ignore blank lines, they're used for readability.
+        if not line:
+            continue
+
+        if line.startswith(b"#"):
+            # Comment
+            continue
+
+        pattern, *attrs = line.split()
+
+        yield (pattern, {k: v for k, v in (_parse_attr(a) for a in attrs)})
+
+
+def _translate_pattern(pattern: bytes) -> bytes:
+    """Translate a gitattributes pattern to a regular expression.
+
+    Similar to gitignore patterns, but simpler as gitattributes doesn't support
+    all the same features (e.g., no directory-only patterns with trailing /).
+    """
+    res = b""
+    i = 0
+    n = len(pattern)
+
+    # If pattern doesn't contain /, it can match at any level
+    if b"/" not in pattern:
+        res = b"(?:.*/)??"
+    elif pattern.startswith(b"/"):
+        # Leading / means root of repository
+        pattern = pattern[1:]
+        n = len(pattern)
+
+    while i < n:
+        c = pattern[i : i + 1]
+        i += 1
+
+        if c == b"*":
+            if i < n and pattern[i : i + 1] == b"*":
+                # Double asterisk
+                i += 1
+                if i < n and pattern[i : i + 1] == b"/":
+                    # **/ - match zero or more directories
+                    res += b"(?:.*/)??"
+                    i += 1
+                elif i == n:
+                    # ** at end - match everything
+                    res += b".*"
+                else:
+                    # ** in middle
+                    res += b".*"
+            else:
+                # Single * - match any character except /
+                res += b"[^/]*"
+        elif c == b"?":
+            res += b"[^/]"
+        elif c == b"[":
+            # Character class
+            j = i
+            if j < n and pattern[j : j + 1] == b"!":
+                j += 1
+            if j < n and pattern[j : j + 1] == b"]":
+                j += 1
+            while j < n and pattern[j : j + 1] != b"]":
+                j += 1
+            if j >= n:
+                res += b"\\["
+            else:
+                stuff = pattern[i:j].replace(b"\\", b"\\\\")
+                i = j + 1
+                if stuff.startswith(b"!"):
+                    stuff = b"^" + stuff[1:]
+                elif stuff.startswith(b"^"):
+                    stuff = b"\\" + stuff
+                res += b"[" + stuff + b"]"
+        else:
+            res += re.escape(c)
+
+    return res
+
+
+class Pattern:
+    """A single gitattributes pattern."""
+
+    def __init__(self, pattern: bytes):
+        self.pattern = pattern
+        self._regex: Optional[re.Pattern[bytes]] = None
+        self._compile()
+
+    def _compile(self):
+        """Compile the pattern to a regular expression."""
+        regex_pattern = _translate_pattern(self.pattern)
+        # Add anchors
+        regex_pattern = b"^" + regex_pattern + b"$"
+        self._regex = re.compile(regex_pattern)
+
+    def match(self, path: bytes) -> bool:
+        """Check if path matches this pattern.
+
+        Args:
+            path: Path to check (relative to repository root, using / separators)
+
+        Returns:
+            True if path matches this pattern
+        """
+        # Normalize path
+        if path.startswith(b"/"):
+            path = path[1:]
+
+        # Try to match
+        assert self._regex is not None  # Always set by _compile()
+        return bool(self._regex.match(path))
+
+
+def match_path(
+    patterns: list[tuple[Pattern, Mapping[bytes, AttributeValue]]], path: bytes
+) -> dict[bytes, AttributeValue]:
+    """Get attributes for a path by matching against patterns.
+
+    Args:
+        patterns: List of (Pattern, attributes) tuples
+        path: Path to match (relative to repository root)
+
+    Returns:
+        Dictionary of attributes that apply to this path
+    """
+    attributes: dict[bytes, AttributeValue] = {}
+
+    # Later patterns override earlier ones
+    for pattern, attrs in patterns:
+        if pattern.match(path):
+            # Update attributes
+            for name, value in attrs.items():
+                if value is None:
+                    # Unspecified - remove the attribute
+                    attributes.pop(name, None)
+                else:
+                    attributes[name] = value
+
+    return attributes
+
+
+def parse_gitattributes_file(
+    filename: Union[str, bytes],
+) -> list[tuple[Pattern, Mapping[bytes, AttributeValue]]]:
+    """Parse a gitattributes file and return compiled patterns.
+
+    Args:
+        filename: Path to the .gitattributes file
+
+    Returns:
+        List of (Pattern, attributes) tuples
+    """
+    patterns = []
+
+    if isinstance(filename, str):
+        filename = filename.encode("utf-8")
+
+    with open(filename, "rb") as f:
+        for pattern_bytes, attrs in parse_git_attributes(f):
+            pattern = Pattern(pattern_bytes)
+            patterns.append((pattern, attrs))
+
+    return patterns
+
+
+def read_gitattributes(
+    path: Union[str, bytes],
+) -> list[tuple[Pattern, Mapping[bytes, AttributeValue]]]:
+    """Read .gitattributes from a directory.
+
+    Args:
+        path: Directory path to check for .gitattributes
+
+    Returns:
+        List of (Pattern, attributes) tuples
+    """
+    if isinstance(path, bytes):
+        path = path.decode("utf-8")
+
+    gitattributes_path = os.path.join(path, ".gitattributes")
+    if os.path.exists(gitattributes_path):
+        return parse_gitattributes_file(gitattributes_path)
+
+    return []
+
+
+class GitAttributes:
+    """A collection of gitattributes patterns that can match paths."""
+
+    def __init__(
+        self,
+        patterns: Optional[list[tuple[Pattern, Mapping[bytes, AttributeValue]]]] = None,
+    ):
+        """Initialize GitAttributes.
+
+        Args:
+            patterns: Optional list of (Pattern, attributes) tuples
+        """
+        self._patterns = patterns or []
+
+    def match_path(self, path: bytes) -> dict[bytes, AttributeValue]:
+        """Get attributes for a path by matching against patterns.
+
+        Args:
+            path: Path to match (relative to repository root)
+
+        Returns:
+            Dictionary of attributes that apply to this path
+        """
+        return match_path(self._patterns, path)
+
+    def add_patterns(
+        self, patterns: list[tuple[Pattern, Mapping[bytes, AttributeValue]]]
+    ) -> None:
+        """Add patterns to the collection.
+
+        Args:
+            patterns: List of (Pattern, attributes) tuples to add
+        """
+        self._patterns.extend(patterns)
+
+    def __len__(self) -> int:
+        """Return the number of patterns."""
+        return len(self._patterns)
+
+    def __iter__(self):
+        """Iterate over patterns."""
+        return iter(self._patterns)
+
+    @classmethod
+    def from_file(cls, filename: Union[str, bytes]) -> "GitAttributes":
+        """Create GitAttributes from a gitattributes file.
+
+        Args:
+            filename: Path to the .gitattributes file
+
+        Returns:
+            New GitAttributes instance
+        """
+        patterns = parse_gitattributes_file(filename)
+        return cls(patterns)
+
+    @classmethod
+    def from_path(cls, path: Union[str, bytes]) -> "GitAttributes":
+        """Create GitAttributes from .gitattributes in a directory.
+
+        Args:
+            path: Directory path to check for .gitattributes
+
+        Returns:
+            New GitAttributes instance
+        """
+        patterns = read_gitattributes(path)
+        return cls(patterns)

+ 22 - 17
dulwich/filters.py

@@ -22,8 +22,10 @@
 """Implementation of Git filter drivers (clean/smudge filters)."""
 
 import subprocess
+from collections.abc import Mapping
 from typing import TYPE_CHECKING, Callable, Optional, Protocol
 
+from .attrs import AttributeValue, Pattern, match_path
 from .objects import Blob
 
 if TYPE_CHECKING:
@@ -175,7 +177,7 @@ class FilterRegistry:
 
 def get_filter_for_path(
     path: bytes,
-    gitattributes: dict[bytes, dict[bytes, bytes]],
+    gitattributes: dict[bytes, dict[bytes, AttributeValue]],
     filter_registry: FilterRegistry,
 ) -> Optional[FilterDriver]:
     """Get the appropriate filter driver for a given path.
@@ -188,21 +190,24 @@ def get_filter_for_path(
     Returns:
         FilterDriver instance or None
     """
-    # For now, this is a simple implementation that does exact path matching
-    # In a real implementation, we'd need to handle glob patterns
-
-    # Check each pattern in gitattributes
-    for pattern, attrs in gitattributes.items():
-        # Simple implementation: just check if path matches pattern exactly
-        # TODO: Implement proper gitattributes pattern matching
-        if pattern == path or (pattern.startswith(b"*") and path.endswith(pattern[1:])):
-            filter_name_bytes = attrs.get(b"filter")
-            if filter_name_bytes is not None:
-                if isinstance(filter_name_bytes, bytes):
-                    filter_name_str = filter_name_bytes.decode("utf-8")
-                else:
-                    filter_name_str = filter_name_bytes
-                return filter_registry.get_driver(filter_name_str)
+    # Convert gitattributes dict to list of (Pattern, attrs) tuples
+    patterns: list[tuple[Pattern, Mapping[bytes, AttributeValue]]] = []
+    for pattern_bytes, attrs in gitattributes.items():
+        pattern = Pattern(pattern_bytes)
+        patterns.append((pattern, attrs))
+
+    # Get all attributes for this path
+    attributes = match_path(patterns, path)
+
+    # Check if there's a filter attribute
+    filter_name = attributes.get(b"filter")
+    if filter_name is not None:
+        if isinstance(filter_name, bool):
+            return None
+        if isinstance(filter_name, bytes):
+            filter_name_str = filter_name.decode("utf-8")
+            return filter_registry.get_driver(filter_name_str)
+        return None
 
     return None
 
@@ -216,7 +221,7 @@ class FilterBlobNormalizer:
     def __init__(
         self,
         config_stack: Optional["StackedConfig"],
-        gitattributes: dict[bytes, dict[bytes, bytes]],
+        gitattributes: dict[bytes, dict[bytes, AttributeValue]],
         filter_registry: Optional[FilterRegistry] = None,
         repo=None,
     ) -> None:

+ 58 - 0
dulwich/repo.py

@@ -49,6 +49,7 @@ if TYPE_CHECKING:
     # There are no circular imports here, but we try to defer imports as long
     # as possible to reduce start-up time for anything that doesn't need
     # these imports.
+    from .attrs import GitAttributes
     from .config import ConditionMatcher, ConfigFile, StackedConfig
     from .index import Index
     from .notes import Notes
@@ -2074,6 +2075,63 @@ class Repo(BaseRepo):
         except KeyError:
             return BlobNormalizer(config_stack, git_attributes)
 
+    def get_gitattributes(self, tree: Optional[bytes] = None) -> "GitAttributes":
+        """Read gitattributes for the repository.
+
+        Args:
+            tree: Tree SHA to read .gitattributes from (defaults to HEAD)
+
+        Returns:
+            GitAttributes object that can be used to match paths
+        """
+        from .attrs import (
+            GitAttributes,
+            Pattern,
+            parse_git_attributes,
+        )
+
+        patterns = []
+
+        # Read system gitattributes (TODO: implement this)
+        # Read global gitattributes (TODO: implement this)
+
+        # Read repository .gitattributes from index/tree
+        if tree is None:
+            try:
+                # Try to get from HEAD
+                head = self[b"HEAD"]
+                if isinstance(head, Tag):
+                    _cls, obj = head.object
+                    head = self.get_object(obj)
+                tree = head.tree
+            except KeyError:
+                # No HEAD, no attributes from tree
+                pass
+
+        if tree is not None:
+            try:
+                tree_obj = self[tree]
+                if b".gitattributes" in tree_obj:
+                    _, attrs_sha = tree_obj[b".gitattributes"]
+                    attrs_blob = self[attrs_sha]
+                    if isinstance(attrs_blob, Blob):
+                        attrs_data = BytesIO(attrs_blob.data)
+                        for pattern_bytes, attrs in parse_git_attributes(attrs_data):
+                            pattern = Pattern(pattern_bytes)
+                            patterns.append((pattern, attrs))
+            except (KeyError, NotTreeError):
+                pass
+
+        # Read .git/info/attributes
+        info_attrs_path = os.path.join(self.controldir(), "info", "attributes")
+        if os.path.exists(info_attrs_path):
+            with open(info_attrs_path, "rb") as f:
+                for pattern_bytes, attrs in parse_git_attributes(f):
+                    pattern = Pattern(pattern_bytes)
+                    patterns.append((pattern, attrs))
+
+        return GitAttributes(patterns)
+
     def _sparse_checkout_file_path(self) -> str:
         """Return the path of the sparse-checkout file in this repo's control dir."""
         return os.path.join(self.controldir(), "info", "sparse-checkout")

+ 1 - 0
tests/__init__.py

@@ -117,6 +117,7 @@ def self_test_suite():
     names = [
         "annotate",
         "archive",
+        "attrs",
         "blackbox",
         "bundle",
         "cli",

+ 458 - 0
tests/test_attrs.py

@@ -0,0 +1,458 @@
+# test_attrs.py -- tests for gitattributes
+# Copyright (C) 2019-2020 Collabora Ltd
+# Copyright (C) 2019-2020 Andrej Shadura <andrew.shadura@collabora.co.uk>
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Tests for gitattributes parsing and matching."""
+
+import os
+import tempfile
+from io import BytesIO
+
+from dulwich.attrs import (
+    GitAttributes,
+    Pattern,
+    _parse_attr,
+    match_path,
+    parse_git_attributes,
+    parse_gitattributes_file,
+    read_gitattributes,
+)
+
+from . import TestCase
+
+
+class ParseAttrTests(TestCase):
+    """Test the _parse_attr function."""
+
+    def test_parse_set_attr(self):
+        """Test parsing a set attribute."""
+        name, value = _parse_attr(b"text")
+        self.assertEqual(name, b"text")
+        self.assertEqual(value, True)
+
+    def test_parse_unset_attr(self):
+        """Test parsing an unset attribute."""
+        name, value = _parse_attr(b"-text")
+        self.assertEqual(name, b"text")
+        self.assertEqual(value, False)
+
+    def test_parse_unspecified_attr(self):
+        """Test parsing an unspecified attribute."""
+        name, value = _parse_attr(b"!text")
+        self.assertEqual(name, b"text")
+        self.assertEqual(value, None)
+
+    def test_parse_value_attr(self):
+        """Test parsing an attribute with a value."""
+        name, value = _parse_attr(b"diff=python")
+        self.assertEqual(name, b"diff")
+        self.assertEqual(value, b"python")
+
+    def test_parse_value_with_equals(self):
+        """Test parsing an attribute value containing equals."""
+        name, value = _parse_attr(b"filter=foo=bar")
+        self.assertEqual(name, b"filter")
+        self.assertEqual(value, b"foo=bar")
+
+
+class ParseGitAttributesTests(TestCase):
+    """Test the parse_git_attributes function."""
+
+    def test_parse_empty(self):
+        """Test parsing empty file."""
+        attrs = list(parse_git_attributes(BytesIO(b"")))
+        self.assertEqual(attrs, [])
+
+    def test_parse_comments(self):
+        """Test parsing file with comments."""
+        content = b"""# This is a comment
+# Another comment
+"""
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(attrs, [])
+
+    def test_parse_single_pattern(self):
+        """Test parsing single pattern."""
+        content = b"*.txt text"
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(len(attrs), 1)
+        pattern, attributes = attrs[0]
+        self.assertEqual(pattern, b"*.txt")
+        self.assertEqual(attributes, {b"text": True})
+
+    def test_parse_multiple_attributes(self):
+        """Test parsing pattern with multiple attributes."""
+        content = b"*.jpg -text -diff binary"
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(len(attrs), 1)
+        pattern, attributes = attrs[0]
+        self.assertEqual(pattern, b"*.jpg")
+        self.assertEqual(attributes, {b"text": False, b"diff": False, b"binary": True})
+
+    def test_parse_attributes_with_values(self):
+        """Test parsing attributes with values."""
+        content = b"*.c filter=indent diff=cpp text"
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(len(attrs), 1)
+        pattern, attributes = attrs[0]
+        self.assertEqual(pattern, b"*.c")
+        self.assertEqual(
+            attributes, {b"filter": b"indent", b"diff": b"cpp", b"text": True}
+        )
+
+    def test_parse_multiple_patterns(self):
+        """Test parsing multiple patterns."""
+        content = b"""*.txt text
+*.jpg -text binary
+*.py diff=python
+"""
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(len(attrs), 3)
+
+        # First pattern
+        pattern, attributes = attrs[0]
+        self.assertEqual(pattern, b"*.txt")
+        self.assertEqual(attributes, {b"text": True})
+
+        # Second pattern
+        pattern, attributes = attrs[1]
+        self.assertEqual(pattern, b"*.jpg")
+        self.assertEqual(attributes, {b"text": False, b"binary": True})
+
+        # Third pattern
+        pattern, attributes = attrs[2]
+        self.assertEqual(pattern, b"*.py")
+        self.assertEqual(attributes, {b"diff": b"python"})
+
+    def test_parse_git_lfs_example(self):
+        """Test parsing Git LFS example from docstring."""
+        content = b"""*.tar.* filter=lfs diff=lfs merge=lfs -text
+
+# store signatures in Git
+*.tar.*.asc -filter -diff merge=binary -text
+
+# store .dsc verbatim
+*.dsc -filter !diff merge=binary !text
+"""
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(len(attrs), 3)
+
+        # LFS pattern
+        pattern, attributes = attrs[0]
+        self.assertEqual(pattern, b"*.tar.*")
+        self.assertEqual(
+            attributes,
+            {b"filter": b"lfs", b"diff": b"lfs", b"merge": b"lfs", b"text": False},
+        )
+
+        # Signatures pattern
+        pattern, attributes = attrs[1]
+        self.assertEqual(pattern, b"*.tar.*.asc")
+        self.assertEqual(
+            attributes,
+            {b"filter": False, b"diff": False, b"merge": b"binary", b"text": False},
+        )
+
+        # .dsc pattern
+        pattern, attributes = attrs[2]
+        self.assertEqual(pattern, b"*.dsc")
+        self.assertEqual(
+            attributes,
+            {b"filter": False, b"diff": None, b"merge": b"binary", b"text": None},
+        )
+
+
+class PatternTests(TestCase):
+    """Test the Pattern class."""
+
+    def test_exact_match(self):
+        """Test exact filename matching without path."""
+        pattern = Pattern(b"README.txt")
+        self.assertTrue(pattern.match(b"README.txt"))
+        self.assertFalse(pattern.match(b"readme.txt"))
+        # Patterns without slashes match at any level
+        self.assertTrue(pattern.match(b"src/README.txt"))
+
+    def test_wildcard_extension(self):
+        """Test wildcard extension matching."""
+        pattern = Pattern(b"*.txt")
+        self.assertTrue(pattern.match(b"file.txt"))
+        self.assertTrue(pattern.match(b"README.txt"))
+        self.assertTrue(pattern.match(b"src/doc.txt"))
+        self.assertFalse(pattern.match(b"file.txt.bak"))
+        self.assertFalse(pattern.match(b"file.md"))
+
+    def test_wildcard_in_name(self):
+        """Test wildcard in filename."""
+        pattern = Pattern(b"test_*.py")
+        self.assertTrue(pattern.match(b"test_foo.py"))
+        self.assertTrue(pattern.match(b"test_bar.py"))
+        self.assertTrue(pattern.match(b"src/test_baz.py"))
+        self.assertFalse(pattern.match(b"test.py"))
+        self.assertFalse(pattern.match(b"tests.py"))
+
+    def test_question_mark(self):
+        """Test question mark matching."""
+        pattern = Pattern(b"file?.txt")
+        self.assertTrue(pattern.match(b"file1.txt"))
+        self.assertTrue(pattern.match(b"fileA.txt"))
+        self.assertFalse(pattern.match(b"file.txt"))
+        self.assertFalse(pattern.match(b"file10.txt"))
+
+    def test_character_class(self):
+        """Test character class matching."""
+        pattern = Pattern(b"file[0-9].txt")
+        self.assertTrue(pattern.match(b"file0.txt"))
+        self.assertTrue(pattern.match(b"file5.txt"))
+        self.assertTrue(pattern.match(b"file9.txt"))
+        self.assertFalse(pattern.match(b"fileA.txt"))
+        self.assertFalse(pattern.match(b"file10.txt"))
+
+    def test_negated_character_class(self):
+        """Test negated character class."""
+        pattern = Pattern(b"file[!0-9].txt")
+        self.assertTrue(pattern.match(b"fileA.txt"))
+        self.assertTrue(pattern.match(b"file_.txt"))
+        self.assertFalse(pattern.match(b"file0.txt"))
+        self.assertFalse(pattern.match(b"file5.txt"))
+
+    def test_directory_pattern(self):
+        """Test pattern with directory."""
+        pattern = Pattern(b"src/*.py")
+        self.assertTrue(pattern.match(b"src/foo.py"))
+        self.assertTrue(pattern.match(b"src/bar.py"))
+        self.assertFalse(pattern.match(b"foo.py"))
+        self.assertFalse(pattern.match(b"src/sub/foo.py"))
+        self.assertFalse(pattern.match(b"other/foo.py"))
+
+    def test_double_asterisk(self):
+        """Test double asterisk matching."""
+        pattern = Pattern(b"**/foo.txt")
+        self.assertTrue(pattern.match(b"foo.txt"))
+        self.assertTrue(pattern.match(b"src/foo.txt"))
+        self.assertTrue(pattern.match(b"src/sub/foo.txt"))
+        self.assertTrue(pattern.match(b"a/b/c/foo.txt"))
+
+    def test_double_asterisk_middle(self):
+        """Test double asterisk in middle."""
+        pattern = Pattern(b"src/**/foo.txt")
+        self.assertTrue(pattern.match(b"src/foo.txt"))
+        self.assertTrue(pattern.match(b"src/sub/foo.txt"))
+        self.assertTrue(pattern.match(b"src/a/b/foo.txt"))
+        self.assertFalse(pattern.match(b"foo.txt"))
+        self.assertFalse(pattern.match(b"other/foo.txt"))
+
+    def test_leading_slash(self):
+        """Test pattern with leading slash."""
+        pattern = Pattern(b"/README.txt")
+        self.assertTrue(pattern.match(b"README.txt"))
+        self.assertTrue(pattern.match(b"/README.txt"))
+        self.assertFalse(pattern.match(b"src/README.txt"))
+
+
+class MatchPathTests(TestCase):
+    """Test the match_path function."""
+
+    def test_no_matches(self):
+        """Test when no patterns match."""
+        patterns = [
+            (Pattern(b"*.txt"), {b"text": True}),
+            (Pattern(b"*.jpg"), {b"binary": True}),
+        ]
+        attrs = match_path(patterns, b"file.py")
+        self.assertEqual(attrs, {})
+
+    def test_single_match(self):
+        """Test single pattern match."""
+        patterns = [
+            (Pattern(b"*.txt"), {b"text": True}),
+            (Pattern(b"*.jpg"), {b"binary": True}),
+        ]
+        attrs = match_path(patterns, b"README.txt")
+        self.assertEqual(attrs, {b"text": True})
+
+    def test_multiple_matches_override(self):
+        """Test that later patterns override earlier ones."""
+        patterns = [
+            (Pattern(b"*"), {b"text": True}),
+            (Pattern(b"*.jpg"), {b"text": False, b"binary": True}),
+        ]
+        attrs = match_path(patterns, b"image.jpg")
+        self.assertEqual(attrs, {b"text": False, b"binary": True})
+
+    def test_unspecified_removes_attribute(self):
+        """Test that unspecified (None) removes attributes."""
+        patterns = [
+            (Pattern(b"*"), {b"text": True, b"diff": True}),
+            (Pattern(b"*.bin"), {b"text": None, b"binary": True}),
+        ]
+        attrs = match_path(patterns, b"file.bin")
+        self.assertEqual(attrs, {b"diff": True, b"binary": True})
+        # 'text' should be removed
+        self.assertNotIn(b"text", attrs)
+
+
+class FileOperationsTests(TestCase):
+    """Test file operations."""
+
+    def test_parse_gitattributes_file(self):
+        """Test parsing a gitattributes file."""
+        with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
+            f.write(b"*.txt text\n")
+            f.write(b"*.jpg -text binary\n")
+            temp_path = f.name
+
+        try:
+            patterns = parse_gitattributes_file(temp_path)
+            self.assertEqual(len(patterns), 2)
+
+            # Check first pattern
+            pattern, attrs = patterns[0]
+            self.assertEqual(pattern.pattern, b"*.txt")
+            self.assertEqual(attrs, {b"text": True})
+
+            # Check second pattern
+            pattern, attrs = patterns[1]
+            self.assertEqual(pattern.pattern, b"*.jpg")
+            self.assertEqual(attrs, {b"text": False, b"binary": True})
+        finally:
+            os.unlink(temp_path)
+
+    def test_read_gitattributes(self):
+        """Test reading gitattributes from a directory."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create .gitattributes file
+            attrs_path = os.path.join(tmpdir, ".gitattributes")
+            with open(attrs_path, "wb") as f:
+                f.write(b"*.py diff=python\n")
+
+            patterns = read_gitattributes(tmpdir)
+            self.assertEqual(len(patterns), 1)
+
+            pattern, attrs = patterns[0]
+            self.assertEqual(pattern.pattern, b"*.py")
+            self.assertEqual(attrs, {b"diff": b"python"})
+
+    def test_read_gitattributes_missing(self):
+        """Test reading gitattributes when file doesn't exist."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            patterns = read_gitattributes(tmpdir)
+            self.assertEqual(patterns, [])
+
+
+class GitAttributesTests(TestCase):
+    """Test the GitAttributes class."""
+
+    def test_empty_gitattributes(self):
+        """Test GitAttributes with no patterns."""
+        ga = GitAttributes()
+        attrs = ga.match_path(b"file.txt")
+        self.assertEqual(attrs, {})
+        self.assertEqual(len(ga), 0)
+
+    def test_gitattributes_with_patterns(self):
+        """Test GitAttributes with patterns."""
+        patterns = [
+            (Pattern(b"*.txt"), {b"text": True}),
+            (Pattern(b"*.jpg"), {b"binary": True, b"text": False}),
+        ]
+        ga = GitAttributes(patterns)
+
+        # Test matching .txt file
+        attrs = ga.match_path(b"README.txt")
+        self.assertEqual(attrs, {b"text": True})
+
+        # Test matching .jpg file
+        attrs = ga.match_path(b"image.jpg")
+        self.assertEqual(attrs, {b"binary": True, b"text": False})
+
+        # Test non-matching file
+        attrs = ga.match_path(b"script.py")
+        self.assertEqual(attrs, {})
+
+        self.assertEqual(len(ga), 2)
+
+    def test_add_patterns(self):
+        """Test adding patterns to GitAttributes."""
+        ga = GitAttributes()
+        self.assertEqual(len(ga), 0)
+
+        # Add patterns
+        ga.add_patterns(
+            [
+                (Pattern(b"*.py"), {b"diff": b"python"}),
+                (Pattern(b"*.md"), {b"text": True}),
+            ]
+        )
+
+        self.assertEqual(len(ga), 2)
+        attrs = ga.match_path(b"test.py")
+        self.assertEqual(attrs, {b"diff": b"python"})
+
+    def test_iteration(self):
+        """Test iterating over patterns."""
+        patterns = [
+            (Pattern(b"*.txt"), {b"text": True}),
+            (Pattern(b"*.jpg"), {b"binary": True}),
+        ]
+        ga = GitAttributes(patterns)
+
+        collected = list(ga)
+        self.assertEqual(len(collected), 2)
+        self.assertEqual(collected[0][0].pattern, b"*.txt")
+        self.assertEqual(collected[1][0].pattern, b"*.jpg")
+
+    def test_from_file(self):
+        """Test creating GitAttributes from file."""
+        with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
+            f.write(b"*.txt text\n")
+            f.write(b"*.bin -text binary\n")
+            temp_path = f.name
+
+        try:
+            ga = GitAttributes.from_file(temp_path)
+            self.assertEqual(len(ga), 2)
+
+            attrs = ga.match_path(b"file.txt")
+            self.assertEqual(attrs, {b"text": True})
+
+            attrs = ga.match_path(b"file.bin")
+            self.assertEqual(attrs, {b"text": False, b"binary": True})
+        finally:
+            os.unlink(temp_path)
+
+    def test_from_path(self):
+        """Test creating GitAttributes from directory path."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create .gitattributes file
+            attrs_path = os.path.join(tmpdir, ".gitattributes")
+            with open(attrs_path, "wb") as f:
+                f.write(b"*.py diff=python\n")
+                f.write(b"*.rs diff=rust\n")
+
+            ga = GitAttributes.from_path(tmpdir)
+            self.assertEqual(len(ga), 2)
+
+            attrs = ga.match_path(b"main.py")
+            self.assertEqual(attrs, {b"diff": b"python"})
+
+            attrs = ga.match_path(b"lib.rs")
+            self.assertEqual(attrs, {b"diff": b"rust"})

+ 29 - 0
tests/test_repository.py

@@ -341,6 +341,35 @@ class RepositoryRootTests(TestCase):
         r.set_description(description)
         self.assertEqual(description, r.get_description())
 
+    def test_get_gitattributes(self) -> None:
+        # Test when no .gitattributes file exists
+        r = self.open_repo("a.git")
+        attrs = r.get_gitattributes()
+        from dulwich.attrs import GitAttributes
+
+        self.assertIsInstance(attrs, GitAttributes)
+        self.assertEqual(len(attrs), 0)
+
+        # Create .git/info/attributes file (which is read by get_gitattributes)
+        info_dir = os.path.join(r.controldir(), "info")
+        if not os.path.exists(info_dir):
+            os.makedirs(info_dir)
+        attrs_path = os.path.join(info_dir, "attributes")
+        with open(attrs_path, "wb") as f:
+            f.write(b"*.txt text\n")
+            f.write(b"*.jpg -text binary\n")
+
+        # Test with attributes file
+        attrs = r.get_gitattributes()
+        self.assertEqual(len(attrs), 2)
+
+        # Test matching
+        txt_attrs = attrs.match_path(b"file.txt")
+        self.assertEqual(txt_attrs, {b"text": True})
+
+        jpg_attrs = attrs.match_path(b"image.jpg")
+        self.assertEqual(jpg_attrs, {b"text": False, b"binary": True})
+
     def test_contains_missing(self) -> None:
         r = self.open_repo("a.git")
         self.assertNotIn(b"bar", r)