Browse Source

Add support for reading GitAttributes (#1662)

Jelmer Vernooij 1 month ago
parent
commit
522d961303
7 changed files with 910 additions and 18 deletions
  1. 6 1
      NEWS
  2. 336 0
      dulwich/attrs.py
  3. 22 17
      dulwich/filters.py
  4. 58 0
      dulwich/repo.py
  5. 1 0
      tests/__init__.py
  6. 458 0
      tests/test_attrs.py
  7. 29 0
      tests/test_repository.py

+ 6 - 1
NEWS

@@ -3,7 +3,7 @@
  * Only write Git index extensions when they contain meaningful data.
  * Only write Git index extensions when they contain meaningful data.
    Previously, dulwich would write empty extensions to the index file,
    Previously, dulwich would write empty extensions to the index file,
    causing unnecessary bloat.
    causing unnecessary bloat.
-   (Jelmer Vernooij, #1643)
+   (Andrew Shadura, Jelmer Vernooij, #1643)
 
 
  * Document that ``porcelain.push`` returns per-ref status information
  * Document that ``porcelain.push`` returns per-ref status information
    in the ``SendPackResult`` object. Added test coverage to verify this
    in the ``SendPackResult`` object. Added test coverage to verify this
@@ -14,6 +14,11 @@
    CLI command, and ``submodule_update`` CLI command. Add ``--recurse-submodules``
    CLI command, and ``submodule_update`` CLI command. Add ``--recurse-submodules``
    option to ``clone`` command. (#506, Jelmer Vernooij)
    option to ``clone`` command. (#506, Jelmer Vernooij)
 
 
+ * Add support for parsing Git attributes from .gitattributes files.
+   This enables proper handling of text/binary detection, line ending
+   normalization, and filter specifications for files.
+   (Jelmer Vernooij, #1211)
+
 0.23.1	2025-06-30
 0.23.1	2025-06-30
 
 
  * Support ``untracked_files="normal"`` argument to ``porcelain.status``,
  * Support ``untracked_files="normal"`` argument to ``porcelain.status``,

+ 336 - 0
dulwich/attrs.py

@@ -0,0 +1,336 @@
+# attrs.py -- Git attributes for dulwich
+# Copyright (C) 2019-2020 Collabora Ltd
+# Copyright (C) 2019-2020 Andrej Shadura <andrew.shadura@collabora.co.uk>
+#
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Parse .gitattributes file."""
+
+import os
+import re
+from collections.abc import Generator, Mapping
+from typing import (
+    IO,
+    Optional,
+    Union,
+)
+
+AttributeValue = Union[bytes, bool, None]
+
+
+def _parse_attr(attr: bytes) -> tuple[bytes, AttributeValue]:
+    """Parse a git attribute into its value.
+
+    >>> _parse_attr(b'attr')
+    (b'attr', True)
+    >>> _parse_attr(b'-attr')
+    (b'attr', False)
+    >>> _parse_attr(b'!attr')
+    (b'attr', None)
+    >>> _parse_attr(b'attr=text')
+    (b'attr', b'text')
+    """
+    if attr.startswith(b"!"):
+        return attr[1:], None
+    if attr.startswith(b"-"):
+        return attr[1:], False
+    if b"=" not in attr:
+        return attr, True
+    # Split only on first = to handle values with = in them
+    name, _, value = attr.partition(b"=")
+    return name, value
+
+
+def parse_git_attributes(
+    f: IO[bytes],
+) -> Generator[tuple[bytes, Mapping[bytes, AttributeValue]], None, None]:
+    """Parse a Git attributes string.
+
+    Args:
+      f: File-like object to read bytes from
+    Returns:
+      List of patterns and corresponding patterns in the order or them being encountered
+    >>> from io import BytesIO
+    >>> list(parse_git_attributes(BytesIO(b'''*.tar.* filter=lfs diff=lfs merge=lfs -text
+    ...
+    ... # store signatures in Git
+    ... *.tar.*.asc -filter -diff merge=binary -text
+    ...
+    ... # store .dsc verbatim
+    ... *.dsc -filter !diff merge=binary !text
+    ... '''))) #doctest: +NORMALIZE_WHITESPACE
+    [(b'*.tar.*', {'filter': 'lfs', 'diff': 'lfs', 'merge': 'lfs', 'text': False}),
+     (b'*.tar.*.asc', {'filter': False, 'diff': False, 'merge': 'binary', 'text': False}),
+     (b'*.dsc', {'filter': False, 'diff': None, 'merge': 'binary', 'text': None})]
+    """
+    for line in f:
+        line = line.strip()
+
+        # Ignore blank lines, they're used for readability.
+        if not line:
+            continue
+
+        if line.startswith(b"#"):
+            # Comment
+            continue
+
+        pattern, *attrs = line.split()
+
+        yield (pattern, {k: v for k, v in (_parse_attr(a) for a in attrs)})
+
+
+def _translate_pattern(pattern: bytes) -> bytes:
+    """Translate a gitattributes pattern to a regular expression.
+
+    Similar to gitignore patterns, but simpler as gitattributes doesn't support
+    all the same features (e.g., no directory-only patterns with trailing /).
+    """
+    res = b""
+    i = 0
+    n = len(pattern)
+
+    # If pattern doesn't contain /, it can match at any level
+    if b"/" not in pattern:
+        res = b"(?:.*/)??"
+    elif pattern.startswith(b"/"):
+        # Leading / means root of repository
+        pattern = pattern[1:]
+        n = len(pattern)
+
+    while i < n:
+        c = pattern[i : i + 1]
+        i += 1
+
+        if c == b"*":
+            if i < n and pattern[i : i + 1] == b"*":
+                # Double asterisk
+                i += 1
+                if i < n and pattern[i : i + 1] == b"/":
+                    # **/ - match zero or more directories
+                    res += b"(?:.*/)??"
+                    i += 1
+                elif i == n:
+                    # ** at end - match everything
+                    res += b".*"
+                else:
+                    # ** in middle
+                    res += b".*"
+            else:
+                # Single * - match any character except /
+                res += b"[^/]*"
+        elif c == b"?":
+            res += b"[^/]"
+        elif c == b"[":
+            # Character class
+            j = i
+            if j < n and pattern[j : j + 1] == b"!":
+                j += 1
+            if j < n and pattern[j : j + 1] == b"]":
+                j += 1
+            while j < n and pattern[j : j + 1] != b"]":
+                j += 1
+            if j >= n:
+                res += b"\\["
+            else:
+                stuff = pattern[i:j].replace(b"\\", b"\\\\")
+                i = j + 1
+                if stuff.startswith(b"!"):
+                    stuff = b"^" + stuff[1:]
+                elif stuff.startswith(b"^"):
+                    stuff = b"\\" + stuff
+                res += b"[" + stuff + b"]"
+        else:
+            res += re.escape(c)
+
+    return res
+
+
+class Pattern:
+    """A single gitattributes pattern."""
+
+    def __init__(self, pattern: bytes):
+        self.pattern = pattern
+        self._regex: Optional[re.Pattern[bytes]] = None
+        self._compile()
+
+    def _compile(self):
+        """Compile the pattern to a regular expression."""
+        regex_pattern = _translate_pattern(self.pattern)
+        # Add anchors
+        regex_pattern = b"^" + regex_pattern + b"$"
+        self._regex = re.compile(regex_pattern)
+
+    def match(self, path: bytes) -> bool:
+        """Check if path matches this pattern.
+
+        Args:
+            path: Path to check (relative to repository root, using / separators)
+
+        Returns:
+            True if path matches this pattern
+        """
+        # Normalize path
+        if path.startswith(b"/"):
+            path = path[1:]
+
+        # Try to match
+        assert self._regex is not None  # Always set by _compile()
+        return bool(self._regex.match(path))
+
+
+def match_path(
+    patterns: list[tuple[Pattern, Mapping[bytes, AttributeValue]]], path: bytes
+) -> dict[bytes, AttributeValue]:
+    """Get attributes for a path by matching against patterns.
+
+    Args:
+        patterns: List of (Pattern, attributes) tuples
+        path: Path to match (relative to repository root)
+
+    Returns:
+        Dictionary of attributes that apply to this path
+    """
+    attributes: dict[bytes, AttributeValue] = {}
+
+    # Later patterns override earlier ones
+    for pattern, attrs in patterns:
+        if pattern.match(path):
+            # Update attributes
+            for name, value in attrs.items():
+                if value is None:
+                    # Unspecified - remove the attribute
+                    attributes.pop(name, None)
+                else:
+                    attributes[name] = value
+
+    return attributes
+
+
+def parse_gitattributes_file(
+    filename: Union[str, bytes],
+) -> list[tuple[Pattern, Mapping[bytes, AttributeValue]]]:
+    """Parse a gitattributes file and return compiled patterns.
+
+    Args:
+        filename: Path to the .gitattributes file
+
+    Returns:
+        List of (Pattern, attributes) tuples
+    """
+    patterns = []
+
+    if isinstance(filename, str):
+        filename = filename.encode("utf-8")
+
+    with open(filename, "rb") as f:
+        for pattern_bytes, attrs in parse_git_attributes(f):
+            pattern = Pattern(pattern_bytes)
+            patterns.append((pattern, attrs))
+
+    return patterns
+
+
+def read_gitattributes(
+    path: Union[str, bytes],
+) -> list[tuple[Pattern, Mapping[bytes, AttributeValue]]]:
+    """Read .gitattributes from a directory.
+
+    Args:
+        path: Directory path to check for .gitattributes
+
+    Returns:
+        List of (Pattern, attributes) tuples
+    """
+    if isinstance(path, bytes):
+        path = path.decode("utf-8")
+
+    gitattributes_path = os.path.join(path, ".gitattributes")
+    if os.path.exists(gitattributes_path):
+        return parse_gitattributes_file(gitattributes_path)
+
+    return []
+
+
+class GitAttributes:
+    """A collection of gitattributes patterns that can match paths."""
+
+    def __init__(
+        self,
+        patterns: Optional[list[tuple[Pattern, Mapping[bytes, AttributeValue]]]] = None,
+    ):
+        """Initialize GitAttributes.
+
+        Args:
+            patterns: Optional list of (Pattern, attributes) tuples
+        """
+        self._patterns = patterns or []
+
+    def match_path(self, path: bytes) -> dict[bytes, AttributeValue]:
+        """Get attributes for a path by matching against patterns.
+
+        Args:
+            path: Path to match (relative to repository root)
+
+        Returns:
+            Dictionary of attributes that apply to this path
+        """
+        return match_path(self._patterns, path)
+
+    def add_patterns(
+        self, patterns: list[tuple[Pattern, Mapping[bytes, AttributeValue]]]
+    ) -> None:
+        """Add patterns to the collection.
+
+        Args:
+            patterns: List of (Pattern, attributes) tuples to add
+        """
+        self._patterns.extend(patterns)
+
+    def __len__(self) -> int:
+        """Return the number of patterns."""
+        return len(self._patterns)
+
+    def __iter__(self):
+        """Iterate over patterns."""
+        return iter(self._patterns)
+
+    @classmethod
+    def from_file(cls, filename: Union[str, bytes]) -> "GitAttributes":
+        """Create GitAttributes from a gitattributes file.
+
+        Args:
+            filename: Path to the .gitattributes file
+
+        Returns:
+            New GitAttributes instance
+        """
+        patterns = parse_gitattributes_file(filename)
+        return cls(patterns)
+
+    @classmethod
+    def from_path(cls, path: Union[str, bytes]) -> "GitAttributes":
+        """Create GitAttributes from .gitattributes in a directory.
+
+        Args:
+            path: Directory path to check for .gitattributes
+
+        Returns:
+            New GitAttributes instance
+        """
+        patterns = read_gitattributes(path)
+        return cls(patterns)

+ 22 - 17
dulwich/filters.py

@@ -22,8 +22,10 @@
 """Implementation of Git filter drivers (clean/smudge filters)."""
 """Implementation of Git filter drivers (clean/smudge filters)."""
 
 
 import subprocess
 import subprocess
+from collections.abc import Mapping
 from typing import TYPE_CHECKING, Callable, Optional, Protocol
 from typing import TYPE_CHECKING, Callable, Optional, Protocol
 
 
+from .attrs import AttributeValue, Pattern, match_path
 from .objects import Blob
 from .objects import Blob
 
 
 if TYPE_CHECKING:
 if TYPE_CHECKING:
@@ -175,7 +177,7 @@ class FilterRegistry:
 
 
 def get_filter_for_path(
 def get_filter_for_path(
     path: bytes,
     path: bytes,
-    gitattributes: dict[bytes, dict[bytes, bytes]],
+    gitattributes: dict[bytes, dict[bytes, AttributeValue]],
     filter_registry: FilterRegistry,
     filter_registry: FilterRegistry,
 ) -> Optional[FilterDriver]:
 ) -> Optional[FilterDriver]:
     """Get the appropriate filter driver for a given path.
     """Get the appropriate filter driver for a given path.
@@ -188,21 +190,24 @@ def get_filter_for_path(
     Returns:
     Returns:
         FilterDriver instance or None
         FilterDriver instance or None
     """
     """
-    # For now, this is a simple implementation that does exact path matching
-    # In a real implementation, we'd need to handle glob patterns
-
-    # Check each pattern in gitattributes
-    for pattern, attrs in gitattributes.items():
-        # Simple implementation: just check if path matches pattern exactly
-        # TODO: Implement proper gitattributes pattern matching
-        if pattern == path or (pattern.startswith(b"*") and path.endswith(pattern[1:])):
-            filter_name_bytes = attrs.get(b"filter")
-            if filter_name_bytes is not None:
-                if isinstance(filter_name_bytes, bytes):
-                    filter_name_str = filter_name_bytes.decode("utf-8")
-                else:
-                    filter_name_str = filter_name_bytes
-                return filter_registry.get_driver(filter_name_str)
+    # Convert gitattributes dict to list of (Pattern, attrs) tuples
+    patterns: list[tuple[Pattern, Mapping[bytes, AttributeValue]]] = []
+    for pattern_bytes, attrs in gitattributes.items():
+        pattern = Pattern(pattern_bytes)
+        patterns.append((pattern, attrs))
+
+    # Get all attributes for this path
+    attributes = match_path(patterns, path)
+
+    # Check if there's a filter attribute
+    filter_name = attributes.get(b"filter")
+    if filter_name is not None:
+        if isinstance(filter_name, bool):
+            return None
+        if isinstance(filter_name, bytes):
+            filter_name_str = filter_name.decode("utf-8")
+            return filter_registry.get_driver(filter_name_str)
+        return None
 
 
     return None
     return None
 
 
@@ -216,7 +221,7 @@ class FilterBlobNormalizer:
     def __init__(
     def __init__(
         self,
         self,
         config_stack: Optional["StackedConfig"],
         config_stack: Optional["StackedConfig"],
-        gitattributes: dict[bytes, dict[bytes, bytes]],
+        gitattributes: dict[bytes, dict[bytes, AttributeValue]],
         filter_registry: Optional[FilterRegistry] = None,
         filter_registry: Optional[FilterRegistry] = None,
         repo=None,
         repo=None,
     ) -> None:
     ) -> None:

+ 58 - 0
dulwich/repo.py

@@ -49,6 +49,7 @@ if TYPE_CHECKING:
     # There are no circular imports here, but we try to defer imports as long
     # There are no circular imports here, but we try to defer imports as long
     # as possible to reduce start-up time for anything that doesn't need
     # as possible to reduce start-up time for anything that doesn't need
     # these imports.
     # these imports.
+    from .attrs import GitAttributes
     from .config import ConditionMatcher, ConfigFile, StackedConfig
     from .config import ConditionMatcher, ConfigFile, StackedConfig
     from .index import Index
     from .index import Index
     from .notes import Notes
     from .notes import Notes
@@ -2074,6 +2075,63 @@ class Repo(BaseRepo):
         except KeyError:
         except KeyError:
             return BlobNormalizer(config_stack, git_attributes)
             return BlobNormalizer(config_stack, git_attributes)
 
 
+    def get_gitattributes(self, tree: Optional[bytes] = None) -> "GitAttributes":
+        """Read gitattributes for the repository.
+
+        Args:
+            tree: Tree SHA to read .gitattributes from (defaults to HEAD)
+
+        Returns:
+            GitAttributes object that can be used to match paths
+        """
+        from .attrs import (
+            GitAttributes,
+            Pattern,
+            parse_git_attributes,
+        )
+
+        patterns = []
+
+        # Read system gitattributes (TODO: implement this)
+        # Read global gitattributes (TODO: implement this)
+
+        # Read repository .gitattributes from index/tree
+        if tree is None:
+            try:
+                # Try to get from HEAD
+                head = self[b"HEAD"]
+                if isinstance(head, Tag):
+                    _cls, obj = head.object
+                    head = self.get_object(obj)
+                tree = head.tree
+            except KeyError:
+                # No HEAD, no attributes from tree
+                pass
+
+        if tree is not None:
+            try:
+                tree_obj = self[tree]
+                if b".gitattributes" in tree_obj:
+                    _, attrs_sha = tree_obj[b".gitattributes"]
+                    attrs_blob = self[attrs_sha]
+                    if isinstance(attrs_blob, Blob):
+                        attrs_data = BytesIO(attrs_blob.data)
+                        for pattern_bytes, attrs in parse_git_attributes(attrs_data):
+                            pattern = Pattern(pattern_bytes)
+                            patterns.append((pattern, attrs))
+            except (KeyError, NotTreeError):
+                pass
+
+        # Read .git/info/attributes
+        info_attrs_path = os.path.join(self.controldir(), "info", "attributes")
+        if os.path.exists(info_attrs_path):
+            with open(info_attrs_path, "rb") as f:
+                for pattern_bytes, attrs in parse_git_attributes(f):
+                    pattern = Pattern(pattern_bytes)
+                    patterns.append((pattern, attrs))
+
+        return GitAttributes(patterns)
+
     def _sparse_checkout_file_path(self) -> str:
     def _sparse_checkout_file_path(self) -> str:
         """Return the path of the sparse-checkout file in this repo's control dir."""
         """Return the path of the sparse-checkout file in this repo's control dir."""
         return os.path.join(self.controldir(), "info", "sparse-checkout")
         return os.path.join(self.controldir(), "info", "sparse-checkout")

+ 1 - 0
tests/__init__.py

@@ -117,6 +117,7 @@ def self_test_suite():
     names = [
     names = [
         "annotate",
         "annotate",
         "archive",
         "archive",
+        "attrs",
         "blackbox",
         "blackbox",
         "bundle",
         "bundle",
         "cli",
         "cli",

+ 458 - 0
tests/test_attrs.py

@@ -0,0 +1,458 @@
+# test_attrs.py -- tests for gitattributes
+# Copyright (C) 2019-2020 Collabora Ltd
+# Copyright (C) 2019-2020 Andrej Shadura <andrew.shadura@collabora.co.uk>
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Tests for gitattributes parsing and matching."""
+
+import os
+import tempfile
+from io import BytesIO
+
+from dulwich.attrs import (
+    GitAttributes,
+    Pattern,
+    _parse_attr,
+    match_path,
+    parse_git_attributes,
+    parse_gitattributes_file,
+    read_gitattributes,
+)
+
+from . import TestCase
+
+
+class ParseAttrTests(TestCase):
+    """Test the _parse_attr function."""
+
+    def test_parse_set_attr(self):
+        """Test parsing a set attribute."""
+        name, value = _parse_attr(b"text")
+        self.assertEqual(name, b"text")
+        self.assertEqual(value, True)
+
+    def test_parse_unset_attr(self):
+        """Test parsing an unset attribute."""
+        name, value = _parse_attr(b"-text")
+        self.assertEqual(name, b"text")
+        self.assertEqual(value, False)
+
+    def test_parse_unspecified_attr(self):
+        """Test parsing an unspecified attribute."""
+        name, value = _parse_attr(b"!text")
+        self.assertEqual(name, b"text")
+        self.assertEqual(value, None)
+
+    def test_parse_value_attr(self):
+        """Test parsing an attribute with a value."""
+        name, value = _parse_attr(b"diff=python")
+        self.assertEqual(name, b"diff")
+        self.assertEqual(value, b"python")
+
+    def test_parse_value_with_equals(self):
+        """Test parsing an attribute value containing equals."""
+        name, value = _parse_attr(b"filter=foo=bar")
+        self.assertEqual(name, b"filter")
+        self.assertEqual(value, b"foo=bar")
+
+
+class ParseGitAttributesTests(TestCase):
+    """Test the parse_git_attributes function."""
+
+    def test_parse_empty(self):
+        """Test parsing empty file."""
+        attrs = list(parse_git_attributes(BytesIO(b"")))
+        self.assertEqual(attrs, [])
+
+    def test_parse_comments(self):
+        """Test parsing file with comments."""
+        content = b"""# This is a comment
+# Another comment
+"""
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(attrs, [])
+
+    def test_parse_single_pattern(self):
+        """Test parsing single pattern."""
+        content = b"*.txt text"
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(len(attrs), 1)
+        pattern, attributes = attrs[0]
+        self.assertEqual(pattern, b"*.txt")
+        self.assertEqual(attributes, {b"text": True})
+
+    def test_parse_multiple_attributes(self):
+        """Test parsing pattern with multiple attributes."""
+        content = b"*.jpg -text -diff binary"
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(len(attrs), 1)
+        pattern, attributes = attrs[0]
+        self.assertEqual(pattern, b"*.jpg")
+        self.assertEqual(attributes, {b"text": False, b"diff": False, b"binary": True})
+
+    def test_parse_attributes_with_values(self):
+        """Test parsing attributes with values."""
+        content = b"*.c filter=indent diff=cpp text"
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(len(attrs), 1)
+        pattern, attributes = attrs[0]
+        self.assertEqual(pattern, b"*.c")
+        self.assertEqual(
+            attributes, {b"filter": b"indent", b"diff": b"cpp", b"text": True}
+        )
+
+    def test_parse_multiple_patterns(self):
+        """Test parsing multiple patterns."""
+        content = b"""*.txt text
+*.jpg -text binary
+*.py diff=python
+"""
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(len(attrs), 3)
+
+        # First pattern
+        pattern, attributes = attrs[0]
+        self.assertEqual(pattern, b"*.txt")
+        self.assertEqual(attributes, {b"text": True})
+
+        # Second pattern
+        pattern, attributes = attrs[1]
+        self.assertEqual(pattern, b"*.jpg")
+        self.assertEqual(attributes, {b"text": False, b"binary": True})
+
+        # Third pattern
+        pattern, attributes = attrs[2]
+        self.assertEqual(pattern, b"*.py")
+        self.assertEqual(attributes, {b"diff": b"python"})
+
+    def test_parse_git_lfs_example(self):
+        """Test parsing Git LFS example from docstring."""
+        content = b"""*.tar.* filter=lfs diff=lfs merge=lfs -text
+
+# store signatures in Git
+*.tar.*.asc -filter -diff merge=binary -text
+
+# store .dsc verbatim
+*.dsc -filter !diff merge=binary !text
+"""
+        attrs = list(parse_git_attributes(BytesIO(content)))
+        self.assertEqual(len(attrs), 3)
+
+        # LFS pattern
+        pattern, attributes = attrs[0]
+        self.assertEqual(pattern, b"*.tar.*")
+        self.assertEqual(
+            attributes,
+            {b"filter": b"lfs", b"diff": b"lfs", b"merge": b"lfs", b"text": False},
+        )
+
+        # Signatures pattern
+        pattern, attributes = attrs[1]
+        self.assertEqual(pattern, b"*.tar.*.asc")
+        self.assertEqual(
+            attributes,
+            {b"filter": False, b"diff": False, b"merge": b"binary", b"text": False},
+        )
+
+        # .dsc pattern
+        pattern, attributes = attrs[2]
+        self.assertEqual(pattern, b"*.dsc")
+        self.assertEqual(
+            attributes,
+            {b"filter": False, b"diff": None, b"merge": b"binary", b"text": None},
+        )
+
+
+class PatternTests(TestCase):
+    """Test the Pattern class."""
+
+    def test_exact_match(self):
+        """Test exact filename matching without path."""
+        pattern = Pattern(b"README.txt")
+        self.assertTrue(pattern.match(b"README.txt"))
+        self.assertFalse(pattern.match(b"readme.txt"))
+        # Patterns without slashes match at any level
+        self.assertTrue(pattern.match(b"src/README.txt"))
+
+    def test_wildcard_extension(self):
+        """Test wildcard extension matching."""
+        pattern = Pattern(b"*.txt")
+        self.assertTrue(pattern.match(b"file.txt"))
+        self.assertTrue(pattern.match(b"README.txt"))
+        self.assertTrue(pattern.match(b"src/doc.txt"))
+        self.assertFalse(pattern.match(b"file.txt.bak"))
+        self.assertFalse(pattern.match(b"file.md"))
+
+    def test_wildcard_in_name(self):
+        """Test wildcard in filename."""
+        pattern = Pattern(b"test_*.py")
+        self.assertTrue(pattern.match(b"test_foo.py"))
+        self.assertTrue(pattern.match(b"test_bar.py"))
+        self.assertTrue(pattern.match(b"src/test_baz.py"))
+        self.assertFalse(pattern.match(b"test.py"))
+        self.assertFalse(pattern.match(b"tests.py"))
+
+    def test_question_mark(self):
+        """Test question mark matching."""
+        pattern = Pattern(b"file?.txt")
+        self.assertTrue(pattern.match(b"file1.txt"))
+        self.assertTrue(pattern.match(b"fileA.txt"))
+        self.assertFalse(pattern.match(b"file.txt"))
+        self.assertFalse(pattern.match(b"file10.txt"))
+
+    def test_character_class(self):
+        """Test character class matching."""
+        pattern = Pattern(b"file[0-9].txt")
+        self.assertTrue(pattern.match(b"file0.txt"))
+        self.assertTrue(pattern.match(b"file5.txt"))
+        self.assertTrue(pattern.match(b"file9.txt"))
+        self.assertFalse(pattern.match(b"fileA.txt"))
+        self.assertFalse(pattern.match(b"file10.txt"))
+
+    def test_negated_character_class(self):
+        """Test negated character class."""
+        pattern = Pattern(b"file[!0-9].txt")
+        self.assertTrue(pattern.match(b"fileA.txt"))
+        self.assertTrue(pattern.match(b"file_.txt"))
+        self.assertFalse(pattern.match(b"file0.txt"))
+        self.assertFalse(pattern.match(b"file5.txt"))
+
+    def test_directory_pattern(self):
+        """Test pattern with directory."""
+        pattern = Pattern(b"src/*.py")
+        self.assertTrue(pattern.match(b"src/foo.py"))
+        self.assertTrue(pattern.match(b"src/bar.py"))
+        self.assertFalse(pattern.match(b"foo.py"))
+        self.assertFalse(pattern.match(b"src/sub/foo.py"))
+        self.assertFalse(pattern.match(b"other/foo.py"))
+
+    def test_double_asterisk(self):
+        """Test double asterisk matching."""
+        pattern = Pattern(b"**/foo.txt")
+        self.assertTrue(pattern.match(b"foo.txt"))
+        self.assertTrue(pattern.match(b"src/foo.txt"))
+        self.assertTrue(pattern.match(b"src/sub/foo.txt"))
+        self.assertTrue(pattern.match(b"a/b/c/foo.txt"))
+
+    def test_double_asterisk_middle(self):
+        """Test double asterisk in middle."""
+        pattern = Pattern(b"src/**/foo.txt")
+        self.assertTrue(pattern.match(b"src/foo.txt"))
+        self.assertTrue(pattern.match(b"src/sub/foo.txt"))
+        self.assertTrue(pattern.match(b"src/a/b/foo.txt"))
+        self.assertFalse(pattern.match(b"foo.txt"))
+        self.assertFalse(pattern.match(b"other/foo.txt"))
+
+    def test_leading_slash(self):
+        """Test pattern with leading slash."""
+        pattern = Pattern(b"/README.txt")
+        self.assertTrue(pattern.match(b"README.txt"))
+        self.assertTrue(pattern.match(b"/README.txt"))
+        self.assertFalse(pattern.match(b"src/README.txt"))
+
+
+class MatchPathTests(TestCase):
+    """Test the match_path function."""
+
+    def test_no_matches(self):
+        """Test when no patterns match."""
+        patterns = [
+            (Pattern(b"*.txt"), {b"text": True}),
+            (Pattern(b"*.jpg"), {b"binary": True}),
+        ]
+        attrs = match_path(patterns, b"file.py")
+        self.assertEqual(attrs, {})
+
+    def test_single_match(self):
+        """Test single pattern match."""
+        patterns = [
+            (Pattern(b"*.txt"), {b"text": True}),
+            (Pattern(b"*.jpg"), {b"binary": True}),
+        ]
+        attrs = match_path(patterns, b"README.txt")
+        self.assertEqual(attrs, {b"text": True})
+
+    def test_multiple_matches_override(self):
+        """Test that later patterns override earlier ones."""
+        patterns = [
+            (Pattern(b"*"), {b"text": True}),
+            (Pattern(b"*.jpg"), {b"text": False, b"binary": True}),
+        ]
+        attrs = match_path(patterns, b"image.jpg")
+        self.assertEqual(attrs, {b"text": False, b"binary": True})
+
+    def test_unspecified_removes_attribute(self):
+        """Test that unspecified (None) removes attributes."""
+        patterns = [
+            (Pattern(b"*"), {b"text": True, b"diff": True}),
+            (Pattern(b"*.bin"), {b"text": None, b"binary": True}),
+        ]
+        attrs = match_path(patterns, b"file.bin")
+        self.assertEqual(attrs, {b"diff": True, b"binary": True})
+        # 'text' should be removed
+        self.assertNotIn(b"text", attrs)
+
+
+class FileOperationsTests(TestCase):
+    """Test file operations."""
+
+    def test_parse_gitattributes_file(self):
+        """Test parsing a gitattributes file."""
+        with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
+            f.write(b"*.txt text\n")
+            f.write(b"*.jpg -text binary\n")
+            temp_path = f.name
+
+        try:
+            patterns = parse_gitattributes_file(temp_path)
+            self.assertEqual(len(patterns), 2)
+
+            # Check first pattern
+            pattern, attrs = patterns[0]
+            self.assertEqual(pattern.pattern, b"*.txt")
+            self.assertEqual(attrs, {b"text": True})
+
+            # Check second pattern
+            pattern, attrs = patterns[1]
+            self.assertEqual(pattern.pattern, b"*.jpg")
+            self.assertEqual(attrs, {b"text": False, b"binary": True})
+        finally:
+            os.unlink(temp_path)
+
+    def test_read_gitattributes(self):
+        """Test reading gitattributes from a directory."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create .gitattributes file
+            attrs_path = os.path.join(tmpdir, ".gitattributes")
+            with open(attrs_path, "wb") as f:
+                f.write(b"*.py diff=python\n")
+
+            patterns = read_gitattributes(tmpdir)
+            self.assertEqual(len(patterns), 1)
+
+            pattern, attrs = patterns[0]
+            self.assertEqual(pattern.pattern, b"*.py")
+            self.assertEqual(attrs, {b"diff": b"python"})
+
+    def test_read_gitattributes_missing(self):
+        """Test reading gitattributes when file doesn't exist."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            patterns = read_gitattributes(tmpdir)
+            self.assertEqual(patterns, [])
+
+
+class GitAttributesTests(TestCase):
+    """Test the GitAttributes class."""
+
+    def test_empty_gitattributes(self):
+        """Test GitAttributes with no patterns."""
+        ga = GitAttributes()
+        attrs = ga.match_path(b"file.txt")
+        self.assertEqual(attrs, {})
+        self.assertEqual(len(ga), 0)
+
+    def test_gitattributes_with_patterns(self):
+        """Test GitAttributes with patterns."""
+        patterns = [
+            (Pattern(b"*.txt"), {b"text": True}),
+            (Pattern(b"*.jpg"), {b"binary": True, b"text": False}),
+        ]
+        ga = GitAttributes(patterns)
+
+        # Test matching .txt file
+        attrs = ga.match_path(b"README.txt")
+        self.assertEqual(attrs, {b"text": True})
+
+        # Test matching .jpg file
+        attrs = ga.match_path(b"image.jpg")
+        self.assertEqual(attrs, {b"binary": True, b"text": False})
+
+        # Test non-matching file
+        attrs = ga.match_path(b"script.py")
+        self.assertEqual(attrs, {})
+
+        self.assertEqual(len(ga), 2)
+
+    def test_add_patterns(self):
+        """Test adding patterns to GitAttributes."""
+        ga = GitAttributes()
+        self.assertEqual(len(ga), 0)
+
+        # Add patterns
+        ga.add_patterns(
+            [
+                (Pattern(b"*.py"), {b"diff": b"python"}),
+                (Pattern(b"*.md"), {b"text": True}),
+            ]
+        )
+
+        self.assertEqual(len(ga), 2)
+        attrs = ga.match_path(b"test.py")
+        self.assertEqual(attrs, {b"diff": b"python"})
+
+    def test_iteration(self):
+        """Test iterating over patterns."""
+        patterns = [
+            (Pattern(b"*.txt"), {b"text": True}),
+            (Pattern(b"*.jpg"), {b"binary": True}),
+        ]
+        ga = GitAttributes(patterns)
+
+        collected = list(ga)
+        self.assertEqual(len(collected), 2)
+        self.assertEqual(collected[0][0].pattern, b"*.txt")
+        self.assertEqual(collected[1][0].pattern, b"*.jpg")
+
+    def test_from_file(self):
+        """Test creating GitAttributes from file."""
+        with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
+            f.write(b"*.txt text\n")
+            f.write(b"*.bin -text binary\n")
+            temp_path = f.name
+
+        try:
+            ga = GitAttributes.from_file(temp_path)
+            self.assertEqual(len(ga), 2)
+
+            attrs = ga.match_path(b"file.txt")
+            self.assertEqual(attrs, {b"text": True})
+
+            attrs = ga.match_path(b"file.bin")
+            self.assertEqual(attrs, {b"text": False, b"binary": True})
+        finally:
+            os.unlink(temp_path)
+
+    def test_from_path(self):
+        """Test creating GitAttributes from directory path."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create .gitattributes file
+            attrs_path = os.path.join(tmpdir, ".gitattributes")
+            with open(attrs_path, "wb") as f:
+                f.write(b"*.py diff=python\n")
+                f.write(b"*.rs diff=rust\n")
+
+            ga = GitAttributes.from_path(tmpdir)
+            self.assertEqual(len(ga), 2)
+
+            attrs = ga.match_path(b"main.py")
+            self.assertEqual(attrs, {b"diff": b"python"})
+
+            attrs = ga.match_path(b"lib.rs")
+            self.assertEqual(attrs, {b"diff": b"rust"})

+ 29 - 0
tests/test_repository.py

@@ -341,6 +341,35 @@ class RepositoryRootTests(TestCase):
         r.set_description(description)
         r.set_description(description)
         self.assertEqual(description, r.get_description())
         self.assertEqual(description, r.get_description())
 
 
+    def test_get_gitattributes(self) -> None:
+        # Test when no .gitattributes file exists
+        r = self.open_repo("a.git")
+        attrs = r.get_gitattributes()
+        from dulwich.attrs import GitAttributes
+
+        self.assertIsInstance(attrs, GitAttributes)
+        self.assertEqual(len(attrs), 0)
+
+        # Create .git/info/attributes file (which is read by get_gitattributes)
+        info_dir = os.path.join(r.controldir(), "info")
+        if not os.path.exists(info_dir):
+            os.makedirs(info_dir)
+        attrs_path = os.path.join(info_dir, "attributes")
+        with open(attrs_path, "wb") as f:
+            f.write(b"*.txt text\n")
+            f.write(b"*.jpg -text binary\n")
+
+        # Test with attributes file
+        attrs = r.get_gitattributes()
+        self.assertEqual(len(attrs), 2)
+
+        # Test matching
+        txt_attrs = attrs.match_path(b"file.txt")
+        self.assertEqual(txt_attrs, {b"text": True})
+
+        jpg_attrs = attrs.match_path(b"image.jpg")
+        self.assertEqual(jpg_attrs, {b"text": False, b"binary": True})
+
     def test_contains_missing(self) -> None:
     def test_contains_missing(self) -> None:
         r = self.open_repo("a.git")
         r = self.open_repo("a.git")
         self.assertNotIn(b"bar", r)
         self.assertNotIn(b"bar", r)