2 months ago · d1dbb47e89
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,10 @@
 
															  * Print deprecations on usage, not import.
														
 
															    (Alyssa Coghlan, #1650)
														
 
															+ * Add support for ``core.protectHFS`` configuration setting to protect
														
 
															+   against paths that could be misinterpreted on HFS+ filesystems.
														
 
															+   (Jelmer Vernooĳ, #246)
														
 
															+
														
 
															  * Only write Git index extensions when they contain meaningful data.
														
 
															    Previously, dulwich would write empty extensions to the index file,
														
 
															    causing unnecessary bloat.
														
--- a/dulwich/index.py
+++ b/dulwich/index.py
@@ -1311,6 +1311,60 @@ def validate_path_element_ntfs(element: bytes) -> bool:
 
															     return True
														
 
															+# HFS+ ignorable Unicode codepoints (from Git's utf8.c)
														
 
															+HFS_IGNORABLE_CHARS = {
														
 
															+    0x200C,  # ZERO WIDTH NON-JOINER
														
 
															+    0x200D,  # ZERO WIDTH JOINER
														
 
															+    0x200E,  # LEFT-TO-RIGHT MARK
														
 
															+    0x200F,  # RIGHT-TO-LEFT MARK
														
 
															+    0x202A,  # LEFT-TO-RIGHT EMBEDDING
														
 
															+    0x202B,  # RIGHT-TO-LEFT EMBEDDING
														
 
															+    0x202C,  # POP DIRECTIONAL FORMATTING
														
 
															+    0x202D,  # LEFT-TO-RIGHT OVERRIDE
														
 
															+    0x202E,  # RIGHT-TO-LEFT OVERRIDE
														
 
															+    0x206A,  # INHIBIT SYMMETRIC SWAPPING
														
 
															+    0x206B,  # ACTIVATE SYMMETRIC SWAPPING
														
 
															+    0x206C,  # INHIBIT ARABIC FORM SHAPING
														
 
															+    0x206D,  # ACTIVATE ARABIC FORM SHAPING
														
 
															+    0x206E,  # NATIONAL DIGIT SHAPES
														
 
															+    0x206F,  # NOMINAL DIGIT SHAPES
														
 
															+    0xFEFF,  # ZERO WIDTH NO-BREAK SPACE
														
 
															+}
														
 
															+
														
 
															+
														
 
															+def validate_path_element_hfs(element: bytes) -> bool:
														
 
															+    """Validate path element for HFS+ filesystem.
														
 
															+
														
 
															+    Equivalent to Git's is_hfs_dotgit and related checks.
														
 
															+    Uses NFD normalization and ignores HFS+ ignorable characters.
														
 
															+    """
														
 
															+    import unicodedata
														
 
															+
														
 
															+    try:
														
 
															+        # Decode to Unicode
														
 
															+        element_str = element.decode("utf-8", errors="strict")
														
 
															+    except UnicodeDecodeError:
														
 
															+        # Malformed UTF-8 - be conservative and reject
														
 
															+        return False
														
 
															+
														
 
															+    # Remove HFS+ ignorable characters (like Git's next_hfs_char)
														
 
															+    filtered = "".join(c for c in element_str if ord(c) not in HFS_IGNORABLE_CHARS)
														
 
															+
														
 
															+    # Normalize to NFD (HFS+ uses a variant of NFD)
														
 
															+    normalized = unicodedata.normalize("NFD", filtered)
														
 
															+
														
 
															+    # Check against invalid names (case-insensitive)
														
 
															+    normalized_bytes = normalized.encode("utf-8", errors="strict")
														
 
															+    if normalized_bytes.lower() in INVALID_DOTNAMES:
														
 
															+        return False
														
 
															+
														
 
															+    # Also check for 8.3 short name
														
 
															+    if normalized_bytes.lower() == b"git~1":
														
 
															+        return False
														
 
															+
														
 
															+    return True
														
 
															+
														
 
															+
														
 
															 def validate_path(
														
 
															     path: bytes,
														
 
															     element_validator: Callable[[bytes], bool] = validate_path_element_default,
														
--- a/dulwich/porcelain.py
+++ b/dulwich/porcelain.py
@@ -1728,10 +1728,18 @@ def reset(repo, mode, treeish="HEAD") -> None:
 
															             honor_filemode = config.get_boolean(b"core", b"filemode", os.name != "nt")
														
 
															             # Import validation functions
														
 
															-            from .index import validate_path_element_default, validate_path_element_ntfs
														
 
															+            from .index import (
														
 
															+                validate_path_element_default,
														
 
															+                validate_path_element_hfs,
														
 
															+                validate_path_element_ntfs,
														
 
															+            )
														
 
															             if config.get_boolean(b"core", b"core.protectNTFS", os.name == "nt"):
														
 
															                 validate_path_element = validate_path_element_ntfs
														
 
															+            elif config.get_boolean(
														
 
															+                b"core", b"core.protectHFS", sys.platform == "darwin"
														
 
															+            ):
														
 
															+                validate_path_element = validate_path_element_hfs
														
 
															             else:
														
 
															                 validate_path_element = validate_path_element_default
														
@@ -3328,11 +3336,41 @@ def ls_files(repo):
 
															         return sorted(r.open_index())
														
 
															-def find_unique_abbrev(object_store, object_id):
														
 
															-    """For now, just return 7 characters."""
														
 
															-    # TODO(jelmer): Add some logic here to return a number of characters that
														
 
															-    # scales relative with the size of the repository
														
 
															-    return object_id.decode("ascii")[:7]
														
 
															+def find_unique_abbrev(object_store, object_id, min_length=7):
														
 
															+    """Find the shortest unique abbreviation for an object ID.
														
 
															+
														
 
															+    Args:
														
 
															+      object_store: Object store to search in
														
 
															+      object_id: The full object ID to abbreviate
														
 
															+      min_length: Minimum length of abbreviation (default 7)
														
 
															+
														
 
															+    Returns:
														
 
															+      The shortest unique prefix of the object ID (at least min_length chars)
														
 
															+    """
														
 
															+    if isinstance(object_id, bytes):
														
 
															+        hex_id = object_id.decode("ascii")
														
 
															+    else:
														
 
															+        hex_id = object_id
														
 
															+
														
 
															+    # Start with minimum length
														
 
															+    for length in range(min_length, len(hex_id) + 1):
														
 
															+        prefix = hex_id[:length]
														
 
															+        matches = 0
														
 
															+
														
 
															+        # Check if this prefix is unique
														
 
															+        for obj_id in object_store:
														
 
															+            if obj_id.decode("ascii").startswith(prefix):
														
 
															+                matches += 1
														
 
															+                if matches > 1:
														
 
															+                    # Not unique, need more characters
														
 
															+                    break
														
 
															+
														
 
															+        if matches == 1:
														
 
															+            # Found unique prefix
														
 
															+            return prefix
														
 
															+
														
 
															+    # If we get here, return the full ID
														
 
															+    return hex_id
														
 
															 def describe(repo, abbrev=None):
														
@@ -3397,16 +3435,20 @@ def describe(repo, abbrev=None):
 
															                     if commit_count == 0:
														
 
															                         return tag_name
														
 
															                     else:
														
 
															-                        return "{}-{}-g{}".format(
														
 
															-                            tag_name,
														
 
															-                            commit_count,
														
 
															-                            latest_commit.id.decode("ascii")[abbrev_slice],
														
 
															-                        )
														
 
															+                        if abbrev is not None:
														
 
															+                            abbrev_hash = latest_commit.id.decode("ascii")[abbrev_slice]
														
 
															+                        else:
														
 
															+                            abbrev_hash = find_unique_abbrev(
														
 
															+                                r.object_store, latest_commit.id
														
 
															+                            )
														
 
															+                        return f"{tag_name}-{commit_count}-g{abbrev_hash}"
														
 
															             commit_count += 1
														
 
															         # Return plain commit if no parent tag can be found
														
 
															-        return "g{}".format(latest_commit.id.decode("ascii")[abbrev_slice])
														
 
															+        if abbrev is not None:
														
 
															+            return "g{}".format(latest_commit.id.decode("ascii")[abbrev_slice])
														
 
															+        return f"g{find_unique_abbrev(r.object_store, latest_commit.id)}"
														
 
															 def get_object_by_path(repo, path, committish=None):
														
--- a/dulwich/repo.py
+++ b/dulwich/repo.py
@@ -1719,6 +1719,7 @@ class Repo(BaseRepo):
 
															             build_index_from_tree,
														
 
															             symlink,
														
 
															             validate_path_element_default,
														
 
															+            validate_path_element_hfs,
														
 
															             validate_path_element_ntfs,
														
 
															         )
														
@@ -1732,6 +1733,8 @@ class Repo(BaseRepo):
 
															         honor_filemode = config.get_boolean(b"core", b"filemode", os.name != "nt")
														
 
															         if config.get_boolean(b"core", b"core.protectNTFS", os.name == "nt"):
														
 
															             validate_path_element = validate_path_element_ntfs
														
 
															+        elif config.get_boolean(b"core", b"core.protectHFS", sys.platform == "darwin"):
														
 
															+            validate_path_element = validate_path_element_hfs
														
 
															         else:
														
 
															             validate_path_element = validate_path_element_default
														
 
															         if config.get_boolean(b"core", b"symlinks", True):
														
--- a/dulwich/stash.py
+++ b/dulwich/stash.py
@@ -22,6 +22,7 @@
 
															 """Stash handling."""
														
 
															 import os
														
 
															+import sys
														
 
															 from typing import TYPE_CHECKING, Optional, TypedDict
														
 
															 from .file import GitFile
														
@@ -37,6 +38,7 @@ from .index import (
 
															     update_working_tree,
														
 
															     validate_path,
														
 
															     validate_path_element_default,
														
 
															+    validate_path_element_hfs,
														
 
															     validate_path_element_ntfs,
														
 
															 )
														
 
															 from .objects import S_IFGITLINK, Blob, Commit, ObjectID
														
@@ -139,6 +141,8 @@ class Stash:
 
															         if config.get_boolean(b"core", b"core.protectNTFS", os.name == "nt"):
														
 
															             validate_path_element = validate_path_element_ntfs
														
 
															+        elif config.get_boolean(b"core", b"core.protectHFS", sys.platform == "darwin"):
														
 
															+            validate_path_element = validate_path_element_hfs
														
 
															         else:
														
 
															             validate_path_element = validate_path_element_default
														
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -51,6 +51,7 @@ from dulwich.index import (
 
															     read_index_dict,
														
 
															     update_working_tree,
														
 
															     validate_path_element_default,
														
 
															+    validate_path_element_hfs,
														
 
															     validate_path_element_ntfs,
														
 
															     write_cache_time,
														
 
															     write_index,
														
@@ -860,6 +861,35 @@ class TestValidatePathElement(TestCase):
 
															         self.assertFalse(validate_path_element_ntfs(b".."))
														
 
															         self.assertFalse(validate_path_element_ntfs(b"git~1"))
														
 
															+    def test_hfs(self) -> None:
														
 
															+        # Normal paths should pass
														
 
															+        self.assertTrue(validate_path_element_hfs(b"bla"))
														
 
															+        self.assertTrue(validate_path_element_hfs(b".bla"))
														
 
															+
														
 
															+        # Basic .git variations should fail
														
 
															+        self.assertFalse(validate_path_element_hfs(b".git"))
														
 
															+        self.assertFalse(validate_path_element_hfs(b".giT"))
														
 
															+        self.assertFalse(validate_path_element_hfs(b".GIT"))
														
 
															+        self.assertFalse(validate_path_element_hfs(b".."))
														
 
															+
														
 
															+        # git~1 should also fail on HFS+
														
 
															+        self.assertFalse(validate_path_element_hfs(b"git~1"))
														
 
															+
														
 
															+        # Test HFS+ Unicode normalization attacks
														
 
															+        # .g\u200cit (zero-width non-joiner)
														
 
															+        self.assertFalse(validate_path_element_hfs(b".g\xe2\x80\x8cit"))
														
 
															+
														
 
															+        # .gi\u200dt (zero-width joiner)
														
 
															+        self.assertFalse(validate_path_element_hfs(b".gi\xe2\x80\x8dt"))
														
 
															+
														
 
															+        # Test other ignorable characters
														
 
															+        # .g\ufeffit (zero-width no-break space)
														
 
															+        self.assertFalse(validate_path_element_hfs(b".g\xef\xbb\xbfit"))
														
 
															+
														
 
															+        # Valid Unicode that shouldn't be confused with .git
														
 
															+        self.assertTrue(validate_path_element_hfs(b".g\xc3\xaft"))  # .gït
														
 
															+        self.assertTrue(validate_path_element_hfs(b"git"))  # git without dot
														
 
															+
														
 
															 class TestTreeFSPathConversion(TestCase):
														
 
															     def test_tree_to_fs_path(self) -> None:
														
--- a/tests/test_repository.py
+++ b/tests/test_repository.py
@@ -635,6 +635,35 @@ class RepositoryRootTests(TestCase):
 
															         t.close()
														
 
															+    def test_reset_index_protect_hfs(self) -> None:
														
 
															+        tmp_dir = self.mkdtemp()
														
 
															+        self.addCleanup(shutil.rmtree, tmp_dir)
														
 
															+
														
 
															+        repo = Repo.init(tmp_dir)
														
 
															+        self.addCleanup(repo.close)
														
 
															+        config = repo.get_config()
														
 
															+
														
 
															+        # Test with protectHFS enabled
														
 
															+        config.set(b"core", b"core.protectHFS", b"true")
														
 
															+        config.write_to_path()
														
 
															+
														
 
															+        # Create a file with HFS+ Unicode attack vector
														
 
															+        # This uses a zero-width non-joiner to create ".g\u200cit"
														
 
															+        attack_name = b".g\xe2\x80\x8cit"
														
 
															+        attack_path = os.path.join(tmp_dir, attack_name.decode("utf-8"))
														
 
															+        os.mkdir(attack_path)
														
 
															+
														
 
															+        # Try to stage the malicious path - should be rejected
														
 
															+        with self.assertRaises(ValueError):
														
 
															+            repo.stage([attack_name])
														
 
															+
														
 
															+        # Test with protectHFS disabled
														
 
															+        config.set(b"core", b"core.protectHFS", b"false")
														
 
															+        config.write_to_path()
														
 
															+
														
 
															+        # Now it should work (though still dangerous!)
														
 
															+        # We're not actually staging it to avoid creating a dangerous repo
														
 
															+
														
 
															     def test_clone_bare(self) -> None:
														
 
															         r = self.open_repo("a.git")
														
 
															         tmp_dir = self.mkdtemp()