hace 1 mes · d73ccd1cbd
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,10 @@
 
				  * Print deprecations on usage, not import.
			
 
				    (Alyssa Coghlan, #1650)
			
 
				 
			
 
				+ * Add support for ``core.protectHFS`` configuration setting to protect
			
 
				+   against paths that could be misinterpreted on HFS+ filesystems.
			
 
				+   (Jelmer Vernooĳ, #246)
			
 
				+
			
 
				  * Only write Git index extensions when they contain meaningful data.
			
 
				    Previously, dulwich would write empty extensions to the index file,
			
 
				    causing unnecessary bloat.
			
--- a/dulwich/index.py
+++ b/dulwich/index.py
@@ -1311,6 +1311,60 @@ def validate_path_element_ntfs(element: bytes) -> bool:
 
				     return True
			
 
				 
			
 
				 
			
 
				+# HFS+ ignorable Unicode codepoints (from Git's utf8.c)
			
 
				+HFS_IGNORABLE_CHARS = {
			
 
				+    0x200C,  # ZERO WIDTH NON-JOINER
			
 
				+    0x200D,  # ZERO WIDTH JOINER
			
 
				+    0x200E,  # LEFT-TO-RIGHT MARK
			
 
				+    0x200F,  # RIGHT-TO-LEFT MARK
			
 
				+    0x202A,  # LEFT-TO-RIGHT EMBEDDING
			
 
				+    0x202B,  # RIGHT-TO-LEFT EMBEDDING
			
 
				+    0x202C,  # POP DIRECTIONAL FORMATTING
			
 
				+    0x202D,  # LEFT-TO-RIGHT OVERRIDE
			
 
				+    0x202E,  # RIGHT-TO-LEFT OVERRIDE
			
 
				+    0x206A,  # INHIBIT SYMMETRIC SWAPPING
			
 
				+    0x206B,  # ACTIVATE SYMMETRIC SWAPPING
			
 
				+    0x206C,  # INHIBIT ARABIC FORM SHAPING
			
 
				+    0x206D,  # ACTIVATE ARABIC FORM SHAPING
			
 
				+    0x206E,  # NATIONAL DIGIT SHAPES
			
 
				+    0x206F,  # NOMINAL DIGIT SHAPES
			
 
				+    0xFEFF,  # ZERO WIDTH NO-BREAK SPACE
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def validate_path_element_hfs(element: bytes) -> bool:
			
 
				+    """Validate path element for HFS+ filesystem.
			
 
				+
			
 
				+    Equivalent to Git's is_hfs_dotgit and related checks.
			
 
				+    Uses NFD normalization and ignores HFS+ ignorable characters.
			
 
				+    """
			
 
				+    import unicodedata
			
 
				+
			
 
				+    try:
			
 
				+        # Decode to Unicode
			
 
				+        element_str = element.decode("utf-8", errors="strict")
			
 
				+    except UnicodeDecodeError:
			
 
				+        # Malformed UTF-8 - be conservative and reject
			
 
				+        return False
			
 
				+
			
 
				+    # Remove HFS+ ignorable characters (like Git's next_hfs_char)
			
 
				+    filtered = "".join(c for c in element_str if ord(c) not in HFS_IGNORABLE_CHARS)
			
 
				+
			
 
				+    # Normalize to NFD (HFS+ uses a variant of NFD)
			
 
				+    normalized = unicodedata.normalize("NFD", filtered)
			
 
				+
			
 
				+    # Check against invalid names (case-insensitive)
			
 
				+    normalized_bytes = normalized.encode("utf-8", errors="strict")
			
 
				+    if normalized_bytes.lower() in INVALID_DOTNAMES:
			
 
				+        return False
			
 
				+
			
 
				+    # Also check for 8.3 short name
			
 
				+    if normalized_bytes.lower() == b"git~1":
			
 
				+        return False
			
 
				+
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				 def validate_path(
			
 
				     path: bytes,
			
 
				     element_validator: Callable[[bytes], bool] = validate_path_element_default,
			
--- a/dulwich/porcelain.py
+++ b/dulwich/porcelain.py
@@ -1728,10 +1728,18 @@ def reset(repo, mode, treeish="HEAD") -> None:
 
				             honor_filemode = config.get_boolean(b"core", b"filemode", os.name != "nt")
			
 
				 
			
 
				             # Import validation functions
			
 
				-            from .index import validate_path_element_default, validate_path_element_ntfs
			
 
				+            from .index import (
			
 
				+                validate_path_element_default,
			
 
				+                validate_path_element_hfs,
			
 
				+                validate_path_element_ntfs,
			
 
				+            )
			
 
				 
			
 
				             if config.get_boolean(b"core", b"core.protectNTFS", os.name == "nt"):
			
 
				                 validate_path_element = validate_path_element_ntfs
			
 
				+            elif config.get_boolean(
			
 
				+                b"core", b"core.protectHFS", sys.platform == "darwin"
			
 
				+            ):
			
 
				+                validate_path_element = validate_path_element_hfs
			
 
				             else:
			
 
				                 validate_path_element = validate_path_element_default
			
 
				 
			
@@ -3328,11 +3336,41 @@ def ls_files(repo):
 
				         return sorted(r.open_index())
			
 
				 
			
 
				 
			
 
				-def find_unique_abbrev(object_store, object_id):
			
 
				-    """For now, just return 7 characters."""
			
 
				-    # TODO(jelmer): Add some logic here to return a number of characters that
			
 
				-    # scales relative with the size of the repository
			
 
				-    return object_id.decode("ascii")[:7]
			
 
				+def find_unique_abbrev(object_store, object_id, min_length=7):
			
 
				+    """Find the shortest unique abbreviation for an object ID.
			
 
				+
			
 
				+    Args:
			
 
				+      object_store: Object store to search in
			
 
				+      object_id: The full object ID to abbreviate
			
 
				+      min_length: Minimum length of abbreviation (default 7)
			
 
				+
			
 
				+    Returns:
			
 
				+      The shortest unique prefix of the object ID (at least min_length chars)
			
 
				+    """
			
 
				+    if isinstance(object_id, bytes):
			
 
				+        hex_id = object_id.decode("ascii")
			
 
				+    else:
			
 
				+        hex_id = object_id
			
 
				+
			
 
				+    # Start with minimum length
			
 
				+    for length in range(min_length, len(hex_id) + 1):
			
 
				+        prefix = hex_id[:length]
			
 
				+        matches = 0
			
 
				+
			
 
				+        # Check if this prefix is unique
			
 
				+        for obj_id in object_store:
			
 
				+            if obj_id.decode("ascii").startswith(prefix):
			
 
				+                matches += 1
			
 
				+                if matches > 1:
			
 
				+                    # Not unique, need more characters
			
 
				+                    break
			
 
				+
			
 
				+        if matches == 1:
			
 
				+            # Found unique prefix
			
 
				+            return prefix
			
 
				+
			
 
				+    # If we get here, return the full ID
			
 
				+    return hex_id
			
 
				 
			
 
				 
			
 
				 def describe(repo, abbrev=None):
			
@@ -3397,16 +3435,20 @@ def describe(repo, abbrev=None):
 
				                     if commit_count == 0:
			
 
				                         return tag_name
			
 
				                     else:
			
 
				-                        return "{}-{}-g{}".format(
			
 
				-                            tag_name,
			
 
				-                            commit_count,
			
 
				-                            latest_commit.id.decode("ascii")[abbrev_slice],
			
 
				-                        )
			
 
				+                        if abbrev is not None:
			
 
				+                            abbrev_hash = latest_commit.id.decode("ascii")[abbrev_slice]
			
 
				+                        else:
			
 
				+                            abbrev_hash = find_unique_abbrev(
			
 
				+                                r.object_store, latest_commit.id
			
 
				+                            )
			
 
				+                        return f"{tag_name}-{commit_count}-g{abbrev_hash}"
			
 
				 
			
 
				             commit_count += 1
			
 
				 
			
 
				         # Return plain commit if no parent tag can be found
			
 
				-        return "g{}".format(latest_commit.id.decode("ascii")[abbrev_slice])
			
 
				+        if abbrev is not None:
			
 
				+            return "g{}".format(latest_commit.id.decode("ascii")[abbrev_slice])
			
 
				+        return f"g{find_unique_abbrev(r.object_store, latest_commit.id)}"
			
 
				 
			
 
				 
			
 
				 def get_object_by_path(repo, path, committish=None):
			
--- a/dulwich/repo.py
+++ b/dulwich/repo.py
@@ -1719,6 +1719,7 @@ class Repo(BaseRepo):
 
				             build_index_from_tree,
			
 
				             symlink,
			
 
				             validate_path_element_default,
			
 
				+            validate_path_element_hfs,
			
 
				             validate_path_element_ntfs,
			
 
				         )
			
 
				 
			
@@ -1732,6 +1733,8 @@ class Repo(BaseRepo):
 
				         honor_filemode = config.get_boolean(b"core", b"filemode", os.name != "nt")
			
 
				         if config.get_boolean(b"core", b"core.protectNTFS", os.name == "nt"):
			
 
				             validate_path_element = validate_path_element_ntfs
			
 
				+        elif config.get_boolean(b"core", b"core.protectHFS", sys.platform == "darwin"):
			
 
				+            validate_path_element = validate_path_element_hfs
			
 
				         else:
			
 
				             validate_path_element = validate_path_element_default
			
 
				         if config.get_boolean(b"core", b"symlinks", True):
			
--- a/dulwich/stash.py
+++ b/dulwich/stash.py
@@ -22,6 +22,7 @@
 
				 """Stash handling."""
			
 
				 
			
 
				 import os
			
 
				+import sys
			
 
				 from typing import TYPE_CHECKING, Optional, TypedDict
			
 
				 
			
 
				 from .file import GitFile
			
@@ -37,6 +38,7 @@ from .index import (
 
				     update_working_tree,
			
 
				     validate_path,
			
 
				     validate_path_element_default,
			
 
				+    validate_path_element_hfs,
			
 
				     validate_path_element_ntfs,
			
 
				 )
			
 
				 from .objects import S_IFGITLINK, Blob, Commit, ObjectID
			
@@ -139,6 +141,8 @@ class Stash:
 
				 
			
 
				         if config.get_boolean(b"core", b"core.protectNTFS", os.name == "nt"):
			
 
				             validate_path_element = validate_path_element_ntfs
			
 
				+        elif config.get_boolean(b"core", b"core.protectHFS", sys.platform == "darwin"):
			
 
				+            validate_path_element = validate_path_element_hfs
			
 
				         else:
			
 
				             validate_path_element = validate_path_element_default
			
 
				 
			
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -51,6 +51,7 @@ from dulwich.index import (
 
				     read_index_dict,
			
 
				     update_working_tree,
			
 
				     validate_path_element_default,
			
 
				+    validate_path_element_hfs,
			
 
				     validate_path_element_ntfs,
			
 
				     write_cache_time,
			
 
				     write_index,
			
@@ -860,6 +861,35 @@ class TestValidatePathElement(TestCase):
 
				         self.assertFalse(validate_path_element_ntfs(b".."))
			
 
				         self.assertFalse(validate_path_element_ntfs(b"git~1"))
			
 
				 
			
 
				+    def test_hfs(self) -> None:
			
 
				+        # Normal paths should pass
			
 
				+        self.assertTrue(validate_path_element_hfs(b"bla"))
			
 
				+        self.assertTrue(validate_path_element_hfs(b".bla"))
			
 
				+
			
 
				+        # Basic .git variations should fail
			
 
				+        self.assertFalse(validate_path_element_hfs(b".git"))
			
 
				+        self.assertFalse(validate_path_element_hfs(b".giT"))
			
 
				+        self.assertFalse(validate_path_element_hfs(b".GIT"))
			
 
				+        self.assertFalse(validate_path_element_hfs(b".."))
			
 
				+
			
 
				+        # git~1 should also fail on HFS+
			
 
				+        self.assertFalse(validate_path_element_hfs(b"git~1"))
			
 
				+
			
 
				+        # Test HFS+ Unicode normalization attacks
			
 
				+        # .g\u200cit (zero-width non-joiner)
			
 
				+        self.assertFalse(validate_path_element_hfs(b".g\xe2\x80\x8cit"))
			
 
				+
			
 
				+        # .gi\u200dt (zero-width joiner)
			
 
				+        self.assertFalse(validate_path_element_hfs(b".gi\xe2\x80\x8dt"))
			
 
				+
			
 
				+        # Test other ignorable characters
			
 
				+        # .g\ufeffit (zero-width no-break space)
			
 
				+        self.assertFalse(validate_path_element_hfs(b".g\xef\xbb\xbfit"))
			
 
				+
			
 
				+        # Valid Unicode that shouldn't be confused with .git
			
 
				+        self.assertTrue(validate_path_element_hfs(b".g\xc3\xaft"))  # .gït
			
 
				+        self.assertTrue(validate_path_element_hfs(b"git"))  # git without dot
			
 
				+
			
 
				 
			
 
				 class TestTreeFSPathConversion(TestCase):
			
 
				     def test_tree_to_fs_path(self) -> None:
			
--- a/tests/test_repository.py
+++ b/tests/test_repository.py
@@ -635,6 +635,35 @@ class RepositoryRootTests(TestCase):
 
				 
			
 
				         t.close()
			
 
				 
			
 
				+    def test_reset_index_protect_hfs(self) -> None:
			
 
				+        tmp_dir = self.mkdtemp()
			
 
				+        self.addCleanup(shutil.rmtree, tmp_dir)
			
 
				+
			
 
				+        repo = Repo.init(tmp_dir)
			
 
				+        self.addCleanup(repo.close)
			
 
				+        config = repo.get_config()
			
 
				+
			
 
				+        # Test with protectHFS enabled
			
 
				+        config.set(b"core", b"core.protectHFS", b"true")
			
 
				+        config.write_to_path()
			
 
				+
			
 
				+        # Create a file with HFS+ Unicode attack vector
			
 
				+        # This uses a zero-width non-joiner to create ".g\u200cit"
			
 
				+        attack_name = b".g\xe2\x80\x8cit"
			
 
				+        attack_path = os.path.join(tmp_dir, attack_name.decode("utf-8"))
			
 
				+        os.mkdir(attack_path)
			
 
				+
			
 
				+        # Try to stage the malicious path - should be rejected
			
 
				+        with self.assertRaises(ValueError):
			
 
				+            repo.stage([attack_name])
			
 
				+
			
 
				+        # Test with protectHFS disabled
			
 
				+        config.set(b"core", b"core.protectHFS", b"false")
			
 
				+        config.write_to_path()
			
 
				+
			
 
				+        # Now it should work (though still dangerous!)
			
 
				+        # We're not actually staging it to avoid creating a dangerous repo
			
 
				+
			
 
				     def test_clone_bare(self) -> None:
			
 
				         r = self.open_repo("a.git")
			
 
				         tmp_dir = self.mkdtemp()