Explorar el Código

Add support for core.protectHFS configuration setting (#1670)

Implement Git's core.protectHFS setting to protect against paths that
could be misinterpreted on HFS+ filesystems through Unicode
normalization. This includes filtering HFS+ ignorable characters and
applying NFD normalization to detect malicious path components.

Fixes #246
Jelmer Vernooij hace 1 mes
padre
commit
d73ccd1cbd
Se han modificado 7 ficheros con 178 adiciones y 12 borrados
  1. 4 0
      NEWS
  2. 54 0
      dulwich/index.py
  3. 54 12
      dulwich/porcelain.py
  4. 3 0
      dulwich/repo.py
  5. 4 0
      dulwich/stash.py
  6. 30 0
      tests/test_index.py
  7. 29 0
      tests/test_repository.py

+ 4 - 0
NEWS

@@ -3,6 +3,10 @@
  * Print deprecations on usage, not import.
    (Alyssa Coghlan, #1650)
 
+ * Add support for ``core.protectHFS`` configuration setting to protect
+   against paths that could be misinterpreted on HFS+ filesystems.
+   (Jelmer Vernooij, #246)
+
  * Only write Git index extensions when they contain meaningful data.
    Previously, dulwich would write empty extensions to the index file,
    causing unnecessary bloat.

+ 54 - 0
dulwich/index.py

@@ -1311,6 +1311,60 @@ def validate_path_element_ntfs(element: bytes) -> bool:
     return True
 
 
+# HFS+ ignorable Unicode codepoints (from Git's utf8.c)
+HFS_IGNORABLE_CHARS = {
+    0x200C,  # ZERO WIDTH NON-JOINER
+    0x200D,  # ZERO WIDTH JOINER
+    0x200E,  # LEFT-TO-RIGHT MARK
+    0x200F,  # RIGHT-TO-LEFT MARK
+    0x202A,  # LEFT-TO-RIGHT EMBEDDING
+    0x202B,  # RIGHT-TO-LEFT EMBEDDING
+    0x202C,  # POP DIRECTIONAL FORMATTING
+    0x202D,  # LEFT-TO-RIGHT OVERRIDE
+    0x202E,  # RIGHT-TO-LEFT OVERRIDE
+    0x206A,  # INHIBIT SYMMETRIC SWAPPING
+    0x206B,  # ACTIVATE SYMMETRIC SWAPPING
+    0x206C,  # INHIBIT ARABIC FORM SHAPING
+    0x206D,  # ACTIVATE ARABIC FORM SHAPING
+    0x206E,  # NATIONAL DIGIT SHAPES
+    0x206F,  # NOMINAL DIGIT SHAPES
+    0xFEFF,  # ZERO WIDTH NO-BREAK SPACE
+}
+
+
+def validate_path_element_hfs(element: bytes) -> bool:
+    """Validate path element for HFS+ filesystem.
+
+    Equivalent to Git's is_hfs_dotgit and related checks.
+    Uses NFD normalization and ignores HFS+ ignorable characters.
+    """
+    import unicodedata
+
+    try:
+        # Decode to Unicode
+        element_str = element.decode("utf-8", errors="strict")
+    except UnicodeDecodeError:
+        # Malformed UTF-8 - be conservative and reject
+        return False
+
+    # Remove HFS+ ignorable characters (like Git's next_hfs_char)
+    filtered = "".join(c for c in element_str if ord(c) not in HFS_IGNORABLE_CHARS)
+
+    # Normalize to NFD (HFS+ uses a variant of NFD)
+    normalized = unicodedata.normalize("NFD", filtered)
+
+    # Check against invalid names (case-insensitive)
+    normalized_bytes = normalized.encode("utf-8", errors="strict")
+    if normalized_bytes.lower() in INVALID_DOTNAMES:
+        return False
+
+    # Also check for 8.3 short name
+    if normalized_bytes.lower() == b"git~1":
+        return False
+
+    return True
+
+
 def validate_path(
     path: bytes,
     element_validator: Callable[[bytes], bool] = validate_path_element_default,

+ 54 - 12
dulwich/porcelain.py

@@ -1728,10 +1728,18 @@ def reset(repo, mode, treeish="HEAD") -> None:
             honor_filemode = config.get_boolean(b"core", b"filemode", os.name != "nt")
 
             # Import validation functions
-            from .index import validate_path_element_default, validate_path_element_ntfs
+            from .index import (
+                validate_path_element_default,
+                validate_path_element_hfs,
+                validate_path_element_ntfs,
+            )
 
             if config.get_boolean(b"core", b"core.protectNTFS", os.name == "nt"):
                 validate_path_element = validate_path_element_ntfs
+            elif config.get_boolean(
+                b"core", b"core.protectHFS", sys.platform == "darwin"
+            ):
+                validate_path_element = validate_path_element_hfs
             else:
                 validate_path_element = validate_path_element_default
 
@@ -3328,11 +3336,41 @@ def ls_files(repo):
         return sorted(r.open_index())
 
 
-def find_unique_abbrev(object_store, object_id):
-    """For now, just return 7 characters."""
-    # TODO(jelmer): Add some logic here to return a number of characters that
-    # scales relative with the size of the repository
-    return object_id.decode("ascii")[:7]
+def find_unique_abbrev(object_store, object_id, min_length=7):
+    """Find the shortest unique abbreviation for an object ID.
+
+    Args:
+      object_store: Object store to search in
+      object_id: The full object ID to abbreviate
+      min_length: Minimum length of abbreviation (default 7)
+
+    Returns:
+      The shortest unique prefix of the object ID (at least min_length chars)
+    """
+    if isinstance(object_id, bytes):
+        hex_id = object_id.decode("ascii")
+    else:
+        hex_id = object_id
+
+    # Start with minimum length
+    for length in range(min_length, len(hex_id) + 1):
+        prefix = hex_id[:length]
+        matches = 0
+
+        # Check if this prefix is unique
+        for obj_id in object_store:
+            if obj_id.decode("ascii").startswith(prefix):
+                matches += 1
+                if matches > 1:
+                    # Not unique, need more characters
+                    break
+
+        if matches == 1:
+            # Found unique prefix
+            return prefix
+
+    # If we get here, return the full ID
+    return hex_id
 
 
 def describe(repo, abbrev=None):
@@ -3397,16 +3435,20 @@ def describe(repo, abbrev=None):
                     if commit_count == 0:
                         return tag_name
                     else:
-                        return "{}-{}-g{}".format(
-                            tag_name,
-                            commit_count,
-                            latest_commit.id.decode("ascii")[abbrev_slice],
-                        )
+                        if abbrev is not None:
+                            abbrev_hash = latest_commit.id.decode("ascii")[abbrev_slice]
+                        else:
+                            abbrev_hash = find_unique_abbrev(
+                                r.object_store, latest_commit.id
+                            )
+                        return f"{tag_name}-{commit_count}-g{abbrev_hash}"
 
             commit_count += 1
 
         # Return plain commit if no parent tag can be found
-        return "g{}".format(latest_commit.id.decode("ascii")[abbrev_slice])
+        if abbrev is not None:
+            return "g{}".format(latest_commit.id.decode("ascii")[abbrev_slice])
+        return f"g{find_unique_abbrev(r.object_store, latest_commit.id)}"
 
 
 def get_object_by_path(repo, path, committish=None):

+ 3 - 0
dulwich/repo.py

@@ -1719,6 +1719,7 @@ class Repo(BaseRepo):
             build_index_from_tree,
             symlink,
             validate_path_element_default,
+            validate_path_element_hfs,
             validate_path_element_ntfs,
         )
 
@@ -1732,6 +1733,8 @@ class Repo(BaseRepo):
         honor_filemode = config.get_boolean(b"core", b"filemode", os.name != "nt")
         if config.get_boolean(b"core", b"core.protectNTFS", os.name == "nt"):
             validate_path_element = validate_path_element_ntfs
+        elif config.get_boolean(b"core", b"core.protectHFS", sys.platform == "darwin"):
+            validate_path_element = validate_path_element_hfs
         else:
             validate_path_element = validate_path_element_default
         if config.get_boolean(b"core", b"symlinks", True):

+ 4 - 0
dulwich/stash.py

@@ -22,6 +22,7 @@
 """Stash handling."""
 
 import os
+import sys
 from typing import TYPE_CHECKING, Optional, TypedDict
 
 from .file import GitFile
@@ -37,6 +38,7 @@ from .index import (
     update_working_tree,
     validate_path,
     validate_path_element_default,
+    validate_path_element_hfs,
     validate_path_element_ntfs,
 )
 from .objects import S_IFGITLINK, Blob, Commit, ObjectID
@@ -139,6 +141,8 @@ class Stash:
 
         if config.get_boolean(b"core", b"core.protectNTFS", os.name == "nt"):
             validate_path_element = validate_path_element_ntfs
+        elif config.get_boolean(b"core", b"core.protectHFS", sys.platform == "darwin"):
+            validate_path_element = validate_path_element_hfs
         else:
             validate_path_element = validate_path_element_default
 

+ 30 - 0
tests/test_index.py

@@ -51,6 +51,7 @@ from dulwich.index import (
     read_index_dict,
     update_working_tree,
     validate_path_element_default,
+    validate_path_element_hfs,
     validate_path_element_ntfs,
     write_cache_time,
     write_index,
@@ -860,6 +861,35 @@ class TestValidatePathElement(TestCase):
         self.assertFalse(validate_path_element_ntfs(b".."))
         self.assertFalse(validate_path_element_ntfs(b"git~1"))
 
+    def test_hfs(self) -> None:
+        # Normal paths should pass
+        self.assertTrue(validate_path_element_hfs(b"bla"))
+        self.assertTrue(validate_path_element_hfs(b".bla"))
+
+        # Basic .git variations should fail
+        self.assertFalse(validate_path_element_hfs(b".git"))
+        self.assertFalse(validate_path_element_hfs(b".giT"))
+        self.assertFalse(validate_path_element_hfs(b".GIT"))
+        self.assertFalse(validate_path_element_hfs(b".."))
+
+        # git~1 should also fail on HFS+
+        self.assertFalse(validate_path_element_hfs(b"git~1"))
+
+        # Test HFS+ Unicode normalization attacks
+        # .g\u200cit (zero-width non-joiner)
+        self.assertFalse(validate_path_element_hfs(b".g\xe2\x80\x8cit"))
+
+        # .gi\u200dt (zero-width joiner)
+        self.assertFalse(validate_path_element_hfs(b".gi\xe2\x80\x8dt"))
+
+        # Test other ignorable characters
+        # .g\ufeffit (zero-width no-break space)
+        self.assertFalse(validate_path_element_hfs(b".g\xef\xbb\xbfit"))
+
+        # Valid Unicode that shouldn't be confused with .git
+        self.assertTrue(validate_path_element_hfs(b".g\xc3\xaft"))  # .gït
+        self.assertTrue(validate_path_element_hfs(b"git"))  # git without dot
+
 
 class TestTreeFSPathConversion(TestCase):
     def test_tree_to_fs_path(self) -> None:

+ 29 - 0
tests/test_repository.py

@@ -635,6 +635,35 @@ class RepositoryRootTests(TestCase):
 
         t.close()
 
+    def test_reset_index_protect_hfs(self) -> None:
+        tmp_dir = self.mkdtemp()
+        self.addCleanup(shutil.rmtree, tmp_dir)
+
+        repo = Repo.init(tmp_dir)
+        self.addCleanup(repo.close)
+        config = repo.get_config()
+
+        # Test with protectHFS enabled
+        config.set(b"core", b"core.protectHFS", b"true")
+        config.write_to_path()
+
+        # Create a file with HFS+ Unicode attack vector
+        # This uses a zero-width non-joiner to create ".g\u200cit"
+        attack_name = b".g\xe2\x80\x8cit"
+        attack_path = os.path.join(tmp_dir, attack_name.decode("utf-8"))
+        os.mkdir(attack_path)
+
+        # Try to stage the malicious path - should be rejected
+        with self.assertRaises(ValueError):
+            repo.stage([attack_name])
+
+        # Test with protectHFS disabled
+        config.set(b"core", b"core.protectHFS", b"false")
+        config.write_to_path()
+
+        # Now it should work (though still dangerous!)
+        # We're not actually staging it to avoid creating a dangerous repo
+
     def test_clone_bare(self) -> None:
         r = self.open_repo("a.git")
         tmp_dir = self.mkdtemp()