Jelmer Vernooij 7 месяцев назад
Родитель
Сommit
a7e3b91227

+ 8 - 2
NEWS

@@ -531,6 +531,13 @@ compatible.
 
  * Add ``gc`` command to ``dulwich.porcelain.`` (Jelmer Vernooij, #92)
 
+ * Add initial support for SHA256 repositories. Dulwich can now read and write Git
+   repositories using SHA256 object format. This includes support for loose
+   objects, pack files (v1 and v2 indexes), and tree parsing with SHA256 hashes.
+   The Rust extensions have been updated to support variable hash lengths.
+   SHA256 repositories require format version 1 and the objectFormat extension.
+   (Jelmer Vernooij, #1115)
+
  * Add ``unpack-objects`` plumbing command to unpack objects from pack files
    into loose objects in the repository. This command extracts all objects
    from a pack file and writes them to the object store as individual files.
@@ -550,8 +557,7 @@ compatible.
  * Add support for pack index format version 3. This format supports variable
    hash sizes to enable future SHA-256 support. The implementation includes
    reading and writing v3 indexes with proper hash algorithm identification
-   (1 for SHA-1, 2 for SHA-256). Note that SHA-256 support itself is not yet
-   implemented and will raise NotImplementedError. (Jelmer Vernooij)
+   (1 for SHA-1, 2 for SHA-256). (Jelmer Vernooij)
 
  * Fix ``LocalGitClient`` assertion error when fetching externally cloned repositories
    into ``MemoryRepo``. Previously, the client would fail with an AssertionError

+ 41 - 10
crates/objects/src/lib.rs

@@ -49,15 +49,13 @@ fn sha_to_pyhex(py: Python, sha: &[u8]) -> PyResult<Py<PyAny>> {
     Ok(PyBytes::new(py, hexsha.as_slice()).into())
 }
 
-#[pyfunction]
-#[pyo3(signature = (text, strict=None))]
-fn parse_tree(
+fn parse_tree_with_length(
     py: Python,
     mut text: &[u8],
-    strict: Option<bool>,
-) -> PyResult<Vec<(Py<PyAny>, u32, Py<PyAny>)>> {
+    strict: bool,
+    hash_len: usize,
+) -> PyResult<Vec<(PyObject, u32, PyObject)>> {
     let mut entries = Vec::new();
-    let strict = strict.unwrap_or(false);
     while !text.is_empty() {
         let mode_end = memchr(b' ', text)
             .ok_or_else(|| ObjectFormatException::new_err(("Missing terminator for mode",)))?;
@@ -73,21 +71,54 @@ fn parse_tree(
         let namelen = memchr(b'\0', text)
             .ok_or_else(|| ObjectFormatException::new_err(("Missing trailing \\0",)))?;
         let name = &text[..namelen];
-        if namelen + 20 >= text.len() {
+
+        // Skip name and null terminator
+        text = &text[namelen + 1..];
+
+        // Check if we have enough bytes for the hash
+        if text.len() < hash_len {
             return Err(ObjectFormatException::new_err(("SHA truncated",)));
         }
-        text = &text[namelen + 1..];
-        let sha = &text[..20];
+
+        let sha = &text[..hash_len];
         entries.push((
             PyBytes::new(py, name).into_pyobject(py)?.unbind().into(),
             mode,
             sha_to_pyhex(py, sha)?,
         ));
-        text = &text[20..];
+        text = &text[hash_len..];
     }
     Ok(entries)
 }
 
+#[pyfunction]
+#[pyo3(signature = (text, strict=None, hash_algorithm=None))]
+fn parse_tree(
+    py: Python,
+    text: &[u8],
+    strict: Option<bool>,
+    hash_algorithm: Option<PyObject>,
+) -> PyResult<Vec<(PyObject, u32, PyObject)>> {
+    let strict = strict.unwrap_or(false);
+
+    // Determine hash length from hash_algorithm if provided
+    if let Some(algo) = hash_algorithm {
+        // Get oid_length attribute from hash algorithm object
+        let oid_length: usize = algo.getattr(py, "oid_length")?.extract(py)?;
+        parse_tree_with_length(py, text, strict, oid_length)
+    } else {
+        // Try to auto-detect by attempting to parse with both lengths
+        // We'll attempt to parse with SHA1 first (20 bytes), then SHA256 (32 bytes)
+        match parse_tree_with_length(py, text, strict, 20) {
+            Ok(entries) => Ok(entries),
+            Err(_) => {
+                // SHA1 failed, try SHA256
+                parse_tree_with_length(py, text, strict, 32)
+            }
+        }
+    }
+}
+
 fn cmp_with_suffix(a: (u32, &[u8]), b: (u32, &[u8])) -> std::cmp::Ordering {
     let len = std::cmp::min(a.1.len(), b.1.len());
     let cmp = a.1[..len].cmp(&b.1[..len]);

+ 8 - 5
crates/pack/src/lib.rs

@@ -30,8 +30,9 @@ pyo3::import_exception!(dulwich.errors, ApplyDeltaError);
 fn py_is_sha(sha: &Py<PyAny>, py: Python) -> PyResult<bool> {
     // Check if the object is a bytes object
     if sha.bind(py).is_instance_of::<PyBytes>() {
-        // Check if the bytes object has a size of 20
-        if sha.extract::<&[u8]>(py)?.len() == 20 {
+        // Check if the bytes object has a size of 20 (SHA1) or 32 (SHA256)
+        let len = sha.extract::<&[u8]>(py)?.len();
+        if len == 20 || len == 32 {
             Ok(true)
         } else {
             Ok(false)
@@ -53,9 +54,11 @@ fn bisect_find_sha(
     let sha = sha.as_bytes(py);
     let sha_len = sha.len();
 
-    // Check if sha is 20 bytes long
-    if sha_len != 20 {
-        return Err(PyValueError::new_err("Sha is not 20 bytes long"));
+    // Check if sha is 20 bytes (SHA1) or 32 bytes (SHA256)
+    if sha_len != 20 && sha_len != 32 {
+        return Err(PyValueError::new_err(
+            "Sha must be 20 (SHA1) or 32 (SHA256) bytes long",
+        ));
     }
 
     // Check if start > end

+ 121 - 0
dulwich/hash.py

@@ -0,0 +1,121 @@
+# hash.py -- Hash algorithm abstraction layer for Git
+# Copyright (C) 2024 The Dulwich contributors
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Hash algorithm abstraction for Git objects.
+
+This module provides an abstraction layer for different hash algorithms
+used in Git repositories (SHA-1 and SHA-256).
+"""
+
+from hashlib import sha1, sha256
+from typing import Callable, Optional
+
+
+class HashAlgorithm:
+    """Base class for hash algorithms used in Git."""
+
+    def __init__(
+        self, name: str, oid_length: int, hex_length: int, hash_func: Callable
+    ) -> None:
+        """Initialize a hash algorithm.
+
+        Args:
+            name: Name of the algorithm (e.g., "sha1", "sha256")
+            oid_length: Length of the binary object ID in bytes
+            hex_length: Length of the hexadecimal object ID in characters
+            hash_func: Hash function from hashlib
+        """
+        self.name = name
+        self.oid_length = oid_length
+        self.hex_length = hex_length
+        self.hash_func = hash_func
+        self.zero_oid = b"0" * hex_length
+        self.zero_oid_bin = b"\x00" * oid_length
+
+    def __str__(self) -> str:
+        return self.name
+
+    def __repr__(self) -> str:
+        return f"HashAlgorithm({self.name!r})"
+
+    def new_hash(self):
+        """Create a new hash object."""
+        return self.hash_func()
+
+    def hash_object(self, data: bytes) -> bytes:
+        """Hash data and return the digest.
+
+        Args:
+            data: Data to hash
+
+        Returns:
+            Binary digest
+        """
+        h = self.new_hash()
+        h.update(data)
+        return h.digest()
+
+    def hash_object_hex(self, data: bytes) -> bytes:
+        """Hash data and return the hexadecimal digest.
+
+        Args:
+            data: Data to hash
+
+        Returns:
+            Hexadecimal digest as bytes
+        """
+        h = self.new_hash()
+        h.update(data)
+        return h.hexdigest().encode("ascii")
+
+
+# Define the supported hash algorithms
+SHA1 = HashAlgorithm("sha1", 20, 40, sha1)
+SHA256 = HashAlgorithm("sha256", 32, 64, sha256)
+
+# Map of algorithm names to HashAlgorithm instances
+HASH_ALGORITHMS = {
+    "sha1": SHA1,
+    "sha256": SHA256,
+}
+
+# Default algorithm for backward compatibility
+DEFAULT_HASH_ALGORITHM = SHA1
+
+
+def get_hash_algorithm(name: Optional[str] = None) -> HashAlgorithm:
+    """Get a hash algorithm by name.
+
+    Args:
+        name: Algorithm name ("sha1" or "sha256"). If None, returns default.
+
+    Returns:
+        HashAlgorithm instance
+
+    Raises:
+        ValueError: If the algorithm name is not supported
+    """
+    if name is None:
+        return DEFAULT_HASH_ALGORITHM
+    try:
+        return HASH_ALGORITHMS[name.lower()]
+    except KeyError:
+        raise ValueError(f"Unsupported hash algorithm: {name}")

+ 39 - 5
dulwich/object_store.py

@@ -1169,10 +1169,10 @@ class PackBasedObjectStore(PackCapableObjectStore, PackedObjectContainer):
         """
         if name == ZERO_SHA:
             raise KeyError(name)
-        if len(name) == 40:
+        if len(name) in (40, 64):  # Support both SHA1 (40) and SHA256 (64) hex
             sha = hex_to_sha(cast(ObjectID, name))
             hexsha = cast(ObjectID, name)
-        elif len(name) == 20:
+        elif len(name) in (20, 32):  # Support both SHA1 (20) and SHA256 (32) binary
             sha = cast(RawObjectID, name)
             hexsha = None
         else:
@@ -1382,6 +1382,7 @@ class DiskObjectStore(PackBasedObjectStore):
         pack_write_bitmap_lookup_table: bool = True,
         file_mode: int | None = None,
         dir_mode: int | None = None,
+        hash_algorithm=None,
     ) -> None:
         """Open an object store.
 
@@ -1402,6 +1403,7 @@ class DiskObjectStore(PackBasedObjectStore):
           pack_write_bitmap_lookup_table: whether to include lookup table in bitmaps
           file_mode: File permission mask for shared repository
           dir_mode: Directory permission mask for shared repository
+          hash_algorithm: Hash algorithm to use (SHA1 or SHA256)
         """
         super().__init__(
             pack_compression_level=pack_compression_level,
@@ -1426,6 +1428,11 @@ class DiskObjectStore(PackBasedObjectStore):
         self.file_mode = file_mode
         self.dir_mode = dir_mode
 
+        # Import here to avoid circular dependency
+        from .hash import get_hash_algorithm
+
+        self.hash_algorithm = hash_algorithm if hash_algorithm else get_hash_algorithm()
+
         # Commit graph support - lazy loaded
         self._commit_graph = None
         self._use_commit_graph = True  # Default to true
@@ -1540,6 +1547,24 @@ class DiskObjectStore(PackBasedObjectStore):
                 (b"repack",), b"writeBitmaps", False
             )
 
+        # Get hash algorithm from config
+        from .hash import get_hash_algorithm
+
+        hash_algorithm = None
+        try:
+            try:
+                version = int(config.get((b"core",), b"repositoryformatversion"))
+            except KeyError:
+                version = 0
+            if version == 1:
+                try:
+                    object_format = config.get((b"extensions",), b"objectformat")
+                except KeyError:
+                    object_format = b"sha1"
+                hash_algorithm = get_hash_algorithm(object_format.decode("ascii"))
+        except (KeyError, ValueError):
+            pass
+
         instance = cls(
             path,
             loose_compression_level=loose_compression_level,
@@ -1557,6 +1582,7 @@ class DiskObjectStore(PackBasedObjectStore):
             pack_write_bitmap_lookup_table=pack_write_bitmap_lookup_table,
             file_mode=file_mode,
             dir_mode=dir_mode,
+            hash_algorithm=hash_algorithm,
         )
         instance._use_commit_graph = use_commit_graph
         instance._use_midx = use_midx
@@ -1647,6 +1673,7 @@ class DiskObjectStore(PackBasedObjectStore):
                     depth=self.pack_depth,
                     threads=self.pack_threads,
                     big_file_threshold=self.pack_big_file_threshold,
+                    hash_algorithm=self.hash_algorithm,
                 )
                 new_packs.append(pack)
                 self._pack_cache[f] = pack
@@ -1698,7 +1725,9 @@ class DiskObjectStore(PackBasedObjectStore):
     def _get_loose_object(self, sha: ObjectID | RawObjectID) -> ShaFile | None:
         path = self._get_shafile_path(sha)
         try:
-            return ShaFile.from_path(path)
+            # Load the object from path with SHA for hash algorithm detection
+            # sha parameter here is already hex, so pass it directly
+            return ShaFile.from_path(path, sha)
         except FileNotFoundError:
             return None
 
@@ -1885,6 +1914,7 @@ class DiskObjectStore(PackBasedObjectStore):
             depth=self.pack_depth,
             threads=self.pack_threads,
             big_file_threshold=self.pack_big_file_threshold,
+            hash_algorithm=self.hash_algorithm,
         )
         final_pack.check_length_and_checksum()
         self._add_cached_pack(pack_base_name, final_pack)
@@ -1964,7 +1994,9 @@ class DiskObjectStore(PackBasedObjectStore):
         Args:
           obj: Object to add
         """
-        path = self._get_shafile_path(obj.id)
+        # Use the correct hash algorithm for the object ID
+        obj_id = obj.get_id(self.hash_algorithm)
+        path = self._get_shafile_path(obj_id)
         dir = os.path.dirname(path)
         try:
             os.mkdir(dir)
@@ -1987,6 +2019,7 @@ class DiskObjectStore(PackBasedObjectStore):
         *,
         file_mode: int | None = None,
         dir_mode: int | None = None,
+        hash_algorithm=None,
     ) -> "DiskObjectStore":
         """Initialize a new disk object store.
 
@@ -1996,6 +2029,7 @@ class DiskObjectStore(PackBasedObjectStore):
           path: Path where the object store should be created
           file_mode: Optional file permission mask for shared repository
           dir_mode: Optional directory permission mask for shared repository
+          hash_algorithm: Hash algorithm to use (SHA1 or SHA256)
 
         Returns:
           New DiskObjectStore instance
@@ -2013,7 +2047,7 @@ class DiskObjectStore(PackBasedObjectStore):
         if dir_mode is not None:
             os.chmod(info_path, dir_mode)
             os.chmod(pack_path, dir_mode)
-        return cls(path, file_mode=file_mode, dir_mode=dir_mode)
+        return cls(path, file_mode=file_mode, dir_mode=dir_mode, hash_algorithm=hash_algorithm)
 
     def iter_prefix(self, prefix: bytes) -> Iterator[ObjectID]:
         """Iterate over all object SHAs with the given prefix.

+ 141 - 25
dulwich/objects.py

@@ -112,6 +112,24 @@ if TYPE_CHECKING:
 
     from .file import _GitFile
 
+# Zero SHA constants for backward compatibility
+ZERO_SHA = b"0" * 40  # SHA1 - kept for backward compatibility
+
+
+def zero_sha_for(hash_algorithm=None):
+    """Get the zero SHA for a given hash algorithm.
+
+    Args:
+        hash_algorithm: HashAlgorithm instance. If None, returns SHA1 zero.
+
+    Returns:
+        Zero SHA as hex bytes (40 chars for SHA1, 64 for SHA256)
+    """
+    if hash_algorithm is None:
+        return ZERO_SHA
+    return hash_algorithm.zero_oid
+
+
 # Header fields for commits
 _TREE_HEADER = b"tree"
 _PARENT_HEADER = b"parent"
@@ -175,13 +193,17 @@ def _decompress(string: bytes) -> bytes:
 def sha_to_hex(sha: RawObjectID) -> ObjectID:
     """Takes a string and returns the hex of the sha within."""
     hexsha = binascii.hexlify(sha)
-    assert len(hexsha) == 40, f"Incorrect length of sha1 string: {hexsha!r}"
+    # Support both SHA1 (40 chars) and SHA256 (64 chars)
+    if len(hexsha) not in (40, 64):
+        raise ValueError(f"Incorrect length of sha string: {hexsha!r}")
     return ObjectID(hexsha)
 
 
 def hex_to_sha(hex: ObjectID | str) -> RawObjectID:
     """Takes a hex sha and returns a binary sha."""
-    assert len(hex) == 40, f"Incorrect length of hexsha: {hex!r}"
+    # Support both SHA1 (40 chars) and SHA256 (64 chars)
+    if len(hex) not in (40, 64):
+        raise ValueError(f"Incorrect length of hexsha: {hex}")
     try:
         return RawObjectID(binascii.unhexlify(hex))
     except TypeError as exc:
@@ -191,15 +213,15 @@ def hex_to_sha(hex: ObjectID | str) -> RawObjectID:
 
 
 def valid_hexsha(hex: bytes | str) -> bool:
-    """Check if a string is a valid hex SHA.
+    """Check if a hex string is a valid SHA1 or SHA256.
 
     Args:
-      hex: Hex string to check
+        hex: Hex string to validate
 
     Returns:
-      True if valid hex SHA, False otherwise
+        True if valid SHA1 (40 chars) or SHA256 (64 chars), False otherwise
     """
-    if len(hex) != 40:
+    if len(hex) not in (40, 64):
         return False
     try:
         binascii.unhexlify(hex)
@@ -549,11 +571,12 @@ class ShaFile:
     ) -> None:
         """Set the contents of this object from a list of chunks."""
         self._chunked_text = chunks
-        self._deserialize(chunks)
+        # Set SHA before deserialization so Tree can detect hash algorithm
         if sha is None:
             self._sha = None
         else:
-            self._sha = FixedSha(sha)
+            self._sha = FixedSha(sha)  # type: ignore
+        self._deserialize(chunks)
         self._needs_serialization = False
 
     @staticmethod
@@ -613,17 +636,21 @@ class ShaFile:
         raise NotImplementedError(self._serialize)
 
     @classmethod
-    def from_path(cls, path: str | bytes) -> "ShaFile":
+    def from_path(cls, path: str | bytes, sha: ObjectID | None = None) -> "ShaFile":
         """Open a SHA file from disk."""
         with GitFile(path, "rb") as f:
-            return cls.from_file(f)
+            return cls.from_file(f, sha)
 
     @classmethod
-    def from_file(cls, f: BufferedIOBase | IO[bytes] | "_GitFile") -> "ShaFile":
+    def from_file(cls, f: BufferedIOBase | IO[bytes] | "_GitFile", sha: ObjectID | None = None) -> "ShaFile":
         """Get the contents of a SHA file on disk."""
         try:
             obj = cls._parse_file(f)
-            obj._sha = None
+            # Set SHA after parsing but before any further processing
+            if sha is not None:
+                obj._sha = FixedSha(sha)
+            else:
+                obj._sha = None
             return obj
         except (IndexError, ValueError) as exc:
             raise ObjectFormatException("invalid object header") from exc
@@ -713,8 +740,21 @@ class ShaFile:
         """Returns the length of the raw string of this object."""
         return sum(map(len, self.as_raw_chunks()))
 
-    def sha(self) -> "FixedSha | HASH":
-        """The SHA1 object that is the name of this object."""
+    def sha(self, hash_algorithm=None) -> "FixedSha | HASH":
+        """The SHA object that is the name of this object.
+
+        Args:
+            hash_algorithm: Optional HashAlgorithm to use. Defaults to SHA1.
+        """
+        # If using a different hash algorithm, always recalculate
+        if hash_algorithm is not None:
+            new_sha = hash_algorithm.new_hash()
+            new_sha.update(self._header())
+            for chunk in self.as_raw_chunks():
+                new_sha.update(chunk)
+            return new_sha
+
+        # Otherwise use cached SHA1 value
         if self._sha is None or self._needs_serialization:
             # this is a local because as_raw_chunks() overwrites self._sha
             new_sha = sha1()
@@ -733,9 +773,32 @@ class ShaFile:
 
     @property
     def id(self) -> ObjectID:
-        """The hex SHA of this object."""
+        """The hex SHA1 of this object.
+
+        For SHA256 repositories, use get_id(hash_algorithm) instead.
+        This property always returns SHA1 for backward compatibility.
+        """
         return ObjectID(self.sha().hexdigest().encode("ascii"))
 
+    def get_id(self, hash_algorithm=None):
+        """Get the hex SHA of this object using the specified hash algorithm.
+
+        Args:
+            hash_algorithm: Optional HashAlgorithm to use. Defaults to SHA1.
+
+        Example:
+            >>> blob = Blob()
+            >>> blob.data = b"Hello, World!"
+            >>> blob.id  # Always returns SHA1 for backward compatibility
+            b'4ab299c8ad6ed14f31923dd94f8b5f5cb89dfb54'
+            >>> blob.get_id()  # Same as .id
+            b'4ab299c8ad6ed14f31923dd94f8b5f5cb89dfb54'
+            >>> from dulwich.hash import SHA256
+            >>> blob.get_id(SHA256)  # Get SHA256 hash
+            b'03ba204e2f2e707...'  # 64-character SHA256
+        """
+        return self.sha(hash_algorithm).hexdigest().encode("ascii")
+
     def __repr__(self) -> str:
         """Return string representation of this object."""
         return f"<{self.__class__.__name__} {self.id!r}>"
@@ -1247,20 +1310,37 @@ class TreeEntry(NamedTuple):
 
 
 def parse_tree(
-    text: bytes, strict: bool = False
+    text: bytes, strict: bool = False, hash_algorithm=None
 ) -> Iterator[tuple[bytes, int, ObjectID]]:
     """Parse a tree text.
 
     Args:
       text: Serialized text to parse
-      strict: If True, enforce strict validation
+      strict: Whether to be strict about format
+      hash_algorithm: Hash algorithm object (SHA1 or SHA256) - if None, auto-detect
     Returns: iterator of tuples of (name, mode, sha)
 
     Raises:
       ObjectFormatException: if the object was malformed in some way
     """
+    if hash_algorithm is not None:
+        sha_len = hash_algorithm.oid_length
+        return _parse_tree_with_sha_len(text, strict, sha_len)
+
+    # Try both hash lengths and use the one that works
+    try:
+        # Try SHA1 first (more common)
+        return _parse_tree_with_sha_len(text, strict, 20)
+    except ObjectFormatException:
+        # If SHA1 fails, try SHA256
+        return _parse_tree_with_sha_len(text, strict, 32)
+
+
+def _parse_tree_with_sha_len(text, strict, sha_len):
+    """Helper function to parse tree with a specific hash length."""
     count = 0
     length = len(text)
+
     while count < length:
         mode_end = text.index(b" ", count)
         mode_text = text[count:mode_end]
@@ -1272,10 +1352,18 @@ def parse_tree(
             raise ObjectFormatException(f"Invalid mode {mode_text!r}") from exc
         name_end = text.index(b"\0", mode_end)
         name = text[mode_end + 1 : name_end]
-        count = name_end + 21
+
+        count = name_end + 1 + sha_len
+        if count > length:
+            raise ObjectFormatException(
+                f"Tree entry extends beyond tree length: {count} > {length}"
+            )
+
         sha = text[name_end + 1 : count]
-        if len(sha) != 20:
-            raise ObjectFormatException("Sha has invalid length")
+        if len(sha) != sha_len:
+            raise ObjectFormatException(
+                f"Sha has invalid length: {len(sha)} != {sha_len}"
+            )
         hexsha = sha_to_hex(RawObjectID(sha))
         yield (name, mode, hexsha)
 
@@ -1386,12 +1474,34 @@ class Tree(ShaFile):
         super().__init__()
         self._entries: dict[bytes, tuple[int, ObjectID]] = {}
 
+    def _get_hash_algorithm(self):
+        """Get the hash algorithm based on the object's SHA."""
+        if not hasattr(self, "_sha") or self._sha is None:
+            return None
+
+        # Get the raw SHA bytes
+        sha = self._sha.digest() if hasattr(self._sha, "digest") else self._sha
+        if not isinstance(sha, bytes):
+            return None
+
+        # Import hash modules lazily to avoid circular imports
+        if len(sha) == 32:
+            from .hash import SHA256
+
+            return SHA256
+        elif len(sha) == 20:
+            from .hash import SHA1
+
+            return SHA1
+        return None
+
     @classmethod
-    def from_path(cls, filename: str | bytes) -> "Tree":
+    def from_path(cls, filename: str | bytes, sha: ObjectID | None = None) -> "Tree":
         """Read a tree from a file on disk.
 
         Args:
           filename: Path to the tree file
+          sha: Optional known SHA for the object
 
         Returns:
           A Tree object
@@ -1399,7 +1509,7 @@ class Tree(ShaFile):
         Raises:
           NotTreeError: If the file is not a tree
         """
-        tree = ShaFile.from_path(filename)
+        tree = ShaFile.from_path(filename, sha)
         if not isinstance(tree, cls):
             raise NotTreeError(_path_to_bytes(filename))
         return tree
@@ -1470,7 +1580,9 @@ class Tree(ShaFile):
     def _deserialize(self, chunks: list[bytes]) -> None:
         """Grab the entries in the tree."""
         try:
-            parsed_entries = parse_tree(b"".join(chunks))
+            parsed_entries = parse_tree(
+                b"".join(chunks), hash_algorithm=self._get_hash_algorithm()
+            )
         except ValueError as exc:
             raise ObjectFormatException(exc) from exc
         # TODO: list comprehension is for efficiency in the common (small)
@@ -1496,8 +1608,12 @@ class Tree(ShaFile):
             # TODO: optionally exclude as in git fsck --strict
             stat.S_IFREG | 0o664,
         )
-        for name, mode, sha in parse_tree(b"".join(self._chunked_text), True):
-            check_hexsha(sha, f"invalid sha {sha!r}")
+        for name, mode, sha in parse_tree(
+            b"".join(self._chunked_text),
+            strict=True,
+            hash_algorithm=self._get_hash_algorithm(),
+        ):
+            check_hexsha(sha, f"invalid sha {sha}")
             if b"/" in name or name in (b"", b".", b"..", b".git"):
                 raise ObjectFormatException(
                     "invalid name {}".format(name.decode("utf-8", "replace"))

+ 84 - 33
dulwich/pack.py

@@ -530,15 +530,16 @@ def iter_sha1(iter: Iterable[bytes]) -> bytes:
     return sha.hexdigest().encode("ascii")
 
 
-def load_pack_index(path: str | os.PathLike[str]) -> "PackIndex":
+def load_pack_index(path: str | os.PathLike[str], hash_algorithm: int | None = None) -> "PackIndex":
     """Load an index file by path.
 
     Args:
       path: Path to the index file
+      hash_algorithm: Hash algorithm used by the repository
     Returns: A PackIndex loaded from the given path
     """
     with GitFile(path, "rb") as f:
-        return load_pack_index_file(path, f)
+        return load_pack_index_file(path, f, hash_algorithm=hash_algorithm)
 
 
 def _load_file_contents(
@@ -574,25 +575,35 @@ def _load_file_contents(
 
 def load_pack_index_file(
     path: str | os.PathLike[str], f: IO[bytes] | _GitFile
+    hash_algorithm: int | None = None,
 ) -> "PackIndex":
     """Load an index file from a file-like object.
 
     Args:
       path: Path for the index file
       f: File-like object
+      hash_algorithm: Hash algorithm used by the repository
     Returns: A PackIndex loaded from the given file
     """
     contents, size = _load_file_contents(f)
     if contents[:4] == b"\377tOc":
         version = struct.unpack(b">L", contents[4:8])[0]
         if version == 2:
-            return PackIndex2(path, file=f, contents=contents, size=size)
+            return PackIndex2(
+                path,
+                file=f,
+                contents=contents,
+                size=size,
+                hash_algorithm=hash_algorithm,
+            )
         elif version == 3:
             return PackIndex3(path, file=f, contents=contents, size=size)
         else:
             raise KeyError(f"Unknown pack index format {version}")
     else:
-        return PackIndex1(path, file=f, contents=contents, size=size)
+        return PackIndex1(
+            path, file=f, contents=contents, size=size, hash_algorithm=hash_algorithm
+        )
 
 
 def bisect_find_sha(
@@ -777,7 +788,7 @@ class MemoryPackIndex(PackIndex):
           sha: SHA to look up (binary or hex)
         Returns: Offset in the pack file
         """
-        if len(sha) == 40:
+        if len(sha) in (40, 64):  # Hex string (SHA1 or SHA256)
             sha = hex_to_sha(cast(ObjectID, sha))
         return self._by_sha[cast(RawObjectID, sha)]
 
@@ -976,7 +987,8 @@ class FilePackIndex(PackIndex):
         Args:
           sha: A *binary* SHA string. (20 characters long)_
         """
-        assert len(sha) == 20
+        hash_size = getattr(self, "hash_size", 20)  # Default to SHA1 for v1
+        assert len(sha) == hash_size
         idx = ord(sha[:1])
         if idx == 0:
             start = 0
@@ -1020,6 +1032,7 @@ class PackIndex1(FilePackIndex):
         file: IO[bytes] | _GitFile | None = None,
         contents: bytes | None = None,
         size: int | None = None,
+        hash_algorithm: int | None = None,
     ) -> None:
         """Initialize a version 1 pack index.
 
@@ -1028,24 +1041,35 @@ class PackIndex1(FilePackIndex):
             file: Optional file object
             contents: Optional mmap'd contents
             size: Optional size of the index
+            hash_algorithm: Hash algorithm used by the repository
         """
         super().__init__(filename, file, contents, size)
         self.version = 1
         self._fan_out_table = self._read_fan_out_table(0)
+        # Use provided hash algorithm if available, otherwise default to SHA1
+        if hash_algorithm:
+            self.hash_size = hash_algorithm.oid_length
+        else:
+            self.hash_size = 20  # Default to SHA1
+
+        self._entry_size = 4 + self.hash_size
 
     def _unpack_entry(self, i: int) -> tuple[RawObjectID, int, None]:
-        (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))
+        base_offset = (0x100 * 4) + (i * self._entry_size)
+        if self.hash_size == 20:
+            (offset, name) = unpack_from(">L20s", self._contents, base_offset)
+        else:  # SHA256
+            offset = unpack_from(">L", self._contents, base_offset)[0]
+            name = self._contents[base_offset + 4 : base_offset + 4 + self.hash_size]
         return (RawObjectID(name), offset, None)
 
     def _unpack_name(self, i: int) -> bytes:
-        offset = (0x100 * 4) + (i * 24) + 4
-        return self._contents[offset : offset + 20]
+        offset = (0x100 * 4) + (i * self._entry_size) + 4
+        return self._contents[offset : offset + self.hash_size]
 
     def _unpack_offset(self, i: int) -> int:
-        offset = (0x100 * 4) + (i * 24)
-        result = unpack_from(">L", self._contents, offset)[0]
-        assert isinstance(result, int)
-        return result
+        offset = (0x100 * 4) + (i * self._entry_size)
+        return unpack_from(">L", self._contents, offset)[0]
 
     def _unpack_crc32_checksum(self, i: int) -> None:
         # Not stored in v1 index files
@@ -1061,6 +1085,7 @@ class PackIndex2(FilePackIndex):
         file: IO[bytes] | _GitFile | None = None,
         contents: bytes | None = None,
         size: int | None = None,
+        hash_algorithm: int | None = None,
     ) -> None:
         """Initialize a version 2 pack index.
 
@@ -1069,6 +1094,7 @@ class PackIndex2(FilePackIndex):
             file: Optional file object
             contents: Optional mmap'd contents
             size: Optional size of the index
+            hash_algorithm: Hash algorithm used by the repository
         """
         super().__init__(filename, file, contents, size)
         if self._contents[:4] != b"\377tOc":
@@ -1077,8 +1103,15 @@ class PackIndex2(FilePackIndex):
         if self.version != 2:
             raise AssertionError(f"Version was {self.version}")
         self._fan_out_table = self._read_fan_out_table(8)
+
+        # Use provided hash algorithm if available, otherwise default to SHA1
+        if hash_algorithm:
+            self.hash_size = hash_algorithm.oid_length
+        else:
+            self.hash_size = 20  # Default to SHA1
+
         self._name_table_offset = 8 + 0x100 * 4
-        self._crc32_table_offset = self._name_table_offset + 20 * len(self)
+        self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)
         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
         self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
             self
@@ -1092,25 +1125,27 @@ class PackIndex2(FilePackIndex):
         )
 
     def _unpack_name(self, i: int) -> bytes:
-        offset = self._name_table_offset + i * 20
-        return self._contents[offset : offset + 20]
+        offset = self._name_table_offset + i * self.hash_size
+        return self._contents[offset : offset + self.hash_size]
 
     def _unpack_offset(self, i: int) -> int:
-        offset_pos = self._pack_offset_table_offset + i * 4
-        offset = unpack_from(">L", self._contents, offset_pos)[0]
-        assert isinstance(offset, int)
+        offset = self._pack_offset_table_offset + i * 4
+        offset = unpack_from(">L", self._contents, offset)[0]
         if offset & (2**31):
-            large_offset_pos = (
-                self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
-            )
-            offset = unpack_from(">Q", self._contents, large_offset_pos)[0]
-            assert isinstance(offset, int)
+            offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
+            offset = unpack_from(">Q", self._contents, offset)[0]
         return offset
 
     def _unpack_crc32_checksum(self, i: int) -> int:
-        result = unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
-        assert isinstance(result, int)
-        return result
+        return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
+
+    def get_pack_checksum(self) -> bytes:
+        """Return the checksum stored for the corresponding packfile.
+
+        Returns: binary digest (always 20 bytes - SHA1)
+        """
+        # Pack checksums are always SHA1, even in SHA256 repositories
+        return bytes(self._contents[-40:-20])
 
 
 class PackIndex3(FilePackIndex):
@@ -3172,6 +3207,8 @@ def write_pack_index_v1(
         f.write(struct.pack(">L", fan_out_table[i]))
         fan_out_table[i + 1] += fan_out_table[i]
     for name, offset, _entry_checksum in entries:
+        if len(name) != 20:
+            raise TypeError("pack index v1 only supports SHA-1 names")
         if not (offset <= 0xFFFFFFFF):
             raise TypeError("pack format 1 only supports offsets < 2Gb")
         f.write(struct.pack(">L20s", offset, name))
@@ -3250,11 +3287,11 @@ def _create_delta_py(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:
             o = j1
             while s > 127:
                 yield bytes([127])
-                yield bytes(memoryview(target_buf)[o : o + 127])
+                yield memoryview(target_buf)[o : o + 127]
                 s -= 127
                 o += 127
             yield bytes([s])
-            yield bytes(memoryview(target_buf)[o : o + s])
+            yield memoryview(target_buf)[o : o + s]
 
 
 # Default to pure Python implementation
@@ -3357,12 +3394,20 @@ def write_pack_index_v2(
     fan_out_table: dict[int, int] = defaultdict(lambda: 0)
     for name, offset, entry_checksum in entries:
         fan_out_table[ord(name[:1])] += 1
+    try:
+        hash_size = len(next(iter(entries))[0])
+    except StopIteration:
+        hash_size = 20  # Default to SHA-1 size if no entries
     # Fan-out table
     largetable: list[int] = []
     for i in range(0x100):
         f.write(struct.pack(b">L", fan_out_table[i]))
         fan_out_table[i + 1] += fan_out_table[i]
     for name, offset, entry_checksum in entries:
+        if len(name) != hash_size:
+            raise TypeError(
+                f"Object name has wrong length: expected {hash_size}, got {len(name)}"
+            )
         f.write(name)
     for name, offset, entry_checksum in entries:
         f.write(struct.pack(b">L", entry_checksum))
@@ -3512,6 +3557,7 @@ class Pack:
         depth: int | None = None,
         threads: int | None = None,
         big_file_threshold: int | None = None,
+        hash_algorithm: int | None = None,
     ) -> None:
         """Initialize a Pack object.
 
@@ -3524,6 +3570,7 @@ class Pack:
           depth: Maximum depth for delta chains
           threads: Number of threads to use for operations
           big_file_threshold: Size threshold for big file handling
+          hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
         """
         self._basename = basename
         self._data = None
@@ -3549,21 +3596,25 @@ class Pack:
         )
         self._idx_load = lambda: load_pack_index(self._idx_path)
         self.resolve_ext_ref = resolve_ext_ref
+        self.hash_algorithm = (
+            hash_algorithm if hash_algorithm is not None else DEFAULT_HASH_ALGORITHM
+        )
 
     @classmethod
     def from_lazy_objects(
-        cls, data_fn: Callable[[], PackData], idx_fn: Callable[[], PackIndex]
+        cls, data_fn: Callable[[], PackData], idx_fn: Callable[[], PackIndex],
+        hash_algorithm: int | None = None
     ) -> "Pack":
         """Create a new pack object from callables to load pack data and index objects."""
-        ret = cls("")
+        ret = cls("", hash_algorithm=hash_algorithm)
         ret._data_load = data_fn
         ret._idx_load = idx_fn
         return ret
 
     @classmethod
-    def from_objects(cls, data: PackData, idx: PackIndex) -> "Pack":
+    def from_objects(cls, data: PackData, idx: PackIndex, hash_algorithm: int | None = None) -> "Pack":
         """Create a new pack object from pack data and index objects."""
-        ret = cls("")
+        ret = cls("", hash_algorithm=hash_algorithm)
         ret._data = data
         ret._data_load = None
         ret._idx = idx

+ 5 - 3
dulwich/refs.py

@@ -1000,7 +1000,7 @@ class DiskRefsContainer(RefsContainer):
         """Read a reference file and return its contents.
 
         If the reference file a symbolic reference, only read the first line of
-        the file. Otherwise, only read the first 40 bytes.
+        the file. Otherwise, read the hash (40 bytes for SHA1, 64 bytes for SHA256).
 
         Args:
           name: the refname to read, relative to refpath
@@ -1018,8 +1018,10 @@ class DiskRefsContainer(RefsContainer):
                     # Read only the first line
                     return header + next(iter(f)).rstrip(b"\r\n")
                 else:
-                    # Read only the first 40 bytes
-                    return header + f.read(40 - len(SYMREF))
+                    # Read the entire line to get the full hash (handles both SHA1 and SHA256)
+                    f.seek(0)
+                    line = f.readline().rstrip(b"\r\n")
+                    return line
         except (OSError, UnicodeError):
             # don't assume anything specific about the error; in
             # particular, invalid or forbidden paths can raise weird

+ 96 - 5
dulwich/repo.py

@@ -515,6 +515,7 @@ class BaseRepo:
 
         self._graftpoints: dict[ObjectID, list[ObjectID]] = {}
         self.hooks: dict[str, Hook] = {}
+        self._hash_algorithm = None  # Cached hash algorithm
 
     def _determine_file_mode(self) -> bool:
         """Probe the file-system to determine whether permissions can be trusted.
@@ -537,6 +538,7 @@ class BaseRepo:
         symlinks: bool | None = None,
         format: int | None = None,
         shared_repository: str | bool | None = None,
+        object_format: str | None = None,
     ) -> None:
         """Initialize a default set of named files."""
         from .config import ConfigFile
@@ -544,11 +546,30 @@ class BaseRepo:
         self._put_named_file("description", b"Unnamed repository")
         f = BytesIO()
         cf = ConfigFile()
-        if format is None:
-            format = 0
+
+        # Determine the appropriate format version
+        if object_format == "sha256":
+            # SHA256 requires format version 1
+            if format is None:
+                format = 1
+            elif format != 1:
+                raise ValueError(
+                    "SHA256 object format requires repository format version 1"
+                )
+        else:
+            # SHA1 (default) can use format 0 or 1
+            if format is None:
+                format = 0
+
         if format not in (0, 1):
             raise ValueError(f"Unsupported repository format version: {format}")
+
         cf.set("core", "repositoryformatversion", str(format))
+
+        # Set object format extension if using SHA256
+        if object_format == "sha256":
+            cf.set("extensions", "objectformat", "sha256")
+
         if self._determine_file_mode():
             cf.set("core", "filemode", True)
         else:
@@ -574,6 +595,19 @@ class BaseRepo:
         self._put_named_file("config", f.getvalue())
         self._put_named_file(os.path.join("info", "exclude"), b"")
 
+        # Allow subclasses to handle config initialization
+        self._init_config(cf)
+
+    def _init_config(self, config: "ConfigFile") -> None:
+        """Initialize repository configuration.
+
+        This method can be overridden by subclasses to handle config initialization.
+
+        Args:
+            config: The ConfigFile object that was just created
+        """
+        # Default implementation does nothing
+
     def get_named_file(self, path: str) -> BinaryIO | None:
         """Get a file from the control dir with a specific name.
 
@@ -912,6 +946,42 @@ class BaseRepo:
         """
         raise NotImplementedError(self.get_config)
 
+    def get_hash_algorithm(self):
+        """Get the hash algorithm used by this repository.
+
+        Returns: HashAlgorithm instance (SHA1 or SHA256)
+        """
+        if self._hash_algorithm is None:
+            from .hash import get_hash_algorithm
+
+            # Check if repository uses SHA256
+            try:
+                config = self.get_config()
+                try:
+                    version = int(config.get(("core",), "repositoryformatversion"))
+                except KeyError:
+                    version = 0  # Default version is 0
+
+                if version == 1:
+                    # Check for SHA256 extension
+                    try:
+                        object_format = config.get(("extensions",), "objectformat")
+                        if object_format == b"sha256":
+                            self._hash_algorithm = get_hash_algorithm("sha256")
+                        else:
+                            self._hash_algorithm = get_hash_algorithm("sha1")
+                    except KeyError:
+                        # No objectformat extension, default to SHA1
+                        self._hash_algorithm = get_hash_algorithm("sha1")
+                else:
+                    # Version 0 always uses SHA1
+                    self._hash_algorithm = get_hash_algorithm("sha1")
+            except (KeyError, ValueError):
+                # If we can't read config, default to SHA1
+                self._hash_algorithm = get_hash_algorithm("sha1")
+
+        return self._hash_algorithm
+
     def get_worktree_config(self) -> "ConfigFile":
         """Retrieve the worktree config object."""
         raise NotImplementedError(self.get_worktree_config)
@@ -1103,7 +1173,7 @@ class BaseRepo:
         """
         if not isinstance(name, bytes):
             raise TypeError(f"'name' must be bytestring, not {type(name).__name__:.80}")
-        if len(name) in (20, 40):
+        if len(name) in (20, 32, 40, 64):  # Support both SHA1 and SHA256
             try:
                 # Try as ObjectID/RawObjectID
                 return self.object_store[
@@ -1424,7 +1494,7 @@ class Repo(BaseRepo):
                     has_reftable_extension = True
                 else:
                     raise UnsupportedExtension(f"refStorage = {value.decode()}")
-            elif extension.lower() not in (b"worktreeconfig",):
+            elif extension.lower() not in (b"worktreeconfig", b"objectformat"):
                 raise UnsupportedExtension(extension.decode("utf-8"))
 
         if object_store is None:
@@ -2055,6 +2125,7 @@ class Repo(BaseRepo):
         symlinks: bool | None = None,
         format: int | None = None,
         shared_repository: str | bool | None = None,
+        object_format: str | None = None,
     ) -> "Repo":
         path = os.fspath(path)
         if isinstance(path, bytes):
@@ -2077,10 +2148,17 @@ class Repo(BaseRepo):
                 os.chmod(dir_path, dir_mode)
 
         if object_store is None:
+            # Get hash algorithm for object store
+            from .hash import get_hash_algorithm
+
+            hash_alg = get_hash_algorithm(
+                "sha256" if object_format == "sha256" else "sha1"
+            )
             object_store = DiskObjectStore.init(
                 os.path.join(controldir, OBJECTDIR),
                 file_mode=file_mode,
                 dir_mode=dir_mode,
+                hash_algorithm=hash_alg,
             )
         ret = cls(path, bare=bare, object_store=object_store)
         if default_branch is None:
@@ -2098,6 +2176,7 @@ class Repo(BaseRepo):
             symlinks=symlinks,
             format=format,
             shared_repository=shared_repository,
+            object_format=object_format,
         )
         return ret
 
@@ -2112,6 +2191,7 @@ class Repo(BaseRepo):
         symlinks: bool | None = None,
         format: int | None = None,
         shared_repository: str | bool | None = None,
+        object_format: str | None = None,
     ) -> "Repo":
         """Create a new repository.
 
@@ -2123,6 +2203,7 @@ class Repo(BaseRepo):
           symlinks: Whether to support symlinks
           format: Repository format version (defaults to 0)
           shared_repository: Shared repository setting (group, all, umask, or octal)
+          object_format: Object format to use ("sha1" or "sha256", defaults to "sha1")
         Returns: `Repo` instance
         """
         path = os.fspath(path)
@@ -2142,6 +2223,7 @@ class Repo(BaseRepo):
             symlinks=symlinks,
             format=format,
             shared_repository=shared_repository,
+            object_format=object_format,
         )
 
     @classmethod
@@ -2213,6 +2295,7 @@ class Repo(BaseRepo):
         default_branch: bytes | None = None,
         format: int | None = None,
         shared_repository: str | bool | None = None,
+        object_format: str | None = None,
     ) -> "Repo":
         """Create a new bare repository.
 
@@ -2226,6 +2309,7 @@ class Repo(BaseRepo):
           default_branch: Default branch name
           format: Repository format version (defaults to 0)
           shared_repository: Shared repository setting (group, all, umask, or octal)
+          object_format: Object format to use ("sha1" or "sha256", defaults to "sha1")
         Returns: a `Repo` instance
         """
         path = os.fspath(path)
@@ -2242,6 +2326,7 @@ class Repo(BaseRepo):
             default_branch=default_branch,
             format=format,
             shared_repository=shared_repository,
+            object_format=object_format,
         )
 
     create = init_bare
@@ -2551,6 +2636,10 @@ class MemoryRepo(BaseRepo):
         """
         raise NoIndexPresent
 
+    def _init_config(self, config: "ConfigFile") -> None:
+        """Initialize repository configuration for MemoryRepo."""
+        self._config = config
+
     def get_config(self) -> "ConfigFile":
         """Retrieve the config object.
 
@@ -2739,6 +2828,7 @@ class MemoryRepo(BaseRepo):
         objects: Iterable[ShaFile],
         refs: Mapping[Ref, ObjectID],
         format: int | None = None,
+        object_format: str | None = None,
     ) -> "MemoryRepo":
         """Create a new bare repository in memory.
 
@@ -2748,11 +2838,12 @@ class MemoryRepo(BaseRepo):
           refs: Refs as dictionary, mapping names
             to object SHA1s
           format: Repository format version (defaults to 0)
+          object_format: Object format to use ("sha1" or "sha256", defaults to "sha1")
         """
         ret = cls()
         for obj in objects:
             ret.object_store.add_object(obj)
         for refname, sha in refs.items():
             ret.refs.add_if_new(refname, sha)
-        ret._init_files(bare=True, format=format)
+        ret._init_files(bare=True, format=format, object_format=object_format)
         return ret

+ 1 - 1
dulwich/tests/utils.py

@@ -114,7 +114,7 @@ def make_object(cls: type[T], **attrs: Any) -> T:
         if name == "id":
             # id property is read-only, so we overwrite sha instead.
             sha = FixedSha(value)
-            obj.sha = lambda: sha
+            obj.sha = lambda hash_algorithm=None: sha
         else:
             setattr(obj, name, value)
     return obj

+ 2 - 0
tests/compat/__init__.py

@@ -41,6 +41,8 @@ def test_suite() -> unittest.TestSuite:
         "reftable",
         "repository",
         "server",
+        "sha256",
+        "sha256_packs",
         "utils",
         "web",
     ]

+ 367 - 0
tests/compat/test_sha256.py

@@ -0,0 +1,367 @@
+# test_sha256.py -- Compatibility tests for SHA256 support
+# Copyright (C) 2024 The Dulwich contributors
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Compatibility tests for SHA256 support with git command line tools."""
+
+import os
+import tempfile
+
+from dulwich.hash import SHA256
+from dulwich.objects import Blob, Commit, Tree
+from dulwich.repo import Repo
+
+from .utils import CompatTestCase, run_git_or_fail
+
+
+class GitSHA256CompatibilityTests(CompatTestCase):
+    """Test SHA256 compatibility with git command line tools."""
+
+    min_git_version = (2, 29, 0)
+
+    def _run_git(self, args, cwd=None):
+        """Run git command in the specified directory."""
+        return run_git_or_fail(args, cwd=cwd)
+
+    def test_sha256_repo_creation_compat(self):
+        """Test that dulwich-created SHA256 repos are readable by git."""
+        # Create SHA256 repo with dulwich
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        repo = Repo.init(repo_path, mkdir=False, object_format="sha256")
+
+        # Add a blob and tree using dulwich
+        blob = Blob.from_string(b"Hello SHA256 world!")
+        tree = Tree()
+        tree.add(b"hello.txt", 0o100644, blob.get_id(SHA256))
+
+        # Create objects in the repository
+        object_store = repo.object_store
+        object_store.add_object(blob)
+        object_store.add_object(tree)
+
+        repo.close()
+
+        # Verify git can read the repository
+        config_output = self._run_git(
+            ["config", "--get", "extensions.objectformat"], cwd=repo_path
+        )
+        self.assertEqual(config_output.strip(), b"sha256")
+
+        # Verify git recognizes it as a SHA256 repository
+        rev_parse_output = self._run_git(
+            ["rev-parse", "--show-object-format"], cwd=repo_path
+        )
+        self.assertEqual(rev_parse_output.strip(), b"sha256")
+
+    def test_git_created_sha256_repo_readable(self):
+        """Test that git-created SHA256 repos are readable by dulwich."""
+        # Create SHA256 repo with git
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        self._run_git(["init", "--object-format=sha256", repo_path])
+
+        # Create a file and commit with git
+        test_file = os.path.join(repo_path, "test.txt")
+        with open(test_file, "w") as f:
+            f.write("Test SHA256 content")
+
+        self._run_git(["add", "test.txt"], cwd=repo_path)
+        self._run_git(["commit", "-m", "Test SHA256 commit"], cwd=repo_path)
+
+        # Read with dulwich
+        repo = Repo(repo_path)
+
+        # Verify dulwich detects SHA256
+        hash_alg = repo.get_hash_algorithm()
+        self.assertEqual(hash_alg, SHA256)
+
+        # Verify dulwich can read objects
+        # Try both main and master branches (git default changed over time)
+        try:
+            head_ref = repo.refs[b"refs/heads/main"]
+        except KeyError:
+            head_ref = repo.refs[b"refs/heads/master"]
+        self.assertEqual(len(head_ref), 64)  # SHA256 length
+
+        # Read the commit object
+        commit = repo[head_ref]
+        self.assertIsInstance(commit, Commit)
+        self.assertEqual(len(commit.tree), 64)  # SHA256 tree ID
+
+        repo.close()
+
+    def test_object_hashing_consistency(self):
+        """Test that object hashing is consistent between dulwich and git."""
+        # Create SHA256 repo with git
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        self._run_git(["init", "--object-format=sha256", repo_path])
+
+        # Create a test file with known content
+        test_content = b"Test content for SHA256 hashing consistency"
+        test_file = os.path.join(repo_path, "test.txt")
+        with open(test_file, "wb") as f:
+            f.write(test_content)
+
+        # Get git's hash for the content
+        git_hash = self._run_git(["hash-object", "test.txt"], cwd=repo_path)
+        git_hash = git_hash.strip().decode("ascii")
+
+        # Create same blob with dulwich
+        blob = Blob.from_string(test_content)
+        dulwich_hash = blob.get_id(SHA256).decode("ascii")
+
+        # Hashes should match
+        self.assertEqual(git_hash, dulwich_hash)
+
+    def test_tree_hashing_consistency(self):
+        """Test that tree hashing is consistent between dulwich and git."""
+        # Create SHA256 repo with git
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        self._run_git(["init", "--object-format=sha256", repo_path])
+
+        # Create a test file and add to index
+        test_content = b"Tree test content"
+        test_file = os.path.join(repo_path, "tree_test.txt")
+        with open(test_file, "wb") as f:
+            f.write(test_content)
+
+        self._run_git(["add", "tree_test.txt"], cwd=repo_path)
+
+        # Get git's tree hash
+        git_tree_hash = self._run_git(["write-tree"], cwd=repo_path)
+        git_tree_hash = git_tree_hash.strip().decode("ascii")
+
+        # Create same tree with dulwich
+        blob = Blob.from_string(test_content)
+        tree = Tree()
+        tree.add(b"tree_test.txt", 0o100644, blob.get_id(SHA256))
+
+        dulwich_tree_hash = tree.get_id(SHA256).decode("ascii")
+
+        # Tree hashes should match
+        self.assertEqual(git_tree_hash, dulwich_tree_hash)
+
+    def test_commit_creation_interop(self):
+        """Test commit creation interoperability between dulwich and git."""
+        # Create SHA256 repo with dulwich
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        repo = Repo.init(repo_path, mkdir=False, object_format="sha256")
+
+        # Create objects with dulwich
+        blob = Blob.from_string(b"Interop test content")
+        tree = Tree()
+        tree.add(b"interop.txt", 0o100644, blob.get_id(SHA256))
+
+        commit = Commit()
+        commit.tree = tree.get_id(SHA256)
+        commit.author = commit.committer = b"Test User <test@example.com>"
+        commit.commit_time = commit.author_time = 1234567890
+        commit.commit_timezone = commit.author_timezone = 0
+        commit.message = b"Test SHA256 commit from dulwich"
+
+        # Add objects to repo
+        object_store = repo.object_store
+        object_store.add_object(blob)
+        object_store.add_object(tree)
+        object_store.add_object(commit)
+
+        # Update HEAD
+        commit_id = commit.get_id(SHA256)
+        repo.refs[b"refs/heads/master"] = commit_id
+        repo.close()
+
+        # Verify git can read the commit
+        commit_hash = self._run_git(["rev-parse", "HEAD"], cwd=repo_path)
+        commit_hash = commit_hash.strip().decode("ascii")
+        self.assertEqual(len(commit_hash), 64)  # SHA256 length
+
+        # Verify git can show the commit
+        commit_message = self._run_git(["log", "--format=%s", "-n", "1"], cwd=repo_path)
+        self.assertEqual(commit_message.strip(), b"Test SHA256 commit from dulwich")
+
+        # Verify git can list the tree
+        tree_content = self._run_git(["ls-tree", "HEAD"], cwd=repo_path)
+        self.assertIn(b"interop.txt", tree_content)
+
+    def test_ref_updates_interop(self):
+        """Test that ref updates work between dulwich and git."""
+        # Create repo with git
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        self._run_git(["init", "--object-format=sha256", repo_path])
+
+        # Create initial commit with git
+        test_file = os.path.join(repo_path, "initial.txt")
+        with open(test_file, "w") as f:
+            f.write("Initial content")
+
+        self._run_git(["add", "initial.txt"], cwd=repo_path)
+        self._run_git(["commit", "-m", "Initial commit"], cwd=repo_path)
+
+        initial_commit = self._run_git(["rev-parse", "HEAD"], cwd=repo_path)
+        initial_commit = initial_commit.strip()
+
+        # Update ref with dulwich
+        repo = Repo(repo_path)
+
+        # Create new commit with dulwich
+        blob = Blob.from_string(b"New content from dulwich")
+        tree = Tree()
+        tree.add(b"dulwich.txt", 0o100644, blob.get_id(SHA256))
+
+        commit = Commit()
+        commit.tree = tree.get_id(SHA256)
+        commit.parents = [initial_commit]
+        commit.author = commit.committer = b"Dulwich User <dulwich@example.com>"
+        commit.commit_time = commit.author_time = 1234567891
+        commit.commit_timezone = commit.author_timezone = 0
+        commit.message = b"Commit from dulwich"
+
+        # Add objects and update ref
+        object_store = repo.object_store
+        object_store.add_object(blob)
+        object_store.add_object(tree)
+        object_store.add_object(commit)
+
+        new_commit_hash = commit.get_id(SHA256)
+        repo.refs[b"refs/heads/master"] = new_commit_hash
+        repo.close()
+
+        # Verify git sees the update
+        current_commit = self._run_git(["rev-parse", "HEAD"], cwd=repo_path)
+        current_commit = current_commit.strip().decode("ascii")
+        self.assertEqual(current_commit, new_commit_hash.decode("ascii"))
+
+        # Verify git can access the new tree
+        tree_listing = self._run_git(["ls-tree", "HEAD"], cwd=repo_path)
+        self.assertIn(b"dulwich.txt", tree_listing)
+
+    def test_clone_sha256_repo_git_to_dulwich(self):
+        """Test cloning a git SHA256 repository with dulwich."""
+        # Create source repo with git
+        source_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(source_path))
+        self._run_git(["init", "--object-format=sha256", source_path])
+
+        # Add content
+        test_file = os.path.join(source_path, "clone_test.txt")
+        with open(test_file, "w") as f:
+            f.write("Content to be cloned")
+
+        self._run_git(["add", "clone_test.txt"], cwd=source_path)
+        self._run_git(["commit", "-m", "Initial commit"], cwd=source_path)
+
+        # Clone with dulwich
+        target_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(target_path))
+
+        target_repo = Repo.init(target_path, mkdir=False, object_format="sha256")
+
+        # Copy objects (simplified clone)
+        source_repo = Repo(source_path)
+
+        # Copy all objects
+        for obj_id in source_repo.object_store:
+            obj = source_repo.object_store[obj_id]
+            target_repo.object_store.add_object(obj)
+
+        # Copy refs
+        for ref_name in source_repo.refs.keys():
+            ref_id = source_repo.refs[ref_name]
+            target_repo.refs[ref_name] = ref_id
+
+        # Set HEAD
+        target_repo.refs.set_symbolic_ref(b"HEAD", b"refs/heads/master")
+
+        source_repo.close()
+        target_repo.close()
+
+        # Verify with git
+        output = self._run_git(["rev-parse", "--show-object-format"], cwd=target_path)
+        self.assertEqual(output.strip(), b"sha256")
+
+        # Verify content
+        self._run_git(["checkout", "HEAD", "--", "."], cwd=target_path)
+        cloned_file = os.path.join(target_path, "clone_test.txt")
+        with open(cloned_file) as f:
+            content = f.read()
+        self.assertEqual(content, "Content to be cloned")
+
+    def test_fsck_sha256_repo(self):
+        """Test that git fsck works on dulwich-created SHA256 repos."""
+        # Create repo with dulwich
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        repo = Repo.init(repo_path, mkdir=False, object_format="sha256")
+
+        # Create a more complex object graph
+        # Multiple blobs
+        blobs = []
+        for i in range(5):
+            blob = Blob.from_string(f"Blob content {i}".encode())
+            repo.object_store.add_object(blob)
+            blobs.append(blob)
+
+        # Multiple trees
+        subtree = Tree()
+        subtree.add(b"subfile1.txt", 0o100644, blobs[0].get_id(SHA256))
+        subtree.add(b"subfile2.txt", 0o100644, blobs[1].get_id(SHA256))
+        repo.object_store.add_object(subtree)
+
+        main_tree = Tree()
+        main_tree.add(b"file1.txt", 0o100644, blobs[2].get_id(SHA256))
+        main_tree.add(b"file2.txt", 0o100644, blobs[3].get_id(SHA256))
+        main_tree.add(b"subdir", 0o040000, subtree.get_id(SHA256))
+        repo.object_store.add_object(main_tree)
+
+        # Create commits
+        commit1 = Commit()
+        commit1.tree = main_tree.get_id(SHA256)
+        commit1.author = commit1.committer = b"Test <test@example.com>"
+        commit1.commit_time = commit1.author_time = 1234567890
+        commit1.commit_timezone = commit1.author_timezone = 0
+        commit1.message = b"First commit"
+        repo.object_store.add_object(commit1)
+
+        commit2 = Commit()
+        commit2.tree = main_tree.get_id(SHA256)
+        commit2.parents = [commit1.get_id(SHA256)]
+        commit2.author = commit2.committer = b"Test <test@example.com>"
+        commit2.commit_time = commit2.author_time = 1234567891
+        commit2.commit_timezone = commit2.author_timezone = 0
+        commit2.message = b"Second commit"
+        repo.object_store.add_object(commit2)
+
+        # Set refs
+        repo.refs[b"refs/heads/master"] = commit2.get_id(SHA256)
+        repo.refs[b"refs/heads/branch1"] = commit1.get_id(SHA256)
+
+        repo.close()
+
+        # Run git fsck
+        fsck_output = self._run_git(["fsck", "--full"], cwd=repo_path)
+        # fsck should not report any errors (empty output or success message)
+        self.assertNotIn(b"error", fsck_output.lower())
+        self.assertNotIn(b"missing", fsck_output.lower())
+        self.assertNotIn(b"broken", fsck_output.lower())

+ 330 - 0
tests/compat/test_sha256_packs.py

@@ -0,0 +1,330 @@
+# test_sha256_packs.py -- Compatibility tests for SHA256 pack files
+# Copyright (C) 2024 The Dulwich contributors
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Compatibility tests for SHA256 pack files with git command line tools."""
+
+import os
+import tempfile
+
+from dulwich.hash import SHA256
+from dulwich.objects import Blob, Commit, Tree
+from dulwich.pack import load_pack_index_file
+from dulwich.repo import Repo
+
+from .utils import CompatTestCase, run_git_or_fail
+
+
+class GitSHA256PackCompatibilityTests(CompatTestCase):
+    """Test SHA256 pack file compatibility with git command line tools."""
+
+    min_git_version = (2, 29, 0)
+
+    def _run_git(self, args, cwd=None):
+        """Run git command in the specified directory."""
+        return run_git_or_fail(args, cwd=cwd)
+
+    def test_git_pack_readable_by_dulwich(self):
+        """Test that git-created SHA256 pack files are readable by dulwich."""
+        # Create SHA256 repo with git
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        self._run_git(["init", "--object-format=sha256", repo_path])
+
+        # Create multiple files to ensure pack creation
+        for i in range(20):
+            test_file = os.path.join(repo_path, f"file{i}.txt")
+            with open(test_file, "w") as f:
+                f.write(f"Content for file {i}\n")
+
+        self._run_git(["add", "."], cwd=repo_path)
+        self._run_git(["commit", "-m", "Add 20 files"], cwd=repo_path)
+
+        # Force pack creation
+        self._run_git(["gc"], cwd=repo_path)
+
+        # Open with dulwich
+        repo = Repo(repo_path)
+        self.assertEqual(repo.get_hash_algorithm(), SHA256)
+
+        # Find pack files
+        pack_dir = os.path.join(repo_path, ".git", "objects", "pack")
+        pack_files = [f for f in os.listdir(pack_dir) if f.endswith(".pack")]
+        self.assertGreater(len(pack_files), 0, "No pack files created")
+
+        # Read pack with dulwich
+        for pack_file in pack_files:
+            pack_path = os.path.join(pack_dir, pack_file)
+            idx_path = pack_path[:-5] + ".idx"
+
+            # Load pack index with SHA256 algorithm
+            with open(idx_path, "rb") as f:
+                pack_idx = load_pack_index_file(
+                    idx_path, f, hash_algorithm=repo.get_hash_algorithm()
+                )
+
+            # Verify it's detected as SHA256
+            self.assertEqual(pack_idx.hash_size, 32)
+
+            # Verify we can iterate objects
+            obj_count = 0
+            for sha, offset, crc32 in pack_idx.iterentries():
+                self.assertEqual(len(sha), 32)  # SHA256
+                obj_count += 1
+
+            self.assertGreater(obj_count, 20)  # At least our files + trees + commit
+
+        # Verify we can read all objects through the repo interface
+        head_ref = repo.refs[b"refs/heads/master"]
+        commit = repo[head_ref]
+        self.assertIsInstance(commit, Commit)
+
+        # Read the tree
+        tree = repo[commit.tree]
+        self.assertIsInstance(tree, Tree)
+
+        # Verify all files are there
+        file_count = 0
+        for name, mode, sha in tree.items():
+            if name.startswith(b"file") and name.endswith(b".txt"):
+                file_count += 1
+                # Read the blob
+                blob = repo[sha]
+                self.assertIsInstance(blob, Blob)
+
+        self.assertEqual(file_count, 20)
+        repo.close()
+
+    def test_dulwich_objects_readable_by_git(self):
+        """Test that dulwich-created SHA256 objects are readable by git."""
+        # Create SHA256 repo with dulwich
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        repo = Repo.init(repo_path, mkdir=False, object_format="sha256")
+
+        # Create objects
+        blobs = []
+        for i in range(10):
+            blob = Blob.from_string(f"Dulwich blob content {i}".encode())
+            repo.object_store.add_object(blob)
+            blobs.append(blob)
+
+        # Create a tree with all blobs
+        tree = Tree()
+        for i, blob in enumerate(blobs):
+            tree.add(f"blob{i}.txt".encode(), 0o100644, blob.get_id(SHA256))
+        repo.object_store.add_object(tree)
+
+        # Create a commit
+        commit = Commit()
+        commit.tree = tree.get_id(SHA256)
+        commit.author = commit.committer = b"Dulwich Test <test@dulwich.org>"
+        commit.commit_time = commit.author_time = 1234567890
+        commit.commit_timezone = commit.author_timezone = 0
+        commit.message = b"Test commit with blobs"
+        repo.object_store.add_object(commit)
+
+        # Update HEAD
+        repo.refs[b"refs/heads/master"] = commit.get_id(SHA256)
+        repo.close()
+
+        # Verify git can read all objects
+        output = self._run_git(["rev-parse", "HEAD"], cwd=repo_path)
+        self.assertEqual(len(output.strip()), 64)  # SHA256
+
+        # List tree contents
+        tree_output = self._run_git(["ls-tree", "HEAD"], cwd=repo_path)
+        # Count lines instead of occurrences of "blob" since "blob" appears twice per line
+        lines = tree_output.strip().split(b"\n")
+        self.assertEqual(len(lines), 10)
+
+        # Verify git can check out the content
+        self._run_git(["checkout", "HEAD", "--", "."], cwd=repo_path)
+
+        # Verify files exist with correct content
+        for i in range(10):
+            file_path = os.path.join(repo_path, f"blob{i}.txt")
+            self.assertTrue(os.path.exists(file_path))
+            with open(file_path, "rb") as f:
+                content = f.read()
+                self.assertEqual(content, f"Dulwich blob content {i}".encode())
+
+    def test_pack_index_v1_interop(self):
+        """Test pack index v1 interoperability with SHA256."""
+        # Create repo with git using pack index v1
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        self._run_git(["init", "--object-format=sha256", repo_path])
+        self._run_git(["config", "pack.indexVersion", "1"], cwd=repo_path)
+
+        # Create files
+        for i in range(10):
+            test_file = os.path.join(repo_path, f"v1test{i}.txt")
+            with open(test_file, "w") as f:
+                f.write(f"Pack v1 test {i}\n")
+
+        self._run_git(["add", "."], cwd=repo_path)
+        self._run_git(["commit", "-m", "Test pack v1"], cwd=repo_path)
+        self._run_git(["gc"], cwd=repo_path)
+
+        # Read with dulwich
+        repo = Repo(repo_path)
+
+        # Find pack index
+        pack_dir = os.path.join(repo_path, ".git", "objects", "pack")
+        idx_files = [f for f in os.listdir(pack_dir) if f.endswith(".idx")]
+
+        for idx_file in idx_files:
+            idx_path = os.path.join(pack_dir, idx_file)
+            with open(idx_path, "rb") as f:
+                pack_idx = load_pack_index_file(
+                    idx_path, f, hash_algorithm=repo.get_hash_algorithm()
+                )
+
+            # Verify it's v1 with SHA256
+            self.assertEqual(pack_idx.version, 1)
+            self.assertEqual(pack_idx.hash_size, 32)
+
+            # Verify we can iterate
+            for sha, offset, crc32 in pack_idx.iterentries():
+                self.assertEqual(len(sha), 32)
+                self.assertIsNone(crc32)  # v1 doesn't store CRC32
+
+        repo.close()
+
+    def test_large_pack_interop(self):
+        """Test large pack file interoperability."""
+        # Create repo with dulwich
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        repo = Repo.init(repo_path, mkdir=False, object_format="sha256")
+
+        # Create a large file that will use delta compression
+        large_content = b"A" * 10000
+        blobs = []
+
+        # Create similar blobs to trigger delta compression
+        for i in range(10):
+            content = large_content + f" variation {i}".encode()
+            blob = Blob.from_string(content)
+            repo.object_store.add_object(blob)
+            blobs.append(blob)
+
+        # Create tree
+        tree = Tree()
+        for i, blob in enumerate(blobs):
+            tree.add(f"large{i}.txt".encode(), 0o100644, blob.get_id(SHA256))
+        repo.object_store.add_object(tree)
+
+        # Create commit
+        commit = Commit()
+        commit.tree = tree.get_id(SHA256)
+        commit.author = commit.committer = b"Test <test@example.com>"
+        commit.commit_time = commit.author_time = 1234567890
+        commit.commit_timezone = commit.author_timezone = 0
+        commit.message = b"Large files for delta compression test"
+        repo.object_store.add_object(commit)
+
+        repo.refs[b"refs/heads/master"] = commit.get_id(SHA256)
+        repo.close()
+
+        # Run git gc to create packs with delta compression
+        self._run_git(["gc", "--aggressive"], cwd=repo_path)
+
+        # Verify git created a pack
+        pack_dir = os.path.join(repo_path, ".git", "objects", "pack")
+        pack_files = [f for f in os.listdir(pack_dir) if f.endswith(".pack")]
+        self.assertGreater(len(pack_files), 0)
+
+        # Re-open with dulwich and verify we can read everything
+        repo = Repo(repo_path)
+        head = repo.refs[b"refs/heads/master"]
+        commit = repo[head]
+        tree = repo[commit.tree]
+
+        # Read all blobs
+        for i in range(10):
+            name = f"large{i}.txt".encode()
+            mode, sha = tree[name]
+            blob = repo[sha]
+            expected = large_content + f" variation {i}".encode()
+            self.assertEqual(blob.data, expected)
+
+        repo.close()
+
+    def test_mixed_loose_packed_objects(self):
+        """Test repositories with both loose and packed objects."""
+        # Create repo with git
+        repo_path = tempfile.mkdtemp()
+        self.addCleanup(lambda: __import__("shutil").rmtree(repo_path))
+        self._run_git(["init", "--object-format=sha256", repo_path])
+
+        # Create initial objects that will be packed
+        for i in range(5):
+            test_file = os.path.join(repo_path, f"packed{i}.txt")
+            with open(test_file, "w") as f:
+                f.write(f"Will be packed {i}\n")
+
+        self._run_git(["add", "."], cwd=repo_path)
+        self._run_git(["commit", "-m", "Initial packed objects"], cwd=repo_path)
+        self._run_git(["gc"], cwd=repo_path)
+
+        # Create more objects that will remain loose
+        for i in range(5):
+            test_file = os.path.join(repo_path, f"loose{i}.txt")
+            with open(test_file, "w") as f:
+                f.write(f"Will stay loose {i}\n")
+
+        self._run_git(["add", "."], cwd=repo_path)
+        self._run_git(["commit", "-m", "Loose objects"], cwd=repo_path)
+
+        # Open with dulwich
+        repo = Repo(repo_path)
+
+        # Count objects in packs vs loose
+        pack_dir = os.path.join(repo_path, ".git", "objects", "pack")
+        pack_count = len([f for f in os.listdir(pack_dir) if f.endswith(".pack")])
+        self.assertGreater(pack_count, 0)
+
+        # Verify we can read all objects
+        head = repo.refs[b"refs/heads/master"]
+        commit = repo[head]
+
+        # Walk the commit history
+        commit_count = 0
+        while commit.parents:
+            commit_count += 1
+            tree = repo[commit.tree]
+            # Verify we can read the tree
+            self.assertGreater(len(tree), 0)
+
+            if commit.parents:
+                commit = repo[commit.parents[0]]
+            else:
+                break
+
+        self.assertEqual(commit_count, 1)  # We made 2 commits total
+        repo.close()
+
+
+if __name__ == "__main__":
+    import unittest
+
+    unittest.main()

+ 213 - 0
tests/test_sha256.py

@@ -0,0 +1,213 @@
+# test_sha256.py -- Tests for SHA256 support
+# Copyright (C) 2024 The Dulwich contributors
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Tests for SHA256 support in Dulwich."""
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from dulwich.hash import SHA1, SHA256, get_hash_algorithm
+from dulwich.objects import Blob, Tree, valid_hexsha, zero_sha_for
+from dulwich.repo import MemoryRepo, Repo
+
+
+class HashAlgorithmTests(unittest.TestCase):
+    """Tests for the hash algorithm abstraction."""
+
+    def test_sha1_properties(self):
+        """Test SHA1 algorithm properties."""
+        alg = SHA1
+        self.assertEqual(alg.name, "sha1")
+        self.assertEqual(alg.oid_length, 20)
+        self.assertEqual(alg.hex_length, 40)
+        self.assertEqual(len(alg.zero_oid), 40)
+        self.assertEqual(len(alg.zero_oid_bin), 20)
+
+    def test_sha256_properties(self):
+        """Test SHA256 algorithm properties."""
+        alg = SHA256
+        self.assertEqual(alg.name, "sha256")
+        self.assertEqual(alg.oid_length, 32)
+        self.assertEqual(alg.hex_length, 64)
+        self.assertEqual(len(alg.zero_oid), 64)
+        self.assertEqual(len(alg.zero_oid_bin), 32)
+
+    def test_get_hash_algorithm(self):
+        """Test getting hash algorithms by name."""
+        self.assertEqual(get_hash_algorithm("sha1"), SHA1)
+        self.assertEqual(get_hash_algorithm("sha256"), SHA256)
+        self.assertEqual(get_hash_algorithm(None), SHA1)  # Default
+
+        with self.assertRaises(ValueError):
+            get_hash_algorithm("invalid")
+
+
+class ObjectHashingTests(unittest.TestCase):
+    """Tests for object hashing with different algorithms."""
+
+    def test_blob_sha1(self):
+        """Test blob hashing with SHA1."""
+        blob = Blob()
+        blob.data = b"Hello, World!"
+
+        # Default should be SHA1
+        sha1_id = blob.id
+        self.assertEqual(len(sha1_id), 40)
+        self.assertTrue(valid_hexsha(sha1_id))
+
+    def test_blob_sha256(self):
+        """Test blob hashing with SHA256."""
+        blob = Blob()
+        blob.data = b"Hello, World!"
+
+        # Get SHA256 hash
+        sha256_id = blob.get_id(SHA256)
+        self.assertEqual(len(sha256_id), 64)
+        self.assertTrue(valid_hexsha(sha256_id))
+
+        # SHA256 ID should be different from SHA1
+        sha1_id = blob.id
+        self.assertNotEqual(sha1_id, sha256_id)
+
+        # Verify .id property returns SHA1 for backward compatibility
+        self.assertEqual(blob.id, sha1_id)
+        self.assertEqual(blob.get_id(), sha1_id)  # Default should be SHA1
+
+    def test_tree_sha256(self):
+        """Test tree hashing with SHA256."""
+        tree = Tree()
+        tree.add(b"file.txt", 0o100644, b"a" * 40)  # SHA1 hex
+
+        # Get SHA1 (default)
+        sha1_id = tree.id
+        self.assertEqual(len(sha1_id), 40)
+
+        # Get SHA256
+        sha256_id = tree.get_id(SHA256)
+        self.assertEqual(len(sha256_id), 64)
+
+        # Verify they're different
+        self.assertNotEqual(sha1_id, sha256_id)
+
+    def test_valid_hexsha(self):
+        """Test hex SHA validation for both algorithms."""
+        # Valid SHA1
+        self.assertTrue(valid_hexsha(b"1234567890abcdef1234567890abcdef12345678"))
+
+        # Valid SHA256
+        self.assertTrue(
+            valid_hexsha(
+                b"1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef"
+            )
+        )
+
+        # Invalid lengths
+        self.assertFalse(valid_hexsha(b"1234"))
+        self.assertFalse(
+            valid_hexsha(b"1234567890abcdef1234567890abcdef123456")
+        )  # 38 chars
+
+        # Invalid characters
+        self.assertFalse(valid_hexsha(b"123456789gabcdef1234567890abcdef12345678"))
+
+    def test_zero_sha_for(self):
+        """Test getting zero SHA for different algorithms."""
+        # Default (SHA1)
+        self.assertEqual(zero_sha_for(), b"0" * 40)
+        self.assertEqual(zero_sha_for(None), b"0" * 40)
+
+        # SHA1 explicit
+        self.assertEqual(zero_sha_for(SHA1), b"0" * 40)
+
+        # SHA256
+        self.assertEqual(zero_sha_for(SHA256), b"0" * 64)
+
+
+class RepositorySHA256Tests(unittest.TestCase):
+    """Tests for SHA256 repository support."""
+
+    def setUp(self):
+        """Set up test repository directory."""
+        self.test_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        """Clean up test repository."""
+        shutil.rmtree(self.test_dir)
+
+    def test_init_sha256_repo(self):
+        """Test initializing a SHA256 repository."""
+        repo_path = os.path.join(self.test_dir, "sha256_repo")
+        repo = Repo.init(repo_path, mkdir=True, object_format="sha256")
+
+        # Check repository format version
+        config = repo.get_config()
+        self.assertEqual(config.get(("core",), "repositoryformatversion"), b"1")
+
+        # Check object format extension
+        self.assertEqual(config.get(("extensions",), "objectformat"), b"sha256")
+
+        # Check hash algorithm detection
+        hash_alg = repo.get_hash_algorithm()
+        self.assertEqual(hash_alg, SHA256)
+
+        repo.close()
+
+    def test_init_sha1_repo(self):
+        """Test initializing a SHA1 repository (default)."""
+        repo_path = os.path.join(self.test_dir, "sha1_repo")
+        repo = Repo.init(repo_path, mkdir=True)
+
+        # Check repository format version
+        config = repo.get_config()
+        self.assertEqual(config.get(("core",), "repositoryformatversion"), b"0")
+
+        # Object format extension should not exist
+        with self.assertRaises(KeyError):
+            config.get(("extensions",), "objectformat")
+
+        # Check hash algorithm detection
+        hash_alg = repo.get_hash_algorithm()
+        self.assertEqual(hash_alg, SHA1)
+
+        repo.close()
+
+    def test_format_version_validation(self):
+        """Test format version validation for SHA256."""
+        repo_path = os.path.join(self.test_dir, "invalid_repo")
+
+        # SHA256 with format version 0 should fail
+        with self.assertRaises(ValueError) as cm:
+            Repo.init(repo_path, mkdir=True, format=0, object_format="sha256")
+        self.assertIn("SHA256", str(cm.exception))
+
+    def test_memory_repo_sha256(self):
+        """Test SHA256 support in memory repository."""
+        repo = MemoryRepo.init_bare([], {}, object_format="sha256")
+
+        # Check hash algorithm
+        hash_alg = repo.get_hash_algorithm()
+        self.assertEqual(hash_alg, SHA256)
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 128 - 0
tests/test_sha256_pack.py

@@ -0,0 +1,128 @@
+# test_sha256_pack.py -- Tests for SHA256 pack support
+# Copyright (C) 2024 The Dulwich contributors
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Tests for SHA256 pack support in Dulwich."""
+
+import shutil
+import tempfile
+import unittest
+from io import BytesIO
+
+from dulwich.hash import SHA256
+from dulwich.pack import (
+    load_pack_index_file,
+    write_pack_index_v2,
+)
+
+
+class SHA256PackTests(unittest.TestCase):
+    """Tests for SHA256 pack support."""
+
+    def setUp(self):
+        """Set up test repository directory."""
+        self.test_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        """Clean up test repository."""
+        shutil.rmtree(self.test_dir)
+
+    def test_pack_index_v2_with_sha256(self):
+        """Test that pack index v2 correctly handles SHA256 hashes."""
+        # Create SHA256 entries manually (simulating what would happen in a SHA256 repo)
+        entries = []
+        for i in range(5):
+            # Create a fake SHA256 hash
+            sha256_hash = SHA256.hash_func(f"test object {i}".encode()).digest()
+            offset = i * 1000  # Fake offsets
+            crc32 = i  # Fake CRC32
+            entries.append((sha256_hash, offset, crc32))
+
+        # Sort entries by SHA (required for pack index)
+        entries.sort(key=lambda e: e[0])
+
+        # Write SHA256 pack index with SHA1 pack checksum (Git always uses SHA1 for pack checksums)
+        index_buf = BytesIO()
+        from hashlib import sha1
+
+        pack_checksum = sha1(b"fake pack data").digest()
+        write_pack_index_v2(index_buf, entries, pack_checksum)
+
+        # Load and verify the index
+        index_buf.seek(0)
+        pack_idx = load_pack_index_file("<memory>", index_buf)
+
+        # Check that the index loaded correctly
+        self.assertEqual(len(pack_idx), 5)
+        self.assertEqual(pack_idx.version, 2)
+
+        # Verify hash_size detection
+        self.assertEqual(pack_idx.hash_size, 32)
+
+        # Verify we can look up objects by SHA256
+        for sha256_hash, offset, _ in entries:
+            # This should not raise KeyError
+            found_offset = pack_idx.object_offset(sha256_hash)
+            self.assertEqual(found_offset, offset)
+
+    def test_pack_index_v1_with_sha256(self):
+        """Test that pack index v1 correctly handles SHA256 hashes."""
+        # Create SHA256 entries manually
+        entries = []
+        for i in range(5):
+            # Create a fake SHA256 hash
+            sha256_hash = SHA256.hash_func(f"test v1 object {i}".encode()).digest()
+            offset = i * 1000  # Fake offsets
+            crc32 = None  # v1 doesn't store CRC32
+            entries.append((sha256_hash, offset, crc32))
+
+        # Sort entries by SHA (required for pack index)
+        entries.sort(key=lambda e: e[0])
+
+        # Import write_pack_index_v1
+        from dulwich.pack import write_pack_index_v1
+
+        # Write SHA256 pack index v1 with SHA1 pack checksum
+        index_buf = BytesIO()
+        from hashlib import sha1
+
+        pack_checksum = sha1(b"fake v1 pack data").digest()
+        write_pack_index_v1(index_buf, entries, pack_checksum)
+
+        # Load and verify the index
+        index_buf.seek(0)
+        pack_idx = load_pack_index_file("<memory>", index_buf)
+
+        # Check that the index loaded correctly
+        self.assertEqual(len(pack_idx), 5)
+        self.assertEqual(pack_idx.version, 1)
+
+        # Verify hash_size detection
+        self.assertEqual(pack_idx.hash_size, 32)
+
+        # Verify we can look up objects by SHA256
+        for sha256_hash, offset, _ in entries:
+            # This should not raise KeyError
+            found_offset = pack_idx.object_offset(sha256_hash)
+            self.assertEqual(found_offset, offset)
+
+
+if __name__ == "__main__":
+    unittest.main()