Explorar o código

Add Rust implementation of pack delta creation.

This adds a Rust implementation of create_delta that uses the similar
crate for efficient diff computation. The implementation is exported
from the _pack extension module and will be used automatically when
available.
Jelmer Vernooij hai 2 meses
pai
achega
a8f5dbf165
Modificáronse 6 ficheiros con 637 adicións e 9 borrados
  1. 10 3
      Cargo.lock
  2. 4 0
      NEWS
  3. 1 0
      crates/pack/Cargo.toml
  4. 430 1
      crates/pack/src/lib.rs
  5. 21 4
      dulwich/pack.py
  6. 171 1
      tests/test_pack.py

+ 10 - 3
Cargo.lock

@@ -10,7 +10,7 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 
 [[package]]
 name = "diff-tree-py"
-version = "0.24.7"
+version = "0.24.8"
 dependencies = [
  "pyo3",
 ]
@@ -50,7 +50,7 @@ dependencies = [
 
 [[package]]
 name = "objects-py"
-version = "0.24.7"
+version = "0.24.8"
 dependencies = [
  "memchr",
  "pyo3",
@@ -64,10 +64,11 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
 
 [[package]]
 name = "pack-py"
-version = "0.24.7"
+version = "0.24.8"
 dependencies = [
  "memchr",
  "pyo3",
+ "similar",
 ]
 
 [[package]]
@@ -155,6 +156,12 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "similar"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa"
+
 [[package]]
 name = "syn"
 version = "2.0.107"

+ 4 - 0
NEWS

@@ -1,5 +1,9 @@
 0.24.8	UNRELEASED
 
+ * Add Rust implementation of pack delta creation (create_delta). The
+   implementation uses the similar crate for efficient diff computation.
+   (Jelmer Vernooij)
+
 0.24.7	2025-10-23
 
  * Add sparse index support for improved performance with large repositories.

+ 1 - 0
crates/pack/Cargo.toml

@@ -9,3 +9,4 @@ crate-type = ["cdylib"]
 [dependencies]
 pyo3 = { workspace = true, features = ["extension-module"]}
 memchr = "2"
+similar = "2"

+ 430 - 1
crates/pack/src/lib.rs

@@ -18,6 +18,9 @@
  * License, Version 2.0.
  */
 
+// Allow PyO3 macro-generated interior mutable constants
+#![allow(clippy::declare_interior_mutable_const)]
+
 use pyo3::exceptions::{PyTypeError, PyValueError};
 use pyo3::prelude::*;
 use pyo3::types::{PyBytes, PyList};
@@ -182,7 +185,12 @@ fn apply_delta(py: Python, py_src_buf: Py<PyAny>, py_delta: Py<PyAny>) -> PyResu
                 cp_size = 0x10000;
             }
 
-            if cp_off + cp_size < cp_size || cp_off + cp_size > src_size || cp_size > dest_size {
+            // Check for overflow and bounds
+            if cp_size > src_size
+                || cp_off > src_size
+                || cp_off > src_size - cp_size
+                || cp_size > dest_size
+            {
                 break;
             }
 
@@ -218,9 +226,430 @@ fn apply_delta(py: Python, py_src_buf: Py<PyAny>, py_delta: Py<PyAny>) -> PyResu
     Ok(vec![PyBytes::new(py, &out).into()])
 }
 
+/// Encode a size value for delta headers using variable-length encoding.
+/// This matches Python's _delta_encode_size function.
+fn delta_encode_size(mut size: usize) -> Vec<u8> {
+    let mut ret = Vec::new();
+    let mut c = (size & 0x7F) as u8;
+    size >>= 7;
+    while size > 0 {
+        ret.push(c | 0x80);
+        c = (size & 0x7F) as u8;
+        size >>= 7;
+    }
+    ret.push(c);
+    ret
+}
+
+/// The length of delta compression copy operations in version 2 packs is limited
+/// to 64K. To copy more, we use several copy operations.
+const MAX_COPY_LEN: usize = 0xFFFF;
+
+/// Encode a copy operation for the delta format.
+/// This matches Python's _encode_copy_operation function.
+fn encode_copy_operation(start: usize, length: usize) -> Vec<u8> {
+    let mut scratch = vec![0x80u8];
+
+    // Encode offset (4 bytes max)
+    for i in 0..4 {
+        if start & (0xFF << (i * 8)) != 0 {
+            scratch.push(((start >> (i * 8)) & 0xFF) as u8);
+            scratch[0] |= 1 << i;
+        }
+    }
+
+    // Encode length (2 bytes for version 2 packs)
+    for i in 0..2 {
+        if length & (0xFF << (i * 8)) != 0 {
+            scratch.push(((length >> (i * 8)) & 0xFF) as u8);
+            scratch[0] |= 1 << (4 + i);
+        }
+    }
+
+    scratch
+}
+
+/// Create a delta that transforms base_buf into target_buf.
+/// This uses the similar crate to find matching sequences, similar to
+/// Python's difflib.SequenceMatcher.
+fn create_delta_internal(base_buf: &[u8], target_buf: &[u8]) -> Vec<u8> {
+    let mut result = Vec::new();
+
+    // Write delta header
+    result.extend(delta_encode_size(base_buf.len()));
+    result.extend(delta_encode_size(target_buf.len()));
+
+    // Use similar crate to compute the diff at byte level
+    let ops = similar::capture_diff_slices(similar::Algorithm::Myers, base_buf, target_buf);
+
+    let mut old_pos = 0;
+    let mut new_pos = 0;
+
+    for op in ops {
+        match op {
+            similar::DiffOp::Equal {
+                old_index,
+                new_index,
+                len,
+            } => {
+                // Sanity check
+                assert_eq!(old_index, old_pos);
+                assert_eq!(new_index, new_pos);
+
+                // Emit copy operations from base_buf
+                let mut copy_start = old_index;
+                let mut copy_len = len;
+
+                while copy_len > 0 {
+                    let to_copy = copy_len.min(MAX_COPY_LEN);
+                    result.extend(encode_copy_operation(copy_start, to_copy));
+                    copy_start += to_copy;
+                    copy_len -= to_copy;
+                }
+
+                old_pos += len;
+                new_pos += len;
+            }
+            similar::DiffOp::Delete {
+                old_index, old_len, ..
+            } => {
+                // Git delta format doesn't care about deletes from base
+                assert_eq!(old_index, old_pos);
+                old_pos += old_len;
+            }
+            similar::DiffOp::Insert {
+                new_index, new_len, ..
+            } => {
+                // Emit literal data from target_buf
+                assert_eq!(new_index, new_pos);
+
+                let data = &target_buf[new_index..new_index + new_len];
+                let mut remaining = data.len();
+                let mut offset = 0;
+
+                while remaining > 0 {
+                    let chunk_size = remaining.min(127);
+                    result.push(chunk_size as u8);
+                    result.extend_from_slice(&data[offset..offset + chunk_size]);
+                    offset += chunk_size;
+                    remaining -= chunk_size;
+                }
+
+                new_pos += new_len;
+            }
+            similar::DiffOp::Replace {
+                old_index,
+                old_len,
+                new_index,
+                new_len,
+            } => {
+                // For replace operations, we delete from old and insert from new
+                // Git delta format doesn't care about deletes, so just emit insert
+                assert_eq!(old_index, old_pos);
+                assert_eq!(new_index, new_pos);
+
+                let data = &target_buf[new_index..new_index + new_len];
+                let mut remaining = data.len();
+                let mut offset = 0;
+
+                while remaining > 0 {
+                    let chunk_size = remaining.min(127);
+                    result.push(chunk_size as u8);
+                    result.extend_from_slice(&data[offset..offset + chunk_size]);
+                    offset += chunk_size;
+                    remaining -= chunk_size;
+                }
+
+                old_pos += old_len;
+                new_pos += new_len;
+            }
+        }
+    }
+
+    result
+}
+
+#[pyfunction]
+fn create_delta(
+    py: Python,
+    py_base_buf: Py<PyAny>,
+    py_target_buf: Py<PyAny>,
+) -> PyResult<Py<PyBytes>> {
+    let base_buf = py_chunked_as_string(py, &py_base_buf)?;
+    let target_buf = py_chunked_as_string(py, &py_target_buf)?;
+
+    let delta = create_delta_internal(base_buf.as_ref(), target_buf.as_ref());
+
+    Ok(PyBytes::new(py, &delta).into())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_delta_encode_size_zero() {
+        assert_eq!(delta_encode_size(0), vec![0]);
+    }
+
+    #[test]
+    fn test_delta_encode_size_small() {
+        // Values that fit in 7 bits (0-127)
+        assert_eq!(delta_encode_size(1), vec![1]);
+        assert_eq!(delta_encode_size(127), vec![127]);
+    }
+
+    #[test]
+    fn test_delta_encode_size_medium() {
+        // Values that need 2 bytes (128-16383)
+        assert_eq!(delta_encode_size(128), vec![0x80, 0x01]);
+        assert_eq!(delta_encode_size(256), vec![0x80, 0x02]);
+        assert_eq!(delta_encode_size(16383), vec![0xFF, 0x7F]);
+    }
+
+    #[test]
+    fn test_delta_encode_size_large() {
+        // Values that need 3 bytes (16384-2097151)
+        assert_eq!(delta_encode_size(16384), vec![0x80, 0x80, 0x01]);
+        assert_eq!(delta_encode_size(65536), vec![0x80, 0x80, 0x04]);
+    }
+
+    #[test]
+    fn test_delta_encode_size_very_large() {
+        // Values that need 4+ bytes
+        assert_eq!(delta_encode_size(1048576), vec![0x80, 0x80, 0x40]); // 1MB = 2^20
+        assert_eq!(delta_encode_size(16777216), vec![0x80, 0x80, 0x80, 0x08]); // 16MB = 2^24
+    }
+
+    #[test]
+    fn test_get_delta_header_size_basic() {
+        // Test decoding various encoded sizes
+        let mut index = 0;
+        let delta = vec![0x00];
+        assert_eq!(get_delta_header_size(&delta, &mut index, delta.len()), 0);
+        assert_eq!(index, 1);
+
+        let mut index = 0;
+        let delta = vec![0x01];
+        assert_eq!(get_delta_header_size(&delta, &mut index, delta.len()), 1);
+        assert_eq!(index, 1);
+
+        let mut index = 0;
+        let delta = vec![127];
+        assert_eq!(get_delta_header_size(&delta, &mut index, delta.len()), 127);
+        assert_eq!(index, 1);
+    }
+
+    #[test]
+    fn test_get_delta_header_size_multibyte() {
+        // Test decoding multi-byte sizes
+        let mut index = 0;
+        let delta = vec![0x80, 0x01];
+        assert_eq!(get_delta_header_size(&delta, &mut index, delta.len()), 128);
+        assert_eq!(index, 2);
+
+        let mut index = 0;
+        let delta = vec![0x80, 0x02];
+        assert_eq!(get_delta_header_size(&delta, &mut index, delta.len()), 256);
+        assert_eq!(index, 2);
+
+        let mut index = 0;
+        let delta = vec![0x80, 0x80, 0x01];
+        assert_eq!(
+            get_delta_header_size(&delta, &mut index, delta.len()),
+            16384
+        );
+        assert_eq!(index, 3);
+    }
+
+    #[test]
+    fn test_delta_encode_decode_roundtrip() {
+        // Test that encoding and decoding are inverse operations
+        let test_values = vec![0, 1, 127, 128, 255, 256, 1000, 16384, 65536, 1048576];
+
+        for value in test_values {
+            let encoded = delta_encode_size(value);
+            let mut index = 0;
+            let decoded = get_delta_header_size(&encoded, &mut index, encoded.len());
+            assert_eq!(
+                decoded, value,
+                "Roundtrip failed for value {}: encoded {:?}, decoded {}",
+                value, encoded, decoded
+            );
+            assert_eq!(index, encoded.len());
+        }
+    }
+
+    #[test]
+    fn test_encode_copy_operation_zero_offset() {
+        // Copy from offset 0
+        let result = encode_copy_operation(0, 10);
+        // Should have copy bit set
+        assert_eq!(result[0] & 0x80, 0x80);
+        // Should encode length 10
+        assert_eq!(result[0] & 0x10, 0x10); // Length bit 0 set
+        assert_eq!(result[1], 10);
+        assert_eq!(result.len(), 2);
+    }
+
+    #[test]
+    fn test_encode_copy_operation_small_offset() {
+        // Copy from offset 100, length 20
+        let result = encode_copy_operation(100, 20);
+        assert_eq!(result[0] & 0x80, 0x80); // Copy bit
+        assert_eq!(result[0] & 0x01, 0x01); // Offset byte 0 present
+        assert_eq!(result[0] & 0x10, 0x10); // Length byte 0 present
+        assert_eq!(result[1], 100); // Offset byte 0
+        assert_eq!(result[2], 20); // Length byte 0
+        assert_eq!(result.len(), 3);
+    }
+
+    #[test]
+    fn test_encode_copy_operation_large_offset() {
+        // Copy from offset 0x12345, length 0x678
+        let result = encode_copy_operation(0x12345, 0x678);
+        assert_eq!(result[0] & 0x80, 0x80); // Copy bit
+        assert_eq!(result[0] & 0x07, 0x07); // Offset bytes 0,1,2 present
+        assert_eq!(result[0] & 0x30, 0x30); // Length bytes 0,1 present
+        assert_eq!(result[1], 0x45); // Offset byte 0
+        assert_eq!(result[2], 0x23); // Offset byte 1
+        assert_eq!(result[3], 0x01); // Offset byte 2
+        assert_eq!(result[4], 0x78); // Length byte 0
+        assert_eq!(result[5], 0x06); // Length byte 1
+        assert_eq!(result.len(), 6);
+    }
+
+    #[test]
+    fn test_encode_copy_operation_max_offset() {
+        // Test maximum offset (needs 4 bytes)
+        let max_offset = 0xFFFFFFFF;
+        let result = encode_copy_operation(max_offset, 1);
+        assert_eq!(result[0] & 0x80, 0x80); // Copy bit
+        assert_eq!(result[0] & 0x0F, 0x0F); // All 4 offset bytes present
+        assert_eq!(result[1], 0xFF); // Offset byte 0
+        assert_eq!(result[2], 0xFF); // Offset byte 1
+        assert_eq!(result[3], 0xFF); // Offset byte 2
+        assert_eq!(result[4], 0xFF); // Offset byte 3
+        assert_eq!(result.len(), 6); // 1 cmd + 4 offset + 1 length
+    }
+
+    #[test]
+    fn test_encode_copy_operation_max_length() {
+        // Test maximum length for version 2 packs (0xFFFF)
+        let result = encode_copy_operation(0, MAX_COPY_LEN);
+        assert_eq!(result[0] & 0x80, 0x80); // Copy bit
+        assert_eq!(result[0] & 0x30, 0x30); // Both length bytes present
+        assert_eq!(result[1], 0xFF); // Length byte 0
+        assert_eq!(result[2], 0xFF); // Length byte 1
+        assert_eq!(result.len(), 3);
+    }
+
+    #[test]
+    fn test_encode_copy_operation_various_lengths() {
+        // Test different length values to ensure correct encoding
+        // Note: only non-zero bytes are encoded
+
+        // Length 1: byte0=1 -> only bit 4 set
+        let result = encode_copy_operation(0, 1);
+        assert_eq!(result[0] & 0x80, 0x80);
+        assert_eq!(result[0] & 0x30, 0x10);
+        assert_eq!(result[1], 1);
+
+        // Length 255 (0xFF): byte0=0xFF, byte1=0 -> only bit 4 set
+        let result = encode_copy_operation(0, 255);
+        assert_eq!(result[0] & 0x80, 0x80);
+        assert_eq!(result[0] & 0x30, 0x10);
+        assert_eq!(result[1], 0xFF);
+
+        // Length 256 (0x100): byte0=0, byte1=1 -> only bit 5 set
+        let result = encode_copy_operation(0, 256);
+        assert_eq!(result[0] & 0x80, 0x80);
+        assert_eq!(result[0] & 0x30, 0x20); // Only second length byte
+        assert_eq!(result[1], 1);
+
+        // Length 1000 (0x3E8): byte0=0xE8, byte1=3 -> both bits set
+        let result = encode_copy_operation(0, 1000);
+        assert_eq!(result[0] & 0x80, 0x80);
+        assert_eq!(result[0] & 0x30, 0x30); // Both length bytes
+        assert_eq!(result[1], 0xE8);
+        assert_eq!(result[2], 0x03);
+
+        // Length 0xFFFF: byte0=0xFF, byte1=0xFF -> both bits set
+        let result = encode_copy_operation(0, 0xFFFF);
+        assert_eq!(result[0] & 0x80, 0x80);
+        assert_eq!(result[0] & 0x30, 0x30); // Both length bytes
+        assert_eq!(result[1], 0xFF);
+        assert_eq!(result[2], 0xFF);
+    }
+
+    #[test]
+    fn test_create_delta_identical() {
+        // Delta between identical buffers should be minimal
+        let base = b"hello world";
+        let target = b"hello world";
+        let delta = create_delta_internal(base, target);
+
+        // Should have header (2 size encodings) plus copy operations
+        assert!(delta.len() < base.len()); // Delta should be smaller than full data
+    }
+
+    #[test]
+    fn test_create_delta_completely_different() {
+        // Delta between completely different buffers
+        let base = b"aaaaaaaaaa";
+        let target = b"bbbbbbbbbb";
+        let delta = create_delta_internal(base, target);
+
+        // Should have header plus insert operations with the new data
+        assert!(delta.len() > 0);
+    }
+
+    #[test]
+    fn test_create_and_apply_delta() {
+        // Test that create_delta and apply_delta are inverse operations
+        let base = b"The quick brown fox jumps over the lazy dog";
+        let target = b"The quick brown cat jumps over the lazy dog";
+
+        // Create delta
+        let delta = create_delta_internal(base, target);
+
+        // Apply delta should reconstruct target
+        let mut index = 0;
+        let src_size = get_delta_header_size(&delta, &mut index, delta.len());
+        assert_eq!(src_size, base.len());
+
+        let dest_size = get_delta_header_size(&delta, &mut index, delta.len());
+        assert_eq!(dest_size, target.len());
+
+        // The delta should be valid and smaller than sending the full target
+        assert!(delta.len() > 0);
+    }
+
+    #[test]
+    fn test_create_delta_with_insertion() {
+        let base = b"hello";
+        let target = b"hello world";
+        let delta = create_delta_internal(base, target);
+
+        // Should have a copy operation for "hello" and insert for " world"
+        assert!(delta.len() > 0);
+    }
+
+    #[test]
+    fn test_create_delta_with_deletion() {
+        let base = b"hello world";
+        let target = b"hello";
+        let delta = create_delta_internal(base, target);
+
+        // Should have a copy operation for "hello" only
+        assert!(delta.len() > 0);
+    }
+}
+
 #[pymodule]
 fn _pack(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(bisect_find_sha, m)?)?;
     m.add_function(wrap_pyfunction!(apply_delta, m)?)?;
+    m.add_function(wrap_pyfunction!(create_delta, m)?)?;
     Ok(())
 }

+ 21 - 4
dulwich/pack.py

@@ -2774,11 +2774,11 @@ def pack_objects_to_data(
       progress: Optional progress reporting callback
     Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
     """
-    # TODO(jelmer): support deltaifying
     count = len(objects)
     if deltify is None:
-        # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
-        # slow at the moment.
+        # PERFORMANCE/TODO(jelmer): This should be enabled but the python
+        # implementation is *much* too slow at the moment.
+        # Maybe consider enabling it just if the rust extension is available?
         deltify = False
     if deltify:
         return (
@@ -3145,7 +3145,7 @@ def _encode_copy_operation(start: int, length: int) -> bytes:
     return bytes(scratch)
 
 
-def create_delta(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:
+def _create_delta_py(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:
     """Use python difflib to work out how to transform base_buf to target_buf.
 
     Args:
@@ -3191,6 +3191,10 @@ def create_delta(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:
             yield bytes(memoryview(target_buf)[o : o + s])
 
 
+# Default to pure Python implementation
+create_delta = _create_delta_py
+
+
 def apply_delta(
     src_buf: Union[bytes, list[bytes]], delta: Union[bytes, list[bytes]]
 ) -> list[bytes]:
@@ -3913,3 +3917,16 @@ try:
     )
 except ImportError:
     pass
+
+# Try to import the Rust version of create_delta
+try:
+    from dulwich._pack import create_delta as _create_delta_rs
+except ImportError:
+    pass
+else:
+    # Wrap the Rust version to match the Python API (returns bytes instead of Iterator)
+    def _create_delta_rs_wrapper(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:
+        """Wrapper for Rust create_delta to match Python API."""
+        yield _create_delta_rs(base_buf, target_buf)
+
+    create_delta = _create_delta_rs_wrapper

+ 171 - 1
tests/test_pack.py

@@ -46,6 +46,7 @@ from dulwich.pack import (
     PackStreamReader,
     UnpackedObject,
     UnresolvedDeltas,
+    _create_delta_py,
     _delta_encode_size,
     _encode_copy_operation,
     apply_delta,
@@ -62,7 +63,17 @@ from dulwich.pack import (
     write_pack_index_v3,
     write_pack_object,
 )
-from dulwich.tests.utils import build_pack, make_object
+from dulwich.tests.utils import (
+    build_pack,
+    ext_functest_builder,
+    functest_builder,
+    make_object,
+)
+
+try:
+    from dulwich._pack import create_delta as _create_delta_rs
+except ImportError:
+    _create_delta_rs = None
 
 from . import TestCase
 
@@ -290,6 +301,165 @@ class TestPackDeltas(TestCase):
         ]
         self.assertEqual(b"".join(expected), b"".join(res))
 
+    def _do_test_create_delta_various_cases(self, create_delta_func):
+        """Test create_delta with various input cases for both Python and Rust versions."""
+        import types
+
+        # Helper to normalize delta output (Rust returns bytes, Python returns Iterator[bytes])
+        def get_delta(base, target):
+            result = create_delta_func(base, target)
+            # Check if it's a Rust extension (returns bytes directly)
+            if isinstance(create_delta_func, types.BuiltinFunctionType):
+                return result
+            # Python version returns iterator
+            return b"".join(result)
+
+        # Test case 1: Identical content
+        base = b"hello world"
+        target = b"hello world"
+        delta = get_delta(base, target)
+        result = b"".join(apply_delta(base, delta))
+        self.assertEqual(target, result)
+
+        # Test case 2: Complete rewrite
+        base = b"aaaaaaaaaa"
+        target = b"bbbbbbbbbb"
+        delta = get_delta(base, target)
+        result = b"".join(apply_delta(base, delta))
+        self.assertEqual(target, result)
+
+        # Test case 3: Partial replacement
+        base = b"The quick brown fox jumps over the lazy dog"
+        target = b"The quick brown cat jumps over the lazy dog"
+        delta = get_delta(base, target)
+        result = b"".join(apply_delta(base, delta))
+        self.assertEqual(target, result)
+
+        # Test case 4: Insertion at end
+        base = b"hello"
+        target = b"hello world"
+        delta = get_delta(base, target)
+        result = b"".join(apply_delta(base, delta))
+        self.assertEqual(target, result)
+
+        # Test case 5: Deletion from end
+        base = b"hello world"
+        target = b"hello"
+        delta = get_delta(base, target)
+        result = b"".join(apply_delta(base, delta))
+        self.assertEqual(target, result)
+
+        # Test case 6: Empty base
+        base = b""
+        target = b"new content"
+        delta = get_delta(base, target)
+        result = b"".join(apply_delta(base, delta))
+        self.assertEqual(target, result)
+
+        # Test case 7: Empty target
+        base = b"old content"
+        target = b""
+        delta = get_delta(base, target)
+        result = b"".join(apply_delta(base, delta))
+        self.assertEqual(target, result)
+
+        # Test case 8: Large content
+        base = b"x" * 10000
+        target = b"x" * 9000 + b"y" * 1000
+        delta = get_delta(base, target)
+        result = b"".join(apply_delta(base, delta))
+        self.assertEqual(target, result)
+
+        # Test case 9: Multiple changes
+        base = b"line1\nline2\nline3\nline4\n"
+        target = b"line1\nmodified2\nline3\nmodified4\n"
+        delta = get_delta(base, target)
+        result = b"".join(apply_delta(base, delta))
+        self.assertEqual(target, result)
+
+    # Test both Python and Rust versions
+    test_create_delta_py = functest_builder(
+        _do_test_create_delta_various_cases, _create_delta_py
+    )
+    test_create_delta_extension = ext_functest_builder(
+        _do_test_create_delta_various_cases, _create_delta_rs
+    )
+
+    def _do_test_create_delta_output_consistency(self, create_delta_func):
+        """Test that create_delta produces consistent and valid output."""
+        import types
+
+        # Helper to normalize delta output
+        def get_delta(base, target):
+            result = create_delta_func(base, target)
+            if isinstance(create_delta_func, types.BuiltinFunctionType):
+                return result
+            return b"".join(result)
+
+        test_cases = [
+            (b"", b""),
+            (b"a", b"a"),
+            (b"abc", b"abc"),
+            (b"abc", b"def"),
+            (b"hello world", b"hello rust"),
+            (b"x" * 100, b"y" * 100),
+            (b"same prefix but different suffix", b"same prefix with new suffix"),
+        ]
+
+        for base, target in test_cases:
+            delta = get_delta(base, target)
+
+            # Verify delta can be applied
+            result = b"".join(apply_delta(base, delta))
+            self.assertEqual(
+                target,
+                result,
+                f"Delta failed for base={base[:20]}... target={target[:20]}...",
+            )
+
+            # Verify delta is not empty (should have at least header)
+            self.assertGreater(len(delta), 0)
+
+    test_create_delta_output_consistency_py = functest_builder(
+        _do_test_create_delta_output_consistency, _create_delta_py
+    )
+    test_create_delta_output_consistency_extension = ext_functest_builder(
+        _do_test_create_delta_output_consistency, _create_delta_rs
+    )
+
+    def _do_test_create_delta_produces_valid_deltas(self, create_delta_func):
+        """Test that deltas produced are valid Git delta format."""
+        import types
+
+        # Helper to normalize delta output
+        def get_delta(base, target):
+            result = create_delta_func(base, target)
+            if isinstance(create_delta_func, types.BuiltinFunctionType):
+                return result
+            return b"".join(result)
+
+        base = b"The quick brown fox"
+        target = b"The slow brown fox"
+
+        delta = get_delta(base, target)
+
+        # A valid delta should have:
+        # 1. Base size header
+        # 2. Target size header
+        # 3. Delta operations
+        self.assertGreater(len(delta), 2)  # At minimum 2 header bytes
+
+        # Apply delta to verify it's valid
+        result = b"".join(apply_delta(base, delta))
+        self.assertEqual(target, result)
+
+    test_create_delta_valid_format_py = functest_builder(
+        _do_test_create_delta_produces_valid_deltas, _create_delta_py
+    )
+    test_create_delta_valid_format_extension = ext_functest_builder(
+        _do_test_create_delta_produces_valid_deltas, _create_delta_rs
+    )
+
 
 class TestPackData(PackTests):
     """Tests getting the data from the packfile."""