1 mesiac pred · 298e7f5e82
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,9 @@
 
				 is ahead of a 1.0 release, after which API changes will be kept backwards
			
 
				 compatible.
			
 
				 
			
 
				+ * Fix UTF-8 decode error in process filter protocol when handling binary files.
			
 
				+   (Jelmer Vernooĳ, #2023)
			
 
				+
			
 
				  * Move greenthreads support to dulwich/contrib.
			
 
				    This code isn't really developed and only used
			
 
				    by the swift support.
			
--- a/dulwich/filters.py
+++ b/dulwich/filters.py
@@ -261,13 +261,16 @@ class ProcessFilterDriver:
 
				                     pkt = self._protocol.read_pkt_line()
			
 
				                     if pkt is None:  # flush packet ends headers
			
 
				                         break
			
 
				-                    key, _, value = pkt.decode().rstrip("\n\r").partition("=")
			
 
				+                    key, _, value = pkt.rstrip(b"\n\r").partition(b"=")
			
 
				                     response_headers[key] = value
			
 
				 
			
 
				                 # Check status
			
 
				-                status = response_headers.get("status", "error")
			
 
				-                if status != "success":
			
 
				-                    raise FilterError(f"Process filter {operation} failed: {status}")
			
 
				+                status = response_headers.get(b"status", b"error")
			
 
				+                if status != b"success":
			
 
				+                    status_str = status.decode("utf-8", errors="replace")
			
 
				+                    raise FilterError(
			
 
				+                        f"Process filter {operation} failed: {status_str}"
			
 
				+                    )
			
 
				 
			
 
				                 # Read result data
			
 
				                 result_chunks = []
			
@@ -284,14 +287,15 @@ class ProcessFilterDriver:
 
				                     pkt = self._protocol.read_pkt_line()
			
 
				                     if pkt is None:  # flush packet ends final headers
			
 
				                         break
			
 
				-                    key, _, value = pkt.decode().rstrip("\n\r").partition("=")
			
 
				+                    key, _, value = pkt.rstrip(b"\n\r").partition(b"=")
			
 
				                     final_headers[key] = value
			
 
				 
			
 
				                 # Check final status (if provided, it overrides the initial status)
			
 
				-                final_status = final_headers.get("status", status)
			
 
				-                if final_status != "success":
			
 
				+                final_status = final_headers.get(b"status", status)
			
 
				+                if final_status != b"success":
			
 
				+                    final_status_str = final_status.decode("utf-8", errors="replace")
			
 
				                     raise FilterError(
			
 
				-                        f"Process filter {operation} failed with final status: {final_status}"
			
 
				+                        f"Process filter {operation} failed with final status: {final_status_str}"
			
 
				                     )
			
 
				 
			
 
				                 return b"".join(result_chunks)
			
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -985,15 +985,33 @@ while True:
 
				         # Binary data with null bytes, high bytes, etc.
			
 
				         binary_data = bytes(range(256))
			
 
				 
			
 
				-        try:
			
 
				-            result = driver.clean(binary_data)
			
 
				-            # Should handle binary data without crashing
			
 
				-            self.assertIsInstance(result, bytes)
			
 
				-            # Our test filter uppercases, which may not work for all binary data
			
 
				-            # but should not crash
			
 
				-        except UnicodeDecodeError:
			
 
				-            # This might happen with binary data - acceptable
			
 
				-            pass
			
 
				+        result = driver.clean(binary_data)
			
 
				+        # Should handle binary data without crashing
			
 
				+        self.assertIsInstance(result, bytes)
			
 
				+        # Our test filter uppercases bytes directly, which works for binary data
			
 
				+        # The fix ensures headers are kept as bytes, so binary content doesn't cause decode errors
			
 
				+
			
 
				+    def test_binary_data_with_invalid_utf8_sequences(self):
			
 
				+        """Test handling of binary data with invalid UTF-8 sequences.
			
 
				+
			
 
				+        Regression test for https://github.com/jelmer/dulwich/issues/2023
			
 
				+        where binary files (like .ogg, .jpg) caused UTF-8 decode errors.
			
 
				+        """
			
 
				+        import sys
			
 
				+
			
 
				+        driver = ProcessFilterDriver(
			
 
				+            process_cmd=f"{sys.executable} {self.test_filter_path}", required=False
			
 
				+        )
			
 
				+
			
 
				+        # Create binary data with the specific byte that caused the issue (0xe5 at position 14)
			
 
				+        # plus other invalid UTF-8 sequences
			
 
				+        binary_data = b"some header \xe5\xff\xfe binary data"
			
 
				+
			
 
				+        result = driver.clean(binary_data)
			
 
				+        # Should handle binary data without UTF-8 decode errors
			
 
				+        self.assertIsInstance(result, bytes)
			
 
				+        # The filter should process it successfully
			
 
				+        self.assertEqual(result, binary_data.upper())
			
 
				 
			
 
				     def test_large_file_chunking(self):
			
 
				         """Test proper chunking of large files."""