Prechádzať zdrojové kódy

Fix UTF-8 decode error in process filter protocol handling

Fixes #2023
Jelmer Vernooij 1 mesiac pred
rodič
commit
298e7f5e82
3 zmenil súbory, kde vykonal 42 pridanie a 17 odobranie
  1. 3 0
      NEWS
  2. 12 8
      dulwich/filters.py
  3. 27 9
      tests/test_filters.py

+ 3 - 0
NEWS

@@ -4,6 +4,9 @@
 is ahead of a 1.0 release, after which API changes will be kept backwards
 compatible.
 
+ * Fix UTF-8 decode error in process filter protocol when handling binary files.
+   (Jelmer Vernooij, #2023)
+
  * Move greenthreads support to dulwich/contrib.
    This code isn't really developed and only used
    by the swift support.

+ 12 - 8
dulwich/filters.py

@@ -261,13 +261,16 @@ class ProcessFilterDriver:
                     pkt = self._protocol.read_pkt_line()
                     if pkt is None:  # flush packet ends headers
                         break
-                    key, _, value = pkt.decode().rstrip("\n\r").partition("=")
+                    key, _, value = pkt.rstrip(b"\n\r").partition(b"=")
                     response_headers[key] = value
 
                 # Check status
-                status = response_headers.get("status", "error")
-                if status != "success":
-                    raise FilterError(f"Process filter {operation} failed: {status}")
+                status = response_headers.get(b"status", b"error")
+                if status != b"success":
+                    status_str = status.decode("utf-8", errors="replace")
+                    raise FilterError(
+                        f"Process filter {operation} failed: {status_str}"
+                    )
 
                 # Read result data
                 result_chunks = []
@@ -284,14 +287,15 @@ class ProcessFilterDriver:
                     pkt = self._protocol.read_pkt_line()
                     if pkt is None:  # flush packet ends final headers
                         break
-                    key, _, value = pkt.decode().rstrip("\n\r").partition("=")
+                    key, _, value = pkt.rstrip(b"\n\r").partition(b"=")
                     final_headers[key] = value
 
                 # Check final status (if provided, it overrides the initial status)
-                final_status = final_headers.get("status", status)
-                if final_status != "success":
+                final_status = final_headers.get(b"status", status)
+                if final_status != b"success":
+                    final_status_str = final_status.decode("utf-8", errors="replace")
                     raise FilterError(
-                        f"Process filter {operation} failed with final status: {final_status}"
+                        f"Process filter {operation} failed with final status: {final_status_str}"
                     )
 
                 return b"".join(result_chunks)

+ 27 - 9
tests/test_filters.py

@@ -985,15 +985,33 @@ while True:
         # Binary data with null bytes, high bytes, etc.
         binary_data = bytes(range(256))
 
-        try:
-            result = driver.clean(binary_data)
-            # Should handle binary data without crashing
-            self.assertIsInstance(result, bytes)
-            # Our test filter uppercases, which may not work for all binary data
-            # but should not crash
-        except UnicodeDecodeError:
-            # This might happen with binary data - acceptable
-            pass
+        result = driver.clean(binary_data)
+        # Should handle binary data without crashing
+        self.assertIsInstance(result, bytes)
+        # Our test filter uppercases bytes directly, which works for binary data
+        # The fix ensures headers are kept as bytes, so binary content doesn't cause decode errors
+
+    def test_binary_data_with_invalid_utf8_sequences(self):
+        """Test handling of binary data with invalid UTF-8 sequences.
+
+        Regression test for https://github.com/jelmer/dulwich/issues/2023
+        where binary files (like .ogg, .jpg) caused UTF-8 decode errors.
+        """
+        import sys
+
+        driver = ProcessFilterDriver(
+            process_cmd=f"{sys.executable} {self.test_filter_path}", required=False
+        )
+
+        # Create binary data with the specific byte that caused the issue (0xe5 at position 14)
+        # plus other invalid UTF-8 sequences
+        binary_data = b"some header \xe5\xff\xfe binary data"
+
+        result = driver.clean(binary_data)
+        # Should handle binary data without UTF-8 decode errors
+        self.assertIsInstance(result, bytes)
+        # The filter should process it successfully
+        self.assertEqual(result, binary_data.upper())
 
     def test_large_file_chunking(self):
         """Test proper chunking of large files."""