Ver código fonte

Add support for patiencediff

Fixes #1795
Jelmer Vernooij 4 meses atrás
pai
commit
084f5c6993
7 arquivos alterados com 386 adições e 36 exclusões
  1. 3 0
      NEWS
  2. 56 30
      dulwich/cli.py
  3. 10 1
      dulwich/diff.py
  4. 141 2
      dulwich/patch.py
  5. 13 3
      dulwich/porcelain.py
  6. 1 0
      pyproject.toml
  7. 162 0
      tests/test_patch.py

+ 3 - 0
NEWS

@@ -34,6 +34,9 @@
    headers to the server when communicating over HTTP(S).
    (Jelmer Vernooij, #1769)
 
+ * Add support for ``patiencediff`` algorithm in diff.
+   (Jelmer Vernooij, #1795)
+
 0.24.1	2025-08-01
 
  * Require ``typing_extensions`` on Python 3.10.

+ 56 - 30
dulwich/cli.py

@@ -48,6 +48,7 @@ from .index import Index
 from .objects import Commit, valid_hexsha
 from .objectspec import parse_commit_range
 from .pack import Pack, sha_to_hex
+from .patch import DiffAlgorithmNotAvailable
 from .repo import Repo
 
 
@@ -758,6 +759,17 @@ class cmd_diff(Command):
             default="auto",
             help="Use colored output (requires rich)",
         )
+        parser.add_argument(
+            "--patience",
+            action="store_true",
+            help="Use patience diff algorithm",
+        )
+        parser.add_argument(
+            "--diff-algorithm",
+            choices=["myers", "patience"],
+            default="myers",
+            help="Choose a diff algorithm",
+        )
         parser.add_argument(
             "--", dest="separator", action="store_true", help=argparse.SUPPRESS
         )
@@ -773,6 +785,11 @@ class cmd_diff(Command):
 
         args = parsed_args
 
+        # Determine diff algorithm
+        diff_algorithm = args.diff_algorithm
+        if args.patience:
+            diff_algorithm = "patience"
+
         # Determine if we should use color
         def _should_use_color():
             if args.color == "always":
@@ -806,36 +823,45 @@ class cmd_diff(Command):
             config = repo.get_config_stack()
             with get_pager(config=config, cmd_name="diff") as outstream:
                 output_stream = _create_output_stream(outstream)
-                if len(args.committish) == 0:
-                    # Show diff for working tree or staged changes
-                    porcelain.diff(
-                        repo,
-                        staged=(args.staged or args.cached),
-                        paths=args.paths or None,
-                        outstream=output_stream,
-                    )
-                elif len(args.committish) == 1:
-                    # Show diff between working tree and specified commit
-                    if args.staged or args.cached:
-                        parser.error("--staged/--cached cannot be used with commits")
-                    porcelain.diff(
-                        repo,
-                        commit=args.committish[0],
-                        staged=False,
-                        paths=args.paths or None,
-                        outstream=output_stream,
-                    )
-                elif len(args.committish) == 2:
-                    # Show diff between two commits
-                    porcelain.diff(
-                        repo,
-                        commit=args.committish[0],
-                        commit2=args.committish[1],
-                        paths=args.paths or None,
-                        outstream=output_stream,
-                    )
-                else:
-                    parser.error("Too many arguments - specify at most two commits")
+                try:
+                    if len(args.committish) == 0:
+                        # Show diff for working tree or staged changes
+                        porcelain.diff(
+                            repo,
+                            staged=(args.staged or args.cached),
+                            paths=args.paths or None,
+                            outstream=output_stream,
+                            diff_algorithm=diff_algorithm,
+                        )
+                    elif len(args.committish) == 1:
+                        # Show diff between working tree and specified commit
+                        if args.staged or args.cached:
+                            parser.error(
+                                "--staged/--cached cannot be used with commits"
+                            )
+                        porcelain.diff(
+                            repo,
+                            commit=args.committish[0],
+                            staged=False,
+                            paths=args.paths or None,
+                            outstream=output_stream,
+                            diff_algorithm=diff_algorithm,
+                        )
+                    elif len(args.committish) == 2:
+                        # Show diff between two commits
+                        porcelain.diff(
+                            repo,
+                            commit=args.committish[0],
+                            commit2=args.committish[1],
+                            paths=args.paths or None,
+                            outstream=output_stream,
+                            diff_algorithm=diff_algorithm,
+                        )
+                    else:
+                        parser.error("Too many arguments - specify at most two commits")
+                except DiffAlgorithmNotAvailable as e:
+                    sys.stderr.write(f"fatal: {e}\n")
+                    sys.exit(1)
 
                 # Flush any remaining output
                 if hasattr(output_stream, "flush"):

+ 10 - 1
dulwich/diff.py

@@ -78,6 +78,7 @@ def diff_index_to_tree(
     outstream: BinaryIO,
     commit_sha: Optional[bytes] = None,
     paths: Optional[list[bytes]] = None,
+    diff_algorithm: Optional[str] = None,
 ) -> None:
     """Show staged changes (index vs commit).
 
@@ -86,6 +87,7 @@ def diff_index_to_tree(
         outstream: Stream to write diff to
         commit_sha: SHA of commit to compare against, or None for HEAD
         paths: Optional list of paths to filter (as bytes)
+        diff_algorithm: Algorithm to use for diffing ("myers" or "patience"), defaults to DEFAULT_DIFF_ALGORITHM if None
     """
     if commit_sha is None:
         try:
@@ -112,6 +114,7 @@ def diff_index_to_tree(
             repo.object_store,
             (oldpath, oldmode, oldsha),
             (newpath, newmode, newsha),
+            diff_algorithm=diff_algorithm,
         )
 
 
@@ -120,6 +123,7 @@ def diff_working_tree_to_tree(
     outstream: BinaryIO,
     commit_sha: bytes,
     paths: Optional[list[bytes]] = None,
+    diff_algorithm: Optional[str] = None,
 ) -> None:
     """Compare working tree to a specific commit.
 
@@ -128,6 +132,7 @@ def diff_working_tree_to_tree(
         outstream: Stream to write diff to
         commit_sha: SHA of commit to compare against
         paths: Optional list of paths to filter (as bytes)
+        diff_algorithm: Algorithm to use for diffing ("myers" or "patience"), defaults to DEFAULT_DIFF_ALGORITHM if None
     """
     commit = repo[commit_sha]
     assert isinstance(commit, Commit)
@@ -357,7 +362,10 @@ def diff_working_tree_to_tree(
 
 
 def diff_working_tree_to_index(
-    repo: Repo, outstream: BinaryIO, paths: Optional[list[bytes]] = None
+    repo: Repo,
+    outstream: BinaryIO,
+    paths: Optional[list[bytes]] = None,
+    diff_algorithm: Optional[str] = None,
 ) -> None:
     """Compare working tree to index.
 
@@ -365,6 +373,7 @@ def diff_working_tree_to_index(
         repo: Repository object
         outstream: Stream to write diff to
         paths: Optional list of paths to filter (as bytes)
+        diff_algorithm: Algorithm to use for diffing ("myers" or "patience"), defaults to DEFAULT_DIFF_ALGORITHM if None
     """
     index = repo.open_index()
     normalizer = repo.get_blob_normalizer()

+ 141 - 2
dulwich/patch.py

@@ -47,6 +47,30 @@ from .objects import S_ISGITLINK, Blob, Commit
 
 FIRST_FEW_BYTES = 8000
 
+DEFAULT_DIFF_ALGORITHM = "myers"
+
+
+class DiffAlgorithmNotAvailable(Exception):
+    """Raised when a requested diff algorithm is not available."""
+
+    def __init__(self, algorithm: str, install_hint: str = "") -> None:
+        """Initialize exception.
+
+        Args:
+            algorithm: Name of the unavailable algorithm
+            install_hint: Optional installation hint
+        """
+        self.algorithm = algorithm
+        self.install_hint = install_hint
+        if install_hint:
+            super().__init__(
+                f"Diff algorithm '{algorithm}' requested but not available. {install_hint}"
+            )
+        else:
+            super().__init__(
+                f"Diff algorithm '{algorithm}' requested but not available."
+            )
+
 
 def write_commit_patch(
     f: IO[bytes],
@@ -191,6 +215,107 @@ def unified_diff(
                     yield b"+" + line
 
 
+def _get_sequence_matcher(algorithm: str, a: list[bytes], b: list[bytes]):
+    """Get appropriate sequence matcher for the given algorithm.
+
+    Args:
+        algorithm: Diff algorithm ("myers" or "patience")
+        a: First sequence
+        b: Second sequence
+
+    Returns:
+        Configured sequence matcher instance
+
+    Raises:
+        DiffAlgorithmNotAvailable: If patience requested but not available
+    """
+    if algorithm == "patience":
+        try:
+            from patiencediff import PatienceSequenceMatcher
+
+            return PatienceSequenceMatcher(None, a, b)
+        except ImportError:
+            raise DiffAlgorithmNotAvailable(
+                "patience", "Install with: pip install 'dulwich[patiencediff]'"
+            )
+    else:
+        return SequenceMatcher(a=a, b=b)
+
+
+def unified_diff_with_algorithm(
+    a: list[bytes],
+    b: list[bytes],
+    fromfile: bytes = b"",
+    tofile: bytes = b"",
+    fromfiledate: str = "",
+    tofiledate: str = "",
+    n: int = 3,
+    lineterm: str = "\n",
+    tree_encoding: str = "utf-8",
+    output_encoding: str = "utf-8",
+    algorithm: Optional[str] = None,
+) -> Generator[bytes, None, None]:
+    """Generate unified diff with specified algorithm.
+
+    Args:
+        a: First sequence of lines
+        b: Second sequence of lines
+        fromfile: Name of first file
+        tofile: Name of second file
+        fromfiledate: Date of first file
+        tofiledate: Date of second file
+        n: Number of context lines
+        lineterm: Line terminator
+        tree_encoding: Encoding for tree paths
+        output_encoding: Encoding for output
+        algorithm: Diff algorithm to use ("myers" or "patience")
+
+    Returns:
+        Generator yielding diff lines
+
+    Raises:
+        DiffAlgorithmNotAvailable: If patience algorithm requested but patiencediff not available
+    """
+    if algorithm is None:
+        algorithm = DEFAULT_DIFF_ALGORITHM
+
+    matcher = _get_sequence_matcher(algorithm, a, b)
+
+    started = False
+    for group in matcher.get_grouped_opcodes(n):
+        if not started:
+            started = True
+            fromdate = f"\t{fromfiledate}" if fromfiledate else ""
+            todate = f"\t{tofiledate}" if tofiledate else ""
+            yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
+                output_encoding
+            )
+            yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
+                output_encoding
+            )
+
+        first, last = group[0], group[-1]
+        file1_range = _format_range_unified(first[1], last[2])
+        file2_range = _format_range_unified(first[3], last[4])
+        yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
+
+        for tag, i1, i2, j1, j2 in group:
+            if tag == "equal":
+                for line in a[i1:i2]:
+                    yield b" " + line
+                continue
+            if tag in ("replace", "delete"):
+                for line in a[i1:i2]:
+                    if not line[-1:] == b"\n":
+                        line += b"\n\\ No newline at end of file\n"
+                    yield b"-" + line
+            if tag in ("replace", "insert"):
+                for line in b[j1:j2]:
+                    if not line[-1:] == b"\n":
+                        line += b"\n\\ No newline at end of file\n"
+                    yield b"+" + line
+
+
 def is_binary(content: bytes) -> bool:
     """See if the first few bytes contain any null characters.
 
@@ -237,6 +362,7 @@ def write_object_diff(
     old_file: tuple[Optional[bytes], Optional[int], Optional[bytes]],
     new_file: tuple[Optional[bytes], Optional[int], Optional[bytes]],
     diff_binary: bool = False,
+    diff_algorithm: Optional[str] = None,
 ) -> None:
     """Write the diff for an object.
 
@@ -247,6 +373,7 @@ def write_object_diff(
       new_file: (path, mode, hexsha) tuple
       diff_binary: Whether to diff files even if they
         are considered binary files by is_binary().
+      diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
 
     Note: the tuple elements should be None for nonexistent files
     """
@@ -307,11 +434,12 @@ def write_object_diff(
         f.write(binary_diff)
     else:
         f.writelines(
-            unified_diff(
+            unified_diff_with_algorithm(
                 lines(old_content),
                 lines(new_content),
                 patched_old_path,
                 patched_new_path,
+                algorithm=diff_algorithm,
             )
         )
 
@@ -358,6 +486,7 @@ def write_blob_diff(
     f: IO[bytes],
     old_file: tuple[Optional[bytes], Optional[int], Optional["Blob"]],
     new_file: tuple[Optional[bytes], Optional[int], Optional["Blob"]],
+    diff_algorithm: Optional[str] = None,
 ) -> None:
     """Write blob diff.
 
@@ -365,6 +494,7 @@ def write_blob_diff(
       f: File-like object to write to
       old_file: (path, mode, hexsha) tuple (None if nonexisting)
       new_file: (path, mode, hexsha) tuple (None if nonexisting)
+      diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
 
     Note: The use of write_object_diff is recommended over this function.
     """
@@ -397,7 +527,13 @@ def write_blob_diff(
     old_contents = lines(old_blob)
     new_contents = lines(new_blob)
     f.writelines(
-        unified_diff(old_contents, new_contents, patched_old_path, patched_new_path)
+        unified_diff_with_algorithm(
+            old_contents,
+            new_contents,
+            patched_old_path,
+            patched_new_path,
+            algorithm=diff_algorithm,
+        )
     )
 
 
@@ -407,6 +543,7 @@ def write_tree_diff(
     old_tree: Optional[bytes],
     new_tree: Optional[bytes],
     diff_binary: bool = False,
+    diff_algorithm: Optional[str] = None,
 ) -> None:
     """Write tree diff.
 
@@ -417,6 +554,7 @@ def write_tree_diff(
       new_tree: New tree id
       diff_binary: Whether to diff files even if they
         are considered binary files by is_binary().
+      diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
     """
     changes = store.tree_changes(old_tree, new_tree)
     for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
@@ -426,6 +564,7 @@ def write_tree_diff(
             (oldpath, oldmode, oldsha),
             (newpath, newmode, newsha),
             diff_binary=diff_binary,
+            diff_algorithm=diff_algorithm,
         )
 
 

+ 13 - 3
dulwich/porcelain.py

@@ -1560,6 +1560,7 @@ def diff(
     staged: bool = False,
     paths: Optional[list[Union[str, bytes]]] = None,
     outstream: BinaryIO = default_bytes_out_stream,
+    diff_algorithm: Optional[str] = None,
 ) -> None:
     """Show diff.
 
@@ -1576,6 +1577,8 @@ def diff(
               Ignored if commit2 is provided.
       paths: Optional list of paths to limit diff
       outstream: Stream to write to
+      diff_algorithm: Algorithm to use for diffing ("myers" or "patience"),
+                      defaults to the underlying function's default if None
     """
     from . import diff as diff_module
 
@@ -1637,19 +1640,26 @@ def diff(
                     r.object_store,
                     (oldpath, oldmode, oldsha),
                     (newpath, newmode, newsha),
+                    diff_algorithm=diff_algorithm,
                 )
         elif staged:
             # Show staged changes (index vs commit)
-            diff_module.diff_index_to_tree(r, outstream, commit_sha, byte_paths)
+            diff_module.diff_index_to_tree(
+                r, outstream, commit_sha, byte_paths, diff_algorithm=diff_algorithm
+            )
         elif commit is not None:
             # Compare working tree to a specific commit
             assert (
                 commit_sha is not None
             )  # mypy: commit_sha is set when commit is not None
-            diff_module.diff_working_tree_to_tree(r, outstream, commit_sha, byte_paths)
+            diff_module.diff_working_tree_to_tree(
+                r, outstream, commit_sha, byte_paths, diff_algorithm=diff_algorithm
+            )
         else:
             # Compare working tree to index
-            diff_module.diff_working_tree_to_index(r, outstream, byte_paths)
+            diff_module.diff_working_tree_to_index(
+                r, outstream, byte_paths, diff_algorithm=diff_algorithm
+            )
 
 
 def rev_list(

+ 1 - 0
pyproject.toml

@@ -49,6 +49,7 @@ dev = [
 ]
 merge = ["merge3"]
 fuzzing = ["atheris"]
+patiencediff = ["patiencediff"]
 
 [project.scripts]
 dulwich = "dulwich.cli:main"

+ 162 - 0
tests/test_patch.py

@@ -27,8 +27,10 @@ from typing import NoReturn
 from dulwich.object_store import MemoryObjectStore
 from dulwich.objects import S_IFGITLINK, Blob, Commit, Tree
 from dulwich.patch import (
+    DiffAlgorithmNotAvailable,
     get_summary,
     git_am_patch_split,
+    unified_diff_with_algorithm,
     write_blob_diff,
     write_commit_patch,
     write_object_diff,
@@ -635,3 +637,163 @@ class GetSummaryTests(TestCase):
         c.message = b"This is the first line\nAnd this is the second line.\n"
         c.tree = Tree().id
         self.assertEqual("This-is-the-first-line", get_summary(c))
+
+
+class DiffAlgorithmTests(TestCase):
+    """Tests for diff algorithm selection."""
+
+    def test_unified_diff_with_myers(self) -> None:
+        """Test unified_diff_with_algorithm with default myers algorithm."""
+        a = [b"line1\n", b"line2\n", b"line3\n"]
+        b = [b"line1\n", b"line2 modified\n", b"line3\n"]
+
+        result = list(
+            unified_diff_with_algorithm(
+                a, b, fromfile=b"a.txt", tofile=b"b.txt", algorithm="myers"
+            )
+        )
+
+        # Should contain diff headers and the change
+        self.assertTrue(any(b"---" in line for line in result))
+        self.assertTrue(any(b"+++" in line for line in result))
+        self.assertTrue(any(b"-line2" in line for line in result))
+        self.assertTrue(any(b"+line2 modified" in line for line in result))
+
+    def test_unified_diff_with_patience_not_available(self) -> None:
+        """Test that DiffAlgorithmNotAvailable is raised when patience not available."""
+        # Temporarily mock _get_sequence_matcher to simulate ImportError
+        import dulwich.patch
+
+        original = dulwich.patch._get_sequence_matcher
+
+        def mock_get_sequence_matcher(algorithm, a, b):
+            if algorithm == "patience":
+                raise DiffAlgorithmNotAvailable(
+                    "patience", "Install with: pip install 'dulwich[patiencediff]'"
+                )
+            return original(algorithm, a, b)
+
+        try:
+            dulwich.patch._get_sequence_matcher = mock_get_sequence_matcher
+
+            a = [b"line1\n", b"line2\n", b"line3\n"]
+            b = [b"line1\n", b"line2 modified\n", b"line3\n"]
+
+            with self.assertRaises(DiffAlgorithmNotAvailable) as cm:
+                list(
+                    unified_diff_with_algorithm(
+                        a, b, fromfile=b"a.txt", tofile=b"b.txt", algorithm="patience"
+                    )
+                )
+
+            self.assertIn("patience", str(cm.exception))
+            self.assertIn("pip install", str(cm.exception))
+        finally:
+            dulwich.patch._get_sequence_matcher = original
+
+
+class PatienceDiffTests(TestCase):
+    """Tests for patience diff algorithm support."""
+
+    def setUp(self) -> None:
+        super().setUp()
+        # Skip all patience diff tests if patiencediff is not available
+        try:
+            import patiencediff  # noqa: F401
+        except ImportError:
+            raise SkipTest("patiencediff not available")
+
+    def test_unified_diff_with_patience_available(self) -> None:
+        """Test unified_diff_with_algorithm with patience if available."""
+        a = [b"line1\n", b"line2\n", b"line3\n"]
+        b = [b"line1\n", b"line2 modified\n", b"line3\n"]
+
+        result = list(
+            unified_diff_with_algorithm(
+                a, b, fromfile=b"a.txt", tofile=b"b.txt", algorithm="patience"
+            )
+        )
+
+        # Should contain diff headers and the change
+        self.assertTrue(any(b"---" in line for line in result))
+        self.assertTrue(any(b"+++" in line for line in result))
+        self.assertTrue(any(b"-line2" in line for line in result))
+        self.assertTrue(any(b"+line2 modified" in line for line in result))
+
+    def test_unified_diff_with_patience_not_available(self) -> None:
+        """Test that DiffAlgorithmNotAvailable is raised when patience not available."""
+        # Temporarily mock _get_sequence_matcher to simulate ImportError
+        import dulwich.patch
+
+        original = dulwich.patch._get_sequence_matcher
+
+        def mock_get_sequence_matcher(algorithm, a, b):
+            if algorithm == "patience":
+                raise DiffAlgorithmNotAvailable(
+                    "patience", "Install with: pip install 'dulwich[patiencediff]'"
+                )
+            return original(algorithm, a, b)
+
+        try:
+            dulwich.patch._get_sequence_matcher = mock_get_sequence_matcher
+
+            a = [b"line1\n", b"line2\n", b"line3\n"]
+            b = [b"line1\n", b"line2 modified\n", b"line3\n"]
+
+            with self.assertRaises(DiffAlgorithmNotAvailable) as cm:
+                list(
+                    unified_diff_with_algorithm(
+                        a, b, fromfile=b"a.txt", tofile=b"b.txt", algorithm="patience"
+                    )
+                )
+
+            self.assertIn("patience", str(cm.exception))
+            self.assertIn("pip install", str(cm.exception))
+        finally:
+            dulwich.patch._get_sequence_matcher = original
+
+    def test_write_blob_diff_with_patience(self) -> None:
+        """Test write_blob_diff with patience algorithm if available."""
+        f = BytesIO()
+        old_blob = Blob()
+        old_blob.data = b"line1\nline2\nline3\n"
+        new_blob = Blob()
+        new_blob.data = b"line1\nline2 modified\nline3\n"
+
+        write_blob_diff(
+            f,
+            (b"file.txt", 0o100644, old_blob),
+            (b"file.txt", 0o100644, new_blob),
+            diff_algorithm="patience",
+        )
+
+        diff = f.getvalue()
+        self.assertIn(b"diff --git", diff)
+        self.assertIn(b"-line2", diff)
+        self.assertIn(b"+line2 modified", diff)
+
+    def test_write_object_diff_with_patience(self) -> None:
+        """Test write_object_diff with patience algorithm if available."""
+        f = BytesIO()
+        store = MemoryObjectStore()
+
+        old_blob = Blob()
+        old_blob.data = b"line1\nline2\nline3\n"
+        store.add_object(old_blob)
+
+        new_blob = Blob()
+        new_blob.data = b"line1\nline2 modified\nline3\n"
+        store.add_object(new_blob)
+
+        write_object_diff(
+            f,
+            store,
+            (b"file.txt", 0o100644, old_blob.id),
+            (b"file.txt", 0o100644, new_blob.id),
+            diff_algorithm="patience",
+        )
+
+        diff = f.getvalue()
+        self.assertIn(b"diff --git", diff)
+        self.assertIn(b"-line2", diff)
+        self.assertIn(b"+line2 modified", diff)