Переглянути джерело

Add support for cherry command

Fixes #1782
Jelmer Vernooij 3 місяців тому
батько
коміт
d6c6ae7ef4
6 змінених файлів з 539 додано та 6 видалено
  1. 11 6
      NEWS
  2. 66 0
      dulwich/cli.py
  3. 112 0
      dulwich/patch.py
  4. 138 0
      dulwich/porcelain.py
  5. 96 0
      tests/test_patch.py
  6. 116 0
      tests/test_porcelain.py

+ 11 - 6
NEWS

@@ -1,13 +1,18 @@
 0.24.6	2025-10-17
 
- * Add support for ``git show-branch`` command to display branches and their
+ * Add ``dulwich cherry`` command to find commits not merged upstream.
+   Compares commits by patch ID to identify equivalent patches regardless of
+   commit metadata. Supports automatic upstream detection from tracking branches
+   and verbose mode to display commit messages. (Jelmer Vernooij, #1782)
+
+ * Add support for ``dulwich show-branch`` command to display branches and their
    commits. Supports filtering by local/remote branches, topological ordering,
    list mode, independent branch detection, and merge base calculation.
    (Jelmer Vernooij, #1829)
 
 0.24.5	2025-10-15
 
- * Add support for ``git show-ref`` command to list references in a local
+ * Add support for ``dulwich show-ref`` command to list references in a local
    repository. Supports filtering by branches/tags, pattern matching,
    dereferencing tags, verification mode, and existence checking. Available
    as ``porcelain.show_ref()`` and ``dulwich show-ref`` CLI command.
@@ -33,7 +38,7 @@
 
  * Add ``dulwich merge-base`` command. (Jelmer Vernooij, #1831)
 
- * Add support for ``git var`` command to display Git's logical variables
+ * Add support for ``dulwich var`` command to display Git's logical variables
    (GIT_AUTHOR_IDENT, GIT_COMMITTER_IDENT, GIT_EDITOR, GIT_SEQUENCE_EDITOR,
    GIT_PAGER, GIT_DEFAULT_BRANCH). Available as ``porcelain.var()`` and
    ``dulwich var`` CLI command. (Jelmer Vernooij, #1841)
@@ -53,7 +58,7 @@
    other filters that send status messages in final headers.
    (Jelmer Vernooij, #1889)
 
- * Add ``git worktree repair`` command to repair worktree administrative files
+ * Add ``dulwich worktree repair`` command to repair worktree administrative files
    after worktrees or the main repository have been moved.
    (Jelmer Vernooij, #1799)
 
@@ -90,7 +95,7 @@
  * Fix Windows config loading to only use current Git config path,
    avoiding loading older config files.  (Jelmer Vernooij, #1732)
 
- * Add interactive rebase support with ``git rebase -i``, including support
+ * Add interactive rebase support with ``dulwich rebase -i``, including support
    for pick, reword, edit, squash, fixup, drop, exec, and break commands.
    (Jelmer Vernooij, #1696)
 
@@ -113,7 +118,7 @@
 
  * Optimize LFS filter performance by avoiding redundant disk writes when
    checking file status. The LFS store now checks if objects already exist
-   before writing them to disk, significantly improving ``git status``
+   before writing them to disk, significantly improving ``dulwich status``
    performance in repositories with many LFS-tracked files.
    (Jelmer Vernooij, #1789)
 

+ 66 - 0
dulwich/cli.py

@@ -3353,6 +3353,71 @@ class cmd_notes(SuperCommand):
     default_command = cmd_notes_list
 
 
+class cmd_cherry(Command):
+    """Find commits not merged upstream."""
+
+    def run(self, args: Sequence[str]) -> Optional[int]:
+        """Execute the cherry command.
+
+        Args:
+            args: Command line arguments
+
+        Returns:
+            Exit code (0 for success, 1 for error)
+        """
+        parser = argparse.ArgumentParser(description="Find commits not merged upstream")
+        parser.add_argument(
+            "-v",
+            "--verbose",
+            action="store_true",
+            help="Show commit messages",
+        )
+        parser.add_argument(
+            "upstream",
+            nargs="?",
+            help="Upstream branch (default: tracking branch or HEAD^)",
+        )
+        parser.add_argument(
+            "head",
+            nargs="?",
+            help="Head branch (default: HEAD)",
+        )
+        parser.add_argument(
+            "limit",
+            nargs="?",
+            help="Limit commits to those after this ref",
+        )
+        parsed_args = parser.parse_args(args)
+
+        try:
+            results = porcelain.cherry(
+                ".",
+                upstream=parsed_args.upstream,
+                head=parsed_args.head,
+                limit=parsed_args.limit,
+                verbose=parsed_args.verbose,
+            )
+        except (NotGitRepository, OSError, FileFormatException, ValueError) as e:
+            logger.error(f"Error: {e}")
+            return 1
+
+        # Output results
+        for status, commit_sha, message in results:
+            # Convert commit_sha to hex string
+            if isinstance(commit_sha, bytes):
+                commit_hex = commit_sha.hex()
+            else:
+                commit_hex = commit_sha
+
+            if parsed_args.verbose and message:
+                message_str = message.decode("utf-8", errors="replace")
+                logger.info(f"{status} {commit_hex} {message_str}")
+            else:
+                logger.info(f"{status} {commit_hex}")
+
+        return 0
+
+
 class cmd_cherry_pick(Command):
     """Apply the changes introduced by some existing commits."""
 
@@ -4886,6 +4951,7 @@ commands = {
     "check-ignore": cmd_check_ignore,
     "check-mailmap": cmd_check_mailmap,
     "checkout": cmd_checkout,
+    "cherry": cmd_cherry,
     "cherry-pick": cmd_cherry_pick,
     "clone": cmd_clone,
     "commit": cmd_commit,

+ 112 - 0
dulwich/patch.py

@@ -648,3 +648,115 @@ def parse_patch_message(
     except StopIteration:
         version = None
     return c, diff, version
+
+
+def patch_id(diff_data: bytes) -> bytes:
+    """Compute patch ID for a diff.
+
+    The patch ID is computed by normalizing the diff and computing a SHA1 hash.
+    This follows git's patch-id algorithm which:
+    1. Removes whitespace from lines starting with + or -
+    2. Replaces line numbers in @@ headers with a canonical form
+    3. Computes SHA1 of the result
+
+    Args:
+        diff_data: Raw diff data as bytes
+
+    Returns:
+        SHA1 hash of normalized diff (40-byte hex string)
+
+    TODO: This implementation uses a simple line-by-line approach. For better
+    compatibility with git's patch-id, consider using proper patch parsing that:
+    - Handles edge cases in diff format (binary diffs, mode changes, etc.)
+    - Properly parses unified diff format according to the spec
+    - Matches git's exact normalization algorithm byte-for-byte
+    See git's patch-id.c for reference implementation.
+    """
+    import hashlib
+    import re
+
+    # Normalize the diff for patch-id computation
+    normalized_lines = []
+
+    for line in diff_data.split(b"\n"):
+        # Skip diff headers (diff --git, index, ---, +++)
+        if line.startswith(
+            (
+                b"diff --git ",
+                b"index ",
+                b"--- ",
+                b"+++ ",
+                b"new file mode ",
+                b"old file mode ",
+                b"deleted file mode ",
+                b"new mode ",
+                b"old mode ",
+                b"similarity index ",
+                b"dissimilarity index ",
+                b"rename from ",
+                b"rename to ",
+                b"copy from ",
+                b"copy to ",
+            )
+        ):
+            continue
+
+        # Normalize @@ headers to a canonical form
+        if line.startswith(b"@@"):
+            # Replace line numbers with canonical form
+            match = re.match(rb"^@@\s+-\d+(?:,\d+)?\s+\+\d+(?:,\d+)?\s+@@", line)
+            if match:
+                # Use canonical hunk header without line numbers
+                normalized_lines.append(b"@@")
+                continue
+
+        # For +/- lines, strip all whitespace
+        if line.startswith((b"+", b"-")):
+            # Keep the +/- prefix but remove all whitespace from the rest
+            if len(line) > 1:
+                # Remove all whitespace from the content
+                content = line[1:].replace(b" ", b"").replace(b"\t", b"")
+                normalized_lines.append(line[:1] + content)
+            else:
+                # Just +/- alone
+                normalized_lines.append(line[:1])
+            continue
+
+        # Keep context lines and other content as-is
+        if line.startswith(b" ") or line == b"":
+            normalized_lines.append(line)
+
+    # Join normalized lines and compute SHA1
+    normalized = b"\n".join(normalized_lines)
+    return hashlib.sha1(normalized).hexdigest().encode("ascii")
+
+
+def commit_patch_id(store: "BaseObjectStore", commit_id: bytes) -> bytes:
+    """Compute patch ID for a commit.
+
+    Args:
+        store: Object store to read objects from
+        commit_id: Commit ID (40-byte hex string)
+
+    Returns:
+        Patch ID (40-byte hex string)
+    """
+    from io import BytesIO
+
+    commit = store[commit_id]
+    assert isinstance(commit, Commit)
+
+    # Get the parent tree (or empty tree for root commit)
+    if commit.parents:
+        parent = store[commit.parents[0]]
+        assert isinstance(parent, Commit)
+        parent_tree = parent.tree
+    else:
+        # Root commit - compare against empty tree
+        parent_tree = None
+
+    # Generate diff
+    diff_output = BytesIO()
+    write_tree_diff(diff_output, store, parent_tree, commit.tree)
+
+    return patch_id(diff_output.getvalue())

+ 138 - 0
dulwich/porcelain.py

@@ -5547,6 +5547,144 @@ def merge_tree(
         return merged_tree.id, conflicts
 
 
+def cherry(
+    repo: Union[str, os.PathLike[str], Repo],
+    upstream: Optional[Union[str, bytes]] = None,
+    head: Optional[Union[str, bytes]] = None,
+    limit: Optional[Union[str, bytes]] = None,
+    verbose: bool = False,
+) -> list[tuple[str, bytes, Optional[bytes]]]:
+    """Find commits not merged upstream.
+
+    Args:
+        repo: Repository path or object
+        upstream: Upstream branch (default: tracking branch or @{upstream})
+        head: Head branch (default: HEAD)
+        limit: Limit commits to those after this ref
+        verbose: Include commit messages in output
+
+    Returns:
+        List of tuples (status, commit_sha, message) where status is '+' or '-'
+        '+' means commit is not in upstream, '-' means equivalent patch exists upstream
+        message is None unless verbose=True
+    """
+    from .patch import commit_patch_id
+
+    with open_repo_closing(repo) as r:
+        # Resolve upstream
+        if upstream is None:
+            # Try to find tracking branch
+            upstream_found = False
+            head_refs, _ = r.refs.follow(b"HEAD")
+            if head_refs:
+                head_ref = head_refs[0]
+                if head_ref.startswith(b"refs/heads/"):
+                    config = r.get_config()
+                    branch_name = head_ref[len(b"refs/heads/") :]
+
+                    try:
+                        upstream_ref = config.get((b"branch", branch_name), b"merge")
+                    except KeyError:
+                        upstream_ref = None
+
+                    if upstream_ref:
+                        try:
+                            remote_name = config.get(
+                                (b"branch", branch_name), b"remote"
+                            )
+                        except KeyError:
+                            remote_name = None
+
+                        if remote_name:
+                            # Build the tracking branch ref
+                            upstream_refname = (
+                                b"refs/remotes/"
+                                + remote_name
+                                + b"/"
+                                + upstream_ref.split(b"/")[-1]
+                            )
+                            if upstream_refname in r.refs:
+                                upstream = upstream_refname
+                                upstream_found = True
+
+            if not upstream_found:
+                # Default to HEAD^ if no tracking branch found
+                head_commit = r[b"HEAD"]
+                if isinstance(head_commit, Commit) and head_commit.parents:
+                    upstream = head_commit.parents[0]
+                else:
+                    raise ValueError("Could not determine upstream branch")
+
+        # Resolve head
+        if head is None:
+            head = b"HEAD"
+
+        # Convert strings to bytes
+        if isinstance(upstream, str):
+            upstream = upstream.encode("utf-8")
+        if isinstance(head, str):
+            head = head.encode("utf-8")
+        if limit is not None and isinstance(limit, str):
+            limit = limit.encode("utf-8")
+
+        # Resolve refs to commit IDs
+        assert upstream is not None
+        upstream_obj = r[upstream]
+        head_obj = r[head]
+        upstream_id = upstream_obj.id
+        head_id = head_obj.id
+
+        # Get limit commit ID if specified
+        limit_id = None
+        if limit is not None:
+            limit_id = r[limit].id
+
+        # Find all commits reachable from head but not from upstream
+        # This is equivalent to: git rev-list ^upstream head
+
+        # Get commits from head that are not in upstream
+        walker = r.get_walker([head_id], exclude=[upstream_id])
+        head_commits = []
+        for entry in walker:
+            commit = entry.commit
+            # Apply limit if specified
+            if limit_id is not None:
+                # Stop when we reach the limit commit
+                if commit.id == limit_id:
+                    break
+            head_commits.append(commit.id)
+
+        # Compute patch IDs for upstream commits
+        upstream_walker = r.get_walker([upstream_id])
+        upstream_patch_ids = {}  # Maps patch_id -> commit_id for debugging
+        for entry in upstream_walker:
+            commit = entry.commit
+            pid = commit_patch_id(r.object_store, commit.id)
+            upstream_patch_ids[pid] = commit.id
+
+        # For each head commit, check if equivalent patch exists in upstream
+        results: list[tuple[str, bytes, Optional[bytes]]] = []
+        for commit_id in reversed(head_commits):  # Show oldest first
+            obj = r.object_store[commit_id]
+            assert isinstance(obj, Commit)
+            commit = obj
+
+            pid = commit_patch_id(r.object_store, commit_id)
+
+            if pid in upstream_patch_ids:
+                status = "-"
+            else:
+                status = "+"
+
+            message = None
+            if verbose:
+                message = commit.message.split(b"\n")[0]  # First line only
+
+            results.append((status, commit_id, message))
+
+        return results
+
+
 def cherry_pick(  # noqa: D417
     repo: Union[str, os.PathLike[str], Repo],
     committish: Union[str, bytes, Commit, Tag, None],

+ 96 - 0
tests/test_patch.py

@@ -28,8 +28,10 @@ from dulwich.object_store import MemoryObjectStore
 from dulwich.objects import S_IFGITLINK, Blob, Commit, Tree
 from dulwich.patch import (
     DiffAlgorithmNotAvailable,
+    commit_patch_id,
     get_summary,
     git_am_patch_split,
+    patch_id,
     unified_diff_with_algorithm,
     write_blob_diff,
     write_commit_patch,
@@ -797,3 +799,97 @@ class PatienceDiffTests(TestCase):
         self.assertIn(b"diff --git", diff)
         self.assertIn(b"-line2", diff)
         self.assertIn(b"+line2 modified", diff)
+
+
+class PatchIdTests(TestCase):
+    """Tests for patch_id and commit_patch_id functions."""
+
+    def test_patch_id_simple(self) -> None:
+        """Test patch_id computation with a simple diff."""
+        diff = b"""diff --git a/file.txt b/file.txt
+index 3b0f961..a116b51 644
+--- a/file.txt
++++ b/file.txt
+@@ -1,2 +1,2 @@
+-old
++new
+ same
+"""
+        pid = patch_id(diff)
+        # Patch ID should be a 40-byte hex string
+        self.assertEqual(40, len(pid))
+        self.assertTrue(all(c in b"0123456789abcdef" for c in pid))
+
+    def test_patch_id_same_for_equivalent_diffs(self) -> None:
+        """Test that equivalent patches have the same ID."""
+        # Two diffs with different line numbers but same changes
+        diff1 = b"""diff --git a/file.txt b/file.txt
+--- a/file.txt
++++ b/file.txt
+@@ -1,3 +1,3 @@
+ context
+-old line
++new line
+ context
+"""
+        diff2 = b"""diff --git a/file.txt b/file.txt
+--- a/file.txt
++++ b/file.txt
+@@ -10,3 +10,3 @@
+ context
+-old line
++new line
+ context
+"""
+        pid1 = patch_id(diff1)
+        pid2 = patch_id(diff2)
+        # Same patch content should give same patch ID
+        self.assertEqual(pid1, pid2)
+
+    def test_commit_patch_id(self) -> None:
+        """Test commit_patch_id computation."""
+        store = MemoryObjectStore()
+
+        # Create two trees
+        blob1 = Blob.from_string(b"content1\n")
+        blob2 = Blob.from_string(b"content2\n")
+        store.add_objects([(blob1, None), (blob2, None)])
+
+        tree1 = Tree()
+        tree1.add(b"file.txt", 0o644, blob1.id)
+        store.add_object(tree1)
+
+        tree2 = Tree()
+        tree2.add(b"file.txt", 0o644, blob2.id)
+        store.add_object(tree2)
+
+        # Create a commit
+        commit = Commit()
+        commit.tree = tree2.id
+        commit.parents = [b"0" * 40]  # Fake parent
+        commit.author = commit.committer = b"Test <test@example.com>"
+        commit.author_time = commit.commit_time = 1234567890
+        commit.author_timezone = commit.commit_timezone = 0
+        commit.message = b"Test commit\n"
+        commit.encoding = b"UTF-8"
+        store.add_object(commit)
+
+        # Create parent commit
+        parent_commit = Commit()
+        parent_commit.tree = tree1.id
+        parent_commit.parents = []
+        parent_commit.author = parent_commit.committer = b"Test <test@example.com>"
+        parent_commit.author_time = parent_commit.commit_time = 1234567880
+        parent_commit.author_timezone = parent_commit.commit_timezone = 0
+        parent_commit.message = b"Parent commit\n"
+        parent_commit.encoding = b"UTF-8"
+        store.add_object(parent_commit)
+
+        # Update commit to have real parent
+        commit.parents = [parent_commit.id]
+        store.add_object(commit)
+
+        # Compute patch ID
+        pid = commit_patch_id(store, commit.id)
+        self.assertEqual(40, len(pid))
+        self.assertTrue(all(c in b"0123456789abcdef" for c in pid))

+ 116 - 0
tests/test_porcelain.py

@@ -10041,3 +10041,119 @@ class MergeBaseTests(PorcelainTestCase):
         """Test independent_commits with empty list."""
         result = porcelain.independent_commits(self.repo.path, committishes=[])
         self.assertEqual([], result)
+
+
+class CherryTests(PorcelainTestCase):
+    """Tests for cherry command."""
+
+    def test_cherry_no_changes(self):
+        """Test cherry when head and upstream are the same."""
+        # Create a simple commit
+        commit_sha = self.repo.do_commit(
+            b"Initial commit", committer=b"Test <test@example.com>"
+        )
+
+        # Cherry should return empty when comparing a commit to itself
+        results = porcelain.cherry(
+            self.repo.path, upstream=commit_sha.decode(), head=commit_sha.decode()
+        )
+        self.assertEqual([], results)
+
+    def test_cherry_unique_commits(self):
+        """Test cherry with commits unique to head."""
+        # Create initial commit
+        with open(os.path.join(self.repo_path, "file1.txt"), "w") as f:
+            f.write("base content\n")
+        self.repo.stage(["file1.txt"])
+        base_commit = self.repo.do_commit(
+            b"Base commit", committer=b"Test <test@example.com>"
+        )
+
+        # Create a new commit on head
+        with open(os.path.join(self.repo_path, "file2.txt"), "w") as f:
+            f.write("new content\n")
+        self.repo.stage(["file2.txt"])
+        head_commit = self.repo.do_commit(
+            b"New commit", committer=b"Test <test@example.com>"
+        )
+
+        # Cherry should show the new commit as unique
+        results = porcelain.cherry(
+            self.repo.path, upstream=base_commit.decode(), head=head_commit.decode()
+        )
+        self.assertEqual(1, len(results))
+        status, commit_sha, message = results[0]
+        self.assertEqual("+", status)
+        self.assertEqual(head_commit, commit_sha)
+        self.assertIsNone(message)
+
+    def test_cherry_verbose(self):
+        """Test cherry with verbose flag."""
+        # Create initial commit
+        with open(os.path.join(self.repo_path, "file1.txt"), "w") as f:
+            f.write("base content\n")
+        self.repo.stage(["file1.txt"])
+        base_commit = self.repo.do_commit(
+            b"Base commit", committer=b"Test <test@example.com>"
+        )
+
+        # Create a new commit on head
+        with open(os.path.join(self.repo_path, "file2.txt"), "w") as f:
+            f.write("new content\n")
+        self.repo.stage(["file2.txt"])
+        head_commit = self.repo.do_commit(
+            b"New commit on head", committer=b"Test <test@example.com>"
+        )
+
+        # Cherry with verbose should include commit message
+        results = porcelain.cherry(
+            self.repo.path,
+            upstream=base_commit.decode(),
+            head=head_commit.decode(),
+            verbose=True,
+        )
+        self.assertEqual(1, len(results))
+        status, commit_sha, message = results[0]
+        self.assertEqual("+", status)
+        self.assertEqual(head_commit, commit_sha)
+        self.assertEqual(b"New commit on head", message)
+
+    def test_cherry_equivalent_patches(self):
+        """Test cherry with equivalent patches (cherry-picked commits)."""
+        # Create base commit
+        with open(os.path.join(self.repo_path, "file.txt"), "w") as f:
+            f.write("line1\n")
+        self.repo.stage(["file.txt"])
+        base_commit = self.repo.do_commit(
+            b"Base commit", committer=b"Test <test@example.com>"
+        )
+
+        # Create upstream branch with a change
+        with open(os.path.join(self.repo_path, "file.txt"), "w") as f:
+            f.write("line1\nline2\n")
+        self.repo.stage(["file.txt"])
+        upstream_commit = self.repo.do_commit(
+            b"Add line2", committer=b"Test <test@example.com>"
+        )
+
+        # Reset to base and create same change on head branch
+        self.repo.refs[b"HEAD"] = base_commit
+        self.repo.reset_index()
+        with open(os.path.join(self.repo_path, "file.txt"), "w") as f:
+            f.write("line1\nline2\n")
+        self.repo.stage(["file.txt"])
+        head_commit = self.repo.do_commit(
+            b"Add line2 (different metadata)",
+            committer=b"Different <different@example.com>",
+        )
+
+        # Cherry should mark this as equivalent (-)
+        results = porcelain.cherry(
+            self.repo.path,
+            upstream=upstream_commit.decode(),
+            head=head_commit.decode(),
+        )
+        self.assertEqual(1, len(results))
+        status, commit_sha, _message = results[0]
+        self.assertEqual("-", status)
+        self.assertEqual(head_commit, commit_sha)