Преглед изворни кода

Support rewriting tag contents

Jelmer Vernooij пре 1 недеља
родитељ
комит
e53ffc3089
1 измењених фајлова са 263 додато и 48 уклоњено
  1. 263 48
      devscripts/fix-history.py

+ 263 - 48
devscripts/fix-history.py

@@ -2,21 +2,72 @@
 
 """Fix dulwich history by removing .git directories and updating old timestamps.
 
-Usage: ./fix-history.py <source-branch> <target-branch> [--update-tags]
-Example: ./fix-history.py master main --update-tags
+Usage: ./fix-history.py <source-branch> <target-branch> [--update-tags] [--rewrite-tag-commits]
+Example: ./fix-history.py master main --update-tags --rewrite-tag-commits
 """
 
 import argparse
 import sys
 import time
+from dataclasses import dataclass
 
-from dulwich.objects import Commit, Tag, Tree
+from dulwich.objects import Commit, ObjectID, Tag, Tree
+from dulwich.refs import Ref
 from dulwich.repo import Repo
 
 BANNED_NAMES = [b".git"]
 
 
-def fix_tree(repo, tree_id, seen_trees=None):
+@dataclass
+class RewriteResult:
+    """Result of rewriting history."""
+
+    commit_map: dict[ObjectID, ObjectID]
+    tree_map: dict[ObjectID, ObjectID]
+    filtered_commits: set[ObjectID]
+
+
+def create_fixed_commit(
+    old_commit: Commit, new_tree_id: ObjectID, new_parents: list[ObjectID]
+) -> Commit:
+    """Create a new commit from an old one with fixes applied.
+
+    Args:
+        old_commit: The original commit
+        new_tree_id: The new tree SHA
+        new_parents: List of parent commit SHAs
+
+    Returns:
+        A new commit with fixes applied
+    """
+    new_commit = Commit()
+    new_commit.tree = new_tree_id
+    new_commit.author = old_commit.author
+    new_commit.committer = old_commit.committer
+    new_commit.author_time = old_commit.author_time
+    new_commit.commit_time = old_commit.commit_time
+    new_commit.author_timezone = old_commit.author_timezone
+    new_commit.commit_timezone = old_commit.commit_timezone
+    new_commit.message = old_commit.message
+    new_commit.encoding = old_commit.encoding
+    new_commit.parents = new_parents
+
+    # Fix dates
+    fix_commit_dates(new_commit)
+
+    # Fix email addresses
+    if b"jvernooij@evroc.com" in old_commit.author:
+        new_commit.author = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
+
+    if b"jvernooij@evroc.com" in old_commit.committer:
+        new_commit.committer = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
+
+    return new_commit
+
+
+def fix_tree(
+    repo: Repo, tree_id: ObjectID, seen_trees: set[ObjectID] | None = None
+) -> ObjectID:
     """Recursively fix a tree by removing .git entries."""
     if seen_trees is None:
         seen_trees = set()
@@ -67,7 +118,7 @@ def fix_tree(repo, tree_id, seen_trees=None):
     return new_tree.id
 
 
-def fix_commit_dates(commit):
+def fix_commit_dates(commit: Commit) -> None:
     """Fix commit dates if they're before 1990."""
     # Unix timestamp for 1990-01-01
     min_timestamp = 315532800
@@ -88,27 +139,112 @@ def fix_commit_dates(commit):
             commit.commit_time = new_time
 
 
-def rewrite_history(repo, source_branch, target_branch):
+def rewrite_commit(
+    repo: Repo,
+    commit_sha: ObjectID,
+    commit_map: dict[ObjectID, ObjectID],
+    tree_map: dict[ObjectID, ObjectID],
+    filtered_commits: set[ObjectID],
+) -> ObjectID | None:
+    """Rewrite a single commit and its ancestors.
+
+    This is used to rewrite commits that weren't part of the main branch
+    but are referenced by tags. Uses dulwich's walker for efficient traversal,
+    stopping at commits that have already been rewritten.
+    """
+    # If already mapped, return the mapped version
+    if commit_sha in commit_map:
+        return commit_map[commit_sha]
+
+    # Use walker to efficiently get commits in topological order
+    # Exclude commits that are already mapped to avoid reprocessing
+    exclude = list(commit_map.keys())
+
+    try:
+        # Get commits in reverse topological order (parents before children)
+        walker = repo.get_walker(
+            include=[commit_sha], exclude=exclude, order="topo", reverse=True
+        )
+        commits_to_process = []
+
+        for entry in walker:
+            commits_to_process.append(entry.commit)
+
+        print(
+            f"  Processing {len(commits_to_process)} unmapped commits for tag target {commit_sha.decode()[:8]}"
+        )
+
+    except Exception as e:
+        print(f"Warning: Could not walk commits from {commit_sha.decode()[:8]}: {e}")
+        return commit_sha
+
+    # Process commits in order (parents before children)
+    for old_commit in commits_to_process:
+        commit_sha_current = old_commit.id
+
+        # Skip if already mapped
+        if commit_sha_current in commit_map:
+            continue
+
+        # Handle filtered commits
+        if commit_sha_current in filtered_commits:
+            if old_commit.parents:
+                parent_sha = old_commit.parents[0]
+                if parent_sha in commit_map:
+                    commit_map[commit_sha_current] = commit_map[parent_sha]
+            continue
+
+        # Map parent commits
+        new_parents = []
+        for parent_sha in old_commit.parents:
+            if parent_sha in commit_map:
+                mapped = commit_map[parent_sha]
+                if mapped is not None:
+                    new_parents.append(mapped)
+            else:
+                # Parent should have been processed already due to topological order
+                # Use original as fallback
+                new_parents.append(parent_sha)
+
+        # Fix the tree
+        old_tree_id = old_commit.tree
+        if old_tree_id not in tree_map:
+            tree_map[old_tree_id] = fix_tree(repo, old_tree_id)
+        new_tree_id = tree_map[old_tree_id]
+
+        # Create new commit with fixes
+        new_commit = create_fixed_commit(old_commit, new_tree_id, new_parents)
+
+        # Add new commit to object store
+        repo.object_store.add_object(new_commit)
+        commit_map[commit_sha_current] = new_commit.id
+
+    return commit_map.get(commit_sha)
+
+
+def rewrite_history(
+    repo: Repo, source_branch: str, target_branch: str
+) -> RewriteResult | None:
     """Rewrite history to fix issues."""
     print(f"=== Rewriting history from {source_branch} to {target_branch} ===")
 
     # Commits to filter out completely
-    filtered_commits = {
-        b"336232af1246017ce037b87e913d23e2c2a3bbbd",
-        b"e673babfc11d0b4001d9d08b9b9cef57c6aa67f5",
+    filtered_commits: set[ObjectID] = {
+        ObjectID(b"336232af1246017ce037b87e913d23e2c2a3bbbd"),
+        ObjectID(b"e673babfc11d0b4001d9d08b9b9cef57c6aa67f5"),
     }
 
     # Get the head commit of the source branch
     try:
-        source_ref = f"refs/heads/{source_branch}".encode()
+        source_ref = Ref(f"refs/heads/{source_branch}".encode())
         head_sha = repo.refs[source_ref]
     except KeyError:
         print(f"Error: Branch '{source_branch}' not found")
-        return False
+        return None
 
     # Map old commit SHAs to new ones
-    commit_map = {}
-    tree_map = {}
+    commit_map: dict[ObjectID, ObjectID] = {}
+    tree_map: dict[ObjectID, ObjectID] = {}
 
     # Get all commits in topological order
     walker = repo.get_walker([head_sha], order="topo", reverse=True)
@@ -140,34 +276,14 @@ def rewrite_history(repo, source_branch, target_branch):
             tree_map[old_tree_id] = fix_tree(repo, old_tree_id)
         new_tree_id = tree_map[old_tree_id]
 
-        # Create new commit
-        new_commit = Commit()
-        new_commit.tree = new_tree_id
-        new_commit.author = old_commit.author
-        new_commit.committer = old_commit.committer
-        new_commit.author_time = old_commit.author_time
-        new_commit.commit_time = old_commit.commit_time
-        new_commit.author_timezone = old_commit.author_timezone
-        new_commit.commit_timezone = old_commit.commit_timezone
-        new_commit.message = old_commit.message
-        new_commit.encoding = old_commit.encoding
-        # note: Drop extra fields
-
-        # Fix dates
-        fix_commit_dates(new_commit)
-
-        if b"jvernooij@evroc.com" in old_commit.author:
-            new_commit.author = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
-
-        if b"jvernooij@evroc.com" in old_commit.committer:
-            new_commit.committer = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
-
         # Map parent commits
         new_parents = []
         for parent_sha in old_commit.parents:
             parent_sha = commit_map[parent_sha]
             new_parents.append(parent_sha)
-        new_commit.parents = new_parents
+
+        # Create new commit with fixes (note: Drop extra fields)
+        new_commit = create_fixed_commit(old_commit, new_tree_id, new_parents)
 
         if old_commit.parents != new_parents:
             assert old_commit.id != new_commit.id
@@ -179,22 +295,41 @@ def rewrite_history(repo, source_branch, target_branch):
 
     # Update the target branch
     new_head = commit_map[head_sha]
-    target_ref = f"refs/heads/{target_branch}".encode()
+    target_ref = Ref(f"refs/heads/{target_branch}".encode())
     repo.refs[target_ref] = new_head
 
     print(
         f"✓ Created branch '{target_branch}' with {len([k for k, v in commit_map.items() if k != v])} modified commits"
     )
-    return commit_map
+    return RewriteResult(
+        commit_map=commit_map, tree_map=tree_map, filtered_commits=filtered_commits
+    )
+
+
+def update_tags(
+    repo: Repo, rewrite_result: RewriteResult, rewrite_non_branch_commits: bool = False
+) -> list[str]:
+    """Update tags to point to rewritten commits.
 
+    Args:
+        repo: The repository
+        rewrite_result: RewriteResult containing commit_map, tree_map, and filtered_commits
+        rewrite_non_branch_commits: If True, also rewrite commits that tags point to
+                                     even if they weren't part of the main branch rewrite
 
-def update_tags(repo, commit_map):
-    """Update tags to point to rewritten commits."""
+    Returns:
+        List of tag names that were updated or rewritten
+    """
     print("")
     print("=== Updating tags ===")
 
+    commit_map = rewrite_result.commit_map
+    tree_map = rewrite_result.tree_map
+    filtered_commits = rewrite_result.filtered_commits
+
     updated_tags = []
     skipped_tags = []
+    rewritten_tags = []
 
     # Iterate through all refs looking for tags
     for ref_name, ref_value in list(repo.refs.as_dict().items()):
@@ -240,6 +375,37 @@ def update_tags(repo, commit_map):
                     updated_tags.append(tag_name)
                 else:
                     skipped_tags.append(tag_name)
+            elif rewrite_non_branch_commits:
+                # Rewrite this commit and its ancestors
+                print(
+                    f"Rewriting history for tag '{tag_name}' (commit {target_sha.decode()[:8]} not in branch)"
+                )
+                rewritten_sha = rewrite_commit(
+                    repo, target_sha, commit_map, tree_map, filtered_commits
+                )
+
+                if rewritten_sha and rewritten_sha != target_sha:
+                    # Create a new tag object pointing to the rewritten commit
+                    new_tag = Tag()
+                    new_tag.name = tag_obj.name
+                    new_tag.object = (tag_obj.object[0], rewritten_sha)
+                    new_tag.tag_time = tag_obj.tag_time
+                    new_tag.tag_timezone = tag_obj.tag_timezone
+                    new_tag.tagger = tag_obj.tagger
+                    new_tag.message = tag_obj.message
+
+                    # Add the new tag object to the object store
+                    repo.object_store.add_object(new_tag)
+
+                    # Update the ref to point to the new tag object
+                    repo.refs[ref_name] = new_tag.id
+
+                    print(
+                        f"Rewrote and updated annotated tag '{tag_name}': {target_sha.decode()[:8]} -> {rewritten_sha.decode()[:8]}"
+                    )
+                    rewritten_tags.append(tag_name)
+                else:
+                    skipped_tags.append(tag_name)
             else:
                 print(
                     f"Warning: Tag '{tag_name}' points to commit not in history, skipping"
@@ -263,6 +429,25 @@ def update_tags(repo, commit_map):
                     updated_tags.append(tag_name)
                 else:
                     skipped_tags.append(tag_name)
+            elif rewrite_non_branch_commits:
+                # Rewrite this commit and its ancestors
+                print(
+                    f"Rewriting history for tag '{tag_name}' (commit {commit_sha.decode()[:8]} not in branch)"
+                )
+                rewritten_sha = rewrite_commit(
+                    repo, commit_sha, commit_map, tree_map, filtered_commits
+                )
+
+                if rewritten_sha and rewritten_sha != commit_sha:
+                    # Update the ref to point to the new commit
+                    repo.refs[ref_name] = rewritten_sha
+
+                    print(
+                        f"Rewrote and updated lightweight tag '{tag_name}': {commit_sha.decode()[:8]} -> {rewritten_sha.decode()[:8]}"
+                    )
+                    rewritten_tags.append(tag_name)
+                else:
+                    skipped_tags.append(tag_name)
             else:
                 print(
                     f"Warning: Tag '{tag_name}' points to commit not in history, skipping"
@@ -275,15 +460,19 @@ def update_tags(repo, commit_map):
             skipped_tags.append(tag_name)
 
     print(f"✓ Updated {len(updated_tags)} tags")
+    if rewritten_tags:
+        print(
+            f"✓ Rewrote and updated {len(rewritten_tags)} tags (commits not in branch)"
+        )
     if skipped_tags:
         print(
             f"  Skipped {len(skipped_tags)} tags (unchanged or not in rewritten history)"
         )
 
-    return updated_tags
+    return updated_tags + rewritten_tags
 
 
-def main():
+def main() -> None:
     parser = argparse.ArgumentParser(
         description="Fix dulwich history by removing .git directories and updating old timestamps.",
         epilog="This will create a new branch <target-branch> with the rewritten history from <source-branch>",
@@ -297,12 +486,23 @@ def main():
         action="store_true",
         help="Update existing tags to point to rewritten commits",
     )
+    parser.add_argument(
+        "--rewrite-tag-commits",
+        action="store_true",
+        help="Also rewrite commits that tags point to, even if they aren't in the main branch history",
+    )
 
     args = parser.parse_args()
 
     source_branch = args.source_branch
     target_branch = args.target_branch
     update_tags_flag = args.update_tags
+    rewrite_tag_commits_flag = args.rewrite_tag_commits
+
+    # Validate flags
+    if rewrite_tag_commits_flag and not update_tags_flag:
+        print("Error: --rewrite-tag-commits requires --update-tags")
+        sys.exit(1)
 
     print("=== Dulwich History Fix Script ===")
     print("This script will:")
@@ -313,11 +513,17 @@ def main():
     )
     if update_tags_flag:
         print("4. Update existing tags to point to rewritten commits")
+        if rewrite_tag_commits_flag:
+            print(
+                "   - Including rewriting commits that tags point to outside the branch"
+            )
     print("")
     print(f"Source branch: {source_branch}")
     print(f"Target branch: {target_branch}")
     if update_tags_flag:
         print("Update tags: Yes")
+        if rewrite_tag_commits_flag:
+            print("Rewrite tag commits: Yes")
     print("")
 
     # Open the repository
@@ -328,13 +534,13 @@ def main():
         sys.exit(1)
 
     # Check if source branch exists
-    source_ref = f"refs/heads/{source_branch}".encode()
+    source_ref = Ref(f"refs/heads/{source_branch}".encode())
     if source_ref not in repo.refs:
         print(f"Error: Source branch '{source_branch}' does not exist")
         sys.exit(1)
 
     # Check if target branch already exists
-    target_ref = f"refs/heads/{target_branch}".encode()
+    target_ref = Ref(f"refs/heads/{target_branch}".encode())
     if target_ref in repo.refs:
         print(f"Error: Target branch '{target_branch}' already exists")
         print("Please delete it first or choose a different name")
@@ -368,13 +574,13 @@ def main():
 
     # Rewrite history
     print("")
-    commit_map = rewrite_history(repo, source_branch, target_branch)
-    if not commit_map:
+    rewrite_result = rewrite_history(repo, source_branch, target_branch)
+    if not rewrite_result:
         sys.exit(1)
 
     # Update tags if requested
     if update_tags_flag:
-        update_tags(repo, commit_map)
+        update_tags(repo, rewrite_result, rewrite_tag_commits_flag)
 
     print("")
     print("=== Complete ===")
@@ -387,7 +593,12 @@ def main():
     print("- Fixed commit timestamps that were before 1990")
     print(f"- Created clean history in branch '{target_branch}'")
     if update_tags_flag:
-        print("- Updated tags to point to rewritten commits")
+        if rewrite_tag_commits_flag:
+            print(
+                "- Updated tags to point to rewritten commits (including commits outside branch)"
+            )
+        else:
+            print("- Updated tags to point to rewritten commits")
     print("")
     print("IMPORTANT NEXT STEPS:")
     print(f"1. Review the changes: git log --oneline {target_branch}")
@@ -412,6 +623,10 @@ def main():
         print(
             "WARNING: Tags have been updated. You may need to force push them to remote."
         )
+        if rewrite_tag_commits_flag:
+            print(
+                "WARNING: Tag commits outside the branch were also rewritten. Review carefully."
+            )
 
 
 if __name__ == "__main__":