Browse Source

Improve history fix script

Jelmer Vernooij 2 tuần trước cách đây
mục cha
commit
20f30311cd
1 tập tin đã thay đổi với 39 bổ sung28 xóa
  1. 39 28
      devscripts/fix-history.py

+ 39 - 28
devscripts/fix-history.py

@@ -12,6 +12,8 @@ import time
 from dulwich.objects import Commit, Tree
 from dulwich.repo import Repo
 
+BANNED_NAMES = [".git"]
+
 
 def fix_tree(repo, tree_id, seen_trees=None):
     """Recursively fix a tree by removing .git entries."""
@@ -37,8 +39,7 @@ def fix_tree(repo, tree_id, seen_trees=None):
     for item in tree.items():
         name, mode, sha = item
 
-        # Skip .git entries
-        if name == b".git":
+        if name in BANNED_NAMES:
             modified = True
             continue
 
@@ -67,8 +68,6 @@ def fix_tree(repo, tree_id, seen_trees=None):
 
 def fix_commit_dates(commit):
     """Fix commit dates if they're before 1990."""
-    modified = False
-
     # Unix timestamp for 1990-01-01
     min_timestamp = 315532800
     max_timestamp = int(time.time())
@@ -79,7 +78,6 @@ def fix_commit_dates(commit):
         if min_timestamp <= new_time <= max_timestamp:
             print(f"Fixed author date: {commit.author_time} -> {new_time}")
             commit.author_time = new_time
-            modified = True
 
     # Fix committer date
     if commit.commit_time < min_timestamp:
@@ -87,15 +85,18 @@ def fix_commit_dates(commit):
         if min_timestamp <= new_time <= max_timestamp:
             print(f"Fixed committer date: {commit.commit_time} -> {new_time}")
             commit.commit_time = new_time
-            modified = True
-
-    return modified
 
 
 def rewrite_history(repo, source_branch, target_branch):
     """Rewrite history to fix issues."""
     print(f"=== Rewriting history from {source_branch} to {target_branch} ===")
 
+    # Commits to filter out completely
+    filtered_commits = {
+        b"336232af1246017ce037b87e913d23e2c2a3bbbd",
+        b"e673babfc11d0b4001d9d08b9b9cef57c6aa67f5",
+    }
+
     # Get the head commit of the source branch
     try:
         source_ref = f"refs/heads/{source_branch}".encode()
@@ -109,9 +110,8 @@ def rewrite_history(repo, source_branch, target_branch):
     tree_map = {}
 
     # Get all commits in topological order
-    walker = repo.get_walker([head_sha])
+    walker = repo.get_walker([head_sha], order="topo", reverse=True)
     commits = list(walker)
-    commits.reverse()  # Process from oldest to newest
 
     print(f"Processing {len(commits)} commits...")
 
@@ -121,6 +121,18 @@ def rewrite_history(repo, source_branch, target_branch):
         if i % 100 == 0:
             print(f"Processed {i}/{len(commits)} commits...")
 
+        # Skip filtered commits entirely
+        if old_commit.id in filtered_commits:
+            # Map this commit to its parent (skip it in the history)
+            if old_commit.parents:
+                # If the parent has been remapped, use the remapped version
+                parent_sha = old_commit.parents[0]
+                commit_map[old_commit.id] = commit_map[parent_sha]
+            else:
+                # This is a root commit, skip it by not adding to commit_map
+                pass
+            continue
+
         # Fix the tree
         old_tree_id = old_commit.tree
         if old_tree_id not in tree_map:
@@ -141,29 +153,28 @@ def rewrite_history(repo, source_branch, target_branch):
         # note: Drop extra fields
 
         # Fix dates
-        date_modified = fix_commit_dates(new_commit)
+        fix_commit_dates(new_commit)
+
+        if b"jvernooij@evroc.com" in old_commit.author:
+            new_commit.author = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
+
+        if b"jvernooij@evroc.com" in old_commit.committer:
+            new_commit.committer = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
 
         # Map parent commits
         new_parents = []
         for parent_sha in old_commit.parents:
-            if parent_sha in commit_map:
-                new_parents.append(commit_map[parent_sha])
-            else:
-                new_parents.append(parent_sha)
+            parent_sha = commit_map[parent_sha]
+            new_parents.append(parent_sha)
         new_commit.parents = new_parents
 
-        # Check if commit actually changed
-        if (
-            new_tree_id == old_tree_id
-            and not date_modified
-            and new_parents == list(old_commit.parents)
-        ):
-            # No changes needed, reuse old commit
-            commit_map[old_commit.id] = old_commit.id
-        else:
-            # Add new commit to object store
-            repo.object_store.add_object(new_commit)
-            commit_map[old_commit.id] = new_commit.id
+        if old_commit.parents != new_parents:
+            assert old_commit.id != new_commit.id
+
+        # Add new commit to object store
+        repo.object_store.add_object(new_commit)
+        assert old_commit.id not in commit_map
+        commit_map[old_commit.id] = new_commit.id
 
     # Update the target branch
     new_head = commit_map[head_sha]
@@ -229,7 +240,7 @@ def main():
         obj = repo[sha]
         if isinstance(obj, Tree):
             for name, mode, item_sha in obj.items():
-                if name == b".git":
+                if name in BANNED_NAMES:
                     bad_trees.append(sha)
                     break