| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633 |
- #!/usr/bin/env python3
- """Fix dulwich history by removing .git directories and updating old timestamps.
- Usage: ./fix-history.py <source-branch> <target-branch> [--update-tags] [--rewrite-tag-commits]
- Example: ./fix-history.py master main --update-tags --rewrite-tag-commits
- """
- import argparse
- import sys
- import time
- from dataclasses import dataclass
- from dulwich.objects import Commit, ObjectID, Tag, Tree
- from dulwich.refs import Ref
- from dulwich.repo import Repo
- BANNED_NAMES = [b".git"]
- @dataclass
- class RewriteResult:
- """Result of rewriting history."""
- commit_map: dict[ObjectID, ObjectID]
- tree_map: dict[ObjectID, ObjectID]
- filtered_commits: set[ObjectID]
- def create_fixed_commit(
- old_commit: Commit, new_tree_id: ObjectID, new_parents: list[ObjectID]
- ) -> Commit:
- """Create a new commit from an old one with fixes applied.
- Args:
- old_commit: The original commit
- new_tree_id: The new tree SHA
- new_parents: List of parent commit SHAs
- Returns:
- A new commit with fixes applied
- """
- new_commit = Commit()
- new_commit.tree = new_tree_id
- new_commit.author = old_commit.author
- new_commit.committer = old_commit.committer
- new_commit.author_time = old_commit.author_time
- new_commit.commit_time = old_commit.commit_time
- new_commit.author_timezone = old_commit.author_timezone
- new_commit.commit_timezone = old_commit.commit_timezone
- new_commit.message = old_commit.message
- new_commit.encoding = old_commit.encoding
- new_commit.parents = new_parents
- # Fix dates
- fix_commit_dates(new_commit)
- # Fix email addresses
- if b"jvernooij@evroc.com" in old_commit.author:
- new_commit.author = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
- if b"jvernooij@evroc.com" in old_commit.committer:
- new_commit.committer = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
- return new_commit
- def fix_tree(
- repo: Repo, tree_id: ObjectID, seen_trees: set[ObjectID] | None = None
- ) -> ObjectID:
- """Recursively fix a tree by removing .git entries."""
- if seen_trees is None:
- seen_trees = set()
- if tree_id in seen_trees:
- return tree_id
- seen_trees.add(tree_id)
- try:
- tree = repo[tree_id]
- except KeyError:
- return tree_id
- if not isinstance(tree, Tree):
- return tree_id
- # Check if this tree contains .git entries
- modified = False
- new_items = []
- for item in tree.items():
- name, mode, sha = item
- if name in BANNED_NAMES:
- modified = True
- continue
- # Recursively fix subtrees
- if mode == 0o040000: # Directory mode
- new_sha = fix_tree(repo, sha, seen_trees)
- if new_sha != sha:
- modified = True
- sha = new_sha
- new_items.append((name, mode, sha))
- if not modified:
- return tree_id
- print(f"Removing .git entry from tree {tree_id.decode()}")
- # Create new tree without .git entries
- new_tree = Tree()
- for name, mode, sha in new_items:
- new_tree.add(name, mode, sha)
- repo.object_store.add_object(new_tree)
- return new_tree.id
- def fix_commit_dates(commit: Commit) -> None:
- """Fix commit dates if they're before 1990."""
- # Unix timestamp for 1990-01-01
- min_timestamp = 315532800
- max_timestamp = int(time.time())
- # Fix author date
- if commit.author_time < min_timestamp:
- new_time = commit.author_time * 10
- if min_timestamp <= new_time <= max_timestamp:
- print(f"Fixed author date: {commit.author_time} -> {new_time}")
- commit.author_time = new_time
- # Fix committer date
- if commit.commit_time < min_timestamp:
- new_time = commit.commit_time * 10
- if min_timestamp <= new_time <= max_timestamp:
- print(f"Fixed committer date: {commit.commit_time} -> {new_time}")
- commit.commit_time = new_time
- def rewrite_commit(
- repo: Repo,
- commit_sha: ObjectID,
- commit_map: dict[ObjectID, ObjectID],
- tree_map: dict[ObjectID, ObjectID],
- filtered_commits: set[ObjectID],
- ) -> ObjectID | None:
- """Rewrite a single commit and its ancestors.
- This is used to rewrite commits that weren't part of the main branch
- but are referenced by tags. Uses dulwich's walker for efficient traversal,
- stopping at commits that have already been rewritten.
- """
- # If already mapped, return the mapped version
- if commit_sha in commit_map:
- return commit_map[commit_sha]
- # Use walker to efficiently get commits in topological order
- # Exclude commits that are already mapped to avoid reprocessing
- exclude = list(commit_map.keys())
- try:
- # Get commits in reverse topological order (parents before children)
- walker = repo.get_walker(
- include=[commit_sha], exclude=exclude, order="topo", reverse=True
- )
- commits_to_process = []
- for entry in walker:
- commits_to_process.append(entry.commit)
- print(
- f" Processing {len(commits_to_process)} unmapped commits for tag target {commit_sha.decode()[:8]}"
- )
- except Exception as e:
- print(f"Warning: Could not walk commits from {commit_sha.decode()[:8]}: {e}")
- return commit_sha
- # Process commits in order (parents before children)
- for old_commit in commits_to_process:
- commit_sha_current = old_commit.id
- # Skip if already mapped
- if commit_sha_current in commit_map:
- continue
- # Handle filtered commits
- if commit_sha_current in filtered_commits:
- if old_commit.parents:
- parent_sha = old_commit.parents[0]
- if parent_sha in commit_map:
- commit_map[commit_sha_current] = commit_map[parent_sha]
- continue
- # Map parent commits
- new_parents = []
- for parent_sha in old_commit.parents:
- if parent_sha in commit_map:
- mapped = commit_map[parent_sha]
- if mapped is not None:
- new_parents.append(mapped)
- else:
- # Parent should have been processed already due to topological order
- # Use original as fallback
- new_parents.append(parent_sha)
- # Fix the tree
- old_tree_id = old_commit.tree
- if old_tree_id not in tree_map:
- tree_map[old_tree_id] = fix_tree(repo, old_tree_id)
- new_tree_id = tree_map[old_tree_id]
- # Create new commit with fixes
- new_commit = create_fixed_commit(old_commit, new_tree_id, new_parents)
- # Add new commit to object store
- repo.object_store.add_object(new_commit)
- commit_map[commit_sha_current] = new_commit.id
- return commit_map.get(commit_sha)
- def rewrite_history(
- repo: Repo, source_branch: str, target_branch: str
- ) -> RewriteResult | None:
- """Rewrite history to fix issues."""
- print(f"=== Rewriting history from {source_branch} to {target_branch} ===")
- # Commits to filter out completely
- filtered_commits: set[ObjectID] = {
- ObjectID(b"336232af1246017ce037b87e913d23e2c2a3bbbd"),
- ObjectID(b"e673babfc11d0b4001d9d08b9b9cef57c6aa67f5"),
- }
- # Get the head commit of the source branch
- try:
- source_ref = Ref(f"refs/heads/{source_branch}".encode())
- head_sha = repo.refs[source_ref]
- except KeyError:
- print(f"Error: Branch '{source_branch}' not found")
- return None
- # Map old commit SHAs to new ones
- commit_map: dict[ObjectID, ObjectID] = {}
- tree_map: dict[ObjectID, ObjectID] = {}
- # Get all commits in topological order
- walker = repo.get_walker([head_sha], order="topo", reverse=True)
- commits = list(walker)
- print(f"Processing {len(commits)} commits...")
- for i, commit_entry in enumerate(commits):
- old_commit = commit_entry.commit
- if i % 100 == 0:
- print(f"Processed {i}/{len(commits)} commits...")
- # Skip filtered commits entirely
- if old_commit.id in filtered_commits:
- # Map this commit to its parent (skip it in the history)
- if old_commit.parents:
- # If the parent has been remapped, use the remapped version
- parent_sha = old_commit.parents[0]
- commit_map[old_commit.id] = commit_map[parent_sha]
- else:
- # This is a root commit, skip it by not adding to commit_map
- pass
- continue
- # Fix the tree
- old_tree_id = old_commit.tree
- if old_tree_id not in tree_map:
- tree_map[old_tree_id] = fix_tree(repo, old_tree_id)
- new_tree_id = tree_map[old_tree_id]
- # Map parent commits
- new_parents = []
- for parent_sha in old_commit.parents:
- parent_sha = commit_map[parent_sha]
- new_parents.append(parent_sha)
- # Create new commit with fixes (note: Drop extra fields)
- new_commit = create_fixed_commit(old_commit, new_tree_id, new_parents)
- if old_commit.parents != new_parents:
- assert old_commit.id != new_commit.id
- # Add new commit to object store
- repo.object_store.add_object(new_commit)
- assert old_commit.id not in commit_map
- commit_map[old_commit.id] = new_commit.id
- # Update the target branch
- new_head = commit_map[head_sha]
- target_ref = Ref(f"refs/heads/{target_branch}".encode())
- repo.refs[target_ref] = new_head
- print(
- f"✓ Created branch '{target_branch}' with {len([k for k, v in commit_map.items() if k != v])} modified commits"
- )
- return RewriteResult(
- commit_map=commit_map, tree_map=tree_map, filtered_commits=filtered_commits
- )
- def update_tags(
- repo: Repo, rewrite_result: RewriteResult, rewrite_non_branch_commits: bool = False
- ) -> list[str]:
- """Update tags to point to rewritten commits.
- Args:
- repo: The repository
- rewrite_result: RewriteResult containing commit_map, tree_map, and filtered_commits
- rewrite_non_branch_commits: If True, also rewrite commits that tags point to
- even if they weren't part of the main branch rewrite
- Returns:
- List of tag names that were updated or rewritten
- """
- print("")
- print("=== Updating tags ===")
- commit_map = rewrite_result.commit_map
- tree_map = rewrite_result.tree_map
- filtered_commits = rewrite_result.filtered_commits
- updated_tags = []
- skipped_tags = []
- rewritten_tags = []
- # Iterate through all refs looking for tags
- for ref_name, ref_value in list(repo.refs.as_dict().items()):
- if not ref_name.startswith(b"refs/tags/"):
- continue
- tag_name = ref_name[len(b"refs/tags/") :].decode()
- # Try to get the tag object
- try:
- tag_obj = repo[ref_value]
- except KeyError:
- print(f"Warning: Could not find object for tag '{tag_name}'")
- continue
- # Handle annotated tags (Tag objects)
- if isinstance(tag_obj, Tag):
- # Get the commit that the tag points to
- target_sha = tag_obj.object[1]
- if target_sha in commit_map:
- new_target_sha = commit_map[target_sha]
- if new_target_sha != target_sha:
- # Create a new tag object pointing to the rewritten commit
- new_tag = Tag()
- new_tag.name = tag_obj.name
- new_tag.object = (tag_obj.object[0], new_target_sha)
- new_tag.tag_time = tag_obj.tag_time
- new_tag.tag_timezone = tag_obj.tag_timezone
- new_tag.tagger = tag_obj.tagger
- new_tag.message = tag_obj.message
- # Add the new tag object to the object store
- repo.object_store.add_object(new_tag)
- # Update the ref to point to the new tag object
- repo.refs[ref_name] = new_tag.id
- print(
- f"Updated annotated tag '{tag_name}': {target_sha.decode()[:8]} -> {new_target_sha.decode()[:8]}"
- )
- updated_tags.append(tag_name)
- else:
- skipped_tags.append(tag_name)
- elif rewrite_non_branch_commits:
- # Rewrite this commit and its ancestors
- print(
- f"Rewriting history for tag '{tag_name}' (commit {target_sha.decode()[:8]} not in branch)"
- )
- rewritten_sha = rewrite_commit(
- repo, target_sha, commit_map, tree_map, filtered_commits
- )
- if rewritten_sha and rewritten_sha != target_sha:
- # Create a new tag object pointing to the rewritten commit
- new_tag = Tag()
- new_tag.name = tag_obj.name
- new_tag.object = (tag_obj.object[0], rewritten_sha)
- new_tag.tag_time = tag_obj.tag_time
- new_tag.tag_timezone = tag_obj.tag_timezone
- new_tag.tagger = tag_obj.tagger
- new_tag.message = tag_obj.message
- # Add the new tag object to the object store
- repo.object_store.add_object(new_tag)
- # Update the ref to point to the new tag object
- repo.refs[ref_name] = new_tag.id
- print(
- f"Rewrote and updated annotated tag '{tag_name}': {target_sha.decode()[:8]} -> {rewritten_sha.decode()[:8]}"
- )
- rewritten_tags.append(tag_name)
- else:
- skipped_tags.append(tag_name)
- else:
- print(
- f"Warning: Tag '{tag_name}' points to commit not in history, skipping"
- )
- skipped_tags.append(tag_name)
- # Handle lightweight tags (direct references to commits)
- elif isinstance(tag_obj, Commit):
- commit_sha = ref_value
- if commit_sha in commit_map:
- new_commit_sha = commit_map[commit_sha]
- if new_commit_sha != commit_sha:
- # Update the ref to point to the new commit
- repo.refs[ref_name] = new_commit_sha
- print(
- f"Updated lightweight tag '{tag_name}': {commit_sha.decode()[:8]} -> {new_commit_sha.decode()[:8]}"
- )
- updated_tags.append(tag_name)
- else:
- skipped_tags.append(tag_name)
- elif rewrite_non_branch_commits:
- # Rewrite this commit and its ancestors
- print(
- f"Rewriting history for tag '{tag_name}' (commit {commit_sha.decode()[:8]} not in branch)"
- )
- rewritten_sha = rewrite_commit(
- repo, commit_sha, commit_map, tree_map, filtered_commits
- )
- if rewritten_sha and rewritten_sha != commit_sha:
- # Update the ref to point to the new commit
- repo.refs[ref_name] = rewritten_sha
- print(
- f"Rewrote and updated lightweight tag '{tag_name}': {commit_sha.decode()[:8]} -> {rewritten_sha.decode()[:8]}"
- )
- rewritten_tags.append(tag_name)
- else:
- skipped_tags.append(tag_name)
- else:
- print(
- f"Warning: Tag '{tag_name}' points to commit not in history, skipping"
- )
- skipped_tags.append(tag_name)
- else:
- print(
- f"Warning: Tag '{tag_name}' points to non-commit/non-tag object, skipping"
- )
- skipped_tags.append(tag_name)
- print(f"✓ Updated {len(updated_tags)} tags")
- if rewritten_tags:
- print(
- f"✓ Rewrote and updated {len(rewritten_tags)} tags (commits not in branch)"
- )
- if skipped_tags:
- print(
- f" Skipped {len(skipped_tags)} tags (unchanged or not in rewritten history)"
- )
- return updated_tags + rewritten_tags
- def main() -> None:
- parser = argparse.ArgumentParser(
- description="Fix dulwich history by removing .git directories and updating old timestamps.",
- epilog="This will create a new branch <target-branch> with the rewritten history from <source-branch>",
- )
- parser.add_argument("source_branch", help="Source branch to rewrite from")
- parser.add_argument(
- "target_branch", help="Target branch to create with rewritten history"
- )
- parser.add_argument(
- "--update-tags",
- action="store_true",
- help="Update existing tags to point to rewritten commits",
- )
- parser.add_argument(
- "--rewrite-tag-commits",
- action="store_true",
- help="Also rewrite commits that tags point to, even if they aren't in the main branch history",
- )
- args = parser.parse_args()
- source_branch = args.source_branch
- target_branch = args.target_branch
- update_tags_flag = args.update_tags
- rewrite_tag_commits_flag = args.rewrite_tag_commits
- # Validate flags
- if rewrite_tag_commits_flag and not update_tags_flag:
- print("Error: --rewrite-tag-commits requires --update-tags")
- sys.exit(1)
- print("=== Dulwich History Fix Script ===")
- print("This script will:")
- print("1. Remove .git directories from tree objects")
- print("2. Fix any commits with dates before 1990")
- print(
- f"3. Create new branch '{target_branch}' from '{source_branch}' with fixed history"
- )
- if update_tags_flag:
- print("4. Update existing tags to point to rewritten commits")
- if rewrite_tag_commits_flag:
- print(
- " - Including rewriting commits that tags point to outside the branch"
- )
- print("")
- print(f"Source branch: {source_branch}")
- print(f"Target branch: {target_branch}")
- if update_tags_flag:
- print("Update tags: Yes")
- if rewrite_tag_commits_flag:
- print("Rewrite tag commits: Yes")
- print("")
- # Open the repository
- try:
- repo = Repo(".")
- except Exception as e:
- print(f"Error: Could not open repository: {e}")
- sys.exit(1)
- # Check if source branch exists
- source_ref = Ref(f"refs/heads/{source_branch}".encode())
- if source_ref not in repo.refs:
- print(f"Error: Source branch '{source_branch}' does not exist")
- sys.exit(1)
- # Check if target branch already exists
- target_ref = Ref(f"refs/heads/{target_branch}".encode())
- if target_ref in repo.refs:
- print(f"Error: Target branch '{target_branch}' already exists")
- print("Please delete it first or choose a different name")
- sys.exit(1)
- # Identify problematic trees
- print("")
- print("=== Identifying problematic trees ===")
- bad_trees = []
- for sha in repo.object_store:
- obj = repo[sha]
- if isinstance(obj, Tree):
- for name, mode, item_sha in obj.items():
- if name in BANNED_NAMES:
- bad_trees.append(sha)
- break
- print(f"Found {len(bad_trees)} trees with .git directories")
- # Check for commits with bad dates
- print("")
- print("=== Identifying problematic commits ===")
- bad_dates = []
- for sha in repo.object_store:
- obj = repo[sha]
- if isinstance(obj, Commit):
- if obj.commit_time < 315532800 or obj.author_time < 315532800:
- bad_dates.append(sha)
- print(f"Found {len(bad_dates)} commits with dates before 1990")
- # Rewrite history
- print("")
- rewrite_result = rewrite_history(repo, source_branch, target_branch)
- if not rewrite_result:
- sys.exit(1)
- # Update tags if requested
- if update_tags_flag:
- update_tags(repo, rewrite_result, rewrite_tag_commits_flag)
- print("")
- print("=== Complete ===")
- print(
- f"Successfully created branch '{target_branch}' with fixed history from '{source_branch}'"
- )
- print("")
- print("Summary of changes:")
- print("- Removed .git directories from tree objects")
- print("- Fixed commit timestamps that were before 1990")
- print(f"- Created clean history in branch '{target_branch}'")
- if update_tags_flag:
- if rewrite_tag_commits_flag:
- print(
- "- Updated tags to point to rewritten commits (including commits outside branch)"
- )
- else:
- print("- Updated tags to point to rewritten commits")
- print("")
- print("IMPORTANT NEXT STEPS:")
- print(f"1. Review the changes: git log --oneline {target_branch}")
- print(
- f"2. Compare commit count: git rev-list --count {source_branch} vs git rev-list --count {target_branch}"
- )
- if update_tags_flag:
- print("3. Review updated tags: git tag -l")
- print(
- "3. If satisfied, you can:"
- if not update_tags_flag
- else "4. If satisfied, you can:"
- )
- print(f" - Push the new branch: git push origin {target_branch}")
- if update_tags_flag:
- print(" - Force push updated tags: git push origin --tags --force")
- print(" - Set it as default branch on GitHub/GitLab")
- print(f" - Update local checkout: git checkout {target_branch}")
- print("")
- print(f"The original branch '{source_branch}' remains unchanged.")
- if update_tags_flag:
- print(
- "WARNING: Tags have been updated. You may need to force push them to remote."
- )
- if rewrite_tag_commits_flag:
- print(
- "WARNING: Tag commits outside the branch were also rewritten. Review carefully."
- )
- if __name__ == "__main__":
- main()
|