2
0

fix-history.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. #!/usr/bin/env python3
  2. """Fix dulwich history by removing .git directories and updating old timestamps.
  3. Usage: ./fix-history.py <source-branch> <target-branch>
  4. Example: ./fix-history.py master main
  5. """
  6. import sys
  7. import time
  8. from dulwich.objects import Commit, Tree
  9. from dulwich.repo import Repo
  10. BANNED_NAMES = [".git"]
  11. def fix_tree(repo, tree_id, seen_trees=None):
  12. """Recursively fix a tree by removing .git entries."""
  13. if seen_trees is None:
  14. seen_trees = set()
  15. if tree_id in seen_trees:
  16. return tree_id
  17. seen_trees.add(tree_id)
  18. try:
  19. tree = repo[tree_id]
  20. except KeyError:
  21. return tree_id
  22. if not isinstance(tree, Tree):
  23. return tree_id
  24. # Check if this tree contains .git entries
  25. modified = False
  26. new_items = []
  27. for item in tree.items():
  28. name, mode, sha = item
  29. if name in BANNED_NAMES:
  30. modified = True
  31. continue
  32. # Recursively fix subtrees
  33. if mode == 0o040000: # Directory mode
  34. new_sha = fix_tree(repo, sha, seen_trees)
  35. if new_sha != sha:
  36. modified = True
  37. sha = new_sha
  38. new_items.append((name, mode, sha))
  39. if not modified:
  40. return tree_id
  41. print(f"Removing .git entry from tree {tree_id.decode()}")
  42. # Create new tree without .git entries
  43. new_tree = Tree()
  44. for name, mode, sha in new_items:
  45. new_tree.add(name, mode, sha)
  46. repo.object_store.add_object(new_tree)
  47. return new_tree.id
  48. def fix_commit_dates(commit):
  49. """Fix commit dates if they're before 1990."""
  50. # Unix timestamp for 1990-01-01
  51. min_timestamp = 315532800
  52. max_timestamp = int(time.time())
  53. # Fix author date
  54. if commit.author_time < min_timestamp:
  55. new_time = commit.author_time * 10
  56. if min_timestamp <= new_time <= max_timestamp:
  57. print(f"Fixed author date: {commit.author_time} -> {new_time}")
  58. commit.author_time = new_time
  59. # Fix committer date
  60. if commit.commit_time < min_timestamp:
  61. new_time = commit.commit_time * 10
  62. if min_timestamp <= new_time <= max_timestamp:
  63. print(f"Fixed committer date: {commit.commit_time} -> {new_time}")
  64. commit.commit_time = new_time
  65. def rewrite_history(repo, source_branch, target_branch):
  66. """Rewrite history to fix issues."""
  67. print(f"=== Rewriting history from {source_branch} to {target_branch} ===")
  68. # Commits to filter out completely
  69. filtered_commits = {
  70. b"336232af1246017ce037b87e913d23e2c2a3bbbd",
  71. b"e673babfc11d0b4001d9d08b9b9cef57c6aa67f5",
  72. }
  73. # Get the head commit of the source branch
  74. try:
  75. source_ref = f"refs/heads/{source_branch}".encode()
  76. head_sha = repo.refs[source_ref]
  77. except KeyError:
  78. print(f"Error: Branch '{source_branch}' not found")
  79. return False
  80. # Map old commit SHAs to new ones
  81. commit_map = {}
  82. tree_map = {}
  83. # Get all commits in topological order
  84. walker = repo.get_walker([head_sha], order="topo", reverse=True)
  85. commits = list(walker)
  86. print(f"Processing {len(commits)} commits...")
  87. for i, commit_entry in enumerate(commits):
  88. old_commit = commit_entry.commit
  89. if i % 100 == 0:
  90. print(f"Processed {i}/{len(commits)} commits...")
  91. # Skip filtered commits entirely
  92. if old_commit.id in filtered_commits:
  93. # Map this commit to its parent (skip it in the history)
  94. if old_commit.parents:
  95. # If the parent has been remapped, use the remapped version
  96. parent_sha = old_commit.parents[0]
  97. commit_map[old_commit.id] = commit_map[parent_sha]
  98. else:
  99. # This is a root commit, skip it by not adding to commit_map
  100. pass
  101. continue
  102. # Fix the tree
  103. old_tree_id = old_commit.tree
  104. if old_tree_id not in tree_map:
  105. tree_map[old_tree_id] = fix_tree(repo, old_tree_id)
  106. new_tree_id = tree_map[old_tree_id]
  107. # Create new commit
  108. new_commit = Commit()
  109. new_commit.tree = new_tree_id
  110. new_commit.author = old_commit.author
  111. new_commit.committer = old_commit.committer
  112. new_commit.author_time = old_commit.author_time
  113. new_commit.commit_time = old_commit.commit_time
  114. new_commit.author_timezone = old_commit.author_timezone
  115. new_commit.commit_timezone = old_commit.commit_timezone
  116. new_commit.message = old_commit.message
  117. new_commit.encoding = old_commit.encoding
  118. # note: Drop extra fields
  119. # Fix dates
  120. fix_commit_dates(new_commit)
  121. if b"jvernooij@evroc.com" in old_commit.author:
  122. new_commit.author = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
  123. if b"jvernooij@evroc.com" in old_commit.committer:
  124. new_commit.committer = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
  125. # Map parent commits
  126. new_parents = []
  127. for parent_sha in old_commit.parents:
  128. parent_sha = commit_map[parent_sha]
  129. new_parents.append(parent_sha)
  130. new_commit.parents = new_parents
  131. if old_commit.parents != new_parents:
  132. assert old_commit.id != new_commit.id
  133. # Add new commit to object store
  134. repo.object_store.add_object(new_commit)
  135. assert old_commit.id not in commit_map
  136. commit_map[old_commit.id] = new_commit.id
  137. # Update the target branch
  138. new_head = commit_map[head_sha]
  139. target_ref = f"refs/heads/{target_branch}".encode()
  140. repo.refs[target_ref] = new_head
  141. print(
  142. f"✓ Created branch '{target_branch}' with {len([k for k, v in commit_map.items() if k != v])} modified commits"
  143. )
  144. return True
  145. def main():
  146. if len(sys.argv) != 3:
  147. print(f"Usage: {sys.argv[0]} <source-branch> <target-branch>")
  148. print(f"Example: {sys.argv[0]} master main")
  149. print("")
  150. print(
  151. "This will create a new branch <target-branch> with the rewritten history from <source-branch>"
  152. )
  153. sys.exit(1)
  154. source_branch = sys.argv[1]
  155. target_branch = sys.argv[2]
  156. print("=== Dulwich History Fix Script ===")
  157. print("This script will:")
  158. print("1. Remove .git directories from tree objects")
  159. print("2. Fix any commits with dates before 1990")
  160. print(
  161. f"3. Create new branch '{target_branch}' from '{source_branch}' with fixed history"
  162. )
  163. print("")
  164. print(f"Source branch: {source_branch}")
  165. print(f"Target branch: {target_branch}")
  166. print("")
  167. # Open the repository
  168. try:
  169. repo = Repo(".")
  170. except Exception as e:
  171. print(f"Error: Could not open repository: {e}")
  172. sys.exit(1)
  173. # Check if source branch exists
  174. source_ref = f"refs/heads/{source_branch}".encode()
  175. if source_ref not in repo.refs:
  176. print(f"Error: Source branch '{source_branch}' does not exist")
  177. sys.exit(1)
  178. # Check if target branch already exists
  179. target_ref = f"refs/heads/{target_branch}".encode()
  180. if target_ref in repo.refs:
  181. print(f"Error: Target branch '{target_branch}' already exists")
  182. print("Please delete it first or choose a different name")
  183. sys.exit(1)
  184. # Identify problematic trees
  185. print("")
  186. print("=== Identifying problematic trees ===")
  187. bad_trees = []
  188. for sha in repo.object_store:
  189. obj = repo[sha]
  190. if isinstance(obj, Tree):
  191. for name, mode, item_sha in obj.items():
  192. if name in BANNED_NAMES:
  193. bad_trees.append(sha)
  194. break
  195. print(f"Found {len(bad_trees)} trees with .git directories")
  196. # Check for commits with bad dates
  197. print("")
  198. print("=== Identifying problematic commits ===")
  199. bad_dates = []
  200. for sha in repo.object_store:
  201. obj = repo[sha]
  202. if isinstance(obj, Commit):
  203. if obj.commit_time < 315532800 or obj.author_time < 315532800:
  204. bad_dates.append(sha)
  205. print(f"Found {len(bad_dates)} commits with dates before 1990")
  206. # Rewrite history
  207. print("")
  208. if not rewrite_history(repo, source_branch, target_branch):
  209. sys.exit(1)
  210. print("")
  211. print("=== Complete ===")
  212. print(
  213. f"Successfully created branch '{target_branch}' with fixed history from '{source_branch}'"
  214. )
  215. print("")
  216. print("Summary of changes:")
  217. print("- Removed .git directories from tree objects")
  218. print("- Fixed commit timestamps that were before 1990")
  219. print(f"- Created clean history in branch '{target_branch}'")
  220. print("")
  221. print("IMPORTANT NEXT STEPS:")
  222. print(f"1. Review the changes: git log --oneline {target_branch}")
  223. print(
  224. f"2. Compare commit count: git rev-list --count {source_branch} vs git rev-list --count {target_branch}"
  225. )
  226. print("3. If satisfied, you can:")
  227. print(f" - Push the new branch: git push origin {target_branch}")
  228. print(" - Set it as default branch on GitHub/GitLab")
  229. print(f" - Update local checkout: git checkout {target_branch}")
  230. print("")
  231. print(f"The original branch '{source_branch}' remains unchanged.")
  232. if __name__ == "__main__":
  233. main()