fix-history.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. #!/usr/bin/env python3
  2. """Fix dulwich history by removing .git directories and updating old timestamps.
  3. Usage: ./fix-history.py <source-branch> <target-branch> [--update-tags]
  4. Example: ./fix-history.py master main --update-tags
  5. """
  6. import argparse
  7. import sys
  8. import time
  9. from dulwich.objects import Commit, Tag, Tree
  10. from dulwich.repo import Repo
  11. BANNED_NAMES = [b".git"]
  12. def fix_tree(repo, tree_id, seen_trees=None):
  13. """Recursively fix a tree by removing .git entries."""
  14. if seen_trees is None:
  15. seen_trees = set()
  16. if tree_id in seen_trees:
  17. return tree_id
  18. seen_trees.add(tree_id)
  19. try:
  20. tree = repo[tree_id]
  21. except KeyError:
  22. return tree_id
  23. if not isinstance(tree, Tree):
  24. return tree_id
  25. # Check if this tree contains .git entries
  26. modified = False
  27. new_items = []
  28. for item in tree.items():
  29. name, mode, sha = item
  30. if name in BANNED_NAMES:
  31. modified = True
  32. continue
  33. # Recursively fix subtrees
  34. if mode == 0o040000: # Directory mode
  35. new_sha = fix_tree(repo, sha, seen_trees)
  36. if new_sha != sha:
  37. modified = True
  38. sha = new_sha
  39. new_items.append((name, mode, sha))
  40. if not modified:
  41. return tree_id
  42. print(f"Removing .git entry from tree {tree_id.decode()}")
  43. # Create new tree without .git entries
  44. new_tree = Tree()
  45. for name, mode, sha in new_items:
  46. new_tree.add(name, mode, sha)
  47. repo.object_store.add_object(new_tree)
  48. return new_tree.id
  49. def fix_commit_dates(commit):
  50. """Fix commit dates if they're before 1990."""
  51. # Unix timestamp for 1990-01-01
  52. min_timestamp = 315532800
  53. max_timestamp = int(time.time())
  54. # Fix author date
  55. if commit.author_time < min_timestamp:
  56. new_time = commit.author_time * 10
  57. if min_timestamp <= new_time <= max_timestamp:
  58. print(f"Fixed author date: {commit.author_time} -> {new_time}")
  59. commit.author_time = new_time
  60. # Fix committer date
  61. if commit.commit_time < min_timestamp:
  62. new_time = commit.commit_time * 10
  63. if min_timestamp <= new_time <= max_timestamp:
  64. print(f"Fixed committer date: {commit.commit_time} -> {new_time}")
  65. commit.commit_time = new_time
  66. def rewrite_history(repo, source_branch, target_branch):
  67. """Rewrite history to fix issues."""
  68. print(f"=== Rewriting history from {source_branch} to {target_branch} ===")
  69. # Commits to filter out completely
  70. filtered_commits = {
  71. b"336232af1246017ce037b87e913d23e2c2a3bbbd",
  72. b"e673babfc11d0b4001d9d08b9b9cef57c6aa67f5",
  73. }
  74. # Get the head commit of the source branch
  75. try:
  76. source_ref = f"refs/heads/{source_branch}".encode()
  77. head_sha = repo.refs[source_ref]
  78. except KeyError:
  79. print(f"Error: Branch '{source_branch}' not found")
  80. return False
  81. # Map old commit SHAs to new ones
  82. commit_map = {}
  83. tree_map = {}
  84. # Get all commits in topological order
  85. walker = repo.get_walker([head_sha], order="topo", reverse=True)
  86. commits = list(walker)
  87. print(f"Processing {len(commits)} commits...")
  88. for i, commit_entry in enumerate(commits):
  89. old_commit = commit_entry.commit
  90. if i % 100 == 0:
  91. print(f"Processed {i}/{len(commits)} commits...")
  92. # Skip filtered commits entirely
  93. if old_commit.id in filtered_commits:
  94. # Map this commit to its parent (skip it in the history)
  95. if old_commit.parents:
  96. # If the parent has been remapped, use the remapped version
  97. parent_sha = old_commit.parents[0]
  98. commit_map[old_commit.id] = commit_map[parent_sha]
  99. else:
  100. # This is a root commit, skip it by not adding to commit_map
  101. pass
  102. continue
  103. # Fix the tree
  104. old_tree_id = old_commit.tree
  105. if old_tree_id not in tree_map:
  106. tree_map[old_tree_id] = fix_tree(repo, old_tree_id)
  107. new_tree_id = tree_map[old_tree_id]
  108. # Create new commit
  109. new_commit = Commit()
  110. new_commit.tree = new_tree_id
  111. new_commit.author = old_commit.author
  112. new_commit.committer = old_commit.committer
  113. new_commit.author_time = old_commit.author_time
  114. new_commit.commit_time = old_commit.commit_time
  115. new_commit.author_timezone = old_commit.author_timezone
  116. new_commit.commit_timezone = old_commit.commit_timezone
  117. new_commit.message = old_commit.message
  118. new_commit.encoding = old_commit.encoding
  119. # note: Drop extra fields
  120. # Fix dates
  121. fix_commit_dates(new_commit)
  122. if b"jvernooij@evroc.com" in old_commit.author:
  123. new_commit.author = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
  124. if b"jvernooij@evroc.com" in old_commit.committer:
  125. new_commit.committer = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
  126. # Map parent commits
  127. new_parents = []
  128. for parent_sha in old_commit.parents:
  129. parent_sha = commit_map[parent_sha]
  130. new_parents.append(parent_sha)
  131. new_commit.parents = new_parents
  132. if old_commit.parents != new_parents:
  133. assert old_commit.id != new_commit.id
  134. # Add new commit to object store
  135. repo.object_store.add_object(new_commit)
  136. assert old_commit.id not in commit_map
  137. commit_map[old_commit.id] = new_commit.id
  138. # Update the target branch
  139. new_head = commit_map[head_sha]
  140. target_ref = f"refs/heads/{target_branch}".encode()
  141. repo.refs[target_ref] = new_head
  142. print(
  143. f"✓ Created branch '{target_branch}' with {len([k for k, v in commit_map.items() if k != v])} modified commits"
  144. )
  145. return commit_map
  146. def update_tags(repo, commit_map):
  147. """Update tags to point to rewritten commits."""
  148. print("")
  149. print("=== Updating tags ===")
  150. updated_tags = []
  151. skipped_tags = []
  152. # Iterate through all refs looking for tags
  153. for ref_name, ref_value in list(repo.refs.as_dict().items()):
  154. if not ref_name.startswith(b"refs/tags/"):
  155. continue
  156. tag_name = ref_name[len(b"refs/tags/") :].decode()
  157. # Try to get the tag object
  158. try:
  159. tag_obj = repo[ref_value]
  160. except KeyError:
  161. print(f"Warning: Could not find object for tag '{tag_name}'")
  162. continue
  163. # Handle annotated tags (Tag objects)
  164. if isinstance(tag_obj, Tag):
  165. # Get the commit that the tag points to
  166. target_sha = tag_obj.object[1]
  167. if target_sha in commit_map:
  168. new_target_sha = commit_map[target_sha]
  169. if new_target_sha != target_sha:
  170. # Create a new tag object pointing to the rewritten commit
  171. new_tag = Tag()
  172. new_tag.name = tag_obj.name
  173. new_tag.object = (tag_obj.object[0], new_target_sha)
  174. new_tag.tag_time = tag_obj.tag_time
  175. new_tag.tag_timezone = tag_obj.tag_timezone
  176. new_tag.tagger = tag_obj.tagger
  177. new_tag.message = tag_obj.message
  178. # Add the new tag object to the object store
  179. repo.object_store.add_object(new_tag)
  180. # Update the ref to point to the new tag object
  181. repo.refs[ref_name] = new_tag.id
  182. print(
  183. f"Updated annotated tag '{tag_name}': {target_sha.decode()[:8]} -> {new_target_sha.decode()[:8]}"
  184. )
  185. updated_tags.append(tag_name)
  186. else:
  187. skipped_tags.append(tag_name)
  188. else:
  189. print(
  190. f"Warning: Tag '{tag_name}' points to commit not in history, skipping"
  191. )
  192. skipped_tags.append(tag_name)
  193. # Handle lightweight tags (direct references to commits)
  194. elif isinstance(tag_obj, Commit):
  195. commit_sha = ref_value
  196. if commit_sha in commit_map:
  197. new_commit_sha = commit_map[commit_sha]
  198. if new_commit_sha != commit_sha:
  199. # Update the ref to point to the new commit
  200. repo.refs[ref_name] = new_commit_sha
  201. print(
  202. f"Updated lightweight tag '{tag_name}': {commit_sha.decode()[:8]} -> {new_commit_sha.decode()[:8]}"
  203. )
  204. updated_tags.append(tag_name)
  205. else:
  206. skipped_tags.append(tag_name)
  207. else:
  208. print(
  209. f"Warning: Tag '{tag_name}' points to commit not in history, skipping"
  210. )
  211. skipped_tags.append(tag_name)
  212. else:
  213. print(
  214. f"Warning: Tag '{tag_name}' points to non-commit/non-tag object, skipping"
  215. )
  216. skipped_tags.append(tag_name)
  217. print(f"✓ Updated {len(updated_tags)} tags")
  218. if skipped_tags:
  219. print(
  220. f" Skipped {len(skipped_tags)} tags (unchanged or not in rewritten history)"
  221. )
  222. return updated_tags
  223. def main():
  224. parser = argparse.ArgumentParser(
  225. description="Fix dulwich history by removing .git directories and updating old timestamps.",
  226. epilog="This will create a new branch <target-branch> with the rewritten history from <source-branch>",
  227. )
  228. parser.add_argument("source_branch", help="Source branch to rewrite from")
  229. parser.add_argument(
  230. "target_branch", help="Target branch to create with rewritten history"
  231. )
  232. parser.add_argument(
  233. "--update-tags",
  234. action="store_true",
  235. help="Update existing tags to point to rewritten commits",
  236. )
  237. args = parser.parse_args()
  238. source_branch = args.source_branch
  239. target_branch = args.target_branch
  240. update_tags_flag = args.update_tags
  241. print("=== Dulwich History Fix Script ===")
  242. print("This script will:")
  243. print("1. Remove .git directories from tree objects")
  244. print("2. Fix any commits with dates before 1990")
  245. print(
  246. f"3. Create new branch '{target_branch}' from '{source_branch}' with fixed history"
  247. )
  248. if update_tags_flag:
  249. print("4. Update existing tags to point to rewritten commits")
  250. print("")
  251. print(f"Source branch: {source_branch}")
  252. print(f"Target branch: {target_branch}")
  253. if update_tags_flag:
  254. print("Update tags: Yes")
  255. print("")
  256. # Open the repository
  257. try:
  258. repo = Repo(".")
  259. except Exception as e:
  260. print(f"Error: Could not open repository: {e}")
  261. sys.exit(1)
  262. # Check if source branch exists
  263. source_ref = f"refs/heads/{source_branch}".encode()
  264. if source_ref not in repo.refs:
  265. print(f"Error: Source branch '{source_branch}' does not exist")
  266. sys.exit(1)
  267. # Check if target branch already exists
  268. target_ref = f"refs/heads/{target_branch}".encode()
  269. if target_ref in repo.refs:
  270. print(f"Error: Target branch '{target_branch}' already exists")
  271. print("Please delete it first or choose a different name")
  272. sys.exit(1)
  273. # Identify problematic trees
  274. print("")
  275. print("=== Identifying problematic trees ===")
  276. bad_trees = []
  277. for sha in repo.object_store:
  278. obj = repo[sha]
  279. if isinstance(obj, Tree):
  280. for name, mode, item_sha in obj.items():
  281. if name in BANNED_NAMES:
  282. bad_trees.append(sha)
  283. break
  284. print(f"Found {len(bad_trees)} trees with .git directories")
  285. # Check for commits with bad dates
  286. print("")
  287. print("=== Identifying problematic commits ===")
  288. bad_dates = []
  289. for sha in repo.object_store:
  290. obj = repo[sha]
  291. if isinstance(obj, Commit):
  292. if obj.commit_time < 315532800 or obj.author_time < 315532800:
  293. bad_dates.append(sha)
  294. print(f"Found {len(bad_dates)} commits with dates before 1990")
  295. # Rewrite history
  296. print("")
  297. commit_map = rewrite_history(repo, source_branch, target_branch)
  298. if not commit_map:
  299. sys.exit(1)
  300. # Update tags if requested
  301. if update_tags_flag:
  302. update_tags(repo, commit_map)
  303. print("")
  304. print("=== Complete ===")
  305. print(
  306. f"Successfully created branch '{target_branch}' with fixed history from '{source_branch}'"
  307. )
  308. print("")
  309. print("Summary of changes:")
  310. print("- Removed .git directories from tree objects")
  311. print("- Fixed commit timestamps that were before 1990")
  312. print(f"- Created clean history in branch '{target_branch}'")
  313. if update_tags_flag:
  314. print("- Updated tags to point to rewritten commits")
  315. print("")
  316. print("IMPORTANT NEXT STEPS:")
  317. print(f"1. Review the changes: git log --oneline {target_branch}")
  318. print(
  319. f"2. Compare commit count: git rev-list --count {source_branch} vs git rev-list --count {target_branch}"
  320. )
  321. if update_tags_flag:
  322. print("3. Review updated tags: git tag -l")
  323. print(
  324. "3. If satisfied, you can:"
  325. if not update_tags_flag
  326. else "4. If satisfied, you can:"
  327. )
  328. print(f" - Push the new branch: git push origin {target_branch}")
  329. if update_tags_flag:
  330. print(" - Force push updated tags: git push origin --tags --force")
  331. print(" - Set it as default branch on GitHub/GitLab")
  332. print(f" - Update local checkout: git checkout {target_branch}")
  333. print("")
  334. print(f"The original branch '{source_branch}' remains unchanged.")
  335. if update_tags_flag:
  336. print(
  337. "WARNING: Tags have been updated. You may need to force push them to remote."
  338. )
  339. if __name__ == "__main__":
  340. main()