2
0

fix-history.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633
  1. #!/usr/bin/env python3
  2. """Fix dulwich history by removing .git directories and updating old timestamps.
  3. Usage: ./fix-history.py <source-branch> <target-branch> [--update-tags] [--rewrite-tag-commits]
  4. Example: ./fix-history.py master main --update-tags --rewrite-tag-commits
  5. """
  6. import argparse
  7. import sys
  8. import time
  9. from dataclasses import dataclass
  10. from dulwich.objects import Commit, ObjectID, Tag, Tree
  11. from dulwich.refs import Ref
  12. from dulwich.repo import Repo
  13. BANNED_NAMES = [b".git"]
  14. @dataclass
  15. class RewriteResult:
  16. """Result of rewriting history."""
  17. commit_map: dict[ObjectID, ObjectID]
  18. tree_map: dict[ObjectID, ObjectID]
  19. filtered_commits: set[ObjectID]
  20. def create_fixed_commit(
  21. old_commit: Commit, new_tree_id: ObjectID, new_parents: list[ObjectID]
  22. ) -> Commit:
  23. """Create a new commit from an old one with fixes applied.
  24. Args:
  25. old_commit: The original commit
  26. new_tree_id: The new tree SHA
  27. new_parents: List of parent commit SHAs
  28. Returns:
  29. A new commit with fixes applied
  30. """
  31. new_commit = Commit()
  32. new_commit.tree = new_tree_id
  33. new_commit.author = old_commit.author
  34. new_commit.committer = old_commit.committer
  35. new_commit.author_time = old_commit.author_time
  36. new_commit.commit_time = old_commit.commit_time
  37. new_commit.author_timezone = old_commit.author_timezone
  38. new_commit.commit_timezone = old_commit.commit_timezone
  39. new_commit.message = old_commit.message
  40. new_commit.encoding = old_commit.encoding
  41. new_commit.parents = new_parents
  42. # Fix dates
  43. fix_commit_dates(new_commit)
  44. # Fix email addresses
  45. if b"jvernooij@evroc.com" in old_commit.author:
  46. new_commit.author = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
  47. if b"jvernooij@evroc.com" in old_commit.committer:
  48. new_commit.committer = "Jelmer Vernooij <jelmer@jelmer.uk>".encode()
  49. return new_commit
  50. def fix_tree(
  51. repo: Repo, tree_id: ObjectID, seen_trees: set[ObjectID] | None = None
  52. ) -> ObjectID:
  53. """Recursively fix a tree by removing .git entries."""
  54. if seen_trees is None:
  55. seen_trees = set()
  56. if tree_id in seen_trees:
  57. return tree_id
  58. seen_trees.add(tree_id)
  59. try:
  60. tree = repo[tree_id]
  61. except KeyError:
  62. return tree_id
  63. if not isinstance(tree, Tree):
  64. return tree_id
  65. # Check if this tree contains .git entries
  66. modified = False
  67. new_items = []
  68. for item in tree.items():
  69. name, mode, sha = item
  70. if name in BANNED_NAMES:
  71. modified = True
  72. continue
  73. # Recursively fix subtrees
  74. if mode == 0o040000: # Directory mode
  75. new_sha = fix_tree(repo, sha, seen_trees)
  76. if new_sha != sha:
  77. modified = True
  78. sha = new_sha
  79. new_items.append((name, mode, sha))
  80. if not modified:
  81. return tree_id
  82. print(f"Removing .git entry from tree {tree_id.decode()}")
  83. # Create new tree without .git entries
  84. new_tree = Tree()
  85. for name, mode, sha in new_items:
  86. new_tree.add(name, mode, sha)
  87. repo.object_store.add_object(new_tree)
  88. return new_tree.id
  89. def fix_commit_dates(commit: Commit) -> None:
  90. """Fix commit dates if they're before 1990."""
  91. # Unix timestamp for 1990-01-01
  92. min_timestamp = 315532800
  93. max_timestamp = int(time.time())
  94. # Fix author date
  95. if commit.author_time < min_timestamp:
  96. new_time = commit.author_time * 10
  97. if min_timestamp <= new_time <= max_timestamp:
  98. print(f"Fixed author date: {commit.author_time} -> {new_time}")
  99. commit.author_time = new_time
  100. # Fix committer date
  101. if commit.commit_time < min_timestamp:
  102. new_time = commit.commit_time * 10
  103. if min_timestamp <= new_time <= max_timestamp:
  104. print(f"Fixed committer date: {commit.commit_time} -> {new_time}")
  105. commit.commit_time = new_time
  106. def rewrite_commit(
  107. repo: Repo,
  108. commit_sha: ObjectID,
  109. commit_map: dict[ObjectID, ObjectID],
  110. tree_map: dict[ObjectID, ObjectID],
  111. filtered_commits: set[ObjectID],
  112. ) -> ObjectID | None:
  113. """Rewrite a single commit and its ancestors.
  114. This is used to rewrite commits that weren't part of the main branch
  115. but are referenced by tags. Uses dulwich's walker for efficient traversal,
  116. stopping at commits that have already been rewritten.
  117. """
  118. # If already mapped, return the mapped version
  119. if commit_sha in commit_map:
  120. return commit_map[commit_sha]
  121. # Use walker to efficiently get commits in topological order
  122. # Exclude commits that are already mapped to avoid reprocessing
  123. exclude = list(commit_map.keys())
  124. try:
  125. # Get commits in reverse topological order (parents before children)
  126. walker = repo.get_walker(
  127. include=[commit_sha], exclude=exclude, order="topo", reverse=True
  128. )
  129. commits_to_process = []
  130. for entry in walker:
  131. commits_to_process.append(entry.commit)
  132. print(
  133. f" Processing {len(commits_to_process)} unmapped commits for tag target {commit_sha.decode()[:8]}"
  134. )
  135. except Exception as e:
  136. print(f"Warning: Could not walk commits from {commit_sha.decode()[:8]}: {e}")
  137. return commit_sha
  138. # Process commits in order (parents before children)
  139. for old_commit in commits_to_process:
  140. commit_sha_current = old_commit.id
  141. # Skip if already mapped
  142. if commit_sha_current in commit_map:
  143. continue
  144. # Handle filtered commits
  145. if commit_sha_current in filtered_commits:
  146. if old_commit.parents:
  147. parent_sha = old_commit.parents[0]
  148. if parent_sha in commit_map:
  149. commit_map[commit_sha_current] = commit_map[parent_sha]
  150. continue
  151. # Map parent commits
  152. new_parents = []
  153. for parent_sha in old_commit.parents:
  154. if parent_sha in commit_map:
  155. mapped = commit_map[parent_sha]
  156. if mapped is not None:
  157. new_parents.append(mapped)
  158. else:
  159. # Parent should have been processed already due to topological order
  160. # Use original as fallback
  161. new_parents.append(parent_sha)
  162. # Fix the tree
  163. old_tree_id = old_commit.tree
  164. if old_tree_id not in tree_map:
  165. tree_map[old_tree_id] = fix_tree(repo, old_tree_id)
  166. new_tree_id = tree_map[old_tree_id]
  167. # Create new commit with fixes
  168. new_commit = create_fixed_commit(old_commit, new_tree_id, new_parents)
  169. # Add new commit to object store
  170. repo.object_store.add_object(new_commit)
  171. commit_map[commit_sha_current] = new_commit.id
  172. return commit_map.get(commit_sha)
  173. def rewrite_history(
  174. repo: Repo, source_branch: str, target_branch: str
  175. ) -> RewriteResult | None:
  176. """Rewrite history to fix issues."""
  177. print(f"=== Rewriting history from {source_branch} to {target_branch} ===")
  178. # Commits to filter out completely
  179. filtered_commits: set[ObjectID] = {
  180. ObjectID(b"336232af1246017ce037b87e913d23e2c2a3bbbd"),
  181. ObjectID(b"e673babfc11d0b4001d9d08b9b9cef57c6aa67f5"),
  182. }
  183. # Get the head commit of the source branch
  184. try:
  185. source_ref = Ref(f"refs/heads/{source_branch}".encode())
  186. head_sha = repo.refs[source_ref]
  187. except KeyError:
  188. print(f"Error: Branch '{source_branch}' not found")
  189. return None
  190. # Map old commit SHAs to new ones
  191. commit_map: dict[ObjectID, ObjectID] = {}
  192. tree_map: dict[ObjectID, ObjectID] = {}
  193. # Get all commits in topological order
  194. walker = repo.get_walker([head_sha], order="topo", reverse=True)
  195. commits = list(walker)
  196. print(f"Processing {len(commits)} commits...")
  197. for i, commit_entry in enumerate(commits):
  198. old_commit = commit_entry.commit
  199. if i % 100 == 0:
  200. print(f"Processed {i}/{len(commits)} commits...")
  201. # Skip filtered commits entirely
  202. if old_commit.id in filtered_commits:
  203. # Map this commit to its parent (skip it in the history)
  204. if old_commit.parents:
  205. # If the parent has been remapped, use the remapped version
  206. parent_sha = old_commit.parents[0]
  207. commit_map[old_commit.id] = commit_map[parent_sha]
  208. else:
  209. # This is a root commit, skip it by not adding to commit_map
  210. pass
  211. continue
  212. # Fix the tree
  213. old_tree_id = old_commit.tree
  214. if old_tree_id not in tree_map:
  215. tree_map[old_tree_id] = fix_tree(repo, old_tree_id)
  216. new_tree_id = tree_map[old_tree_id]
  217. # Map parent commits
  218. new_parents = []
  219. for parent_sha in old_commit.parents:
  220. parent_sha = commit_map[parent_sha]
  221. new_parents.append(parent_sha)
  222. # Create new commit with fixes (note: Drop extra fields)
  223. new_commit = create_fixed_commit(old_commit, new_tree_id, new_parents)
  224. if old_commit.parents != new_parents:
  225. assert old_commit.id != new_commit.id
  226. # Add new commit to object store
  227. repo.object_store.add_object(new_commit)
  228. assert old_commit.id not in commit_map
  229. commit_map[old_commit.id] = new_commit.id
  230. # Update the target branch
  231. new_head = commit_map[head_sha]
  232. target_ref = Ref(f"refs/heads/{target_branch}".encode())
  233. repo.refs[target_ref] = new_head
  234. print(
  235. f"✓ Created branch '{target_branch}' with {len([k for k, v in commit_map.items() if k != v])} modified commits"
  236. )
  237. return RewriteResult(
  238. commit_map=commit_map, tree_map=tree_map, filtered_commits=filtered_commits
  239. )
  240. def update_tags(
  241. repo: Repo, rewrite_result: RewriteResult, rewrite_non_branch_commits: bool = False
  242. ) -> list[str]:
  243. """Update tags to point to rewritten commits.
  244. Args:
  245. repo: The repository
  246. rewrite_result: RewriteResult containing commit_map, tree_map, and filtered_commits
  247. rewrite_non_branch_commits: If True, also rewrite commits that tags point to
  248. even if they weren't part of the main branch rewrite
  249. Returns:
  250. List of tag names that were updated or rewritten
  251. """
  252. print("")
  253. print("=== Updating tags ===")
  254. commit_map = rewrite_result.commit_map
  255. tree_map = rewrite_result.tree_map
  256. filtered_commits = rewrite_result.filtered_commits
  257. updated_tags = []
  258. skipped_tags = []
  259. rewritten_tags = []
  260. # Iterate through all refs looking for tags
  261. for ref_name, ref_value in list(repo.refs.as_dict().items()):
  262. if not ref_name.startswith(b"refs/tags/"):
  263. continue
  264. tag_name = ref_name[len(b"refs/tags/") :].decode()
  265. # Try to get the tag object
  266. try:
  267. tag_obj = repo[ref_value]
  268. except KeyError:
  269. print(f"Warning: Could not find object for tag '{tag_name}'")
  270. continue
  271. # Handle annotated tags (Tag objects)
  272. if isinstance(tag_obj, Tag):
  273. # Get the commit that the tag points to
  274. target_sha = tag_obj.object[1]
  275. if target_sha in commit_map:
  276. new_target_sha = commit_map[target_sha]
  277. if new_target_sha != target_sha:
  278. # Create a new tag object pointing to the rewritten commit
  279. new_tag = Tag()
  280. new_tag.name = tag_obj.name
  281. new_tag.object = (tag_obj.object[0], new_target_sha)
  282. new_tag.tag_time = tag_obj.tag_time
  283. new_tag.tag_timezone = tag_obj.tag_timezone
  284. new_tag.tagger = tag_obj.tagger
  285. new_tag.message = tag_obj.message
  286. # Add the new tag object to the object store
  287. repo.object_store.add_object(new_tag)
  288. # Update the ref to point to the new tag object
  289. repo.refs[ref_name] = new_tag.id
  290. print(
  291. f"Updated annotated tag '{tag_name}': {target_sha.decode()[:8]} -> {new_target_sha.decode()[:8]}"
  292. )
  293. updated_tags.append(tag_name)
  294. else:
  295. skipped_tags.append(tag_name)
  296. elif rewrite_non_branch_commits:
  297. # Rewrite this commit and its ancestors
  298. print(
  299. f"Rewriting history for tag '{tag_name}' (commit {target_sha.decode()[:8]} not in branch)"
  300. )
  301. rewritten_sha = rewrite_commit(
  302. repo, target_sha, commit_map, tree_map, filtered_commits
  303. )
  304. if rewritten_sha and rewritten_sha != target_sha:
  305. # Create a new tag object pointing to the rewritten commit
  306. new_tag = Tag()
  307. new_tag.name = tag_obj.name
  308. new_tag.object = (tag_obj.object[0], rewritten_sha)
  309. new_tag.tag_time = tag_obj.tag_time
  310. new_tag.tag_timezone = tag_obj.tag_timezone
  311. new_tag.tagger = tag_obj.tagger
  312. new_tag.message = tag_obj.message
  313. # Add the new tag object to the object store
  314. repo.object_store.add_object(new_tag)
  315. # Update the ref to point to the new tag object
  316. repo.refs[ref_name] = new_tag.id
  317. print(
  318. f"Rewrote and updated annotated tag '{tag_name}': {target_sha.decode()[:8]} -> {rewritten_sha.decode()[:8]}"
  319. )
  320. rewritten_tags.append(tag_name)
  321. else:
  322. skipped_tags.append(tag_name)
  323. else:
  324. print(
  325. f"Warning: Tag '{tag_name}' points to commit not in history, skipping"
  326. )
  327. skipped_tags.append(tag_name)
  328. # Handle lightweight tags (direct references to commits)
  329. elif isinstance(tag_obj, Commit):
  330. commit_sha = ref_value
  331. if commit_sha in commit_map:
  332. new_commit_sha = commit_map[commit_sha]
  333. if new_commit_sha != commit_sha:
  334. # Update the ref to point to the new commit
  335. repo.refs[ref_name] = new_commit_sha
  336. print(
  337. f"Updated lightweight tag '{tag_name}': {commit_sha.decode()[:8]} -> {new_commit_sha.decode()[:8]}"
  338. )
  339. updated_tags.append(tag_name)
  340. else:
  341. skipped_tags.append(tag_name)
  342. elif rewrite_non_branch_commits:
  343. # Rewrite this commit and its ancestors
  344. print(
  345. f"Rewriting history for tag '{tag_name}' (commit {commit_sha.decode()[:8]} not in branch)"
  346. )
  347. rewritten_sha = rewrite_commit(
  348. repo, commit_sha, commit_map, tree_map, filtered_commits
  349. )
  350. if rewritten_sha and rewritten_sha != commit_sha:
  351. # Update the ref to point to the new commit
  352. repo.refs[ref_name] = rewritten_sha
  353. print(
  354. f"Rewrote and updated lightweight tag '{tag_name}': {commit_sha.decode()[:8]} -> {rewritten_sha.decode()[:8]}"
  355. )
  356. rewritten_tags.append(tag_name)
  357. else:
  358. skipped_tags.append(tag_name)
  359. else:
  360. print(
  361. f"Warning: Tag '{tag_name}' points to commit not in history, skipping"
  362. )
  363. skipped_tags.append(tag_name)
  364. else:
  365. print(
  366. f"Warning: Tag '{tag_name}' points to non-commit/non-tag object, skipping"
  367. )
  368. skipped_tags.append(tag_name)
  369. print(f"✓ Updated {len(updated_tags)} tags")
  370. if rewritten_tags:
  371. print(
  372. f"✓ Rewrote and updated {len(rewritten_tags)} tags (commits not in branch)"
  373. )
  374. if skipped_tags:
  375. print(
  376. f" Skipped {len(skipped_tags)} tags (unchanged or not in rewritten history)"
  377. )
  378. return updated_tags + rewritten_tags
  379. def main() -> None:
  380. parser = argparse.ArgumentParser(
  381. description="Fix dulwich history by removing .git directories and updating old timestamps.",
  382. epilog="This will create a new branch <target-branch> with the rewritten history from <source-branch>",
  383. )
  384. parser.add_argument("source_branch", help="Source branch to rewrite from")
  385. parser.add_argument(
  386. "target_branch", help="Target branch to create with rewritten history"
  387. )
  388. parser.add_argument(
  389. "--update-tags",
  390. action="store_true",
  391. help="Update existing tags to point to rewritten commits",
  392. )
  393. parser.add_argument(
  394. "--rewrite-tag-commits",
  395. action="store_true",
  396. help="Also rewrite commits that tags point to, even if they aren't in the main branch history",
  397. )
  398. args = parser.parse_args()
  399. source_branch = args.source_branch
  400. target_branch = args.target_branch
  401. update_tags_flag = args.update_tags
  402. rewrite_tag_commits_flag = args.rewrite_tag_commits
  403. # Validate flags
  404. if rewrite_tag_commits_flag and not update_tags_flag:
  405. print("Error: --rewrite-tag-commits requires --update-tags")
  406. sys.exit(1)
  407. print("=== Dulwich History Fix Script ===")
  408. print("This script will:")
  409. print("1. Remove .git directories from tree objects")
  410. print("2. Fix any commits with dates before 1990")
  411. print(
  412. f"3. Create new branch '{target_branch}' from '{source_branch}' with fixed history"
  413. )
  414. if update_tags_flag:
  415. print("4. Update existing tags to point to rewritten commits")
  416. if rewrite_tag_commits_flag:
  417. print(
  418. " - Including rewriting commits that tags point to outside the branch"
  419. )
  420. print("")
  421. print(f"Source branch: {source_branch}")
  422. print(f"Target branch: {target_branch}")
  423. if update_tags_flag:
  424. print("Update tags: Yes")
  425. if rewrite_tag_commits_flag:
  426. print("Rewrite tag commits: Yes")
  427. print("")
  428. # Open the repository
  429. try:
  430. repo = Repo(".")
  431. except Exception as e:
  432. print(f"Error: Could not open repository: {e}")
  433. sys.exit(1)
  434. # Check if source branch exists
  435. source_ref = Ref(f"refs/heads/{source_branch}".encode())
  436. if source_ref not in repo.refs:
  437. print(f"Error: Source branch '{source_branch}' does not exist")
  438. sys.exit(1)
  439. # Check if target branch already exists
  440. target_ref = Ref(f"refs/heads/{target_branch}".encode())
  441. if target_ref in repo.refs:
  442. print(f"Error: Target branch '{target_branch}' already exists")
  443. print("Please delete it first or choose a different name")
  444. sys.exit(1)
  445. # Identify problematic trees
  446. print("")
  447. print("=== Identifying problematic trees ===")
  448. bad_trees = []
  449. for sha in repo.object_store:
  450. obj = repo[sha]
  451. if isinstance(obj, Tree):
  452. for name, mode, item_sha in obj.items():
  453. if name in BANNED_NAMES:
  454. bad_trees.append(sha)
  455. break
  456. print(f"Found {len(bad_trees)} trees with .git directories")
  457. # Check for commits with bad dates
  458. print("")
  459. print("=== Identifying problematic commits ===")
  460. bad_dates = []
  461. for sha in repo.object_store:
  462. obj = repo[sha]
  463. if isinstance(obj, Commit):
  464. if obj.commit_time < 315532800 or obj.author_time < 315532800:
  465. bad_dates.append(sha)
  466. print(f"Found {len(bad_dates)} commits with dates before 1990")
  467. # Rewrite history
  468. print("")
  469. rewrite_result = rewrite_history(repo, source_branch, target_branch)
  470. if not rewrite_result:
  471. sys.exit(1)
  472. # Update tags if requested
  473. if update_tags_flag:
  474. update_tags(repo, rewrite_result, rewrite_tag_commits_flag)
  475. print("")
  476. print("=== Complete ===")
  477. print(
  478. f"Successfully created branch '{target_branch}' with fixed history from '{source_branch}'"
  479. )
  480. print("")
  481. print("Summary of changes:")
  482. print("- Removed .git directories from tree objects")
  483. print("- Fixed commit timestamps that were before 1990")
  484. print(f"- Created clean history in branch '{target_branch}'")
  485. if update_tags_flag:
  486. if rewrite_tag_commits_flag:
  487. print(
  488. "- Updated tags to point to rewritten commits (including commits outside branch)"
  489. )
  490. else:
  491. print("- Updated tags to point to rewritten commits")
  492. print("")
  493. print("IMPORTANT NEXT STEPS:")
  494. print(f"1. Review the changes: git log --oneline {target_branch}")
  495. print(
  496. f"2. Compare commit count: git rev-list --count {source_branch} vs git rev-list --count {target_branch}"
  497. )
  498. if update_tags_flag:
  499. print("3. Review updated tags: git tag -l")
  500. print(
  501. "3. If satisfied, you can:"
  502. if not update_tags_flag
  503. else "4. If satisfied, you can:"
  504. )
  505. print(f" - Push the new branch: git push origin {target_branch}")
  506. if update_tags_flag:
  507. print(" - Force push updated tags: git push origin --tags --force")
  508. print(" - Set it as default branch on GitHub/GitLab")
  509. print(f" - Update local checkout: git checkout {target_branch}")
  510. print("")
  511. print(f"The original branch '{source_branch}' remains unchanged.")
  512. if update_tags_flag:
  513. print(
  514. "WARNING: Tags have been updated. You may need to force push them to remote."
  515. )
  516. if rewrite_tag_commits_flag:
  517. print(
  518. "WARNING: Tag commits outside the branch were also rewritten. Review carefully."
  519. )
  520. if __name__ == "__main__":
  521. main()