fix-history.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. #!/usr/bin/env python3
  2. """Fix dulwich history by removing .git directories and updating old timestamps.
  3. Usage: ./fix-history.py <source-branch> <target-branch>
  4. Example: ./fix-history.py master main
  5. """
  6. import sys
  7. import time
  8. from dulwich.objects import Commit, Tree
  9. from dulwich.repo import Repo
  10. def fix_tree(repo, tree_id, seen_trees=None):
  11. """Recursively fix a tree by removing .git entries."""
  12. if seen_trees is None:
  13. seen_trees = set()
  14. if tree_id in seen_trees:
  15. return tree_id
  16. seen_trees.add(tree_id)
  17. try:
  18. tree = repo[tree_id]
  19. except KeyError:
  20. return tree_id
  21. if not isinstance(tree, Tree):
  22. return tree_id
  23. # Check if this tree contains .git entries
  24. modified = False
  25. new_items = []
  26. for item in tree.items():
  27. name, mode, sha = item
  28. # Skip .git entries
  29. if name == b".git":
  30. modified = True
  31. continue
  32. # Recursively fix subtrees
  33. if mode == 0o040000: # Directory mode
  34. new_sha = fix_tree(repo, sha, seen_trees)
  35. if new_sha != sha:
  36. modified = True
  37. sha = new_sha
  38. new_items.append((name, mode, sha))
  39. if not modified:
  40. return tree_id
  41. print(f"Removing .git entry from tree {tree_id.decode()}")
  42. # Create new tree without .git entries
  43. new_tree = Tree()
  44. for name, mode, sha in new_items:
  45. new_tree.add(name, mode, sha)
  46. repo.object_store.add_object(new_tree)
  47. return new_tree.id
  48. def fix_commit_dates(commit):
  49. """Fix commit dates if they're before 1990."""
  50. modified = False
  51. # Unix timestamp for 1990-01-01
  52. min_timestamp = 315532800
  53. max_timestamp = int(time.time())
  54. # Fix author date
  55. if commit.author_time < min_timestamp:
  56. new_time = commit.author_time * 10
  57. if min_timestamp <= new_time <= max_timestamp:
  58. print(f"Fixed author date: {commit.author_time} -> {new_time}")
  59. commit.author_time = new_time
  60. modified = True
  61. # Fix committer date
  62. if commit.commit_time < min_timestamp:
  63. new_time = commit.commit_time * 10
  64. if min_timestamp <= new_time <= max_timestamp:
  65. print(f"Fixed committer date: {commit.commit_time} -> {new_time}")
  66. commit.commit_time = new_time
  67. modified = True
  68. return modified
  69. def rewrite_history(repo, source_branch, target_branch):
  70. """Rewrite history to fix issues."""
  71. print(f"=== Rewriting history from {source_branch} to {target_branch} ===")
  72. # Get the head commit of the source branch
  73. try:
  74. source_ref = f"refs/heads/{source_branch}".encode()
  75. head_sha = repo.refs[source_ref]
  76. except KeyError:
  77. print(f"Error: Branch '{source_branch}' not found")
  78. return False
  79. # Map old commit SHAs to new ones
  80. commit_map = {}
  81. tree_map = {}
  82. # Get all commits in topological order
  83. walker = repo.get_walker([head_sha])
  84. commits = list(walker)
  85. commits.reverse() # Process from oldest to newest
  86. print(f"Processing {len(commits)} commits...")
  87. for i, commit_entry in enumerate(commits):
  88. old_commit = commit_entry.commit
  89. if i % 100 == 0:
  90. print(f"Processed {i}/{len(commits)} commits...")
  91. # Fix the tree
  92. old_tree_id = old_commit.tree
  93. if old_tree_id not in tree_map:
  94. tree_map[old_tree_id] = fix_tree(repo, old_tree_id)
  95. new_tree_id = tree_map[old_tree_id]
  96. # Create new commit
  97. new_commit = Commit()
  98. new_commit.tree = new_tree_id
  99. new_commit.author = old_commit.author
  100. new_commit.committer = old_commit.committer
  101. new_commit.author_time = old_commit.author_time
  102. new_commit.commit_time = old_commit.commit_time
  103. new_commit.author_timezone = old_commit.author_timezone
  104. new_commit.commit_timezone = old_commit.commit_timezone
  105. new_commit.message = old_commit.message
  106. new_commit.encoding = old_commit.encoding
  107. # note: Drop extra fields
  108. # Fix dates
  109. date_modified = fix_commit_dates(new_commit)
  110. # Map parent commits
  111. new_parents = []
  112. for parent_sha in old_commit.parents:
  113. if parent_sha in commit_map:
  114. new_parents.append(commit_map[parent_sha])
  115. else:
  116. new_parents.append(parent_sha)
  117. new_commit.parents = new_parents
  118. # Check if commit actually changed
  119. if (
  120. new_tree_id == old_tree_id
  121. and not date_modified
  122. and new_parents == list(old_commit.parents)
  123. ):
  124. # No changes needed, reuse old commit
  125. commit_map[old_commit.id] = old_commit.id
  126. else:
  127. # Add new commit to object store
  128. repo.object_store.add_object(new_commit)
  129. commit_map[old_commit.id] = new_commit.id
  130. # Update the target branch
  131. new_head = commit_map[head_sha]
  132. target_ref = f"refs/heads/{target_branch}".encode()
  133. repo.refs[target_ref] = new_head
  134. print(
  135. f"✓ Created branch '{target_branch}' with {len([k for k, v in commit_map.items() if k != v])} modified commits"
  136. )
  137. return True
  138. def main():
  139. if len(sys.argv) != 3:
  140. print(f"Usage: {sys.argv[0]} <source-branch> <target-branch>")
  141. print(f"Example: {sys.argv[0]} master main")
  142. print("")
  143. print(
  144. "This will create a new branch <target-branch> with the rewritten history from <source-branch>"
  145. )
  146. sys.exit(1)
  147. source_branch = sys.argv[1]
  148. target_branch = sys.argv[2]
  149. print("=== Dulwich History Fix Script ===")
  150. print("This script will:")
  151. print("1. Remove .git directories from tree objects")
  152. print("2. Fix any commits with dates before 1990")
  153. print(
  154. f"3. Create new branch '{target_branch}' from '{source_branch}' with fixed history"
  155. )
  156. print("")
  157. print(f"Source branch: {source_branch}")
  158. print(f"Target branch: {target_branch}")
  159. print("")
  160. # Open the repository
  161. try:
  162. repo = Repo(".")
  163. except Exception as e:
  164. print(f"Error: Could not open repository: {e}")
  165. sys.exit(1)
  166. # Check if source branch exists
  167. source_ref = f"refs/heads/{source_branch}".encode()
  168. if source_ref not in repo.refs:
  169. print(f"Error: Source branch '{source_branch}' does not exist")
  170. sys.exit(1)
  171. # Check if target branch already exists
  172. target_ref = f"refs/heads/{target_branch}".encode()
  173. if target_ref in repo.refs:
  174. print(f"Error: Target branch '{target_branch}' already exists")
  175. print("Please delete it first or choose a different name")
  176. sys.exit(1)
  177. # Identify problematic trees
  178. print("")
  179. print("=== Identifying problematic trees ===")
  180. bad_trees = []
  181. for sha in repo.object_store:
  182. obj = repo[sha]
  183. if isinstance(obj, Tree):
  184. for name, mode, item_sha in obj.items():
  185. if name == b".git":
  186. bad_trees.append(sha)
  187. break
  188. print(f"Found {len(bad_trees)} trees with .git directories")
  189. # Check for commits with bad dates
  190. print("")
  191. print("=== Identifying problematic commits ===")
  192. bad_dates = []
  193. for sha in repo.object_store:
  194. obj = repo[sha]
  195. if isinstance(obj, Commit):
  196. if obj.commit_time < 315532800 or obj.author_time < 315532800:
  197. bad_dates.append(sha)
  198. print(f"Found {len(bad_dates)} commits with dates before 1990")
  199. # Rewrite history
  200. print("")
  201. if not rewrite_history(repo, source_branch, target_branch):
  202. sys.exit(1)
  203. print("")
  204. print("=== Complete ===")
  205. print(
  206. f"Successfully created branch '{target_branch}' with fixed history from '{source_branch}'"
  207. )
  208. print("")
  209. print("Summary of changes:")
  210. print("- Removed .git directories from tree objects")
  211. print("- Fixed commit timestamps that were before 1990")
  212. print(f"- Created clean history in branch '{target_branch}'")
  213. print("")
  214. print("IMPORTANT NEXT STEPS:")
  215. print(f"1. Review the changes: git log --oneline {target_branch}")
  216. print(
  217. f"2. Compare commit count: git rev-list --count {source_branch} vs git rev-list --count {target_branch}"
  218. )
  219. print("3. If satisfied, you can:")
  220. print(f" - Push the new branch: git push origin {target_branch}")
  221. print(" - Set it as default branch on GitHub/GitLab")
  222. print(f" - Update local checkout: git checkout {target_branch}")
  223. print("")
  224. print(f"The original branch '{source_branch}' remains unchanged.")
  225. if __name__ == "__main__":
  226. main()