2
0

patch.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762
  1. # patch.py -- For dealing with packed-style patches.
  2. # Copyright (C) 2009-2013 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Classes for dealing with git am-style patches.
  22. These patches are basically unified diffs with some extra metadata tacked
  23. on.
  24. """
  25. import email.parser
  26. import time
  27. from collections.abc import Generator, Sequence
  28. from difflib import SequenceMatcher
  29. from typing import (
  30. IO,
  31. TYPE_CHECKING,
  32. BinaryIO,
  33. Optional,
  34. TextIO,
  35. Union,
  36. )
  37. if TYPE_CHECKING:
  38. import email.message
  39. from .object_store import BaseObjectStore
  40. from .objects import S_ISGITLINK, Blob, Commit
  41. FIRST_FEW_BYTES = 8000
  42. DEFAULT_DIFF_ALGORITHM = "myers"
  43. class DiffAlgorithmNotAvailable(Exception):
  44. """Raised when a requested diff algorithm is not available."""
  45. def __init__(self, algorithm: str, install_hint: str = "") -> None:
  46. """Initialize exception.
  47. Args:
  48. algorithm: Name of the unavailable algorithm
  49. install_hint: Optional installation hint
  50. """
  51. self.algorithm = algorithm
  52. self.install_hint = install_hint
  53. if install_hint:
  54. super().__init__(
  55. f"Diff algorithm '{algorithm}' requested but not available. {install_hint}"
  56. )
  57. else:
  58. super().__init__(
  59. f"Diff algorithm '{algorithm}' requested but not available."
  60. )
  61. def write_commit_patch(
  62. f: IO[bytes],
  63. commit: "Commit",
  64. contents: Union[str, bytes],
  65. progress: tuple[int, int],
  66. version: Optional[str] = None,
  67. encoding: Optional[str] = None,
  68. ) -> None:
  69. """Write a individual file patch.
  70. Args:
  71. f: File-like object to write to
  72. commit: Commit object
  73. contents: Contents of the patch
  74. progress: tuple with current patch number and total.
  75. version: Version string to include in patch header
  76. encoding: Encoding to use for the patch
  77. Returns:
  78. tuple with filename and contents
  79. """
  80. encoding = encoding or getattr(f, "encoding", "ascii")
  81. if encoding is None:
  82. encoding = "ascii"
  83. if isinstance(contents, str):
  84. contents = contents.encode(encoding)
  85. (num, total) = progress
  86. f.write(
  87. b"From "
  88. + commit.id
  89. + b" "
  90. + time.ctime(commit.commit_time).encode(encoding)
  91. + b"\n"
  92. )
  93. f.write(b"From: " + commit.author + b"\n")
  94. f.write(
  95. b"Date: " + time.strftime("%a, %d %b %Y %H:%M:%S %Z").encode(encoding) + b"\n"
  96. )
  97. f.write(
  98. (f"Subject: [PATCH {num}/{total}] ").encode(encoding) + commit.message + b"\n"
  99. )
  100. f.write(b"\n")
  101. f.write(b"---\n")
  102. try:
  103. import subprocess
  104. p = subprocess.Popen(
  105. ["diffstat"], stdout=subprocess.PIPE, stdin=subprocess.PIPE
  106. )
  107. except (ImportError, OSError):
  108. pass # diffstat not available?
  109. else:
  110. (diffstat, _) = p.communicate(contents)
  111. f.write(diffstat)
  112. f.write(b"\n")
  113. f.write(contents)
  114. f.write(b"-- \n")
  115. if version is None:
  116. from dulwich import __version__ as dulwich_version
  117. f.write(b"Dulwich %d.%d.%d\n" % dulwich_version)
  118. else:
  119. if encoding is None:
  120. encoding = "ascii"
  121. f.write(version.encode(encoding) + b"\n")
  122. def get_summary(commit: "Commit") -> str:
  123. """Determine the summary line for use in a filename.
  124. Args:
  125. commit: Commit
  126. Returns: Summary string
  127. """
  128. decoded = commit.message.decode(errors="replace")
  129. lines = decoded.splitlines()
  130. return lines[0].replace(" ", "-") if lines else ""
  131. # Unified Diff
  132. def _format_range_unified(start: int, stop: int) -> str:
  133. """Convert range to the "ed" format."""
  134. # Per the diff spec at http://www.unix.org/single_unix_specification/
  135. beginning = start + 1 # lines start numbering with one
  136. length = stop - start
  137. if length == 1:
  138. return f"{beginning}"
  139. if not length:
  140. beginning -= 1 # empty ranges begin at line just before the range
  141. return f"{beginning},{length}"
  142. def unified_diff(
  143. a: Sequence[bytes],
  144. b: Sequence[bytes],
  145. fromfile: bytes = b"",
  146. tofile: bytes = b"",
  147. fromfiledate: str = "",
  148. tofiledate: str = "",
  149. n: int = 3,
  150. lineterm: str = "\n",
  151. tree_encoding: str = "utf-8",
  152. output_encoding: str = "utf-8",
  153. ) -> Generator[bytes, None, None]:
  154. """difflib.unified_diff that can detect "No newline at end of file" as original "git diff" does.
  155. Based on the same function in Python2.7 difflib.py
  156. """
  157. started = False
  158. for group in SequenceMatcher(a=a, b=b).get_grouped_opcodes(n):
  159. if not started:
  160. started = True
  161. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  162. todate = f"\t{tofiledate}" if tofiledate else ""
  163. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  164. output_encoding
  165. )
  166. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  167. output_encoding
  168. )
  169. first, last = group[0], group[-1]
  170. file1_range = _format_range_unified(first[1], last[2])
  171. file2_range = _format_range_unified(first[3], last[4])
  172. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  173. for tag, i1, i2, j1, j2 in group:
  174. if tag == "equal":
  175. for line in a[i1:i2]:
  176. yield b" " + line
  177. continue
  178. if tag in ("replace", "delete"):
  179. for line in a[i1:i2]:
  180. if not line[-1:] == b"\n":
  181. line += b"\n\\ No newline at end of file\n"
  182. yield b"-" + line
  183. if tag in ("replace", "insert"):
  184. for line in b[j1:j2]:
  185. if not line[-1:] == b"\n":
  186. line += b"\n\\ No newline at end of file\n"
  187. yield b"+" + line
  188. def _get_sequence_matcher(
  189. algorithm: str, a: Sequence[bytes], b: Sequence[bytes]
  190. ) -> SequenceMatcher[bytes]:
  191. """Get appropriate sequence matcher for the given algorithm.
  192. Args:
  193. algorithm: Diff algorithm ("myers" or "patience")
  194. a: First sequence
  195. b: Second sequence
  196. Returns:
  197. Configured sequence matcher instance
  198. Raises:
  199. DiffAlgorithmNotAvailable: If patience requested but not available
  200. """
  201. if algorithm == "patience":
  202. try:
  203. from patiencediff import PatienceSequenceMatcher
  204. return PatienceSequenceMatcher(None, a, b) # type: ignore[no-any-return,unused-ignore]
  205. except ImportError:
  206. raise DiffAlgorithmNotAvailable(
  207. "patience", "Install with: pip install 'dulwich[patiencediff]'"
  208. )
  209. else:
  210. return SequenceMatcher(a=a, b=b)
  211. def unified_diff_with_algorithm(
  212. a: Sequence[bytes],
  213. b: Sequence[bytes],
  214. fromfile: bytes = b"",
  215. tofile: bytes = b"",
  216. fromfiledate: str = "",
  217. tofiledate: str = "",
  218. n: int = 3,
  219. lineterm: str = "\n",
  220. tree_encoding: str = "utf-8",
  221. output_encoding: str = "utf-8",
  222. algorithm: Optional[str] = None,
  223. ) -> Generator[bytes, None, None]:
  224. """Generate unified diff with specified algorithm.
  225. Args:
  226. a: First sequence of lines
  227. b: Second sequence of lines
  228. fromfile: Name of first file
  229. tofile: Name of second file
  230. fromfiledate: Date of first file
  231. tofiledate: Date of second file
  232. n: Number of context lines
  233. lineterm: Line terminator
  234. tree_encoding: Encoding for tree paths
  235. output_encoding: Encoding for output
  236. algorithm: Diff algorithm to use ("myers" or "patience")
  237. Returns:
  238. Generator yielding diff lines
  239. Raises:
  240. DiffAlgorithmNotAvailable: If patience algorithm requested but patiencediff not available
  241. """
  242. if algorithm is None:
  243. algorithm = DEFAULT_DIFF_ALGORITHM
  244. matcher = _get_sequence_matcher(algorithm, a, b)
  245. started = False
  246. for group in matcher.get_grouped_opcodes(n):
  247. if not started:
  248. started = True
  249. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  250. todate = f"\t{tofiledate}" if tofiledate else ""
  251. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  252. output_encoding
  253. )
  254. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  255. output_encoding
  256. )
  257. first, last = group[0], group[-1]
  258. file1_range = _format_range_unified(first[1], last[2])
  259. file2_range = _format_range_unified(first[3], last[4])
  260. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  261. for tag, i1, i2, j1, j2 in group:
  262. if tag == "equal":
  263. for line in a[i1:i2]:
  264. yield b" " + line
  265. continue
  266. if tag in ("replace", "delete"):
  267. for line in a[i1:i2]:
  268. if not line[-1:] == b"\n":
  269. line += b"\n\\ No newline at end of file\n"
  270. yield b"-" + line
  271. if tag in ("replace", "insert"):
  272. for line in b[j1:j2]:
  273. if not line[-1:] == b"\n":
  274. line += b"\n\\ No newline at end of file\n"
  275. yield b"+" + line
  276. def is_binary(content: bytes) -> bool:
  277. """See if the first few bytes contain any null characters.
  278. Args:
  279. content: Bytestring to check for binary content
  280. """
  281. return b"\0" in content[:FIRST_FEW_BYTES]
  282. def shortid(hexsha: Optional[bytes]) -> bytes:
  283. """Get short object ID.
  284. Args:
  285. hexsha: Full hex SHA or None
  286. Returns:
  287. 7-character short ID
  288. """
  289. if hexsha is None:
  290. return b"0" * 7
  291. else:
  292. return hexsha[:7]
  293. def patch_filename(p: Optional[bytes], root: bytes) -> bytes:
  294. """Generate patch filename.
  295. Args:
  296. p: Path or None
  297. root: Root directory
  298. Returns:
  299. Full patch filename
  300. """
  301. if p is None:
  302. return b"/dev/null"
  303. else:
  304. return root + b"/" + p
  305. def write_object_diff(
  306. f: IO[bytes],
  307. store: "BaseObjectStore",
  308. old_file: tuple[Optional[bytes], Optional[int], Optional[bytes]],
  309. new_file: tuple[Optional[bytes], Optional[int], Optional[bytes]],
  310. diff_binary: bool = False,
  311. diff_algorithm: Optional[str] = None,
  312. ) -> None:
  313. """Write the diff for an object.
  314. Args:
  315. f: File-like object to write to
  316. store: Store to retrieve objects from, if necessary
  317. old_file: (path, mode, hexsha) tuple
  318. new_file: (path, mode, hexsha) tuple
  319. diff_binary: Whether to diff files even if they
  320. are considered binary files by is_binary().
  321. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  322. Note: the tuple elements should be None for nonexistent files
  323. """
  324. (old_path, old_mode, old_id) = old_file
  325. (new_path, new_mode, new_id) = new_file
  326. patched_old_path = patch_filename(old_path, b"a")
  327. patched_new_path = patch_filename(new_path, b"b")
  328. def content(mode: Optional[int], hexsha: Optional[bytes]) -> Blob:
  329. """Get blob content for a file.
  330. Args:
  331. mode: File mode
  332. hexsha: Object SHA
  333. Returns:
  334. Blob object
  335. """
  336. if hexsha is None:
  337. return Blob.from_string(b"")
  338. elif mode is not None and S_ISGITLINK(mode):
  339. return Blob.from_string(b"Subproject commit " + hexsha + b"\n")
  340. else:
  341. obj = store[hexsha]
  342. if isinstance(obj, Blob):
  343. return obj
  344. else:
  345. # Fallback for non-blob objects
  346. return Blob.from_string(obj.as_raw_string())
  347. def lines(content: "Blob") -> list[bytes]:
  348. """Split blob content into lines.
  349. Args:
  350. content: Blob content
  351. Returns:
  352. List of lines
  353. """
  354. if not content:
  355. return []
  356. else:
  357. return content.splitlines()
  358. f.writelines(
  359. gen_diff_header((old_path, new_path), (old_mode, new_mode), (old_id, new_id))
  360. )
  361. old_content = content(old_mode, old_id)
  362. new_content = content(new_mode, new_id)
  363. if not diff_binary and (is_binary(old_content.data) or is_binary(new_content.data)):
  364. binary_diff = (
  365. b"Binary files "
  366. + patched_old_path
  367. + b" and "
  368. + patched_new_path
  369. + b" differ\n"
  370. )
  371. f.write(binary_diff)
  372. else:
  373. f.writelines(
  374. unified_diff_with_algorithm(
  375. lines(old_content),
  376. lines(new_content),
  377. patched_old_path,
  378. patched_new_path,
  379. algorithm=diff_algorithm,
  380. )
  381. )
  382. # TODO(jelmer): Support writing unicode, rather than bytes.
  383. def gen_diff_header(
  384. paths: tuple[Optional[bytes], Optional[bytes]],
  385. modes: tuple[Optional[int], Optional[int]],
  386. shas: tuple[Optional[bytes], Optional[bytes]],
  387. ) -> Generator[bytes, None, None]:
  388. """Write a blob diff header.
  389. Args:
  390. paths: Tuple with old and new path
  391. modes: Tuple with old and new modes
  392. shas: Tuple with old and new shas
  393. """
  394. (old_path, new_path) = paths
  395. (old_mode, new_mode) = modes
  396. (old_sha, new_sha) = shas
  397. if old_path is None and new_path is not None:
  398. old_path = new_path
  399. if new_path is None and old_path is not None:
  400. new_path = old_path
  401. old_path = patch_filename(old_path, b"a")
  402. new_path = patch_filename(new_path, b"b")
  403. yield b"diff --git " + old_path + b" " + new_path + b"\n"
  404. if old_mode != new_mode:
  405. if new_mode is not None:
  406. if old_mode is not None:
  407. yield (f"old file mode {old_mode:o}\n").encode("ascii")
  408. yield (f"new file mode {new_mode:o}\n").encode("ascii")
  409. else:
  410. yield (f"deleted file mode {old_mode:o}\n").encode("ascii")
  411. yield b"index " + shortid(old_sha) + b".." + shortid(new_sha)
  412. if new_mode is not None and old_mode is not None:
  413. yield (f" {new_mode:o}").encode("ascii")
  414. yield b"\n"
  415. # TODO(jelmer): Support writing unicode, rather than bytes.
  416. def write_blob_diff(
  417. f: IO[bytes],
  418. old_file: tuple[Optional[bytes], Optional[int], Optional["Blob"]],
  419. new_file: tuple[Optional[bytes], Optional[int], Optional["Blob"]],
  420. diff_algorithm: Optional[str] = None,
  421. ) -> None:
  422. """Write blob diff.
  423. Args:
  424. f: File-like object to write to
  425. old_file: (path, mode, hexsha) tuple (None if nonexisting)
  426. new_file: (path, mode, hexsha) tuple (None if nonexisting)
  427. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  428. Note: The use of write_object_diff is recommended over this function.
  429. """
  430. (old_path, old_mode, old_blob) = old_file
  431. (new_path, new_mode, new_blob) = new_file
  432. patched_old_path = patch_filename(old_path, b"a")
  433. patched_new_path = patch_filename(new_path, b"b")
  434. def lines(blob: Optional["Blob"]) -> list[bytes]:
  435. """Split blob content into lines.
  436. Args:
  437. blob: Blob object or None
  438. Returns:
  439. List of lines
  440. """
  441. if blob is not None:
  442. return blob.splitlines()
  443. else:
  444. return []
  445. f.writelines(
  446. gen_diff_header(
  447. (old_path, new_path),
  448. (old_mode, new_mode),
  449. (getattr(old_blob, "id", None), getattr(new_blob, "id", None)),
  450. )
  451. )
  452. old_contents = lines(old_blob)
  453. new_contents = lines(new_blob)
  454. f.writelines(
  455. unified_diff_with_algorithm(
  456. old_contents,
  457. new_contents,
  458. patched_old_path,
  459. patched_new_path,
  460. algorithm=diff_algorithm,
  461. )
  462. )
  463. def write_tree_diff(
  464. f: IO[bytes],
  465. store: "BaseObjectStore",
  466. old_tree: Optional[bytes],
  467. new_tree: Optional[bytes],
  468. diff_binary: bool = False,
  469. diff_algorithm: Optional[str] = None,
  470. ) -> None:
  471. """Write tree diff.
  472. Args:
  473. f: File-like object to write to.
  474. store: Object store to read from
  475. old_tree: Old tree id
  476. new_tree: New tree id
  477. diff_binary: Whether to diff files even if they
  478. are considered binary files by is_binary().
  479. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  480. """
  481. changes = store.tree_changes(old_tree, new_tree)
  482. for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
  483. write_object_diff(
  484. f,
  485. store,
  486. (oldpath, oldmode, oldsha),
  487. (newpath, newmode, newsha),
  488. diff_binary=diff_binary,
  489. diff_algorithm=diff_algorithm,
  490. )
  491. def git_am_patch_split(
  492. f: Union[TextIO, BinaryIO], encoding: Optional[str] = None
  493. ) -> tuple["Commit", bytes, Optional[bytes]]:
  494. """Parse a git-am-style patch and split it up into bits.
  495. Args:
  496. f: File-like object to parse
  497. encoding: Encoding to use when creating Git objects
  498. Returns: Tuple with commit object, diff contents and git version
  499. """
  500. encoding = encoding or getattr(f, "encoding", "ascii")
  501. encoding = encoding or "ascii"
  502. contents = f.read()
  503. if isinstance(contents, bytes):
  504. bparser = email.parser.BytesParser()
  505. msg = bparser.parsebytes(contents)
  506. else:
  507. uparser = email.parser.Parser()
  508. msg = uparser.parsestr(contents)
  509. return parse_patch_message(msg, encoding)
  510. def parse_patch_message(
  511. msg: "email.message.Message", encoding: Optional[str] = None
  512. ) -> tuple["Commit", bytes, Optional[bytes]]:
  513. """Extract a Commit object and patch from an e-mail message.
  514. Args:
  515. msg: An email message (email.message.Message)
  516. encoding: Encoding to use to encode Git commits
  517. Returns: Tuple with commit object, diff contents and git version
  518. """
  519. c = Commit()
  520. if encoding is None:
  521. encoding = "ascii"
  522. c.author = msg["from"].encode(encoding)
  523. c.committer = msg["from"].encode(encoding)
  524. try:
  525. patch_tag_start = msg["subject"].index("[PATCH")
  526. except ValueError:
  527. subject = msg["subject"]
  528. else:
  529. close = msg["subject"].index("] ", patch_tag_start)
  530. subject = msg["subject"][close + 2 :]
  531. c.message = (subject.replace("\n", "") + "\n").encode(encoding)
  532. first = True
  533. body = msg.get_payload(decode=True)
  534. if isinstance(body, str):
  535. body = body.encode(encoding)
  536. if isinstance(body, bytes):
  537. lines = body.splitlines(True)
  538. else:
  539. # Handle other types by converting to string first
  540. lines = str(body).encode(encoding).splitlines(True)
  541. line_iter = iter(lines)
  542. for line in line_iter:
  543. if line == b"---\n":
  544. break
  545. if first:
  546. if line.startswith(b"From: "):
  547. c.author = line[len(b"From: ") :].rstrip()
  548. else:
  549. c.message += b"\n" + line
  550. first = False
  551. else:
  552. c.message += line
  553. diff = b""
  554. for line in line_iter:
  555. if line == b"-- \n":
  556. break
  557. diff += line
  558. try:
  559. version = next(line_iter).rstrip(b"\n")
  560. except StopIteration:
  561. version = None
  562. return c, diff, version
  563. def patch_id(diff_data: bytes) -> bytes:
  564. """Compute patch ID for a diff.
  565. The patch ID is computed by normalizing the diff and computing a SHA1 hash.
  566. This follows git's patch-id algorithm which:
  567. 1. Removes whitespace from lines starting with + or -
  568. 2. Replaces line numbers in @@ headers with a canonical form
  569. 3. Computes SHA1 of the result
  570. Args:
  571. diff_data: Raw diff data as bytes
  572. Returns:
  573. SHA1 hash of normalized diff (40-byte hex string)
  574. TODO: This implementation uses a simple line-by-line approach. For better
  575. compatibility with git's patch-id, consider using proper patch parsing that:
  576. - Handles edge cases in diff format (binary diffs, mode changes, etc.)
  577. - Properly parses unified diff format according to the spec
  578. - Matches git's exact normalization algorithm byte-for-byte
  579. See git's patch-id.c for reference implementation.
  580. """
  581. import hashlib
  582. import re
  583. # Normalize the diff for patch-id computation
  584. normalized_lines = []
  585. for line in diff_data.split(b"\n"):
  586. # Skip diff headers (diff --git, index, ---, +++)
  587. if line.startswith(
  588. (
  589. b"diff --git ",
  590. b"index ",
  591. b"--- ",
  592. b"+++ ",
  593. b"new file mode ",
  594. b"old file mode ",
  595. b"deleted file mode ",
  596. b"new mode ",
  597. b"old mode ",
  598. b"similarity index ",
  599. b"dissimilarity index ",
  600. b"rename from ",
  601. b"rename to ",
  602. b"copy from ",
  603. b"copy to ",
  604. )
  605. ):
  606. continue
  607. # Normalize @@ headers to a canonical form
  608. if line.startswith(b"@@"):
  609. # Replace line numbers with canonical form
  610. match = re.match(rb"^@@\s+-\d+(?:,\d+)?\s+\+\d+(?:,\d+)?\s+@@", line)
  611. if match:
  612. # Use canonical hunk header without line numbers
  613. normalized_lines.append(b"@@")
  614. continue
  615. # For +/- lines, strip all whitespace
  616. if line.startswith((b"+", b"-")):
  617. # Keep the +/- prefix but remove all whitespace from the rest
  618. if len(line) > 1:
  619. # Remove all whitespace from the content
  620. content = line[1:].replace(b" ", b"").replace(b"\t", b"")
  621. normalized_lines.append(line[:1] + content)
  622. else:
  623. # Just +/- alone
  624. normalized_lines.append(line[:1])
  625. continue
  626. # Keep context lines and other content as-is
  627. if line.startswith(b" ") or line == b"":
  628. normalized_lines.append(line)
  629. # Join normalized lines and compute SHA1
  630. normalized = b"\n".join(normalized_lines)
  631. return hashlib.sha1(normalized).hexdigest().encode("ascii")
  632. def commit_patch_id(store: "BaseObjectStore", commit_id: bytes) -> bytes:
  633. """Compute patch ID for a commit.
  634. Args:
  635. store: Object store to read objects from
  636. commit_id: Commit ID (40-byte hex string)
  637. Returns:
  638. Patch ID (40-byte hex string)
  639. """
  640. from io import BytesIO
  641. commit = store[commit_id]
  642. assert isinstance(commit, Commit)
  643. # Get the parent tree (or empty tree for root commit)
  644. if commit.parents:
  645. parent = store[commit.parents[0]]
  646. assert isinstance(parent, Commit)
  647. parent_tree = parent.tree
  648. else:
  649. # Root commit - compare against empty tree
  650. parent_tree = None
  651. # Generate diff
  652. diff_output = BytesIO()
  653. write_tree_diff(diff_output, store, parent_tree, commit.tree)
  654. return patch_id(diff_output.getvalue())