patch.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005
  1. # patch.py -- For dealing with packed-style patches.
  2. # Copyright (C) 2009-2013 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Classes for dealing with git am-style patches.
  22. These patches are basically unified diffs with some extra metadata tacked
  23. on.
  24. """
  25. import email.message
  26. import email.parser
  27. import email.utils
  28. import re
  29. import time
  30. from collections.abc import Generator, Sequence
  31. from dataclasses import dataclass
  32. from difflib import SequenceMatcher
  33. from typing import (
  34. IO,
  35. TYPE_CHECKING,
  36. BinaryIO,
  37. Optional,
  38. TextIO,
  39. )
  40. if TYPE_CHECKING:
  41. from .object_store import BaseObjectStore
  42. from .objects import S_ISGITLINK, Blob, Commit
  43. FIRST_FEW_BYTES = 8000
  44. DEFAULT_DIFF_ALGORITHM = "myers"
  45. class DiffAlgorithmNotAvailable(Exception):
  46. """Raised when a requested diff algorithm is not available."""
  47. def __init__(self, algorithm: str, install_hint: str = "") -> None:
  48. """Initialize exception.
  49. Args:
  50. algorithm: Name of the unavailable algorithm
  51. install_hint: Optional installation hint
  52. """
  53. self.algorithm = algorithm
  54. self.install_hint = install_hint
  55. if install_hint:
  56. super().__init__(
  57. f"Diff algorithm '{algorithm}' requested but not available. {install_hint}"
  58. )
  59. else:
  60. super().__init__(
  61. f"Diff algorithm '{algorithm}' requested but not available."
  62. )
  63. def write_commit_patch(
  64. f: IO[bytes],
  65. commit: "Commit",
  66. contents: str | bytes,
  67. progress: tuple[int, int],
  68. version: str | None = None,
  69. encoding: str | None = None,
  70. ) -> None:
  71. """Write a individual file patch.
  72. Args:
  73. f: File-like object to write to
  74. commit: Commit object
  75. contents: Contents of the patch
  76. progress: tuple with current patch number and total.
  77. version: Version string to include in patch header
  78. encoding: Encoding to use for the patch
  79. Returns:
  80. tuple with filename and contents
  81. """
  82. encoding = encoding or getattr(f, "encoding", "ascii")
  83. if encoding is None:
  84. encoding = "ascii"
  85. if isinstance(contents, str):
  86. contents = contents.encode(encoding)
  87. (num, total) = progress
  88. f.write(
  89. b"From "
  90. + commit.id
  91. + b" "
  92. + time.ctime(commit.commit_time).encode(encoding)
  93. + b"\n"
  94. )
  95. f.write(b"From: " + commit.author + b"\n")
  96. f.write(
  97. b"Date: " + time.strftime("%a, %d %b %Y %H:%M:%S %Z").encode(encoding) + b"\n"
  98. )
  99. f.write(
  100. (f"Subject: [PATCH {num}/{total}] ").encode(encoding) + commit.message + b"\n"
  101. )
  102. f.write(b"\n")
  103. f.write(b"---\n")
  104. try:
  105. import subprocess
  106. p = subprocess.Popen(
  107. ["diffstat"], stdout=subprocess.PIPE, stdin=subprocess.PIPE
  108. )
  109. except (ImportError, OSError):
  110. pass # diffstat not available?
  111. else:
  112. (diffstat, _) = p.communicate(contents)
  113. f.write(diffstat)
  114. f.write(b"\n")
  115. f.write(contents)
  116. f.write(b"-- \n")
  117. if version is None:
  118. from dulwich import __version__ as dulwich_version
  119. f.write(b"Dulwich %d.%d.%d\n" % dulwich_version)
  120. else:
  121. if encoding is None:
  122. encoding = "ascii"
  123. f.write(version.encode(encoding) + b"\n")
  124. def get_summary(commit: "Commit") -> str:
  125. """Determine the summary line for use in a filename.
  126. Args:
  127. commit: Commit
  128. Returns: Summary string
  129. """
  130. decoded = commit.message.decode(errors="replace")
  131. lines = decoded.splitlines()
  132. return lines[0].replace(" ", "-") if lines else ""
  133. # Unified Diff
  134. def _format_range_unified(start: int, stop: int) -> str:
  135. """Convert range to the "ed" format."""
  136. # Per the diff spec at http://www.unix.org/single_unix_specification/
  137. beginning = start + 1 # lines start numbering with one
  138. length = stop - start
  139. if length == 1:
  140. return f"{beginning}"
  141. if not length:
  142. beginning -= 1 # empty ranges begin at line just before the range
  143. return f"{beginning},{length}"
  144. def unified_diff(
  145. a: Sequence[bytes],
  146. b: Sequence[bytes],
  147. fromfile: bytes = b"",
  148. tofile: bytes = b"",
  149. fromfiledate: str = "",
  150. tofiledate: str = "",
  151. n: int = 3,
  152. lineterm: str = "\n",
  153. tree_encoding: str = "utf-8",
  154. output_encoding: str = "utf-8",
  155. ) -> Generator[bytes, None, None]:
  156. """difflib.unified_diff that can detect "No newline at end of file" as original "git diff" does.
  157. Based on the same function in Python2.7 difflib.py
  158. """
  159. started = False
  160. for group in SequenceMatcher(a=a, b=b).get_grouped_opcodes(n):
  161. if not started:
  162. started = True
  163. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  164. todate = f"\t{tofiledate}" if tofiledate else ""
  165. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  166. output_encoding
  167. )
  168. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  169. output_encoding
  170. )
  171. first, last = group[0], group[-1]
  172. file1_range = _format_range_unified(first[1], last[2])
  173. file2_range = _format_range_unified(first[3], last[4])
  174. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  175. for tag, i1, i2, j1, j2 in group:
  176. if tag == "equal":
  177. for line in a[i1:i2]:
  178. yield b" " + line
  179. continue
  180. if tag in ("replace", "delete"):
  181. for line in a[i1:i2]:
  182. if not line[-1:] == b"\n":
  183. line += b"\n\\ No newline at end of file\n"
  184. yield b"-" + line
  185. if tag in ("replace", "insert"):
  186. for line in b[j1:j2]:
  187. if not line[-1:] == b"\n":
  188. line += b"\n\\ No newline at end of file\n"
  189. yield b"+" + line
  190. def _get_sequence_matcher(
  191. algorithm: str, a: Sequence[bytes], b: Sequence[bytes]
  192. ) -> SequenceMatcher[bytes]:
  193. """Get appropriate sequence matcher for the given algorithm.
  194. Args:
  195. algorithm: Diff algorithm ("myers" or "patience")
  196. a: First sequence
  197. b: Second sequence
  198. Returns:
  199. Configured sequence matcher instance
  200. Raises:
  201. DiffAlgorithmNotAvailable: If patience requested but not available
  202. """
  203. if algorithm == "patience":
  204. try:
  205. from patiencediff import PatienceSequenceMatcher
  206. return PatienceSequenceMatcher(None, a, b) # type: ignore[no-any-return,unused-ignore]
  207. except ImportError:
  208. raise DiffAlgorithmNotAvailable(
  209. "patience", "Install with: pip install 'dulwich[patiencediff]'"
  210. )
  211. else:
  212. return SequenceMatcher(a=a, b=b)
  213. def unified_diff_with_algorithm(
  214. a: Sequence[bytes],
  215. b: Sequence[bytes],
  216. fromfile: bytes = b"",
  217. tofile: bytes = b"",
  218. fromfiledate: str = "",
  219. tofiledate: str = "",
  220. n: int = 3,
  221. lineterm: str = "\n",
  222. tree_encoding: str = "utf-8",
  223. output_encoding: str = "utf-8",
  224. algorithm: str | None = None,
  225. ) -> Generator[bytes, None, None]:
  226. """Generate unified diff with specified algorithm.
  227. Args:
  228. a: First sequence of lines
  229. b: Second sequence of lines
  230. fromfile: Name of first file
  231. tofile: Name of second file
  232. fromfiledate: Date of first file
  233. tofiledate: Date of second file
  234. n: Number of context lines
  235. lineterm: Line terminator
  236. tree_encoding: Encoding for tree paths
  237. output_encoding: Encoding for output
  238. algorithm: Diff algorithm to use ("myers" or "patience")
  239. Returns:
  240. Generator yielding diff lines
  241. Raises:
  242. DiffAlgorithmNotAvailable: If patience algorithm requested but patiencediff not available
  243. """
  244. if algorithm is None:
  245. algorithm = DEFAULT_DIFF_ALGORITHM
  246. matcher = _get_sequence_matcher(algorithm, a, b)
  247. started = False
  248. for group in matcher.get_grouped_opcodes(n):
  249. if not started:
  250. started = True
  251. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  252. todate = f"\t{tofiledate}" if tofiledate else ""
  253. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  254. output_encoding
  255. )
  256. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  257. output_encoding
  258. )
  259. first, last = group[0], group[-1]
  260. file1_range = _format_range_unified(first[1], last[2])
  261. file2_range = _format_range_unified(first[3], last[4])
  262. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  263. for tag, i1, i2, j1, j2 in group:
  264. if tag == "equal":
  265. for line in a[i1:i2]:
  266. yield b" " + line
  267. continue
  268. if tag in ("replace", "delete"):
  269. for line in a[i1:i2]:
  270. if not line[-1:] == b"\n":
  271. line += b"\n\\ No newline at end of file\n"
  272. yield b"-" + line
  273. if tag in ("replace", "insert"):
  274. for line in b[j1:j2]:
  275. if not line[-1:] == b"\n":
  276. line += b"\n\\ No newline at end of file\n"
  277. yield b"+" + line
  278. def is_binary(content: bytes) -> bool:
  279. """See if the first few bytes contain any null characters.
  280. Args:
  281. content: Bytestring to check for binary content
  282. """
  283. return b"\0" in content[:FIRST_FEW_BYTES]
  284. def shortid(hexsha: bytes | None) -> bytes:
  285. """Get short object ID.
  286. Args:
  287. hexsha: Full hex SHA or None
  288. Returns:
  289. 7-character short ID
  290. """
  291. if hexsha is None:
  292. return b"0" * 7
  293. else:
  294. return hexsha[:7]
  295. def patch_filename(p: bytes | None, root: bytes) -> bytes:
  296. """Generate patch filename.
  297. Args:
  298. p: Path or None
  299. root: Root directory
  300. Returns:
  301. Full patch filename
  302. """
  303. if p is None:
  304. return b"/dev/null"
  305. else:
  306. return root + b"/" + p
  307. def write_object_diff(
  308. f: IO[bytes],
  309. store: "BaseObjectStore",
  310. old_file: tuple[bytes | None, int | None, bytes | None],
  311. new_file: tuple[bytes | None, int | None, bytes | None],
  312. diff_binary: bool = False,
  313. diff_algorithm: str | None = None,
  314. ) -> None:
  315. """Write the diff for an object.
  316. Args:
  317. f: File-like object to write to
  318. store: Store to retrieve objects from, if necessary
  319. old_file: (path, mode, hexsha) tuple
  320. new_file: (path, mode, hexsha) tuple
  321. diff_binary: Whether to diff files even if they
  322. are considered binary files by is_binary().
  323. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  324. Note: the tuple elements should be None for nonexistent files
  325. """
  326. (old_path, old_mode, old_id) = old_file
  327. (new_path, new_mode, new_id) = new_file
  328. patched_old_path = patch_filename(old_path, b"a")
  329. patched_new_path = patch_filename(new_path, b"b")
  330. def content(mode: int | None, hexsha: bytes | None) -> Blob:
  331. """Get blob content for a file.
  332. Args:
  333. mode: File mode
  334. hexsha: Object SHA
  335. Returns:
  336. Blob object
  337. """
  338. if hexsha is None:
  339. return Blob.from_string(b"")
  340. elif mode is not None and S_ISGITLINK(mode):
  341. return Blob.from_string(b"Subproject commit " + hexsha + b"\n")
  342. else:
  343. obj = store[hexsha]
  344. if isinstance(obj, Blob):
  345. return obj
  346. else:
  347. # Fallback for non-blob objects
  348. return Blob.from_string(obj.as_raw_string())
  349. def lines(content: "Blob") -> list[bytes]:
  350. """Split blob content into lines.
  351. Args:
  352. content: Blob content
  353. Returns:
  354. List of lines
  355. """
  356. if not content:
  357. return []
  358. else:
  359. return content.splitlines()
  360. f.writelines(
  361. gen_diff_header((old_path, new_path), (old_mode, new_mode), (old_id, new_id))
  362. )
  363. old_content = content(old_mode, old_id)
  364. new_content = content(new_mode, new_id)
  365. if not diff_binary and (is_binary(old_content.data) or is_binary(new_content.data)):
  366. binary_diff = (
  367. b"Binary files "
  368. + patched_old_path
  369. + b" and "
  370. + patched_new_path
  371. + b" differ\n"
  372. )
  373. f.write(binary_diff)
  374. else:
  375. f.writelines(
  376. unified_diff_with_algorithm(
  377. lines(old_content),
  378. lines(new_content),
  379. patched_old_path,
  380. patched_new_path,
  381. algorithm=diff_algorithm,
  382. )
  383. )
  384. # TODO(jelmer): Support writing unicode, rather than bytes.
  385. def gen_diff_header(
  386. paths: tuple[bytes | None, bytes | None],
  387. modes: tuple[int | None, int | None],
  388. shas: tuple[bytes | None, bytes | None],
  389. ) -> Generator[bytes, None, None]:
  390. """Write a blob diff header.
  391. Args:
  392. paths: Tuple with old and new path
  393. modes: Tuple with old and new modes
  394. shas: Tuple with old and new shas
  395. """
  396. (old_path, new_path) = paths
  397. (old_mode, new_mode) = modes
  398. (old_sha, new_sha) = shas
  399. if old_path is None and new_path is not None:
  400. old_path = new_path
  401. if new_path is None and old_path is not None:
  402. new_path = old_path
  403. old_path = patch_filename(old_path, b"a")
  404. new_path = patch_filename(new_path, b"b")
  405. yield b"diff --git " + old_path + b" " + new_path + b"\n"
  406. if old_mode != new_mode:
  407. if new_mode is not None:
  408. if old_mode is not None:
  409. yield (f"old file mode {old_mode:o}\n").encode("ascii")
  410. yield (f"new file mode {new_mode:o}\n").encode("ascii")
  411. else:
  412. yield (f"deleted file mode {old_mode:o}\n").encode("ascii")
  413. yield b"index " + shortid(old_sha) + b".." + shortid(new_sha)
  414. if new_mode is not None and old_mode is not None:
  415. yield (f" {new_mode:o}").encode("ascii")
  416. yield b"\n"
  417. # TODO(jelmer): Support writing unicode, rather than bytes.
  418. def write_blob_diff(
  419. f: IO[bytes],
  420. old_file: tuple[bytes | None, int | None, Optional["Blob"]],
  421. new_file: tuple[bytes | None, int | None, Optional["Blob"]],
  422. diff_algorithm: str | None = None,
  423. ) -> None:
  424. """Write blob diff.
  425. Args:
  426. f: File-like object to write to
  427. old_file: (path, mode, hexsha) tuple (None if nonexisting)
  428. new_file: (path, mode, hexsha) tuple (None if nonexisting)
  429. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  430. Note: The use of write_object_diff is recommended over this function.
  431. """
  432. (old_path, old_mode, old_blob) = old_file
  433. (new_path, new_mode, new_blob) = new_file
  434. patched_old_path = patch_filename(old_path, b"a")
  435. patched_new_path = patch_filename(new_path, b"b")
  436. def lines(blob: Optional["Blob"]) -> list[bytes]:
  437. """Split blob content into lines.
  438. Args:
  439. blob: Blob object or None
  440. Returns:
  441. List of lines
  442. """
  443. if blob is not None:
  444. return blob.splitlines()
  445. else:
  446. return []
  447. f.writelines(
  448. gen_diff_header(
  449. (old_path, new_path),
  450. (old_mode, new_mode),
  451. (getattr(old_blob, "id", None), getattr(new_blob, "id", None)),
  452. )
  453. )
  454. old_contents = lines(old_blob)
  455. new_contents = lines(new_blob)
  456. f.writelines(
  457. unified_diff_with_algorithm(
  458. old_contents,
  459. new_contents,
  460. patched_old_path,
  461. patched_new_path,
  462. algorithm=diff_algorithm,
  463. )
  464. )
  465. def write_tree_diff(
  466. f: IO[bytes],
  467. store: "BaseObjectStore",
  468. old_tree: bytes | None,
  469. new_tree: bytes | None,
  470. diff_binary: bool = False,
  471. diff_algorithm: str | None = None,
  472. ) -> None:
  473. """Write tree diff.
  474. Args:
  475. f: File-like object to write to.
  476. store: Object store to read from
  477. old_tree: Old tree id
  478. new_tree: New tree id
  479. diff_binary: Whether to diff files even if they
  480. are considered binary files by is_binary().
  481. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  482. """
  483. changes = store.tree_changes(old_tree, new_tree)
  484. for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
  485. write_object_diff(
  486. f,
  487. store,
  488. (oldpath, oldmode, oldsha),
  489. (newpath, newmode, newsha),
  490. diff_binary=diff_binary,
  491. diff_algorithm=diff_algorithm,
  492. )
  493. def git_am_patch_split(
  494. f: TextIO | BinaryIO, encoding: str | None = None
  495. ) -> tuple["Commit", bytes, bytes | None]:
  496. """Parse a git-am-style patch and split it up into bits.
  497. Args:
  498. f: File-like object to parse
  499. encoding: Encoding to use when creating Git objects
  500. Returns: Tuple with commit object, diff contents and git version
  501. """
  502. encoding = encoding or getattr(f, "encoding", "ascii")
  503. encoding = encoding or "ascii"
  504. contents = f.read()
  505. if isinstance(contents, bytes):
  506. bparser = email.parser.BytesParser()
  507. msg = bparser.parsebytes(contents)
  508. else:
  509. uparser = email.parser.Parser()
  510. msg = uparser.parsestr(contents)
  511. return parse_patch_message(msg, encoding)
  512. def parse_patch_message(
  513. msg: email.message.Message, encoding: str | None = None
  514. ) -> tuple["Commit", bytes, bytes | None]:
  515. """Extract a Commit object and patch from an e-mail message.
  516. Args:
  517. msg: An email message (email.message.Message)
  518. encoding: Encoding to use to encode Git commits
  519. Returns: Tuple with commit object, diff contents and git version
  520. """
  521. c = Commit()
  522. if encoding is None:
  523. encoding = "ascii"
  524. c.author = msg["from"].encode(encoding)
  525. c.committer = msg["from"].encode(encoding)
  526. try:
  527. patch_tag_start = msg["subject"].index("[PATCH")
  528. except ValueError:
  529. subject = msg["subject"]
  530. else:
  531. close = msg["subject"].index("] ", patch_tag_start)
  532. subject = msg["subject"][close + 2 :]
  533. c.message = (subject.replace("\n", "") + "\n").encode(encoding)
  534. first = True
  535. body = msg.get_payload(decode=True)
  536. if isinstance(body, str):
  537. body = body.encode(encoding)
  538. if isinstance(body, bytes):
  539. lines = body.splitlines(True)
  540. else:
  541. # Handle other types by converting to string first
  542. lines = str(body).encode(encoding).splitlines(True)
  543. line_iter = iter(lines)
  544. for line in line_iter:
  545. if line == b"---\n":
  546. break
  547. if first:
  548. if line.startswith(b"From: "):
  549. c.author = line[len(b"From: ") :].rstrip()
  550. else:
  551. c.message += b"\n" + line
  552. first = False
  553. else:
  554. c.message += line
  555. diff = b""
  556. for line in line_iter:
  557. if line == b"-- \n":
  558. break
  559. diff += line
  560. try:
  561. version = next(line_iter).rstrip(b"\n")
  562. except StopIteration:
  563. version = None
  564. return c, diff, version
  565. def patch_id(diff_data: bytes) -> bytes:
  566. """Compute patch ID for a diff.
  567. The patch ID is computed by normalizing the diff and computing a SHA1 hash.
  568. This follows git's patch-id algorithm which:
  569. 1. Removes whitespace from lines starting with + or -
  570. 2. Replaces line numbers in @@ headers with a canonical form
  571. 3. Computes SHA1 of the result
  572. Args:
  573. diff_data: Raw diff data as bytes
  574. Returns:
  575. SHA1 hash of normalized diff (40-byte hex string)
  576. TODO: This implementation uses a simple line-by-line approach. For better
  577. compatibility with git's patch-id, consider using proper patch parsing that:
  578. - Handles edge cases in diff format (binary diffs, mode changes, etc.)
  579. - Properly parses unified diff format according to the spec
  580. - Matches git's exact normalization algorithm byte-for-byte
  581. See git's patch-id.c for reference implementation.
  582. """
  583. import hashlib
  584. import re
  585. # Normalize the diff for patch-id computation
  586. normalized_lines = []
  587. for line in diff_data.split(b"\n"):
  588. # Skip diff headers (diff --git, index, ---, +++)
  589. if line.startswith(
  590. (
  591. b"diff --git ",
  592. b"index ",
  593. b"--- ",
  594. b"+++ ",
  595. b"new file mode ",
  596. b"old file mode ",
  597. b"deleted file mode ",
  598. b"new mode ",
  599. b"old mode ",
  600. b"similarity index ",
  601. b"dissimilarity index ",
  602. b"rename from ",
  603. b"rename to ",
  604. b"copy from ",
  605. b"copy to ",
  606. )
  607. ):
  608. continue
  609. # Normalize @@ headers to a canonical form
  610. if line.startswith(b"@@"):
  611. # Replace line numbers with canonical form
  612. match = re.match(rb"^@@\s+-\d+(?:,\d+)?\s+\+\d+(?:,\d+)?\s+@@", line)
  613. if match:
  614. # Use canonical hunk header without line numbers
  615. normalized_lines.append(b"@@")
  616. continue
  617. # For +/- lines, strip all whitespace
  618. if line.startswith((b"+", b"-")):
  619. # Keep the +/- prefix but remove all whitespace from the rest
  620. if len(line) > 1:
  621. # Remove all whitespace from the content
  622. content = line[1:].replace(b" ", b"").replace(b"\t", b"")
  623. normalized_lines.append(line[:1] + content)
  624. else:
  625. # Just +/- alone
  626. normalized_lines.append(line[:1])
  627. continue
  628. # Keep context lines and other content as-is
  629. if line.startswith(b" ") or line == b"":
  630. normalized_lines.append(line)
  631. # Join normalized lines and compute SHA1
  632. normalized = b"\n".join(normalized_lines)
  633. return hashlib.sha1(normalized).hexdigest().encode("ascii")
  634. def commit_patch_id(store: "BaseObjectStore", commit_id: bytes) -> bytes:
  635. """Compute patch ID for a commit.
  636. Args:
  637. store: Object store to read objects from
  638. commit_id: Commit ID (40-byte hex string)
  639. Returns:
  640. Patch ID (40-byte hex string)
  641. """
  642. from io import BytesIO
  643. commit = store[commit_id]
  644. assert isinstance(commit, Commit)
  645. # Get the parent tree (or empty tree for root commit)
  646. if commit.parents:
  647. parent = store[commit.parents[0]]
  648. assert isinstance(parent, Commit)
  649. parent_tree = parent.tree
  650. else:
  651. # Root commit - compare against empty tree
  652. parent_tree = None
  653. # Generate diff
  654. diff_output = BytesIO()
  655. write_tree_diff(diff_output, store, parent_tree, commit.tree)
  656. return patch_id(diff_output.getvalue())
  657. @dataclass
  658. class MailinfoResult:
  659. """Result of mailinfo parsing.
  660. Attributes:
  661. author_name: Author's name
  662. author_email: Author's email address
  663. author_date: Author's date (if present in the email)
  664. subject: Processed subject line
  665. message: Commit message body
  666. patch: Patch content
  667. message_id: Message-ID header (if -m/--message-id was used)
  668. """
  669. author_name: str
  670. author_email: str
  671. author_date: str | None
  672. subject: str
  673. message: str
  674. patch: str
  675. message_id: str | None = None
  676. def _munge_subject(subject: str, keep_subject: bool, keep_non_patch: bool) -> str:
  677. """Munge email subject line for commit message.
  678. Args:
  679. subject: Original subject line
  680. keep_subject: If True, keep subject intact (-k option)
  681. keep_non_patch: If True, only strip [PATCH] (-b option)
  682. Returns:
  683. Processed subject line
  684. """
  685. if keep_subject:
  686. return subject
  687. result = subject
  688. # First remove Re: prefixes (they can appear before brackets)
  689. while True:
  690. new_result = re.sub(r"^\s*(?:re|RE|Re):\s*", "", result, flags=re.IGNORECASE)
  691. if new_result == result:
  692. break
  693. result = new_result
  694. # Remove bracketed strings
  695. if keep_non_patch:
  696. # Only remove brackets containing "PATCH"
  697. # Match each bracket individually anywhere in the string
  698. while True:
  699. # Remove PATCH bracket, but be careful with whitespace
  700. new_result = re.sub(
  701. r"\[[^\]]*?PATCH[^\]]*?\](\s+)?", r"\1", result, flags=re.IGNORECASE
  702. )
  703. if new_result == result:
  704. break
  705. result = new_result
  706. else:
  707. # Remove all bracketed strings
  708. while True:
  709. new_result = re.sub(r"^\s*\[.*?\]\s*", "", result)
  710. if new_result == result:
  711. break
  712. result = new_result
  713. # Remove leading/trailing whitespace
  714. result = result.strip()
  715. # Normalize multiple whitespace to single space
  716. result = re.sub(r"\s+", " ", result)
  717. return result
  718. def _find_scissors_line(lines: list[bytes]) -> int | None:
  719. """Find the scissors line in message body.
  720. Args:
  721. lines: List of lines in the message body
  722. Returns:
  723. Index of scissors line, or None if not found
  724. """
  725. scissors_pattern = re.compile(
  726. rb"^(?:>?\s*-+\s*)?(?:8<|>8)?\s*-+\s*$|^(?:>?\s*-+\s*)(?:cut here|scissors)(?:\s*-+)?$",
  727. re.IGNORECASE,
  728. )
  729. for i, line in enumerate(lines):
  730. if scissors_pattern.match(line.strip()):
  731. return i
  732. return None
  733. def mailinfo(
  734. msg: email.message.Message | BinaryIO | TextIO,
  735. keep_subject: bool = False,
  736. keep_non_patch: bool = False,
  737. encoding: str | None = None,
  738. scissors: bool = False,
  739. message_id: bool = False,
  740. ) -> MailinfoResult:
  741. """Extract patch information from an email message.
  742. This function parses an email message and extracts commit metadata
  743. (author, email, subject) and separates the commit message from the
  744. patch content, similar to git mailinfo.
  745. Args:
  746. msg: Email message (email.message.Message object) or file handle to read from
  747. keep_subject: If True, keep subject intact without munging (-k)
  748. keep_non_patch: If True, only strip [PATCH] from brackets (-b)
  749. encoding: Character encoding to use (default: detect from message)
  750. scissors: If True, remove everything before scissors line
  751. message_id: If True, include Message-ID in commit message (-m)
  752. Returns:
  753. MailinfoResult with parsed information
  754. Raises:
  755. ValueError: If message is malformed or missing required fields
  756. """
  757. # Parse message if given a file handle
  758. parsed_msg: email.message.Message
  759. if not isinstance(msg, email.message.Message):
  760. if hasattr(msg, "read"):
  761. content = msg.read()
  762. if isinstance(content, bytes):
  763. bparser = email.parser.BytesParser()
  764. parsed_msg = bparser.parsebytes(content)
  765. else:
  766. sparser = email.parser.Parser()
  767. parsed_msg = sparser.parsestr(content)
  768. else:
  769. raise ValueError("msg must be an email.message.Message or file-like object")
  770. else:
  771. parsed_msg = msg
  772. # Detect encoding from message if not specified
  773. if encoding is None:
  774. encoding = parsed_msg.get_content_charset() or "utf-8"
  775. # Extract author information
  776. from_header = parsed_msg.get("From", "")
  777. if not from_header:
  778. raise ValueError("Email message missing 'From' header")
  779. # Parse "Name <email>" format
  780. author_name, author_email = email.utils.parseaddr(from_header)
  781. if not author_email:
  782. raise ValueError(
  783. f"Could not parse email address from 'From' header: {from_header}"
  784. )
  785. # Extract date
  786. date_header = parsed_msg.get("Date")
  787. author_date = date_header if date_header else None
  788. # Extract and process subject
  789. subject = parsed_msg.get("Subject", "")
  790. if not subject:
  791. subject = "(no subject)"
  792. # Convert Header object to string if needed
  793. subject = str(subject)
  794. # Remove newlines from subject
  795. subject = subject.replace("\n", " ").replace("\r", " ")
  796. subject = _munge_subject(subject, keep_subject, keep_non_patch)
  797. # Extract Message-ID if requested
  798. msg_id = None
  799. if message_id:
  800. msg_id = parsed_msg.get("Message-ID")
  801. # Get message body
  802. body = parsed_msg.get_payload(decode=True)
  803. if body is None:
  804. body = b""
  805. elif isinstance(body, str):
  806. body = body.encode(encoding)
  807. elif not isinstance(body, bytes):
  808. # Handle multipart or other types
  809. body = str(body).encode(encoding)
  810. # Split into lines
  811. lines = body.splitlines(keepends=True)
  812. # Handle scissors
  813. scissors_idx = None
  814. if scissors:
  815. scissors_idx = _find_scissors_line(lines)
  816. if scissors_idx is not None:
  817. # Remove everything up to and including scissors line
  818. lines = lines[scissors_idx + 1 :]
  819. # Separate commit message from patch
  820. # Look for the "---" separator that indicates start of diffstat/patch
  821. message_lines: list[bytes] = []
  822. patch_lines: list[bytes] = []
  823. in_patch = False
  824. for line in lines:
  825. if not in_patch and line == b"---\n":
  826. in_patch = True
  827. patch_lines.append(line)
  828. elif in_patch:
  829. # Stop at signature marker "-- "
  830. if line == b"-- \n":
  831. break
  832. patch_lines.append(line)
  833. else:
  834. message_lines.append(line)
  835. # Build commit message
  836. commit_message = b"".join(message_lines).decode(encoding, errors="replace")
  837. # Clean up commit message
  838. commit_message = commit_message.strip()
  839. # Append Message-ID if requested
  840. if message_id and msg_id:
  841. if commit_message:
  842. commit_message += "\n\n"
  843. commit_message += f"Message-ID: {msg_id}"
  844. # Build patch content
  845. patch_content = b"".join(patch_lines).decode(encoding, errors="replace")
  846. return MailinfoResult(
  847. author_name=author_name,
  848. author_email=author_email,
  849. author_date=author_date,
  850. subject=subject,
  851. message=commit_message,
  852. patch=patch_content,
  853. message_id=msg_id,
  854. )