patch.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. # patch.py -- For dealing with packed-style patches.
  2. # Copyright (C) 2009-2013 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Classes for dealing with git am-style patches.
  22. These patches are basically unified diffs with some extra metadata tacked
  23. on.
  24. """
  25. __all__ = [
  26. "DEFAULT_DIFF_ALGORITHM",
  27. "FIRST_FEW_BYTES",
  28. "DiffAlgorithmNotAvailable",
  29. "MailinfoResult",
  30. "commit_patch_id",
  31. "gen_diff_header",
  32. "get_summary",
  33. "git_am_patch_split",
  34. "is_binary",
  35. "mailinfo",
  36. "parse_patch_message",
  37. "patch_filename",
  38. "patch_id",
  39. "shortid",
  40. "unified_diff",
  41. "unified_diff_with_algorithm",
  42. "write_blob_diff",
  43. "write_commit_patch",
  44. "write_object_diff",
  45. "write_tree_diff",
  46. ]
  47. import email.message
  48. import email.parser
  49. import email.utils
  50. import re
  51. import time
  52. from collections.abc import Generator, Sequence
  53. from dataclasses import dataclass
  54. from difflib import SequenceMatcher
  55. from typing import (
  56. IO,
  57. TYPE_CHECKING,
  58. BinaryIO,
  59. TextIO,
  60. )
  61. if TYPE_CHECKING:
  62. from .object_store import BaseObjectStore
  63. from .objects import S_ISGITLINK, Blob, Commit, ObjectID, RawObjectID
  64. FIRST_FEW_BYTES = 8000
  65. DEFAULT_DIFF_ALGORITHM = "myers"
  66. class DiffAlgorithmNotAvailable(Exception):
  67. """Raised when a requested diff algorithm is not available."""
  68. def __init__(self, algorithm: str, install_hint: str = "") -> None:
  69. """Initialize exception.
  70. Args:
  71. algorithm: Name of the unavailable algorithm
  72. install_hint: Optional installation hint
  73. """
  74. self.algorithm = algorithm
  75. self.install_hint = install_hint
  76. if install_hint:
  77. super().__init__(
  78. f"Diff algorithm '{algorithm}' requested but not available. {install_hint}"
  79. )
  80. else:
  81. super().__init__(
  82. f"Diff algorithm '{algorithm}' requested but not available."
  83. )
  84. def write_commit_patch(
  85. f: IO[bytes],
  86. commit: "Commit",
  87. contents: str | bytes,
  88. progress: tuple[int, int],
  89. version: str | None = None,
  90. encoding: str | None = None,
  91. ) -> None:
  92. """Write a individual file patch.
  93. Args:
  94. f: File-like object to write to
  95. commit: Commit object
  96. contents: Contents of the patch
  97. progress: tuple with current patch number and total.
  98. version: Version string to include in patch header
  99. encoding: Encoding to use for the patch
  100. Returns:
  101. tuple with filename and contents
  102. """
  103. encoding = encoding or getattr(f, "encoding", "ascii")
  104. if encoding is None:
  105. encoding = "ascii"
  106. if isinstance(contents, str):
  107. contents = contents.encode(encoding)
  108. (num, total) = progress
  109. f.write(
  110. b"From "
  111. + commit.id
  112. + b" "
  113. + time.ctime(commit.commit_time).encode(encoding)
  114. + b"\n"
  115. )
  116. f.write(b"From: " + commit.author + b"\n")
  117. f.write(
  118. b"Date: " + time.strftime("%a, %d %b %Y %H:%M:%S %Z").encode(encoding) + b"\n"
  119. )
  120. f.write(
  121. (f"Subject: [PATCH {num}/{total}] ").encode(encoding) + commit.message + b"\n"
  122. )
  123. f.write(b"\n")
  124. f.write(b"---\n")
  125. try:
  126. import subprocess
  127. p = subprocess.Popen(
  128. ["diffstat"], stdout=subprocess.PIPE, stdin=subprocess.PIPE
  129. )
  130. except (ImportError, OSError):
  131. pass # diffstat not available?
  132. else:
  133. (diffstat, _) = p.communicate(contents)
  134. f.write(diffstat)
  135. f.write(b"\n")
  136. f.write(contents)
  137. f.write(b"-- \n")
  138. if version is None:
  139. from dulwich import __version__ as dulwich_version
  140. f.write(b"Dulwich %d.%d.%d\n" % dulwich_version)
  141. else:
  142. if encoding is None:
  143. encoding = "ascii"
  144. f.write(version.encode(encoding) + b"\n")
  145. def get_summary(commit: "Commit") -> str:
  146. """Determine the summary line for use in a filename.
  147. Args:
  148. commit: Commit
  149. Returns: Summary string
  150. """
  151. decoded = commit.message.decode(errors="replace")
  152. lines = decoded.splitlines()
  153. return lines[0].replace(" ", "-") if lines else ""
  154. # Unified Diff
  155. def _format_range_unified(start: int, stop: int) -> str:
  156. """Convert range to the "ed" format."""
  157. # Per the diff spec at http://www.unix.org/single_unix_specification/
  158. beginning = start + 1 # lines start numbering with one
  159. length = stop - start
  160. if length == 1:
  161. return f"{beginning}"
  162. if not length:
  163. beginning -= 1 # empty ranges begin at line just before the range
  164. return f"{beginning},{length}"
  165. def unified_diff(
  166. a: Sequence[bytes],
  167. b: Sequence[bytes],
  168. fromfile: bytes = b"",
  169. tofile: bytes = b"",
  170. fromfiledate: str = "",
  171. tofiledate: str = "",
  172. n: int = 3,
  173. lineterm: str = "\n",
  174. tree_encoding: str = "utf-8",
  175. output_encoding: str = "utf-8",
  176. ) -> Generator[bytes, None, None]:
  177. """difflib.unified_diff that can detect "No newline at end of file" as original "git diff" does.
  178. Based on the same function in Python2.7 difflib.py
  179. """
  180. started = False
  181. for group in SequenceMatcher(a=a, b=b).get_grouped_opcodes(n):
  182. if not started:
  183. started = True
  184. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  185. todate = f"\t{tofiledate}" if tofiledate else ""
  186. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  187. output_encoding
  188. )
  189. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  190. output_encoding
  191. )
  192. first, last = group[0], group[-1]
  193. file1_range = _format_range_unified(first[1], last[2])
  194. file2_range = _format_range_unified(first[3], last[4])
  195. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  196. for tag, i1, i2, j1, j2 in group:
  197. if tag == "equal":
  198. for line in a[i1:i2]:
  199. yield b" " + line
  200. continue
  201. if tag in ("replace", "delete"):
  202. for line in a[i1:i2]:
  203. if not line[-1:] == b"\n":
  204. line += b"\n\\ No newline at end of file\n"
  205. yield b"-" + line
  206. if tag in ("replace", "insert"):
  207. for line in b[j1:j2]:
  208. if not line[-1:] == b"\n":
  209. line += b"\n\\ No newline at end of file\n"
  210. yield b"+" + line
  211. def _get_sequence_matcher(
  212. algorithm: str, a: Sequence[bytes], b: Sequence[bytes]
  213. ) -> SequenceMatcher[bytes]:
  214. """Get appropriate sequence matcher for the given algorithm.
  215. Args:
  216. algorithm: Diff algorithm ("myers" or "patience")
  217. a: First sequence
  218. b: Second sequence
  219. Returns:
  220. Configured sequence matcher instance
  221. Raises:
  222. DiffAlgorithmNotAvailable: If patience requested but not available
  223. """
  224. if algorithm == "patience":
  225. try:
  226. from patiencediff import PatienceSequenceMatcher
  227. return PatienceSequenceMatcher(None, a, b) # type: ignore[no-any-return,unused-ignore]
  228. except ImportError:
  229. raise DiffAlgorithmNotAvailable(
  230. "patience", "Install with: pip install 'dulwich[patiencediff]'"
  231. )
  232. else:
  233. return SequenceMatcher(a=a, b=b)
  234. def unified_diff_with_algorithm(
  235. a: Sequence[bytes],
  236. b: Sequence[bytes],
  237. fromfile: bytes = b"",
  238. tofile: bytes = b"",
  239. fromfiledate: str = "",
  240. tofiledate: str = "",
  241. n: int = 3,
  242. lineterm: str = "\n",
  243. tree_encoding: str = "utf-8",
  244. output_encoding: str = "utf-8",
  245. algorithm: str | None = None,
  246. ) -> Generator[bytes, None, None]:
  247. """Generate unified diff with specified algorithm.
  248. Args:
  249. a: First sequence of lines
  250. b: Second sequence of lines
  251. fromfile: Name of first file
  252. tofile: Name of second file
  253. fromfiledate: Date of first file
  254. tofiledate: Date of second file
  255. n: Number of context lines
  256. lineterm: Line terminator
  257. tree_encoding: Encoding for tree paths
  258. output_encoding: Encoding for output
  259. algorithm: Diff algorithm to use ("myers" or "patience")
  260. Returns:
  261. Generator yielding diff lines
  262. Raises:
  263. DiffAlgorithmNotAvailable: If patience algorithm requested but patiencediff not available
  264. """
  265. if algorithm is None:
  266. algorithm = DEFAULT_DIFF_ALGORITHM
  267. matcher = _get_sequence_matcher(algorithm, a, b)
  268. started = False
  269. for group in matcher.get_grouped_opcodes(n):
  270. if not started:
  271. started = True
  272. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  273. todate = f"\t{tofiledate}" if tofiledate else ""
  274. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  275. output_encoding
  276. )
  277. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  278. output_encoding
  279. )
  280. first, last = group[0], group[-1]
  281. file1_range = _format_range_unified(first[1], last[2])
  282. file2_range = _format_range_unified(first[3], last[4])
  283. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  284. for tag, i1, i2, j1, j2 in group:
  285. if tag == "equal":
  286. for line in a[i1:i2]:
  287. yield b" " + line
  288. continue
  289. if tag in ("replace", "delete"):
  290. for line in a[i1:i2]:
  291. if not line[-1:] == b"\n":
  292. line += b"\n\\ No newline at end of file\n"
  293. yield b"-" + line
  294. if tag in ("replace", "insert"):
  295. for line in b[j1:j2]:
  296. if not line[-1:] == b"\n":
  297. line += b"\n\\ No newline at end of file\n"
  298. yield b"+" + line
  299. def is_binary(content: bytes) -> bool:
  300. """See if the first few bytes contain any null characters.
  301. Args:
  302. content: Bytestring to check for binary content
  303. """
  304. return b"\0" in content[:FIRST_FEW_BYTES]
  305. def shortid(hexsha: bytes | None) -> bytes:
  306. """Get short object ID.
  307. Args:
  308. hexsha: Full hex SHA or None
  309. Returns:
  310. 7-character short ID
  311. """
  312. if hexsha is None:
  313. return b"0" * 7
  314. else:
  315. return hexsha[:7]
  316. def patch_filename(p: bytes | None, root: bytes) -> bytes:
  317. """Generate patch filename.
  318. Args:
  319. p: Path or None
  320. root: Root directory
  321. Returns:
  322. Full patch filename
  323. """
  324. if p is None:
  325. return b"/dev/null"
  326. else:
  327. return root + b"/" + p
  328. def write_object_diff(
  329. f: IO[bytes],
  330. store: "BaseObjectStore",
  331. old_file: tuple[bytes | None, int | None, ObjectID | None],
  332. new_file: tuple[bytes | None, int | None, ObjectID | None],
  333. diff_binary: bool = False,
  334. diff_algorithm: str | None = None,
  335. ) -> None:
  336. """Write the diff for an object.
  337. Args:
  338. f: File-like object to write to
  339. store: Store to retrieve objects from, if necessary
  340. old_file: (path, mode, hexsha) tuple
  341. new_file: (path, mode, hexsha) tuple
  342. diff_binary: Whether to diff files even if they
  343. are considered binary files by is_binary().
  344. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  345. Note: the tuple elements should be None for nonexistent files
  346. """
  347. (old_path, old_mode, old_id) = old_file
  348. (new_path, new_mode, new_id) = new_file
  349. patched_old_path = patch_filename(old_path, b"a")
  350. patched_new_path = patch_filename(new_path, b"b")
  351. def content(mode: int | None, hexsha: ObjectID | None) -> Blob:
  352. """Get blob content for a file.
  353. Args:
  354. mode: File mode
  355. hexsha: Object SHA
  356. Returns:
  357. Blob object
  358. """
  359. if hexsha is None:
  360. return Blob.from_string(b"")
  361. elif mode is not None and S_ISGITLINK(mode):
  362. return Blob.from_string(b"Subproject commit " + hexsha + b"\n")
  363. else:
  364. obj = store[hexsha]
  365. if isinstance(obj, Blob):
  366. return obj
  367. else:
  368. # Fallback for non-blob objects
  369. return Blob.from_string(obj.as_raw_string())
  370. def lines(content: "Blob") -> list[bytes]:
  371. """Split blob content into lines.
  372. Args:
  373. content: Blob content
  374. Returns:
  375. List of lines
  376. """
  377. if not content:
  378. return []
  379. else:
  380. return content.splitlines()
  381. f.writelines(
  382. gen_diff_header((old_path, new_path), (old_mode, new_mode), (old_id, new_id))
  383. )
  384. old_content = content(old_mode, old_id)
  385. new_content = content(new_mode, new_id)
  386. if not diff_binary and (is_binary(old_content.data) or is_binary(new_content.data)):
  387. binary_diff = (
  388. b"Binary files "
  389. + patched_old_path
  390. + b" and "
  391. + patched_new_path
  392. + b" differ\n"
  393. )
  394. f.write(binary_diff)
  395. else:
  396. f.writelines(
  397. unified_diff_with_algorithm(
  398. lines(old_content),
  399. lines(new_content),
  400. patched_old_path,
  401. patched_new_path,
  402. algorithm=diff_algorithm,
  403. )
  404. )
  405. # TODO(jelmer): Support writing unicode, rather than bytes.
  406. def gen_diff_header(
  407. paths: tuple[bytes | None, bytes | None],
  408. modes: tuple[int | None, int | None],
  409. shas: tuple[bytes | None, bytes | None],
  410. ) -> Generator[bytes, None, None]:
  411. """Write a blob diff header.
  412. Args:
  413. paths: Tuple with old and new path
  414. modes: Tuple with old and new modes
  415. shas: Tuple with old and new shas
  416. """
  417. (old_path, new_path) = paths
  418. (old_mode, new_mode) = modes
  419. (old_sha, new_sha) = shas
  420. if old_path is None and new_path is not None:
  421. old_path = new_path
  422. if new_path is None and old_path is not None:
  423. new_path = old_path
  424. old_path = patch_filename(old_path, b"a")
  425. new_path = patch_filename(new_path, b"b")
  426. yield b"diff --git " + old_path + b" " + new_path + b"\n"
  427. if old_mode != new_mode:
  428. if new_mode is not None:
  429. if old_mode is not None:
  430. yield (f"old file mode {old_mode:o}\n").encode("ascii")
  431. yield (f"new file mode {new_mode:o}\n").encode("ascii")
  432. else:
  433. yield (f"deleted file mode {old_mode:o}\n").encode("ascii")
  434. yield b"index " + shortid(old_sha) + b".." + shortid(new_sha)
  435. if new_mode is not None and old_mode is not None:
  436. yield (f" {new_mode:o}").encode("ascii")
  437. yield b"\n"
  438. # TODO(jelmer): Support writing unicode, rather than bytes.
  439. def write_blob_diff(
  440. f: IO[bytes],
  441. old_file: tuple[bytes | None, int | None, "Blob | None"],
  442. new_file: tuple[bytes | None, int | None, "Blob | None"],
  443. diff_algorithm: str | None = None,
  444. ) -> None:
  445. """Write blob diff.
  446. Args:
  447. f: File-like object to write to
  448. old_file: (path, mode, hexsha) tuple (None if nonexisting)
  449. new_file: (path, mode, hexsha) tuple (None if nonexisting)
  450. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  451. Note: The use of write_object_diff is recommended over this function.
  452. """
  453. (old_path, old_mode, old_blob) = old_file
  454. (new_path, new_mode, new_blob) = new_file
  455. patched_old_path = patch_filename(old_path, b"a")
  456. patched_new_path = patch_filename(new_path, b"b")
  457. def lines(blob: "Blob | None") -> list[bytes]:
  458. """Split blob content into lines.
  459. Args:
  460. blob: Blob object or None
  461. Returns:
  462. List of lines
  463. """
  464. if blob is not None:
  465. return blob.splitlines()
  466. else:
  467. return []
  468. f.writelines(
  469. gen_diff_header(
  470. (old_path, new_path),
  471. (old_mode, new_mode),
  472. (getattr(old_blob, "id", None), getattr(new_blob, "id", None)),
  473. )
  474. )
  475. old_contents = lines(old_blob)
  476. new_contents = lines(new_blob)
  477. f.writelines(
  478. unified_diff_with_algorithm(
  479. old_contents,
  480. new_contents,
  481. patched_old_path,
  482. patched_new_path,
  483. algorithm=diff_algorithm,
  484. )
  485. )
  486. def write_tree_diff(
  487. f: IO[bytes],
  488. store: "BaseObjectStore",
  489. old_tree: ObjectID | None,
  490. new_tree: ObjectID | None,
  491. diff_binary: bool = False,
  492. diff_algorithm: str | None = None,
  493. ) -> None:
  494. """Write tree diff.
  495. Args:
  496. f: File-like object to write to.
  497. store: Object store to read from
  498. old_tree: Old tree id
  499. new_tree: New tree id
  500. diff_binary: Whether to diff files even if they
  501. are considered binary files by is_binary().
  502. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  503. """
  504. changes = store.tree_changes(old_tree, new_tree)
  505. for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
  506. write_object_diff(
  507. f,
  508. store,
  509. (oldpath, oldmode, oldsha),
  510. (newpath, newmode, newsha),
  511. diff_binary=diff_binary,
  512. diff_algorithm=diff_algorithm,
  513. )
  514. def git_am_patch_split(
  515. f: TextIO | BinaryIO, encoding: str | None = None
  516. ) -> tuple["Commit", bytes, bytes | None]:
  517. """Parse a git-am-style patch and split it up into bits.
  518. Args:
  519. f: File-like object to parse
  520. encoding: Encoding to use when creating Git objects
  521. Returns: Tuple with commit object, diff contents and git version
  522. """
  523. encoding = encoding or getattr(f, "encoding", "ascii")
  524. encoding = encoding or "ascii"
  525. contents = f.read()
  526. if isinstance(contents, bytes):
  527. bparser = email.parser.BytesParser()
  528. msg = bparser.parsebytes(contents)
  529. else:
  530. uparser = email.parser.Parser()
  531. msg = uparser.parsestr(contents)
  532. return parse_patch_message(msg, encoding)
  533. def parse_patch_message(
  534. msg: email.message.Message, encoding: str | None = None
  535. ) -> tuple["Commit", bytes, bytes | None]:
  536. """Extract a Commit object and patch from an e-mail message.
  537. Args:
  538. msg: An email message (email.message.Message)
  539. encoding: Encoding to use to encode Git commits
  540. Returns: Tuple with commit object, diff contents and git version
  541. """
  542. c = Commit()
  543. if encoding is None:
  544. encoding = "ascii"
  545. c.author = msg["from"].encode(encoding)
  546. c.committer = msg["from"].encode(encoding)
  547. try:
  548. patch_tag_start = msg["subject"].index("[PATCH")
  549. except ValueError:
  550. subject = msg["subject"]
  551. else:
  552. close = msg["subject"].index("] ", patch_tag_start)
  553. subject = msg["subject"][close + 2 :]
  554. c.message = (subject.replace("\n", "") + "\n").encode(encoding)
  555. first = True
  556. body = msg.get_payload(decode=True)
  557. if isinstance(body, str):
  558. body = body.encode(encoding)
  559. if isinstance(body, bytes):
  560. lines = body.splitlines(True)
  561. else:
  562. # Handle other types by converting to string first
  563. lines = str(body).encode(encoding).splitlines(True)
  564. line_iter = iter(lines)
  565. for line in line_iter:
  566. if line == b"---\n":
  567. break
  568. if first:
  569. if line.startswith(b"From: "):
  570. c.author = line[len(b"From: ") :].rstrip()
  571. else:
  572. c.message += b"\n" + line
  573. first = False
  574. else:
  575. c.message += line
  576. diff = b""
  577. for line in line_iter:
  578. if line == b"-- \n":
  579. break
  580. diff += line
  581. try:
  582. version = next(line_iter).rstrip(b"\n")
  583. except StopIteration:
  584. version = None
  585. return c, diff, version
  586. def patch_id(diff_data: bytes) -> bytes:
  587. """Compute patch ID for a diff.
  588. The patch ID is computed by normalizing the diff and computing a SHA1 hash.
  589. This follows git's patch-id algorithm which:
  590. 1. Removes whitespace from lines starting with + or -
  591. 2. Replaces line numbers in @@ headers with a canonical form
  592. 3. Computes SHA1 of the result
  593. Args:
  594. diff_data: Raw diff data as bytes
  595. Returns:
  596. SHA1 hash of normalized diff (40-byte hex string)
  597. TODO: This implementation uses a simple line-by-line approach. For better
  598. compatibility with git's patch-id, consider using proper patch parsing that:
  599. - Handles edge cases in diff format (binary diffs, mode changes, etc.)
  600. - Properly parses unified diff format according to the spec
  601. - Matches git's exact normalization algorithm byte-for-byte
  602. See git's patch-id.c for reference implementation.
  603. """
  604. import hashlib
  605. import re
  606. # Normalize the diff for patch-id computation
  607. normalized_lines = []
  608. for line in diff_data.split(b"\n"):
  609. # Skip diff headers (diff --git, index, ---, +++)
  610. if line.startswith(
  611. (
  612. b"diff --git ",
  613. b"index ",
  614. b"--- ",
  615. b"+++ ",
  616. b"new file mode ",
  617. b"old file mode ",
  618. b"deleted file mode ",
  619. b"new mode ",
  620. b"old mode ",
  621. b"similarity index ",
  622. b"dissimilarity index ",
  623. b"rename from ",
  624. b"rename to ",
  625. b"copy from ",
  626. b"copy to ",
  627. )
  628. ):
  629. continue
  630. # Normalize @@ headers to a canonical form
  631. if line.startswith(b"@@"):
  632. # Replace line numbers with canonical form
  633. match = re.match(rb"^@@\s+-\d+(?:,\d+)?\s+\+\d+(?:,\d+)?\s+@@", line)
  634. if match:
  635. # Use canonical hunk header without line numbers
  636. normalized_lines.append(b"@@")
  637. continue
  638. # For +/- lines, strip all whitespace
  639. if line.startswith((b"+", b"-")):
  640. # Keep the +/- prefix but remove all whitespace from the rest
  641. if len(line) > 1:
  642. # Remove all whitespace from the content
  643. content = line[1:].replace(b" ", b"").replace(b"\t", b"")
  644. normalized_lines.append(line[:1] + content)
  645. else:
  646. # Just +/- alone
  647. normalized_lines.append(line[:1])
  648. continue
  649. # Keep context lines and other content as-is
  650. if line.startswith(b" ") or line == b"":
  651. normalized_lines.append(line)
  652. # Join normalized lines and compute SHA1
  653. normalized = b"\n".join(normalized_lines)
  654. return hashlib.sha1(normalized).hexdigest().encode("ascii")
  655. def commit_patch_id(
  656. store: "BaseObjectStore", commit_id: ObjectID | RawObjectID
  657. ) -> bytes:
  658. """Compute patch ID for a commit.
  659. Args:
  660. store: Object store to read objects from
  661. commit_id: Commit ID (40-byte hex string)
  662. Returns:
  663. Patch ID (40-byte hex string)
  664. """
  665. from io import BytesIO
  666. commit = store[commit_id]
  667. assert isinstance(commit, Commit)
  668. # Get the parent tree (or empty tree for root commit)
  669. if commit.parents:
  670. parent = store[commit.parents[0]]
  671. assert isinstance(parent, Commit)
  672. parent_tree = parent.tree
  673. else:
  674. # Root commit - compare against empty tree
  675. parent_tree = None
  676. # Generate diff
  677. diff_output = BytesIO()
  678. write_tree_diff(diff_output, store, parent_tree, commit.tree)
  679. return patch_id(diff_output.getvalue())
  680. @dataclass
  681. class MailinfoResult:
  682. """Result of mailinfo parsing.
  683. Attributes:
  684. author_name: Author's name
  685. author_email: Author's email address
  686. author_date: Author's date (if present in the email)
  687. subject: Processed subject line
  688. message: Commit message body
  689. patch: Patch content
  690. message_id: Message-ID header (if -m/--message-id was used)
  691. """
  692. author_name: str
  693. author_email: str
  694. author_date: str | None
  695. subject: str
  696. message: str
  697. patch: str
  698. message_id: str | None = None
  699. def _munge_subject(subject: str, keep_subject: bool, keep_non_patch: bool) -> str:
  700. """Munge email subject line for commit message.
  701. Args:
  702. subject: Original subject line
  703. keep_subject: If True, keep subject intact (-k option)
  704. keep_non_patch: If True, only strip [PATCH] (-b option)
  705. Returns:
  706. Processed subject line
  707. """
  708. if keep_subject:
  709. return subject
  710. result = subject
  711. # First remove Re: prefixes (they can appear before brackets)
  712. while True:
  713. new_result = re.sub(r"^\s*(?:re|RE|Re):\s*", "", result, flags=re.IGNORECASE)
  714. if new_result == result:
  715. break
  716. result = new_result
  717. # Remove bracketed strings
  718. if keep_non_patch:
  719. # Only remove brackets containing "PATCH"
  720. # Match each bracket individually anywhere in the string
  721. while True:
  722. # Remove PATCH bracket, but be careful with whitespace
  723. new_result = re.sub(
  724. r"\[[^\]]*?PATCH[^\]]*?\](\s+)?", r"\1", result, flags=re.IGNORECASE
  725. )
  726. if new_result == result:
  727. break
  728. result = new_result
  729. else:
  730. # Remove all bracketed strings
  731. while True:
  732. new_result = re.sub(r"^\s*\[.*?\]\s*", "", result)
  733. if new_result == result:
  734. break
  735. result = new_result
  736. # Remove leading/trailing whitespace
  737. result = result.strip()
  738. # Normalize multiple whitespace to single space
  739. result = re.sub(r"\s+", " ", result)
  740. return result
  741. def _find_scissors_line(lines: list[bytes]) -> int | None:
  742. """Find the scissors line in message body.
  743. Args:
  744. lines: List of lines in the message body
  745. Returns:
  746. Index of scissors line, or None if not found
  747. """
  748. scissors_pattern = re.compile(
  749. rb"^(?:>?\s*-+\s*)?(?:8<|>8)?\s*-+\s*$|^(?:>?\s*-+\s*)(?:cut here|scissors)(?:\s*-+)?$",
  750. re.IGNORECASE,
  751. )
  752. for i, line in enumerate(lines):
  753. if scissors_pattern.match(line.strip()):
  754. return i
  755. return None
  756. def mailinfo(
  757. msg: email.message.Message | BinaryIO | TextIO,
  758. keep_subject: bool = False,
  759. keep_non_patch: bool = False,
  760. encoding: str | None = None,
  761. scissors: bool = False,
  762. message_id: bool = False,
  763. ) -> MailinfoResult:
  764. """Extract patch information from an email message.
  765. This function parses an email message and extracts commit metadata
  766. (author, email, subject) and separates the commit message from the
  767. patch content, similar to git mailinfo.
  768. Args:
  769. msg: Email message (email.message.Message object) or file handle to read from
  770. keep_subject: If True, keep subject intact without munging (-k)
  771. keep_non_patch: If True, only strip [PATCH] from brackets (-b)
  772. encoding: Character encoding to use (default: detect from message)
  773. scissors: If True, remove everything before scissors line
  774. message_id: If True, include Message-ID in commit message (-m)
  775. Returns:
  776. MailinfoResult with parsed information
  777. Raises:
  778. ValueError: If message is malformed or missing required fields
  779. """
  780. # Parse message if given a file handle
  781. parsed_msg: email.message.Message
  782. if not isinstance(msg, email.message.Message):
  783. if hasattr(msg, "read"):
  784. content = msg.read()
  785. if isinstance(content, bytes):
  786. bparser = email.parser.BytesParser()
  787. parsed_msg = bparser.parsebytes(content)
  788. else:
  789. sparser = email.parser.Parser()
  790. parsed_msg = sparser.parsestr(content)
  791. else:
  792. raise ValueError("msg must be an email.message.Message or file-like object")
  793. else:
  794. parsed_msg = msg
  795. # Detect encoding from message if not specified
  796. if encoding is None:
  797. encoding = parsed_msg.get_content_charset() or "utf-8"
  798. # Extract author information
  799. from_header = parsed_msg.get("From", "")
  800. if not from_header:
  801. raise ValueError("Email message missing 'From' header")
  802. # Parse "Name <email>" format
  803. author_name, author_email = email.utils.parseaddr(from_header)
  804. if not author_email:
  805. raise ValueError(
  806. f"Could not parse email address from 'From' header: {from_header}"
  807. )
  808. # Extract date
  809. date_header = parsed_msg.get("Date")
  810. author_date = date_header if date_header else None
  811. # Extract and process subject
  812. subject = parsed_msg.get("Subject", "")
  813. if not subject:
  814. subject = "(no subject)"
  815. # Convert Header object to string if needed
  816. subject = str(subject)
  817. # Remove newlines from subject
  818. subject = subject.replace("\n", " ").replace("\r", " ")
  819. subject = _munge_subject(subject, keep_subject, keep_non_patch)
  820. # Extract Message-ID if requested
  821. msg_id = None
  822. if message_id:
  823. msg_id = parsed_msg.get("Message-ID")
  824. # Get message body
  825. body = parsed_msg.get_payload(decode=True)
  826. if body is None:
  827. body = b""
  828. elif isinstance(body, str):
  829. body = body.encode(encoding)
  830. elif not isinstance(body, bytes):
  831. # Handle multipart or other types
  832. body = str(body).encode(encoding)
  833. # Split into lines
  834. lines = body.splitlines(keepends=True)
  835. # Handle scissors
  836. scissors_idx = None
  837. if scissors:
  838. scissors_idx = _find_scissors_line(lines)
  839. if scissors_idx is not None:
  840. # Remove everything up to and including scissors line
  841. lines = lines[scissors_idx + 1 :]
  842. # Separate commit message from patch
  843. # Look for the "---" separator that indicates start of diffstat/patch
  844. message_lines: list[bytes] = []
  845. patch_lines: list[bytes] = []
  846. in_patch = False
  847. for line in lines:
  848. if not in_patch and line == b"---\n":
  849. in_patch = True
  850. patch_lines.append(line)
  851. elif in_patch:
  852. # Stop at signature marker "-- "
  853. if line == b"-- \n":
  854. break
  855. patch_lines.append(line)
  856. else:
  857. message_lines.append(line)
  858. # Build commit message
  859. commit_message = b"".join(message_lines).decode(encoding, errors="replace")
  860. # Clean up commit message
  861. commit_message = commit_message.strip()
  862. # Append Message-ID if requested
  863. if message_id and msg_id:
  864. if commit_message:
  865. commit_message += "\n\n"
  866. commit_message += f"Message-ID: {msg_id}"
  867. # Build patch content
  868. patch_content = b"".join(patch_lines).decode(encoding, errors="replace")
  869. return MailinfoResult(
  870. author_name=author_name,
  871. author_email=author_email,
  872. author_date=author_date,
  873. subject=subject,
  874. message=commit_message,
  875. patch=patch_content,
  876. message_id=msg_id,
  877. )