patch.py 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004
  1. # patch.py -- For dealing with packed-style patches.
  2. # Copyright (C) 2009-2013 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Classes for dealing with git am-style patches.
  22. These patches are basically unified diffs with some extra metadata tacked
  23. on.
  24. """
  25. import email.message
  26. import email.parser
  27. import email.utils
  28. import re
  29. import time
  30. from collections.abc import Generator, Sequence
  31. from dataclasses import dataclass
  32. from difflib import SequenceMatcher
  33. from typing import (
  34. IO,
  35. TYPE_CHECKING,
  36. BinaryIO,
  37. TextIO,
  38. )
  39. if TYPE_CHECKING:
  40. from .object_store import BaseObjectStore
  41. from .objects import S_ISGITLINK, Blob, Commit
  42. FIRST_FEW_BYTES = 8000
  43. DEFAULT_DIFF_ALGORITHM = "myers"
  44. class DiffAlgorithmNotAvailable(Exception):
  45. """Raised when a requested diff algorithm is not available."""
  46. def __init__(self, algorithm: str, install_hint: str = "") -> None:
  47. """Initialize exception.
  48. Args:
  49. algorithm: Name of the unavailable algorithm
  50. install_hint: Optional installation hint
  51. """
  52. self.algorithm = algorithm
  53. self.install_hint = install_hint
  54. if install_hint:
  55. super().__init__(
  56. f"Diff algorithm '{algorithm}' requested but not available. {install_hint}"
  57. )
  58. else:
  59. super().__init__(
  60. f"Diff algorithm '{algorithm}' requested but not available."
  61. )
  62. def write_commit_patch(
  63. f: IO[bytes],
  64. commit: "Commit",
  65. contents: str | bytes,
  66. progress: tuple[int, int],
  67. version: str | None = None,
  68. encoding: str | None = None,
  69. ) -> None:
  70. """Write a individual file patch.
  71. Args:
  72. f: File-like object to write to
  73. commit: Commit object
  74. contents: Contents of the patch
  75. progress: tuple with current patch number and total.
  76. version: Version string to include in patch header
  77. encoding: Encoding to use for the patch
  78. Returns:
  79. tuple with filename and contents
  80. """
  81. encoding = encoding or getattr(f, "encoding", "ascii")
  82. if encoding is None:
  83. encoding = "ascii"
  84. if isinstance(contents, str):
  85. contents = contents.encode(encoding)
  86. (num, total) = progress
  87. f.write(
  88. b"From "
  89. + commit.id
  90. + b" "
  91. + time.ctime(commit.commit_time).encode(encoding)
  92. + b"\n"
  93. )
  94. f.write(b"From: " + commit.author + b"\n")
  95. f.write(
  96. b"Date: " + time.strftime("%a, %d %b %Y %H:%M:%S %Z").encode(encoding) + b"\n"
  97. )
  98. f.write(
  99. (f"Subject: [PATCH {num}/{total}] ").encode(encoding) + commit.message + b"\n"
  100. )
  101. f.write(b"\n")
  102. f.write(b"---\n")
  103. try:
  104. import subprocess
  105. p = subprocess.Popen(
  106. ["diffstat"], stdout=subprocess.PIPE, stdin=subprocess.PIPE
  107. )
  108. except (ImportError, OSError):
  109. pass # diffstat not available?
  110. else:
  111. (diffstat, _) = p.communicate(contents)
  112. f.write(diffstat)
  113. f.write(b"\n")
  114. f.write(contents)
  115. f.write(b"-- \n")
  116. if version is None:
  117. from dulwich import __version__ as dulwich_version
  118. f.write(b"Dulwich %d.%d.%d\n" % dulwich_version)
  119. else:
  120. if encoding is None:
  121. encoding = "ascii"
  122. f.write(version.encode(encoding) + b"\n")
  123. def get_summary(commit: "Commit") -> str:
  124. """Determine the summary line for use in a filename.
  125. Args:
  126. commit: Commit
  127. Returns: Summary string
  128. """
  129. decoded = commit.message.decode(errors="replace")
  130. lines = decoded.splitlines()
  131. return lines[0].replace(" ", "-") if lines else ""
  132. # Unified Diff
  133. def _format_range_unified(start: int, stop: int) -> str:
  134. """Convert range to the "ed" format."""
  135. # Per the diff spec at http://www.unix.org/single_unix_specification/
  136. beginning = start + 1 # lines start numbering with one
  137. length = stop - start
  138. if length == 1:
  139. return f"{beginning}"
  140. if not length:
  141. beginning -= 1 # empty ranges begin at line just before the range
  142. return f"{beginning},{length}"
  143. def unified_diff(
  144. a: Sequence[bytes],
  145. b: Sequence[bytes],
  146. fromfile: bytes = b"",
  147. tofile: bytes = b"",
  148. fromfiledate: str = "",
  149. tofiledate: str = "",
  150. n: int = 3,
  151. lineterm: str = "\n",
  152. tree_encoding: str = "utf-8",
  153. output_encoding: str = "utf-8",
  154. ) -> Generator[bytes, None, None]:
  155. """difflib.unified_diff that can detect "No newline at end of file" as original "git diff" does.
  156. Based on the same function in Python2.7 difflib.py
  157. """
  158. started = False
  159. for group in SequenceMatcher(a=a, b=b).get_grouped_opcodes(n):
  160. if not started:
  161. started = True
  162. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  163. todate = f"\t{tofiledate}" if tofiledate else ""
  164. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  165. output_encoding
  166. )
  167. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  168. output_encoding
  169. )
  170. first, last = group[0], group[-1]
  171. file1_range = _format_range_unified(first[1], last[2])
  172. file2_range = _format_range_unified(first[3], last[4])
  173. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  174. for tag, i1, i2, j1, j2 in group:
  175. if tag == "equal":
  176. for line in a[i1:i2]:
  177. yield b" " + line
  178. continue
  179. if tag in ("replace", "delete"):
  180. for line in a[i1:i2]:
  181. if not line[-1:] == b"\n":
  182. line += b"\n\\ No newline at end of file\n"
  183. yield b"-" + line
  184. if tag in ("replace", "insert"):
  185. for line in b[j1:j2]:
  186. if not line[-1:] == b"\n":
  187. line += b"\n\\ No newline at end of file\n"
  188. yield b"+" + line
  189. def _get_sequence_matcher(
  190. algorithm: str, a: Sequence[bytes], b: Sequence[bytes]
  191. ) -> SequenceMatcher[bytes]:
  192. """Get appropriate sequence matcher for the given algorithm.
  193. Args:
  194. algorithm: Diff algorithm ("myers" or "patience")
  195. a: First sequence
  196. b: Second sequence
  197. Returns:
  198. Configured sequence matcher instance
  199. Raises:
  200. DiffAlgorithmNotAvailable: If patience requested but not available
  201. """
  202. if algorithm == "patience":
  203. try:
  204. from patiencediff import PatienceSequenceMatcher
  205. return PatienceSequenceMatcher(None, a, b) # type: ignore[no-any-return,unused-ignore]
  206. except ImportError:
  207. raise DiffAlgorithmNotAvailable(
  208. "patience", "Install with: pip install 'dulwich[patiencediff]'"
  209. )
  210. else:
  211. return SequenceMatcher(a=a, b=b)
  212. def unified_diff_with_algorithm(
  213. a: Sequence[bytes],
  214. b: Sequence[bytes],
  215. fromfile: bytes = b"",
  216. tofile: bytes = b"",
  217. fromfiledate: str = "",
  218. tofiledate: str = "",
  219. n: int = 3,
  220. lineterm: str = "\n",
  221. tree_encoding: str = "utf-8",
  222. output_encoding: str = "utf-8",
  223. algorithm: str | None = None,
  224. ) -> Generator[bytes, None, None]:
  225. """Generate unified diff with specified algorithm.
  226. Args:
  227. a: First sequence of lines
  228. b: Second sequence of lines
  229. fromfile: Name of first file
  230. tofile: Name of second file
  231. fromfiledate: Date of first file
  232. tofiledate: Date of second file
  233. n: Number of context lines
  234. lineterm: Line terminator
  235. tree_encoding: Encoding for tree paths
  236. output_encoding: Encoding for output
  237. algorithm: Diff algorithm to use ("myers" or "patience")
  238. Returns:
  239. Generator yielding diff lines
  240. Raises:
  241. DiffAlgorithmNotAvailable: If patience algorithm requested but patiencediff not available
  242. """
  243. if algorithm is None:
  244. algorithm = DEFAULT_DIFF_ALGORITHM
  245. matcher = _get_sequence_matcher(algorithm, a, b)
  246. started = False
  247. for group in matcher.get_grouped_opcodes(n):
  248. if not started:
  249. started = True
  250. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  251. todate = f"\t{tofiledate}" if tofiledate else ""
  252. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  253. output_encoding
  254. )
  255. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  256. output_encoding
  257. )
  258. first, last = group[0], group[-1]
  259. file1_range = _format_range_unified(first[1], last[2])
  260. file2_range = _format_range_unified(first[3], last[4])
  261. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  262. for tag, i1, i2, j1, j2 in group:
  263. if tag == "equal":
  264. for line in a[i1:i2]:
  265. yield b" " + line
  266. continue
  267. if tag in ("replace", "delete"):
  268. for line in a[i1:i2]:
  269. if not line[-1:] == b"\n":
  270. line += b"\n\\ No newline at end of file\n"
  271. yield b"-" + line
  272. if tag in ("replace", "insert"):
  273. for line in b[j1:j2]:
  274. if not line[-1:] == b"\n":
  275. line += b"\n\\ No newline at end of file\n"
  276. yield b"+" + line
  277. def is_binary(content: bytes) -> bool:
  278. """See if the first few bytes contain any null characters.
  279. Args:
  280. content: Bytestring to check for binary content
  281. """
  282. return b"\0" in content[:FIRST_FEW_BYTES]
  283. def shortid(hexsha: bytes | None) -> bytes:
  284. """Get short object ID.
  285. Args:
  286. hexsha: Full hex SHA or None
  287. Returns:
  288. 7-character short ID
  289. """
  290. if hexsha is None:
  291. return b"0" * 7
  292. else:
  293. return hexsha[:7]
  294. def patch_filename(p: bytes | None, root: bytes) -> bytes:
  295. """Generate patch filename.
  296. Args:
  297. p: Path or None
  298. root: Root directory
  299. Returns:
  300. Full patch filename
  301. """
  302. if p is None:
  303. return b"/dev/null"
  304. else:
  305. return root + b"/" + p
  306. def write_object_diff(
  307. f: IO[bytes],
  308. store: "BaseObjectStore",
  309. old_file: tuple[bytes | None, int | None, bytes | None],
  310. new_file: tuple[bytes | None, int | None, bytes | None],
  311. diff_binary: bool = False,
  312. diff_algorithm: str | None = None,
  313. ) -> None:
  314. """Write the diff for an object.
  315. Args:
  316. f: File-like object to write to
  317. store: Store to retrieve objects from, if necessary
  318. old_file: (path, mode, hexsha) tuple
  319. new_file: (path, mode, hexsha) tuple
  320. diff_binary: Whether to diff files even if they
  321. are considered binary files by is_binary().
  322. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  323. Note: the tuple elements should be None for nonexistent files
  324. """
  325. (old_path, old_mode, old_id) = old_file
  326. (new_path, new_mode, new_id) = new_file
  327. patched_old_path = patch_filename(old_path, b"a")
  328. patched_new_path = patch_filename(new_path, b"b")
  329. def content(mode: int | None, hexsha: bytes | None) -> Blob:
  330. """Get blob content for a file.
  331. Args:
  332. mode: File mode
  333. hexsha: Object SHA
  334. Returns:
  335. Blob object
  336. """
  337. if hexsha is None:
  338. return Blob.from_string(b"")
  339. elif mode is not None and S_ISGITLINK(mode):
  340. return Blob.from_string(b"Subproject commit " + hexsha + b"\n")
  341. else:
  342. obj = store[hexsha]
  343. if isinstance(obj, Blob):
  344. return obj
  345. else:
  346. # Fallback for non-blob objects
  347. return Blob.from_string(obj.as_raw_string())
  348. def lines(content: "Blob") -> list[bytes]:
  349. """Split blob content into lines.
  350. Args:
  351. content: Blob content
  352. Returns:
  353. List of lines
  354. """
  355. if not content:
  356. return []
  357. else:
  358. return content.splitlines()
  359. f.writelines(
  360. gen_diff_header((old_path, new_path), (old_mode, new_mode), (old_id, new_id))
  361. )
  362. old_content = content(old_mode, old_id)
  363. new_content = content(new_mode, new_id)
  364. if not diff_binary and (is_binary(old_content.data) or is_binary(new_content.data)):
  365. binary_diff = (
  366. b"Binary files "
  367. + patched_old_path
  368. + b" and "
  369. + patched_new_path
  370. + b" differ\n"
  371. )
  372. f.write(binary_diff)
  373. else:
  374. f.writelines(
  375. unified_diff_with_algorithm(
  376. lines(old_content),
  377. lines(new_content),
  378. patched_old_path,
  379. patched_new_path,
  380. algorithm=diff_algorithm,
  381. )
  382. )
  383. # TODO(jelmer): Support writing unicode, rather than bytes.
  384. def gen_diff_header(
  385. paths: tuple[bytes | None, bytes | None],
  386. modes: tuple[int | None, int | None],
  387. shas: tuple[bytes | None, bytes | None],
  388. ) -> Generator[bytes, None, None]:
  389. """Write a blob diff header.
  390. Args:
  391. paths: Tuple with old and new path
  392. modes: Tuple with old and new modes
  393. shas: Tuple with old and new shas
  394. """
  395. (old_path, new_path) = paths
  396. (old_mode, new_mode) = modes
  397. (old_sha, new_sha) = shas
  398. if old_path is None and new_path is not None:
  399. old_path = new_path
  400. if new_path is None and old_path is not None:
  401. new_path = old_path
  402. old_path = patch_filename(old_path, b"a")
  403. new_path = patch_filename(new_path, b"b")
  404. yield b"diff --git " + old_path + b" " + new_path + b"\n"
  405. if old_mode != new_mode:
  406. if new_mode is not None:
  407. if old_mode is not None:
  408. yield (f"old file mode {old_mode:o}\n").encode("ascii")
  409. yield (f"new file mode {new_mode:o}\n").encode("ascii")
  410. else:
  411. yield (f"deleted file mode {old_mode:o}\n").encode("ascii")
  412. yield b"index " + shortid(old_sha) + b".." + shortid(new_sha)
  413. if new_mode is not None and old_mode is not None:
  414. yield (f" {new_mode:o}").encode("ascii")
  415. yield b"\n"
  416. # TODO(jelmer): Support writing unicode, rather than bytes.
  417. def write_blob_diff(
  418. f: IO[bytes],
  419. old_file: tuple[bytes | None, int | None, "Blob | None"],
  420. new_file: tuple[bytes | None, int | None, "Blob | None"],
  421. diff_algorithm: str | None = None,
  422. ) -> None:
  423. """Write blob diff.
  424. Args:
  425. f: File-like object to write to
  426. old_file: (path, mode, hexsha) tuple (None if nonexisting)
  427. new_file: (path, mode, hexsha) tuple (None if nonexisting)
  428. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  429. Note: The use of write_object_diff is recommended over this function.
  430. """
  431. (old_path, old_mode, old_blob) = old_file
  432. (new_path, new_mode, new_blob) = new_file
  433. patched_old_path = patch_filename(old_path, b"a")
  434. patched_new_path = patch_filename(new_path, b"b")
  435. def lines(blob: "Blob | None") -> list[bytes]:
  436. """Split blob content into lines.
  437. Args:
  438. blob: Blob object or None
  439. Returns:
  440. List of lines
  441. """
  442. if blob is not None:
  443. return blob.splitlines()
  444. else:
  445. return []
  446. f.writelines(
  447. gen_diff_header(
  448. (old_path, new_path),
  449. (old_mode, new_mode),
  450. (getattr(old_blob, "id", None), getattr(new_blob, "id", None)),
  451. )
  452. )
  453. old_contents = lines(old_blob)
  454. new_contents = lines(new_blob)
  455. f.writelines(
  456. unified_diff_with_algorithm(
  457. old_contents,
  458. new_contents,
  459. patched_old_path,
  460. patched_new_path,
  461. algorithm=diff_algorithm,
  462. )
  463. )
  464. def write_tree_diff(
  465. f: IO[bytes],
  466. store: "BaseObjectStore",
  467. old_tree: bytes | None,
  468. new_tree: bytes | None,
  469. diff_binary: bool = False,
  470. diff_algorithm: str | None = None,
  471. ) -> None:
  472. """Write tree diff.
  473. Args:
  474. f: File-like object to write to.
  475. store: Object store to read from
  476. old_tree: Old tree id
  477. new_tree: New tree id
  478. diff_binary: Whether to diff files even if they
  479. are considered binary files by is_binary().
  480. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  481. """
  482. changes = store.tree_changes(old_tree, new_tree)
  483. for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
  484. write_object_diff(
  485. f,
  486. store,
  487. (oldpath, oldmode, oldsha),
  488. (newpath, newmode, newsha),
  489. diff_binary=diff_binary,
  490. diff_algorithm=diff_algorithm,
  491. )
  492. def git_am_patch_split(
  493. f: TextIO | BinaryIO, encoding: str | None = None
  494. ) -> tuple["Commit", bytes, bytes | None]:
  495. """Parse a git-am-style patch and split it up into bits.
  496. Args:
  497. f: File-like object to parse
  498. encoding: Encoding to use when creating Git objects
  499. Returns: Tuple with commit object, diff contents and git version
  500. """
  501. encoding = encoding or getattr(f, "encoding", "ascii")
  502. encoding = encoding or "ascii"
  503. contents = f.read()
  504. if isinstance(contents, bytes):
  505. bparser = email.parser.BytesParser()
  506. msg = bparser.parsebytes(contents)
  507. else:
  508. uparser = email.parser.Parser()
  509. msg = uparser.parsestr(contents)
  510. return parse_patch_message(msg, encoding)
  511. def parse_patch_message(
  512. msg: email.message.Message, encoding: str | None = None
  513. ) -> tuple["Commit", bytes, bytes | None]:
  514. """Extract a Commit object and patch from an e-mail message.
  515. Args:
  516. msg: An email message (email.message.Message)
  517. encoding: Encoding to use to encode Git commits
  518. Returns: Tuple with commit object, diff contents and git version
  519. """
  520. c = Commit()
  521. if encoding is None:
  522. encoding = "ascii"
  523. c.author = msg["from"].encode(encoding)
  524. c.committer = msg["from"].encode(encoding)
  525. try:
  526. patch_tag_start = msg["subject"].index("[PATCH")
  527. except ValueError:
  528. subject = msg["subject"]
  529. else:
  530. close = msg["subject"].index("] ", patch_tag_start)
  531. subject = msg["subject"][close + 2 :]
  532. c.message = (subject.replace("\n", "") + "\n").encode(encoding)
  533. first = True
  534. body = msg.get_payload(decode=True)
  535. if isinstance(body, str):
  536. body = body.encode(encoding)
  537. if isinstance(body, bytes):
  538. lines = body.splitlines(True)
  539. else:
  540. # Handle other types by converting to string first
  541. lines = str(body).encode(encoding).splitlines(True)
  542. line_iter = iter(lines)
  543. for line in line_iter:
  544. if line == b"---\n":
  545. break
  546. if first:
  547. if line.startswith(b"From: "):
  548. c.author = line[len(b"From: ") :].rstrip()
  549. else:
  550. c.message += b"\n" + line
  551. first = False
  552. else:
  553. c.message += line
  554. diff = b""
  555. for line in line_iter:
  556. if line == b"-- \n":
  557. break
  558. diff += line
  559. try:
  560. version = next(line_iter).rstrip(b"\n")
  561. except StopIteration:
  562. version = None
  563. return c, diff, version
  564. def patch_id(diff_data: bytes) -> bytes:
  565. """Compute patch ID for a diff.
  566. The patch ID is computed by normalizing the diff and computing a SHA1 hash.
  567. This follows git's patch-id algorithm which:
  568. 1. Removes whitespace from lines starting with + or -
  569. 2. Replaces line numbers in @@ headers with a canonical form
  570. 3. Computes SHA1 of the result
  571. Args:
  572. diff_data: Raw diff data as bytes
  573. Returns:
  574. SHA1 hash of normalized diff (40-byte hex string)
  575. TODO: This implementation uses a simple line-by-line approach. For better
  576. compatibility with git's patch-id, consider using proper patch parsing that:
  577. - Handles edge cases in diff format (binary diffs, mode changes, etc.)
  578. - Properly parses unified diff format according to the spec
  579. - Matches git's exact normalization algorithm byte-for-byte
  580. See git's patch-id.c for reference implementation.
  581. """
  582. import hashlib
  583. import re
  584. # Normalize the diff for patch-id computation
  585. normalized_lines = []
  586. for line in diff_data.split(b"\n"):
  587. # Skip diff headers (diff --git, index, ---, +++)
  588. if line.startswith(
  589. (
  590. b"diff --git ",
  591. b"index ",
  592. b"--- ",
  593. b"+++ ",
  594. b"new file mode ",
  595. b"old file mode ",
  596. b"deleted file mode ",
  597. b"new mode ",
  598. b"old mode ",
  599. b"similarity index ",
  600. b"dissimilarity index ",
  601. b"rename from ",
  602. b"rename to ",
  603. b"copy from ",
  604. b"copy to ",
  605. )
  606. ):
  607. continue
  608. # Normalize @@ headers to a canonical form
  609. if line.startswith(b"@@"):
  610. # Replace line numbers with canonical form
  611. match = re.match(rb"^@@\s+-\d+(?:,\d+)?\s+\+\d+(?:,\d+)?\s+@@", line)
  612. if match:
  613. # Use canonical hunk header without line numbers
  614. normalized_lines.append(b"@@")
  615. continue
  616. # For +/- lines, strip all whitespace
  617. if line.startswith((b"+", b"-")):
  618. # Keep the +/- prefix but remove all whitespace from the rest
  619. if len(line) > 1:
  620. # Remove all whitespace from the content
  621. content = line[1:].replace(b" ", b"").replace(b"\t", b"")
  622. normalized_lines.append(line[:1] + content)
  623. else:
  624. # Just +/- alone
  625. normalized_lines.append(line[:1])
  626. continue
  627. # Keep context lines and other content as-is
  628. if line.startswith(b" ") or line == b"":
  629. normalized_lines.append(line)
  630. # Join normalized lines and compute SHA1
  631. normalized = b"\n".join(normalized_lines)
  632. return hashlib.sha1(normalized).hexdigest().encode("ascii")
  633. def commit_patch_id(store: "BaseObjectStore", commit_id: bytes) -> bytes:
  634. """Compute patch ID for a commit.
  635. Args:
  636. store: Object store to read objects from
  637. commit_id: Commit ID (40-byte hex string)
  638. Returns:
  639. Patch ID (40-byte hex string)
  640. """
  641. from io import BytesIO
  642. commit = store[commit_id]
  643. assert isinstance(commit, Commit)
  644. # Get the parent tree (or empty tree for root commit)
  645. if commit.parents:
  646. parent = store[commit.parents[0]]
  647. assert isinstance(parent, Commit)
  648. parent_tree = parent.tree
  649. else:
  650. # Root commit - compare against empty tree
  651. parent_tree = None
  652. # Generate diff
  653. diff_output = BytesIO()
  654. write_tree_diff(diff_output, store, parent_tree, commit.tree)
  655. return patch_id(diff_output.getvalue())
  656. @dataclass
  657. class MailinfoResult:
  658. """Result of mailinfo parsing.
  659. Attributes:
  660. author_name: Author's name
  661. author_email: Author's email address
  662. author_date: Author's date (if present in the email)
  663. subject: Processed subject line
  664. message: Commit message body
  665. patch: Patch content
  666. message_id: Message-ID header (if -m/--message-id was used)
  667. """
  668. author_name: str
  669. author_email: str
  670. author_date: str | None
  671. subject: str
  672. message: str
  673. patch: str
  674. message_id: str | None = None
  675. def _munge_subject(subject: str, keep_subject: bool, keep_non_patch: bool) -> str:
  676. """Munge email subject line for commit message.
  677. Args:
  678. subject: Original subject line
  679. keep_subject: If True, keep subject intact (-k option)
  680. keep_non_patch: If True, only strip [PATCH] (-b option)
  681. Returns:
  682. Processed subject line
  683. """
  684. if keep_subject:
  685. return subject
  686. result = subject
  687. # First remove Re: prefixes (they can appear before brackets)
  688. while True:
  689. new_result = re.sub(r"^\s*(?:re|RE|Re):\s*", "", result, flags=re.IGNORECASE)
  690. if new_result == result:
  691. break
  692. result = new_result
  693. # Remove bracketed strings
  694. if keep_non_patch:
  695. # Only remove brackets containing "PATCH"
  696. # Match each bracket individually anywhere in the string
  697. while True:
  698. # Remove PATCH bracket, but be careful with whitespace
  699. new_result = re.sub(
  700. r"\[[^\]]*?PATCH[^\]]*?\](\s+)?", r"\1", result, flags=re.IGNORECASE
  701. )
  702. if new_result == result:
  703. break
  704. result = new_result
  705. else:
  706. # Remove all bracketed strings
  707. while True:
  708. new_result = re.sub(r"^\s*\[.*?\]\s*", "", result)
  709. if new_result == result:
  710. break
  711. result = new_result
  712. # Remove leading/trailing whitespace
  713. result = result.strip()
  714. # Normalize multiple whitespace to single space
  715. result = re.sub(r"\s+", " ", result)
  716. return result
  717. def _find_scissors_line(lines: list[bytes]) -> int | None:
  718. """Find the scissors line in message body.
  719. Args:
  720. lines: List of lines in the message body
  721. Returns:
  722. Index of scissors line, or None if not found
  723. """
  724. scissors_pattern = re.compile(
  725. rb"^(?:>?\s*-+\s*)?(?:8<|>8)?\s*-+\s*$|^(?:>?\s*-+\s*)(?:cut here|scissors)(?:\s*-+)?$",
  726. re.IGNORECASE,
  727. )
  728. for i, line in enumerate(lines):
  729. if scissors_pattern.match(line.strip()):
  730. return i
  731. return None
  732. def mailinfo(
  733. msg: email.message.Message | BinaryIO | TextIO,
  734. keep_subject: bool = False,
  735. keep_non_patch: bool = False,
  736. encoding: str | None = None,
  737. scissors: bool = False,
  738. message_id: bool = False,
  739. ) -> MailinfoResult:
  740. """Extract patch information from an email message.
  741. This function parses an email message and extracts commit metadata
  742. (author, email, subject) and separates the commit message from the
  743. patch content, similar to git mailinfo.
  744. Args:
  745. msg: Email message (email.message.Message object) or file handle to read from
  746. keep_subject: If True, keep subject intact without munging (-k)
  747. keep_non_patch: If True, only strip [PATCH] from brackets (-b)
  748. encoding: Character encoding to use (default: detect from message)
  749. scissors: If True, remove everything before scissors line
  750. message_id: If True, include Message-ID in commit message (-m)
  751. Returns:
  752. MailinfoResult with parsed information
  753. Raises:
  754. ValueError: If message is malformed or missing required fields
  755. """
  756. # Parse message if given a file handle
  757. parsed_msg: email.message.Message
  758. if not isinstance(msg, email.message.Message):
  759. if hasattr(msg, "read"):
  760. content = msg.read()
  761. if isinstance(content, bytes):
  762. bparser = email.parser.BytesParser()
  763. parsed_msg = bparser.parsebytes(content)
  764. else:
  765. sparser = email.parser.Parser()
  766. parsed_msg = sparser.parsestr(content)
  767. else:
  768. raise ValueError("msg must be an email.message.Message or file-like object")
  769. else:
  770. parsed_msg = msg
  771. # Detect encoding from message if not specified
  772. if encoding is None:
  773. encoding = parsed_msg.get_content_charset() or "utf-8"
  774. # Extract author information
  775. from_header = parsed_msg.get("From", "")
  776. if not from_header:
  777. raise ValueError("Email message missing 'From' header")
  778. # Parse "Name <email>" format
  779. author_name, author_email = email.utils.parseaddr(from_header)
  780. if not author_email:
  781. raise ValueError(
  782. f"Could not parse email address from 'From' header: {from_header}"
  783. )
  784. # Extract date
  785. date_header = parsed_msg.get("Date")
  786. author_date = date_header if date_header else None
  787. # Extract and process subject
  788. subject = parsed_msg.get("Subject", "")
  789. if not subject:
  790. subject = "(no subject)"
  791. # Convert Header object to string if needed
  792. subject = str(subject)
  793. # Remove newlines from subject
  794. subject = subject.replace("\n", " ").replace("\r", " ")
  795. subject = _munge_subject(subject, keep_subject, keep_non_patch)
  796. # Extract Message-ID if requested
  797. msg_id = None
  798. if message_id:
  799. msg_id = parsed_msg.get("Message-ID")
  800. # Get message body
  801. body = parsed_msg.get_payload(decode=True)
  802. if body is None:
  803. body = b""
  804. elif isinstance(body, str):
  805. body = body.encode(encoding)
  806. elif not isinstance(body, bytes):
  807. # Handle multipart or other types
  808. body = str(body).encode(encoding)
  809. # Split into lines
  810. lines = body.splitlines(keepends=True)
  811. # Handle scissors
  812. scissors_idx = None
  813. if scissors:
  814. scissors_idx = _find_scissors_line(lines)
  815. if scissors_idx is not None:
  816. # Remove everything up to and including scissors line
  817. lines = lines[scissors_idx + 1 :]
  818. # Separate commit message from patch
  819. # Look for the "---" separator that indicates start of diffstat/patch
  820. message_lines: list[bytes] = []
  821. patch_lines: list[bytes] = []
  822. in_patch = False
  823. for line in lines:
  824. if not in_patch and line == b"---\n":
  825. in_patch = True
  826. patch_lines.append(line)
  827. elif in_patch:
  828. # Stop at signature marker "-- "
  829. if line == b"-- \n":
  830. break
  831. patch_lines.append(line)
  832. else:
  833. message_lines.append(line)
  834. # Build commit message
  835. commit_message = b"".join(message_lines).decode(encoding, errors="replace")
  836. # Clean up commit message
  837. commit_message = commit_message.strip()
  838. # Append Message-ID if requested
  839. if message_id and msg_id:
  840. if commit_message:
  841. commit_message += "\n\n"
  842. commit_message += f"Message-ID: {msg_id}"
  843. # Build patch content
  844. patch_content = b"".join(patch_lines).decode(encoding, errors="replace")
  845. return MailinfoResult(
  846. author_name=author_name,
  847. author_email=author_email,
  848. author_date=author_date,
  849. subject=subject,
  850. message=commit_message,
  851. patch=patch_content,
  852. message_id=msg_id,
  853. )