2
0

patch.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648
  1. # patch.py -- For dealing with packed-style patches.
  2. # Copyright (C) 2009-2013 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Classes for dealing with git am-style patches.
  22. These patches are basically unified diffs with some extra metadata tacked
  23. on.
  24. """
  25. import email.parser
  26. import time
  27. from collections.abc import Generator
  28. from difflib import SequenceMatcher
  29. from typing import (
  30. IO,
  31. TYPE_CHECKING,
  32. BinaryIO,
  33. Optional,
  34. TextIO,
  35. Union,
  36. )
  37. if TYPE_CHECKING:
  38. import email.message
  39. from .object_store import BaseObjectStore
  40. from .objects import S_ISGITLINK, Blob, Commit
  41. FIRST_FEW_BYTES = 8000
  42. DEFAULT_DIFF_ALGORITHM = "myers"
  43. class DiffAlgorithmNotAvailable(Exception):
  44. """Raised when a requested diff algorithm is not available."""
  45. def __init__(self, algorithm: str, install_hint: str = "") -> None:
  46. """Initialize exception.
  47. Args:
  48. algorithm: Name of the unavailable algorithm
  49. install_hint: Optional installation hint
  50. """
  51. self.algorithm = algorithm
  52. self.install_hint = install_hint
  53. if install_hint:
  54. super().__init__(
  55. f"Diff algorithm '{algorithm}' requested but not available. {install_hint}"
  56. )
  57. else:
  58. super().__init__(
  59. f"Diff algorithm '{algorithm}' requested but not available."
  60. )
  61. def write_commit_patch(
  62. f: IO[bytes],
  63. commit: "Commit",
  64. contents: Union[str, bytes],
  65. progress: tuple[int, int],
  66. version: Optional[str] = None,
  67. encoding: Optional[str] = None,
  68. ) -> None:
  69. """Write a individual file patch.
  70. Args:
  71. f: File-like object to write to
  72. commit: Commit object
  73. contents: Contents of the patch
  74. progress: tuple with current patch number and total.
  75. version: Version string to include in patch header
  76. encoding: Encoding to use for the patch
  77. Returns:
  78. tuple with filename and contents
  79. """
  80. encoding = encoding or getattr(f, "encoding", "ascii")
  81. if encoding is None:
  82. encoding = "ascii"
  83. if isinstance(contents, str):
  84. contents = contents.encode(encoding)
  85. (num, total) = progress
  86. f.write(
  87. b"From "
  88. + commit.id
  89. + b" "
  90. + time.ctime(commit.commit_time).encode(encoding)
  91. + b"\n"
  92. )
  93. f.write(b"From: " + commit.author + b"\n")
  94. f.write(
  95. b"Date: " + time.strftime("%a, %d %b %Y %H:%M:%S %Z").encode(encoding) + b"\n"
  96. )
  97. f.write(
  98. (f"Subject: [PATCH {num}/{total}] ").encode(encoding) + commit.message + b"\n"
  99. )
  100. f.write(b"\n")
  101. f.write(b"---\n")
  102. try:
  103. import subprocess
  104. p = subprocess.Popen(
  105. ["diffstat"], stdout=subprocess.PIPE, stdin=subprocess.PIPE
  106. )
  107. except (ImportError, OSError):
  108. pass # diffstat not available?
  109. else:
  110. (diffstat, _) = p.communicate(contents)
  111. f.write(diffstat)
  112. f.write(b"\n")
  113. f.write(contents)
  114. f.write(b"-- \n")
  115. if version is None:
  116. from dulwich import __version__ as dulwich_version
  117. f.write(b"Dulwich %d.%d.%d\n" % dulwich_version)
  118. else:
  119. if encoding is None:
  120. encoding = "ascii"
  121. f.write(version.encode(encoding) + b"\n")
  122. def get_summary(commit: "Commit") -> str:
  123. """Determine the summary line for use in a filename.
  124. Args:
  125. commit: Commit
  126. Returns: Summary string
  127. """
  128. decoded = commit.message.decode(errors="replace")
  129. lines = decoded.splitlines()
  130. return lines[0].replace(" ", "-") if lines else ""
  131. # Unified Diff
  132. def _format_range_unified(start: int, stop: int) -> str:
  133. """Convert range to the "ed" format."""
  134. # Per the diff spec at http://www.unix.org/single_unix_specification/
  135. beginning = start + 1 # lines start numbering with one
  136. length = stop - start
  137. if length == 1:
  138. return f"{beginning}"
  139. if not length:
  140. beginning -= 1 # empty ranges begin at line just before the range
  141. return f"{beginning},{length}"
  142. def unified_diff(
  143. a: list[bytes],
  144. b: list[bytes],
  145. fromfile: bytes = b"",
  146. tofile: bytes = b"",
  147. fromfiledate: str = "",
  148. tofiledate: str = "",
  149. n: int = 3,
  150. lineterm: str = "\n",
  151. tree_encoding: str = "utf-8",
  152. output_encoding: str = "utf-8",
  153. ) -> Generator[bytes, None, None]:
  154. """difflib.unified_diff that can detect "No newline at end of file" as original "git diff" does.
  155. Based on the same function in Python2.7 difflib.py
  156. """
  157. started = False
  158. for group in SequenceMatcher(a=a, b=b).get_grouped_opcodes(n):
  159. if not started:
  160. started = True
  161. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  162. todate = f"\t{tofiledate}" if tofiledate else ""
  163. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  164. output_encoding
  165. )
  166. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  167. output_encoding
  168. )
  169. first, last = group[0], group[-1]
  170. file1_range = _format_range_unified(first[1], last[2])
  171. file2_range = _format_range_unified(first[3], last[4])
  172. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  173. for tag, i1, i2, j1, j2 in group:
  174. if tag == "equal":
  175. for line in a[i1:i2]:
  176. yield b" " + line
  177. continue
  178. if tag in ("replace", "delete"):
  179. for line in a[i1:i2]:
  180. if not line[-1:] == b"\n":
  181. line += b"\n\\ No newline at end of file\n"
  182. yield b"-" + line
  183. if tag in ("replace", "insert"):
  184. for line in b[j1:j2]:
  185. if not line[-1:] == b"\n":
  186. line += b"\n\\ No newline at end of file\n"
  187. yield b"+" + line
  188. def _get_sequence_matcher(algorithm: str, a: list[bytes], b: list[bytes]):
  189. """Get appropriate sequence matcher for the given algorithm.
  190. Args:
  191. algorithm: Diff algorithm ("myers" or "patience")
  192. a: First sequence
  193. b: Second sequence
  194. Returns:
  195. Configured sequence matcher instance
  196. Raises:
  197. DiffAlgorithmNotAvailable: If patience requested but not available
  198. """
  199. if algorithm == "patience":
  200. try:
  201. from patiencediff import PatienceSequenceMatcher
  202. return PatienceSequenceMatcher(None, a, b)
  203. except ImportError:
  204. raise DiffAlgorithmNotAvailable(
  205. "patience", "Install with: pip install 'dulwich[patiencediff]'"
  206. )
  207. else:
  208. return SequenceMatcher(a=a, b=b)
  209. def unified_diff_with_algorithm(
  210. a: list[bytes],
  211. b: list[bytes],
  212. fromfile: bytes = b"",
  213. tofile: bytes = b"",
  214. fromfiledate: str = "",
  215. tofiledate: str = "",
  216. n: int = 3,
  217. lineterm: str = "\n",
  218. tree_encoding: str = "utf-8",
  219. output_encoding: str = "utf-8",
  220. algorithm: Optional[str] = None,
  221. ) -> Generator[bytes, None, None]:
  222. """Generate unified diff with specified algorithm.
  223. Args:
  224. a: First sequence of lines
  225. b: Second sequence of lines
  226. fromfile: Name of first file
  227. tofile: Name of second file
  228. fromfiledate: Date of first file
  229. tofiledate: Date of second file
  230. n: Number of context lines
  231. lineterm: Line terminator
  232. tree_encoding: Encoding for tree paths
  233. output_encoding: Encoding for output
  234. algorithm: Diff algorithm to use ("myers" or "patience")
  235. Returns:
  236. Generator yielding diff lines
  237. Raises:
  238. DiffAlgorithmNotAvailable: If patience algorithm requested but patiencediff not available
  239. """
  240. if algorithm is None:
  241. algorithm = DEFAULT_DIFF_ALGORITHM
  242. matcher = _get_sequence_matcher(algorithm, a, b)
  243. started = False
  244. for group in matcher.get_grouped_opcodes(n):
  245. if not started:
  246. started = True
  247. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  248. todate = f"\t{tofiledate}" if tofiledate else ""
  249. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  250. output_encoding
  251. )
  252. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  253. output_encoding
  254. )
  255. first, last = group[0], group[-1]
  256. file1_range = _format_range_unified(first[1], last[2])
  257. file2_range = _format_range_unified(first[3], last[4])
  258. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  259. for tag, i1, i2, j1, j2 in group:
  260. if tag == "equal":
  261. for line in a[i1:i2]:
  262. yield b" " + line
  263. continue
  264. if tag in ("replace", "delete"):
  265. for line in a[i1:i2]:
  266. if not line[-1:] == b"\n":
  267. line += b"\n\\ No newline at end of file\n"
  268. yield b"-" + line
  269. if tag in ("replace", "insert"):
  270. for line in b[j1:j2]:
  271. if not line[-1:] == b"\n":
  272. line += b"\n\\ No newline at end of file\n"
  273. yield b"+" + line
  274. def is_binary(content: bytes) -> bool:
  275. """See if the first few bytes contain any null characters.
  276. Args:
  277. content: Bytestring to check for binary content
  278. """
  279. return b"\0" in content[:FIRST_FEW_BYTES]
  280. def shortid(hexsha: Optional[bytes]) -> bytes:
  281. """Get short object ID.
  282. Args:
  283. hexsha: Full hex SHA or None
  284. Returns:
  285. 7-character short ID
  286. """
  287. if hexsha is None:
  288. return b"0" * 7
  289. else:
  290. return hexsha[:7]
  291. def patch_filename(p: Optional[bytes], root: bytes) -> bytes:
  292. """Generate patch filename.
  293. Args:
  294. p: Path or None
  295. root: Root directory
  296. Returns:
  297. Full patch filename
  298. """
  299. if p is None:
  300. return b"/dev/null"
  301. else:
  302. return root + b"/" + p
  303. def write_object_diff(
  304. f: IO[bytes],
  305. store: "BaseObjectStore",
  306. old_file: tuple[Optional[bytes], Optional[int], Optional[bytes]],
  307. new_file: tuple[Optional[bytes], Optional[int], Optional[bytes]],
  308. diff_binary: bool = False,
  309. diff_algorithm: Optional[str] = None,
  310. ) -> None:
  311. """Write the diff for an object.
  312. Args:
  313. f: File-like object to write to
  314. store: Store to retrieve objects from, if necessary
  315. old_file: (path, mode, hexsha) tuple
  316. new_file: (path, mode, hexsha) tuple
  317. diff_binary: Whether to diff files even if they
  318. are considered binary files by is_binary().
  319. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  320. Note: the tuple elements should be None for nonexistent files
  321. """
  322. (old_path, old_mode, old_id) = old_file
  323. (new_path, new_mode, new_id) = new_file
  324. patched_old_path = patch_filename(old_path, b"a")
  325. patched_new_path = patch_filename(new_path, b"b")
  326. def content(mode: Optional[int], hexsha: Optional[bytes]) -> Blob:
  327. """Get blob content for a file.
  328. Args:
  329. mode: File mode
  330. hexsha: Object SHA
  331. Returns:
  332. Blob object
  333. """
  334. if hexsha is None:
  335. return Blob.from_string(b"")
  336. elif mode is not None and S_ISGITLINK(mode):
  337. return Blob.from_string(b"Subproject commit " + hexsha + b"\n")
  338. else:
  339. obj = store[hexsha]
  340. if isinstance(obj, Blob):
  341. return obj
  342. else:
  343. # Fallback for non-blob objects
  344. return Blob.from_string(obj.as_raw_string())
  345. def lines(content: "Blob") -> list[bytes]:
  346. """Split blob content into lines.
  347. Args:
  348. content: Blob content
  349. Returns:
  350. List of lines
  351. """
  352. if not content:
  353. return []
  354. else:
  355. return content.splitlines()
  356. f.writelines(
  357. gen_diff_header((old_path, new_path), (old_mode, new_mode), (old_id, new_id))
  358. )
  359. old_content = content(old_mode, old_id)
  360. new_content = content(new_mode, new_id)
  361. if not diff_binary and (is_binary(old_content.data) or is_binary(new_content.data)):
  362. binary_diff = (
  363. b"Binary files "
  364. + patched_old_path
  365. + b" and "
  366. + patched_new_path
  367. + b" differ\n"
  368. )
  369. f.write(binary_diff)
  370. else:
  371. f.writelines(
  372. unified_diff_with_algorithm(
  373. lines(old_content),
  374. lines(new_content),
  375. patched_old_path,
  376. patched_new_path,
  377. algorithm=diff_algorithm,
  378. )
  379. )
  380. # TODO(jelmer): Support writing unicode, rather than bytes.
  381. def gen_diff_header(
  382. paths: tuple[Optional[bytes], Optional[bytes]],
  383. modes: tuple[Optional[int], Optional[int]],
  384. shas: tuple[Optional[bytes], Optional[bytes]],
  385. ) -> Generator[bytes, None, None]:
  386. """Write a blob diff header.
  387. Args:
  388. paths: Tuple with old and new path
  389. modes: Tuple with old and new modes
  390. shas: Tuple with old and new shas
  391. """
  392. (old_path, new_path) = paths
  393. (old_mode, new_mode) = modes
  394. (old_sha, new_sha) = shas
  395. if old_path is None and new_path is not None:
  396. old_path = new_path
  397. if new_path is None and old_path is not None:
  398. new_path = old_path
  399. old_path = patch_filename(old_path, b"a")
  400. new_path = patch_filename(new_path, b"b")
  401. yield b"diff --git " + old_path + b" " + new_path + b"\n"
  402. if old_mode != new_mode:
  403. if new_mode is not None:
  404. if old_mode is not None:
  405. yield (f"old file mode {old_mode:o}\n").encode("ascii")
  406. yield (f"new file mode {new_mode:o}\n").encode("ascii")
  407. else:
  408. yield (f"deleted file mode {old_mode:o}\n").encode("ascii")
  409. yield b"index " + shortid(old_sha) + b".." + shortid(new_sha)
  410. if new_mode is not None and old_mode is not None:
  411. yield (f" {new_mode:o}").encode("ascii")
  412. yield b"\n"
  413. # TODO(jelmer): Support writing unicode, rather than bytes.
  414. def write_blob_diff(
  415. f: IO[bytes],
  416. old_file: tuple[Optional[bytes], Optional[int], Optional["Blob"]],
  417. new_file: tuple[Optional[bytes], Optional[int], Optional["Blob"]],
  418. diff_algorithm: Optional[str] = None,
  419. ) -> None:
  420. """Write blob diff.
  421. Args:
  422. f: File-like object to write to
  423. old_file: (path, mode, hexsha) tuple (None if nonexisting)
  424. new_file: (path, mode, hexsha) tuple (None if nonexisting)
  425. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  426. Note: The use of write_object_diff is recommended over this function.
  427. """
  428. (old_path, old_mode, old_blob) = old_file
  429. (new_path, new_mode, new_blob) = new_file
  430. patched_old_path = patch_filename(old_path, b"a")
  431. patched_new_path = patch_filename(new_path, b"b")
  432. def lines(blob: Optional["Blob"]) -> list[bytes]:
  433. """Split blob content into lines.
  434. Args:
  435. blob: Blob object or None
  436. Returns:
  437. List of lines
  438. """
  439. if blob is not None:
  440. return blob.splitlines()
  441. else:
  442. return []
  443. f.writelines(
  444. gen_diff_header(
  445. (old_path, new_path),
  446. (old_mode, new_mode),
  447. (getattr(old_blob, "id", None), getattr(new_blob, "id", None)),
  448. )
  449. )
  450. old_contents = lines(old_blob)
  451. new_contents = lines(new_blob)
  452. f.writelines(
  453. unified_diff_with_algorithm(
  454. old_contents,
  455. new_contents,
  456. patched_old_path,
  457. patched_new_path,
  458. algorithm=diff_algorithm,
  459. )
  460. )
  461. def write_tree_diff(
  462. f: IO[bytes],
  463. store: "BaseObjectStore",
  464. old_tree: Optional[bytes],
  465. new_tree: Optional[bytes],
  466. diff_binary: bool = False,
  467. diff_algorithm: Optional[str] = None,
  468. ) -> None:
  469. """Write tree diff.
  470. Args:
  471. f: File-like object to write to.
  472. store: Object store to read from
  473. old_tree: Old tree id
  474. new_tree: New tree id
  475. diff_binary: Whether to diff files even if they
  476. are considered binary files by is_binary().
  477. diff_algorithm: Algorithm to use for diffing ("myers" or "patience")
  478. """
  479. changes = store.tree_changes(old_tree, new_tree)
  480. for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
  481. write_object_diff(
  482. f,
  483. store,
  484. (oldpath, oldmode, oldsha),
  485. (newpath, newmode, newsha),
  486. diff_binary=diff_binary,
  487. diff_algorithm=diff_algorithm,
  488. )
  489. def git_am_patch_split(
  490. f: Union[TextIO, BinaryIO], encoding: Optional[str] = None
  491. ) -> tuple["Commit", bytes, Optional[bytes]]:
  492. """Parse a git-am-style patch and split it up into bits.
  493. Args:
  494. f: File-like object to parse
  495. encoding: Encoding to use when creating Git objects
  496. Returns: Tuple with commit object, diff contents and git version
  497. """
  498. encoding = encoding or getattr(f, "encoding", "ascii")
  499. encoding = encoding or "ascii"
  500. contents = f.read()
  501. if isinstance(contents, bytes):
  502. bparser = email.parser.BytesParser()
  503. msg = bparser.parsebytes(contents)
  504. else:
  505. uparser = email.parser.Parser()
  506. msg = uparser.parsestr(contents)
  507. return parse_patch_message(msg, encoding)
  508. def parse_patch_message(
  509. msg: "email.message.Message", encoding: Optional[str] = None
  510. ) -> tuple["Commit", bytes, Optional[bytes]]:
  511. """Extract a Commit object and patch from an e-mail message.
  512. Args:
  513. msg: An email message (email.message.Message)
  514. encoding: Encoding to use to encode Git commits
  515. Returns: Tuple with commit object, diff contents and git version
  516. """
  517. c = Commit()
  518. if encoding is None:
  519. encoding = "ascii"
  520. c.author = msg["from"].encode(encoding)
  521. c.committer = msg["from"].encode(encoding)
  522. try:
  523. patch_tag_start = msg["subject"].index("[PATCH")
  524. except ValueError:
  525. subject = msg["subject"]
  526. else:
  527. close = msg["subject"].index("] ", patch_tag_start)
  528. subject = msg["subject"][close + 2 :]
  529. c.message = (subject.replace("\n", "") + "\n").encode(encoding)
  530. first = True
  531. body = msg.get_payload(decode=True)
  532. if isinstance(body, str):
  533. body = body.encode(encoding)
  534. if isinstance(body, bytes):
  535. lines = body.splitlines(True)
  536. else:
  537. # Handle other types by converting to string first
  538. lines = str(body).encode(encoding).splitlines(True)
  539. line_iter = iter(lines)
  540. for line in line_iter:
  541. if line == b"---\n":
  542. break
  543. if first:
  544. if line.startswith(b"From: "):
  545. c.author = line[len(b"From: ") :].rstrip()
  546. else:
  547. c.message += b"\n" + line
  548. first = False
  549. else:
  550. c.message += line
  551. diff = b""
  552. for line in line_iter:
  553. if line == b"-- \n":
  554. break
  555. diff += line
  556. try:
  557. version = next(line_iter).rstrip(b"\n")
  558. except StopIteration:
  559. version = None
  560. return c, diff, version