patch.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. # patch.py -- For dealing with packed-style patches.
  2. # Copyright (C) 2009-2013 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Classes for dealing with git am-style patches.
  22. These patches are basically unified diffs with some extra metadata tacked
  23. on.
  24. """
  25. import email.parser
  26. import time
  27. from collections.abc import Generator
  28. from difflib import SequenceMatcher
  29. from typing import (
  30. TYPE_CHECKING,
  31. BinaryIO,
  32. Optional,
  33. TextIO,
  34. Union,
  35. )
  36. if TYPE_CHECKING:
  37. import email.message
  38. from .object_store import BaseObjectStore
  39. from .objects import S_ISGITLINK, Blob, Commit
  40. FIRST_FEW_BYTES = 8000
  41. def write_commit_patch(
  42. f: BinaryIO,
  43. commit: "Commit",
  44. contents: Union[str, bytes],
  45. progress: tuple[int, int],
  46. version: Optional[str] = None,
  47. encoding: Optional[str] = None,
  48. ) -> None:
  49. """Write a individual file patch.
  50. Args:
  51. commit: Commit object
  52. progress: tuple with current patch number and total.
  53. Returns:
  54. tuple with filename and contents
  55. """
  56. encoding = encoding or getattr(f, "encoding", "ascii")
  57. if encoding is None:
  58. encoding = "ascii"
  59. if isinstance(contents, str):
  60. contents = contents.encode(encoding)
  61. (num, total) = progress
  62. f.write(
  63. b"From "
  64. + commit.id
  65. + b" "
  66. + time.ctime(commit.commit_time).encode(encoding)
  67. + b"\n"
  68. )
  69. f.write(b"From: " + commit.author + b"\n")
  70. f.write(
  71. b"Date: " + time.strftime("%a, %d %b %Y %H:%M:%S %Z").encode(encoding) + b"\n"
  72. )
  73. f.write(
  74. (f"Subject: [PATCH {num}/{total}] ").encode(encoding) + commit.message + b"\n"
  75. )
  76. f.write(b"\n")
  77. f.write(b"---\n")
  78. try:
  79. import subprocess
  80. p = subprocess.Popen(
  81. ["diffstat"], stdout=subprocess.PIPE, stdin=subprocess.PIPE
  82. )
  83. except (ImportError, OSError):
  84. pass # diffstat not available?
  85. else:
  86. (diffstat, _) = p.communicate(contents)
  87. f.write(diffstat)
  88. f.write(b"\n")
  89. f.write(contents)
  90. f.write(b"-- \n")
  91. if version is None:
  92. from dulwich import __version__ as dulwich_version
  93. f.write(b"Dulwich %d.%d.%d\n" % dulwich_version)
  94. else:
  95. if encoding is None:
  96. encoding = "ascii"
  97. f.write(version.encode(encoding) + b"\n")
  98. def get_summary(commit: "Commit") -> str:
  99. """Determine the summary line for use in a filename.
  100. Args:
  101. commit: Commit
  102. Returns: Summary string
  103. """
  104. decoded = commit.message.decode(errors="replace")
  105. lines = decoded.splitlines()
  106. return lines[0].replace(" ", "-") if lines else ""
  107. # Unified Diff
  108. def _format_range_unified(start: int, stop: int) -> str:
  109. """Convert range to the "ed" format."""
  110. # Per the diff spec at http://www.unix.org/single_unix_specification/
  111. beginning = start + 1 # lines start numbering with one
  112. length = stop - start
  113. if length == 1:
  114. return f"{beginning}"
  115. if not length:
  116. beginning -= 1 # empty ranges begin at line just before the range
  117. return f"{beginning},{length}"
  118. def unified_diff(
  119. a: list[bytes],
  120. b: list[bytes],
  121. fromfile: bytes = b"",
  122. tofile: bytes = b"",
  123. fromfiledate: str = "",
  124. tofiledate: str = "",
  125. n: int = 3,
  126. lineterm: str = "\n",
  127. tree_encoding: str = "utf-8",
  128. output_encoding: str = "utf-8",
  129. ) -> Generator[bytes, None, None]:
  130. """difflib.unified_diff that can detect "No newline at end of file" as original "git diff" does.
  131. Based on the same function in Python2.7 difflib.py
  132. """
  133. started = False
  134. for group in SequenceMatcher(a=a, b=b).get_grouped_opcodes(n):
  135. if not started:
  136. started = True
  137. fromdate = f"\t{fromfiledate}" if fromfiledate else ""
  138. todate = f"\t{tofiledate}" if tofiledate else ""
  139. yield f"--- {fromfile.decode(tree_encoding)}{fromdate}{lineterm}".encode(
  140. output_encoding
  141. )
  142. yield f"+++ {tofile.decode(tree_encoding)}{todate}{lineterm}".encode(
  143. output_encoding
  144. )
  145. first, last = group[0], group[-1]
  146. file1_range = _format_range_unified(first[1], last[2])
  147. file2_range = _format_range_unified(first[3], last[4])
  148. yield f"@@ -{file1_range} +{file2_range} @@{lineterm}".encode(output_encoding)
  149. for tag, i1, i2, j1, j2 in group:
  150. if tag == "equal":
  151. for line in a[i1:i2]:
  152. yield b" " + line
  153. continue
  154. if tag in ("replace", "delete"):
  155. for line in a[i1:i2]:
  156. if not line[-1:] == b"\n":
  157. line += b"\n\\ No newline at end of file\n"
  158. yield b"-" + line
  159. if tag in ("replace", "insert"):
  160. for line in b[j1:j2]:
  161. if not line[-1:] == b"\n":
  162. line += b"\n\\ No newline at end of file\n"
  163. yield b"+" + line
  164. def is_binary(content: bytes) -> bool:
  165. """See if the first few bytes contain any null characters.
  166. Args:
  167. content: Bytestring to check for binary content
  168. """
  169. return b"\0" in content[:FIRST_FEW_BYTES]
  170. def shortid(hexsha: Optional[bytes]) -> bytes:
  171. """Get short object ID.
  172. Args:
  173. hexsha: Full hex SHA or None
  174. Returns:
  175. 7-character short ID
  176. """
  177. if hexsha is None:
  178. return b"0" * 7
  179. else:
  180. return hexsha[:7]
  181. def patch_filename(p: Optional[bytes], root: bytes) -> bytes:
  182. """Generate patch filename.
  183. Args:
  184. p: Path or None
  185. root: Root directory
  186. Returns:
  187. Full patch filename
  188. """
  189. if p is None:
  190. return b"/dev/null"
  191. else:
  192. return root + b"/" + p
  193. def write_object_diff(
  194. f: BinaryIO,
  195. store: "BaseObjectStore",
  196. old_file: tuple[Optional[bytes], Optional[int], Optional[bytes]],
  197. new_file: tuple[Optional[bytes], Optional[int], Optional[bytes]],
  198. diff_binary: bool = False,
  199. ) -> None:
  200. """Write the diff for an object.
  201. Args:
  202. f: File-like object to write to
  203. store: Store to retrieve objects from, if necessary
  204. old_file: (path, mode, hexsha) tuple
  205. new_file: (path, mode, hexsha) tuple
  206. diff_binary: Whether to diff files even if they
  207. are considered binary files by is_binary().
  208. Note: the tuple elements should be None for nonexistent files
  209. """
  210. (old_path, old_mode, old_id) = old_file
  211. (new_path, new_mode, new_id) = new_file
  212. patched_old_path = patch_filename(old_path, b"a")
  213. patched_new_path = patch_filename(new_path, b"b")
  214. def content(mode: Optional[int], hexsha: Optional[bytes]) -> Blob:
  215. """Get blob content for a file.
  216. Args:
  217. mode: File mode
  218. hexsha: Object SHA
  219. Returns:
  220. Blob object
  221. """
  222. from typing import cast
  223. if hexsha is None:
  224. return cast(Blob, Blob.from_string(b""))
  225. elif mode is not None and S_ISGITLINK(mode):
  226. return cast(Blob, Blob.from_string(b"Subproject commit " + hexsha + b"\n"))
  227. else:
  228. obj = store[hexsha]
  229. if isinstance(obj, Blob):
  230. return obj
  231. else:
  232. # Fallback for non-blob objects
  233. return cast(Blob, Blob.from_string(obj.as_raw_string()))
  234. def lines(content: "Blob") -> list[bytes]:
  235. """Split blob content into lines.
  236. Args:
  237. content: Blob content
  238. Returns:
  239. List of lines
  240. """
  241. if not content:
  242. return []
  243. else:
  244. return content.splitlines()
  245. f.writelines(
  246. gen_diff_header((old_path, new_path), (old_mode, new_mode), (old_id, new_id))
  247. )
  248. old_content = content(old_mode, old_id)
  249. new_content = content(new_mode, new_id)
  250. if not diff_binary and (is_binary(old_content.data) or is_binary(new_content.data)):
  251. binary_diff = (
  252. b"Binary files "
  253. + patched_old_path
  254. + b" and "
  255. + patched_new_path
  256. + b" differ\n"
  257. )
  258. f.write(binary_diff)
  259. else:
  260. f.writelines(
  261. unified_diff(
  262. lines(old_content),
  263. lines(new_content),
  264. patched_old_path,
  265. patched_new_path,
  266. )
  267. )
  268. # TODO(jelmer): Support writing unicode, rather than bytes.
  269. def gen_diff_header(
  270. paths: tuple[Optional[bytes], Optional[bytes]],
  271. modes: tuple[Optional[int], Optional[int]],
  272. shas: tuple[Optional[bytes], Optional[bytes]],
  273. ) -> Generator[bytes, None, None]:
  274. """Write a blob diff header.
  275. Args:
  276. paths: Tuple with old and new path
  277. modes: Tuple with old and new modes
  278. shas: Tuple with old and new shas
  279. """
  280. (old_path, new_path) = paths
  281. (old_mode, new_mode) = modes
  282. (old_sha, new_sha) = shas
  283. if old_path is None and new_path is not None:
  284. old_path = new_path
  285. if new_path is None and old_path is not None:
  286. new_path = old_path
  287. old_path = patch_filename(old_path, b"a")
  288. new_path = patch_filename(new_path, b"b")
  289. yield b"diff --git " + old_path + b" " + new_path + b"\n"
  290. if old_mode != new_mode:
  291. if new_mode is not None:
  292. if old_mode is not None:
  293. yield (f"old file mode {old_mode:o}\n").encode("ascii")
  294. yield (f"new file mode {new_mode:o}\n").encode("ascii")
  295. else:
  296. yield (f"deleted file mode {old_mode:o}\n").encode("ascii")
  297. yield b"index " + shortid(old_sha) + b".." + shortid(new_sha)
  298. if new_mode is not None and old_mode is not None:
  299. yield (f" {new_mode:o}").encode("ascii")
  300. yield b"\n"
  301. # TODO(jelmer): Support writing unicode, rather than bytes.
  302. def write_blob_diff(
  303. f: BinaryIO,
  304. old_file: tuple[Optional[bytes], Optional[int], Optional["Blob"]],
  305. new_file: tuple[Optional[bytes], Optional[int], Optional["Blob"]],
  306. ) -> None:
  307. """Write blob diff.
  308. Args:
  309. f: File-like object to write to
  310. old_file: (path, mode, hexsha) tuple (None if nonexisting)
  311. new_file: (path, mode, hexsha) tuple (None if nonexisting)
  312. Note: The use of write_object_diff is recommended over this function.
  313. """
  314. (old_path, old_mode, old_blob) = old_file
  315. (new_path, new_mode, new_blob) = new_file
  316. patched_old_path = patch_filename(old_path, b"a")
  317. patched_new_path = patch_filename(new_path, b"b")
  318. def lines(blob: Optional["Blob"]) -> list[bytes]:
  319. """Split blob content into lines.
  320. Args:
  321. blob: Blob object or None
  322. Returns:
  323. List of lines
  324. """
  325. if blob is not None:
  326. return blob.splitlines()
  327. else:
  328. return []
  329. f.writelines(
  330. gen_diff_header(
  331. (old_path, new_path),
  332. (old_mode, new_mode),
  333. (getattr(old_blob, "id", None), getattr(new_blob, "id", None)),
  334. )
  335. )
  336. old_contents = lines(old_blob)
  337. new_contents = lines(new_blob)
  338. f.writelines(
  339. unified_diff(old_contents, new_contents, patched_old_path, patched_new_path)
  340. )
  341. def write_tree_diff(
  342. f: BinaryIO,
  343. store: "BaseObjectStore",
  344. old_tree: Optional[bytes],
  345. new_tree: Optional[bytes],
  346. diff_binary: bool = False,
  347. ) -> None:
  348. """Write tree diff.
  349. Args:
  350. f: File-like object to write to.
  351. old_tree: Old tree id
  352. new_tree: New tree id
  353. diff_binary: Whether to diff files even if they
  354. are considered binary files by is_binary().
  355. """
  356. changes = store.tree_changes(old_tree, new_tree)
  357. for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
  358. write_object_diff(
  359. f,
  360. store,
  361. (oldpath, oldmode, oldsha),
  362. (newpath, newmode, newsha),
  363. diff_binary=diff_binary,
  364. )
  365. def git_am_patch_split(
  366. f: Union[TextIO, BinaryIO], encoding: Optional[str] = None
  367. ) -> tuple["Commit", bytes, Optional[bytes]]:
  368. """Parse a git-am-style patch and split it up into bits.
  369. Args:
  370. f: File-like object to parse
  371. encoding: Encoding to use when creating Git objects
  372. Returns: Tuple with commit object, diff contents and git version
  373. """
  374. encoding = encoding or getattr(f, "encoding", "ascii")
  375. encoding = encoding or "ascii"
  376. contents = f.read()
  377. if isinstance(contents, bytes):
  378. bparser = email.parser.BytesParser()
  379. msg = bparser.parsebytes(contents)
  380. else:
  381. uparser = email.parser.Parser()
  382. msg = uparser.parsestr(contents)
  383. return parse_patch_message(msg, encoding)
  384. def parse_patch_message(
  385. msg: "email.message.Message", encoding: Optional[str] = None
  386. ) -> tuple["Commit", bytes, Optional[bytes]]:
  387. """Extract a Commit object and patch from an e-mail message.
  388. Args:
  389. msg: An email message (email.message.Message)
  390. encoding: Encoding to use to encode Git commits
  391. Returns: Tuple with commit object, diff contents and git version
  392. """
  393. c = Commit()
  394. if encoding is None:
  395. encoding = "ascii"
  396. c.author = msg["from"].encode(encoding)
  397. c.committer = msg["from"].encode(encoding)
  398. try:
  399. patch_tag_start = msg["subject"].index("[PATCH")
  400. except ValueError:
  401. subject = msg["subject"]
  402. else:
  403. close = msg["subject"].index("] ", patch_tag_start)
  404. subject = msg["subject"][close + 2 :]
  405. c.message = (subject.replace("\n", "") + "\n").encode(encoding)
  406. first = True
  407. body = msg.get_payload(decode=True)
  408. if isinstance(body, str):
  409. body = body.encode(encoding)
  410. if isinstance(body, bytes):
  411. lines = body.splitlines(True)
  412. else:
  413. # Handle other types by converting to string first
  414. lines = str(body).encode(encoding).splitlines(True)
  415. line_iter = iter(lines)
  416. for line in line_iter:
  417. if line == b"---\n":
  418. break
  419. if first:
  420. if line.startswith(b"From: "):
  421. c.author = line[len(b"From: ") :].rstrip()
  422. else:
  423. c.message += b"\n" + line
  424. first = False
  425. else:
  426. c.message += line
  427. diff = b""
  428. for line in line_iter:
  429. if line == b"-- \n":
  430. break
  431. diff += line
  432. try:
  433. version = next(line_iter).rstrip(b"\n")
  434. except StopIteration:
  435. version = None
  436. return c, diff, version