mbox.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. # mbox.py -- For dealing with mbox files
  2. # Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Classes for dealing with mbox files and Maildir.
  22. This module provides functionality to split mbox files and Maildir
  23. into individual message files, similar to git mailsplit, and to extract
  24. patch information from email messages, similar to git mailinfo.
  25. """
  26. __all__ = [
  27. "mailinfo",
  28. "split_maildir",
  29. "split_mbox",
  30. ]
  31. import mailbox
  32. import os
  33. from collections.abc import Iterable, Iterator
  34. from pathlib import Path
  35. from typing import TYPE_CHECKING, BinaryIO, TextIO
  36. if TYPE_CHECKING:
  37. from .patch import MailinfoResult
  38. def split_mbox(
  39. input_file: str | bytes | BinaryIO,
  40. output_dir: str | bytes | Path,
  41. start_number: int = 1,
  42. precision: int = 4,
  43. keep_cr: bool = False,
  44. mboxrd: bool = False,
  45. ) -> list[str]:
  46. r"""Split an mbox file into individual message files.
  47. Args:
  48. input_file: Path to mbox file or file-like object. If None, reads from stdin.
  49. output_dir: Directory where individual messages will be written
  50. start_number: Starting number for output files (default: 1)
  51. precision: Number of digits for output filenames (default: 4)
  52. keep_cr: If True, preserve \r in lines ending with \r\n (default: False)
  53. mboxrd: If True, treat input as mboxrd format and reverse escaping (default: False)
  54. Returns:
  55. List of output file paths that were created
  56. Raises:
  57. ValueError: If output_dir doesn't exist or isn't a directory
  58. OSError: If there are issues reading/writing files
  59. """
  60. # Convert output_dir to Path for easier manipulation
  61. if isinstance(output_dir, bytes):
  62. output_dir = output_dir.decode("utf-8")
  63. output_path = Path(output_dir)
  64. if not output_path.exists():
  65. raise ValueError(f"Output directory does not exist: {output_dir}")
  66. if not output_path.is_dir():
  67. raise ValueError(f"Output path is not a directory: {output_dir}")
  68. # Open the mbox file
  69. mbox_obj: mailbox.mbox | None = None
  70. mbox_iter: Iterable[mailbox.mboxMessage]
  71. if isinstance(input_file, (str, bytes)):
  72. if isinstance(input_file, bytes):
  73. input_file = input_file.decode("utf-8")
  74. mbox_obj = mailbox.mbox(input_file)
  75. mbox_iter = mbox_obj
  76. else:
  77. # For file-like objects, we need to read and parse manually
  78. mbox_iter = _parse_mbox_from_file(input_file)
  79. try:
  80. output_files = []
  81. msg_number = start_number
  82. for message in mbox_iter:
  83. # Format the output filename with the specified precision
  84. output_filename = f"{msg_number:0{precision}d}"
  85. output_file_path = output_path / output_filename
  86. # Write the message to the output file
  87. with open(output_file_path, "wb") as f:
  88. message_bytes = bytes(message)
  89. # Handle mboxrd format - reverse the escaping
  90. if mboxrd:
  91. message_bytes = _reverse_mboxrd_escaping(message_bytes)
  92. # Handle CR/LF if needed
  93. if not keep_cr:
  94. message_bytes = message_bytes.replace(b"\r\n", b"\n")
  95. # Strip trailing newlines (mailbox module adds separator newlines)
  96. message_bytes = message_bytes.rstrip(b"\n")
  97. if message_bytes:
  98. message_bytes += b"\n"
  99. f.write(message_bytes)
  100. output_files.append(str(output_file_path))
  101. msg_number += 1
  102. return output_files
  103. finally:
  104. if mbox_obj is not None:
  105. mbox_obj.close()
  106. def split_maildir(
  107. maildir_path: str | bytes | Path,
  108. output_dir: str | bytes | Path,
  109. start_number: int = 1,
  110. precision: int = 4,
  111. keep_cr: bool = False,
  112. ) -> list[str]:
  113. r"""Split a Maildir into individual message files.
  114. Maildir splitting relies upon filenames being sorted to output
  115. patches in the correct order.
  116. Args:
  117. maildir_path: Path to the Maildir directory (should contain cur, tmp, new subdirectories)
  118. output_dir: Directory where individual messages will be written
  119. start_number: Starting number for output files (default: 1)
  120. precision: Number of digits for output filenames (default: 4)
  121. keep_cr: If True, preserve \r in lines ending with \r\n (default: False)
  122. Returns:
  123. List of output file paths that were created
  124. Raises:
  125. ValueError: If maildir_path or output_dir don't exist or aren't valid
  126. OSError: If there are issues reading/writing files
  127. """
  128. # Convert paths to Path objects
  129. if isinstance(maildir_path, bytes):
  130. maildir_path = maildir_path.decode("utf-8")
  131. if isinstance(output_dir, bytes):
  132. output_dir = output_dir.decode("utf-8")
  133. maildir = Path(maildir_path)
  134. output_path = Path(output_dir)
  135. if not maildir.exists():
  136. raise ValueError(f"Maildir does not exist: {maildir_path}")
  137. if not maildir.is_dir():
  138. raise ValueError(f"Maildir path is not a directory: {maildir_path}")
  139. if not output_path.exists():
  140. raise ValueError(f"Output directory does not exist: {output_dir}")
  141. if not output_path.is_dir():
  142. raise ValueError(f"Output path is not a directory: {output_dir}")
  143. # Open the Maildir
  144. md = mailbox.Maildir(str(maildir), factory=None)
  145. try:
  146. # Get all messages and sort by their keys to ensure consistent ordering
  147. sorted_keys = sorted(md.keys())
  148. output_files = []
  149. msg_number = start_number
  150. for key in sorted_keys:
  151. message = md[key]
  152. # Format the output filename with the specified precision
  153. output_filename = f"{msg_number:0{precision}d}"
  154. output_file_path = output_path / output_filename
  155. # Write the message to the output file
  156. with open(output_file_path, "wb") as f:
  157. message_bytes = bytes(message)
  158. # Handle CR/LF if needed
  159. if not keep_cr:
  160. message_bytes = message_bytes.replace(b"\r\n", b"\n")
  161. f.write(message_bytes)
  162. output_files.append(str(output_file_path))
  163. msg_number += 1
  164. return output_files
  165. finally:
  166. md.close()
  167. def _parse_mbox_from_file(file_obj: BinaryIO) -> Iterator[mailbox.mboxMessage]:
  168. """Parse mbox format from a file-like object.
  169. Args:
  170. file_obj: Binary file-like object containing mbox data
  171. Yields:
  172. Individual mboxMessage objects
  173. """
  174. import tempfile
  175. # Create a temporary file to hold the mbox data
  176. with tempfile.NamedTemporaryFile(mode="wb", delete=False) as tmp:
  177. tmp.write(file_obj.read())
  178. tmp_path = tmp.name
  179. mbox = mailbox.mbox(tmp_path)
  180. try:
  181. yield from mbox
  182. finally:
  183. mbox.close()
  184. os.unlink(tmp_path)
  185. def _reverse_mboxrd_escaping(message_bytes: bytes) -> bytes:
  186. """Reverse mboxrd escaping (^>+From lines).
  187. In mboxrd format, lines matching ^>+From have one leading ">" removed.
  188. Args:
  189. message_bytes: Message content with mboxrd escaping
  190. Returns:
  191. Message content with escaping reversed
  192. """
  193. lines = message_bytes.split(b"\n")
  194. result_lines = []
  195. for line in lines:
  196. # Check if line matches the pattern ^>+From (one or more > followed by From)
  197. if line.startswith(b">") and line.lstrip(b">").startswith(b"From "):
  198. # Remove one leading ">"
  199. result_lines.append(line[1:])
  200. else:
  201. result_lines.append(line)
  202. return b"\n".join(result_lines)
  203. def mailinfo(
  204. input_file: str | bytes | BinaryIO | TextIO,
  205. keep_subject: bool = False,
  206. keep_non_patch: bool = False,
  207. encoding: str | None = None,
  208. scissors: bool = False,
  209. message_id: bool = False,
  210. ) -> "MailinfoResult":
  211. """Extract patch information from an email message.
  212. High-level wrapper around patch.mailinfo() that handles file I/O.
  213. Args:
  214. input_file: Path to email file or file-like object (binary or text)
  215. keep_subject: If True, keep subject intact without munging (-k)
  216. keep_non_patch: If True, only strip [PATCH] from brackets (-b)
  217. encoding: Character encoding to use (default: detect from message)
  218. scissors: If True, remove everything before scissors line
  219. message_id: If True, include Message-ID in commit message (-m)
  220. Returns:
  221. MailinfoResult with parsed information (from patch.mailinfo)
  222. Raises:
  223. ValueError: If message is malformed or missing required fields
  224. OSError: If there are issues reading the file
  225. """
  226. from .patch import mailinfo as patch_mailinfo
  227. # Handle file path input
  228. if isinstance(input_file, (str, bytes)):
  229. if isinstance(input_file, bytes):
  230. input_file = input_file.decode("utf-8")
  231. with open(input_file, "rb") as f:
  232. return patch_mailinfo(
  233. f,
  234. keep_subject=keep_subject,
  235. keep_non_patch=keep_non_patch,
  236. encoding=encoding,
  237. scissors=scissors,
  238. message_id=message_id,
  239. )
  240. # Handle file-like objects
  241. return patch_mailinfo(
  242. input_file,
  243. keep_subject=keep_subject,
  244. keep_non_patch=keep_non_patch,
  245. encoding=encoding,
  246. scissors=scissors,
  247. message_id=message_id,
  248. )