midx.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. # midx.py -- Multi-Pack-Index (MIDX) support
  2. # Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Multi-Pack-Index (MIDX) support.
  22. A multi-pack-index (MIDX) provides a single index that covers multiple pack files,
  23. enabling fast object lookup across all packs without opening each pack index.
  24. The MIDX file format consists of:
  25. - A header with signature, version, and hash algorithm
  26. - A chunk lookup table
  27. - Multiple chunks containing pack names, OID fanout, OID lookup, and object offsets
  28. - A trailer with checksum
  29. This module provides:
  30. - Reading MIDX files
  31. - Writing MIDX files
  32. - Integration with pack-based object stores
  33. """
  34. import os
  35. import struct
  36. from collections.abc import Iterator
  37. from typing import IO, Any
  38. from .file import GitFile, _GitFile
  39. # MIDX signature
  40. MIDX_SIGNATURE = b"MIDX"
  41. # MIDX version
  42. MIDX_VERSION = 1
  43. # Chunk identifiers (4 bytes each)
  44. CHUNK_PNAM = b"PNAM" # Packfile names
  45. CHUNK_OIDF = b"OIDF" # OID fanout table
  46. CHUNK_OIDL = b"OIDL" # OID lookup table
  47. CHUNK_OOFF = b"OOFF" # Object offsets
  48. CHUNK_LOFF = b"LOFF" # Large offsets (optional)
  49. CHUNK_BTMP = b"BTMP" # Bitmapped packfiles (optional)
  50. CHUNK_RIDX = b"RIDX" # Reverse index (optional)
  51. # Hash algorithm identifiers
  52. HASH_ALGORITHM_SHA1 = 1
  53. HASH_ALGORITHM_SHA256 = 2
  54. class MultiPackIndex:
  55. """Multi-pack-index for efficient object lookup across multiple pack files."""
  56. def __init__(
  57. self,
  58. filename: str | os.PathLike[str],
  59. file: IO[bytes] | _GitFile | None = None,
  60. contents: bytes | None = None,
  61. size: int | None = None,
  62. ) -> None:
  63. """Initialize a MultiPackIndex.
  64. Args:
  65. filename: Path to the MIDX file
  66. file: Optional file object
  67. contents: Optional mmap'd contents
  68. size: Optional size of the MIDX file
  69. """
  70. self._filename = os.fspath(filename)
  71. self._file = file
  72. self._size = size
  73. # Instance variables that will be set during parsing
  74. self.version: int
  75. self.hash_algorithm: int
  76. self.hash_size: int
  77. self.chunk_count: int
  78. self.base_midx_files: int
  79. self.pack_count: int
  80. self.pack_names: list[str]
  81. self.object_count: int
  82. self._chunks: dict[bytes, int]
  83. self._fanout_table: list[int]
  84. self._oidl_offset: int
  85. self._ooff_offset: int
  86. self._loff_offset: int
  87. # Load file contents
  88. if contents is None:
  89. if file is None:
  90. with GitFile(filename, "rb") as f:
  91. self._contents, self._size = self._load_file_contents(f, size)
  92. else:
  93. self._contents, self._size = self._load_file_contents(file, size)
  94. else:
  95. self._contents = contents
  96. # Parse header
  97. self._parse_header()
  98. # Parse chunk lookup table
  99. self._parse_chunk_table()
  100. def _load_file_contents(
  101. self, f: IO[bytes] | _GitFile, size: int | None = None
  102. ) -> tuple[bytes | Any, int]:
  103. """Load contents from a file, preferring mmap when possible.
  104. Args:
  105. f: File-like object to load
  106. size: Expected size, or None to determine from file
  107. Returns:
  108. Tuple of (contents, size)
  109. """
  110. # Simplified version - similar to pack.py's _load_file_contents
  111. if size is None:
  112. f.seek(0, 2) # SEEK_END
  113. size = f.tell()
  114. f.seek(0)
  115. # For now, just read the entire file into memory
  116. # TODO: Add mmap support for large files
  117. contents = f.read(size)
  118. return contents, size
  119. def _parse_header(self) -> None:
  120. """Parse the MIDX header."""
  121. if len(self._contents) < 12:
  122. raise ValueError("MIDX file too small")
  123. # Check signature
  124. signature = self._contents[0:4]
  125. if signature != MIDX_SIGNATURE:
  126. raise ValueError(f"Invalid MIDX signature: {signature!r}")
  127. # Read version
  128. self.version = self._contents[4]
  129. if self.version != MIDX_VERSION:
  130. raise ValueError(f"Unsupported MIDX version: {self.version}")
  131. # Read object ID version (hash algorithm)
  132. self.hash_algorithm = self._contents[5]
  133. if self.hash_algorithm == HASH_ALGORITHM_SHA1:
  134. self.hash_size = 20
  135. elif self.hash_algorithm == HASH_ALGORITHM_SHA256:
  136. self.hash_size = 32
  137. else:
  138. raise ValueError(f"Unknown hash algorithm: {self.hash_algorithm}")
  139. # Read chunk count
  140. self.chunk_count = self._contents[6]
  141. # Read base MIDX files count (currently always 0)
  142. self.base_midx_files = self._contents[7]
  143. if self.base_midx_files != 0:
  144. raise ValueError("Incremental MIDX not yet supported")
  145. # Read pack file count
  146. (self.pack_count,) = struct.unpack(">L", self._contents[8:12])
  147. def _parse_chunk_table(self) -> None:
  148. """Parse the chunk lookup table."""
  149. self._chunks = {}
  150. # Chunk table starts at offset 12
  151. offset = 12
  152. # Each chunk entry is 12 bytes (4-byte ID + 8-byte offset)
  153. for i in range(self.chunk_count + 1): # +1 for terminator
  154. chunk_id = self._contents[offset : offset + 4]
  155. (chunk_offset,) = struct.unpack(
  156. ">Q", self._contents[offset + 4 : offset + 12]
  157. )
  158. if chunk_id == b"\x00\x00\x00\x00":
  159. # Terminator entry
  160. break
  161. self._chunks[chunk_id] = chunk_offset
  162. offset += 12
  163. # Parse required chunks
  164. self._parse_pnam_chunk()
  165. self._parse_oidf_chunk()
  166. self._parse_oidl_chunk()
  167. self._parse_ooff_chunk()
  168. # Parse optional chunks
  169. if CHUNK_LOFF in self._chunks:
  170. self._parse_loff_chunk()
  171. def _parse_pnam_chunk(self) -> None:
  172. """Parse the Packfile Names (PNAM) chunk."""
  173. if CHUNK_PNAM not in self._chunks:
  174. raise ValueError("Required PNAM chunk not found")
  175. offset = self._chunks[CHUNK_PNAM]
  176. self.pack_names = []
  177. # Find the end of the PNAM chunk (next chunk or end of chunks section)
  178. next_offset = min(
  179. (o for o in self._chunks.values() if o > offset),
  180. default=len(self._contents),
  181. )
  182. # Parse null-terminated pack names
  183. current = offset
  184. while current < next_offset:
  185. # Find the next null terminator
  186. null_pos = self._contents.find(b"\x00", current, next_offset)
  187. if null_pos == -1:
  188. break
  189. pack_name = self._contents[current:null_pos].decode("utf-8")
  190. if pack_name: # Skip empty strings (padding)
  191. self.pack_names.append(pack_name)
  192. current = null_pos + 1
  193. def _parse_oidf_chunk(self) -> None:
  194. """Parse the OID Fanout (OIDF) chunk."""
  195. if CHUNK_OIDF not in self._chunks:
  196. raise ValueError("Required OIDF chunk not found")
  197. offset = self._chunks[CHUNK_OIDF]
  198. self._fanout_table = []
  199. # Read 256 4-byte entries
  200. for i in range(256):
  201. (count,) = struct.unpack(
  202. ">L", self._contents[offset + i * 4 : offset + i * 4 + 4]
  203. )
  204. self._fanout_table.append(count)
  205. # Total object count is the last entry
  206. self.object_count = self._fanout_table[255]
  207. def _parse_oidl_chunk(self) -> None:
  208. """Parse the OID Lookup (OIDL) chunk."""
  209. if CHUNK_OIDL not in self._chunks:
  210. raise ValueError("Required OIDL chunk not found")
  211. self._oidl_offset = self._chunks[CHUNK_OIDL]
  212. def _parse_ooff_chunk(self) -> None:
  213. """Parse the Object Offsets (OOFF) chunk."""
  214. if CHUNK_OOFF not in self._chunks:
  215. raise ValueError("Required OOFF chunk not found")
  216. self._ooff_offset = self._chunks[CHUNK_OOFF]
  217. def _parse_loff_chunk(self) -> None:
  218. """Parse the Large Offsets (LOFF) chunk."""
  219. self._loff_offset = self._chunks[CHUNK_LOFF]
  220. def __len__(self) -> int:
  221. """Return the number of objects in this MIDX."""
  222. return self.object_count
  223. def _get_oid(self, index: int) -> bytes:
  224. """Get the object ID at the given index.
  225. Args:
  226. index: Index of the object
  227. Returns:
  228. Binary object ID
  229. """
  230. if index < 0 or index >= self.object_count:
  231. raise IndexError(f"Index {index} out of range")
  232. offset = self._oidl_offset + index * self.hash_size
  233. return self._contents[offset : offset + self.hash_size]
  234. def _get_pack_info(self, index: int) -> tuple[int, int]:
  235. """Get pack ID and offset for object at the given index.
  236. Args:
  237. index: Index of the object
  238. Returns:
  239. Tuple of (pack_id, offset)
  240. """
  241. if index < 0 or index >= self.object_count:
  242. raise IndexError(f"Index {index} out of range")
  243. # Each entry is 8 bytes (4-byte pack ID + 4-byte offset)
  244. offset = self._ooff_offset + index * 8
  245. (pack_id,) = struct.unpack(">L", self._contents[offset : offset + 4])
  246. (pack_offset,) = struct.unpack(">L", self._contents[offset + 4 : offset + 8])
  247. # Check if this is a large offset (MSB set)
  248. if pack_offset & 0x80000000:
  249. # Look up in LOFF chunk
  250. if CHUNK_LOFF not in self._chunks:
  251. raise ValueError("Large offset found but no LOFF chunk")
  252. large_index = pack_offset & 0x7FFFFFFF
  253. large_offset_pos = self._loff_offset + large_index * 8
  254. (pack_offset,) = struct.unpack(
  255. ">Q", self._contents[large_offset_pos : large_offset_pos + 8]
  256. )
  257. return pack_id, pack_offset
  258. def object_offset(self, sha: bytes) -> tuple[str, int] | None:
  259. """Return the pack name and offset for the given object.
  260. Args:
  261. sha: Binary SHA-1 or SHA-256 hash
  262. Returns:
  263. Tuple of (pack_name, offset) or None if not found
  264. """
  265. if len(sha) != self.hash_size:
  266. raise ValueError(
  267. f"SHA size mismatch: expected {self.hash_size}, got {len(sha)}"
  268. )
  269. # Use fanout table to narrow search range
  270. first_byte = sha[0]
  271. start_idx = 0 if first_byte == 0 else self._fanout_table[first_byte - 1]
  272. end_idx = self._fanout_table[first_byte]
  273. # Binary search within the range
  274. while start_idx < end_idx:
  275. mid = (start_idx + end_idx) // 2
  276. mid_sha = self._get_oid(mid)
  277. if mid_sha == sha:
  278. # Found it!
  279. pack_id, offset = self._get_pack_info(mid)
  280. return self.pack_names[pack_id], offset
  281. elif mid_sha < sha:
  282. start_idx = mid + 1
  283. else:
  284. end_idx = mid
  285. return None
  286. def __contains__(self, sha: bytes) -> bool:
  287. """Check if the given object SHA is in this MIDX.
  288. Args:
  289. sha: Binary SHA hash
  290. Returns:
  291. True if the object is in this MIDX
  292. """
  293. return self.object_offset(sha) is not None
  294. def iterentries(self) -> Iterator[tuple[bytes, str, int]]:
  295. """Iterate over all entries in this MIDX.
  296. Yields:
  297. Tuples of (sha, pack_name, offset)
  298. """
  299. for i in range(self.object_count):
  300. sha = self._get_oid(i)
  301. pack_id, offset = self._get_pack_info(i)
  302. pack_name = self.pack_names[pack_id]
  303. yield sha, pack_name, offset
  304. def close(self) -> None:
  305. """Close the MIDX file."""
  306. if self._file is not None:
  307. self._file.close()
  308. self._file = None
  309. def load_midx(path: str | os.PathLike[str]) -> MultiPackIndex:
  310. """Load a multi-pack-index file by path.
  311. Args:
  312. path: Path to the MIDX file
  313. Returns:
  314. A MultiPackIndex loaded from the given path
  315. """
  316. with GitFile(path, "rb") as f:
  317. return load_midx_file(path, f)
  318. def load_midx_file(
  319. path: str | os.PathLike[str], f: IO[bytes] | _GitFile
  320. ) -> MultiPackIndex:
  321. """Load a multi-pack-index from a file-like object.
  322. Args:
  323. path: Path for the MIDX file
  324. f: File-like object
  325. Returns:
  326. A MultiPackIndex loaded from the given file
  327. """
  328. return MultiPackIndex(path, file=f)
  329. # TODO: Implement MIDX writing functionality
  330. # TODO: Implement integration with object_store.py
  331. # TODO: Add support for incremental MIDX chains
  332. # TODO: Add support for BTMP and RIDX chunks for bitmap integration