midx.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. # midx.py -- Multi-Pack-Index (MIDX) support
  2. # Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Multi-Pack-Index (MIDX) support.
  22. A multi-pack-index (MIDX) provides a single index that covers multiple pack files,
  23. enabling fast object lookup across all packs without opening each pack index.
  24. The MIDX file format consists of:
  25. - A header with signature, version, and hash algorithm
  26. - A chunk lookup table
  27. - Multiple chunks containing pack names, OID fanout, OID lookup, and object offsets
  28. - A trailer with checksum
  29. This module provides:
  30. - Reading MIDX files
  31. - Writing MIDX files
  32. - Integration with pack-based object stores
  33. Limitations:
  34. - Incremental MIDX chains are not yet supported (base_midx_files must be 0)
  35. - BTMP (bitmapped packfiles) chunk is not yet implemented
  36. - RIDX (reverse index) chunk is not yet implemented
  37. Note: Incremental MIDX chains were introduced in Git 2.47 as an experimental
  38. feature, where multiple MIDX files can be chained together. The format includes
  39. a base_midx_files field in the header and uses a multi-pack-index.d/ directory
  40. with a multi-pack-index-chain file. This feature is not yet supported by Dulwich
  41. as the specification is still evolving.
  42. """
  43. import os
  44. import struct
  45. from collections.abc import Iterator
  46. from typing import IO, Any
  47. from .file import GitFile, _GitFile
  48. from .pack import SHA1Writer
  49. # MIDX signature
  50. MIDX_SIGNATURE = b"MIDX"
  51. # MIDX version
  52. MIDX_VERSION = 1
  53. # Chunk identifiers (4 bytes each)
  54. CHUNK_PNAM = b"PNAM" # Packfile names
  55. CHUNK_OIDF = b"OIDF" # OID fanout table
  56. CHUNK_OIDL = b"OIDL" # OID lookup table
  57. CHUNK_OOFF = b"OOFF" # Object offsets
  58. CHUNK_LOFF = b"LOFF" # Large offsets (optional)
  59. CHUNK_BTMP = b"BTMP" # Bitmapped packfiles (optional)
  60. CHUNK_RIDX = b"RIDX" # Reverse index (optional)
  61. # Hash algorithm identifiers
  62. HASH_ALGORITHM_SHA1 = 1
  63. HASH_ALGORITHM_SHA256 = 2
  64. class MultiPackIndex:
  65. """Multi-pack-index for efficient object lookup across multiple pack files."""
  66. def __init__(
  67. self,
  68. filename: str | os.PathLike[str],
  69. file: IO[bytes] | _GitFile | None = None,
  70. contents: bytes | None = None,
  71. size: int | None = None,
  72. ) -> None:
  73. """Initialize a MultiPackIndex.
  74. Args:
  75. filename: Path to the MIDX file
  76. file: Optional file object
  77. contents: Optional mmap'd contents
  78. size: Optional size of the MIDX file
  79. """
  80. self._filename = os.fspath(filename)
  81. self._file = file
  82. self._size = size
  83. # Instance variables that will be set during parsing
  84. self.version: int
  85. self.hash_algorithm: int
  86. self.hash_size: int
  87. self.chunk_count: int
  88. self.base_midx_files: int
  89. self.pack_count: int
  90. self.pack_names: list[str]
  91. self.object_count: int
  92. self._chunks: dict[bytes, int]
  93. self._fanout_table: list[int]
  94. self._oidl_offset: int
  95. self._ooff_offset: int
  96. self._loff_offset: int
  97. # Load file contents
  98. if contents is None:
  99. if file is None:
  100. with GitFile(filename, "rb") as f:
  101. self._contents, self._size = self._load_file_contents(f, size)
  102. else:
  103. self._contents, self._size = self._load_file_contents(file, size)
  104. else:
  105. self._contents = contents
  106. # Parse header
  107. self._parse_header()
  108. # Parse chunk lookup table
  109. self._parse_chunk_table()
  110. def _load_file_contents(
  111. self, f: IO[bytes] | _GitFile, size: int | None = None
  112. ) -> tuple[bytes | Any, int]:
  113. """Load contents from a file, preferring mmap when possible.
  114. Args:
  115. f: File-like object to load
  116. size: Expected size, or None to determine from file
  117. Returns:
  118. Tuple of (contents, size)
  119. """
  120. # Simplified version - similar to pack.py's _load_file_contents
  121. if size is None:
  122. f.seek(0, 2) # SEEK_END
  123. size = f.tell()
  124. f.seek(0)
  125. # For now, just read the entire file into memory
  126. # TODO: Add mmap support for large files
  127. contents = f.read(size)
  128. return contents, size
  129. def _parse_header(self) -> None:
  130. """Parse the MIDX header."""
  131. if len(self._contents) < 12:
  132. raise ValueError("MIDX file too small")
  133. # Check signature
  134. signature = self._contents[0:4]
  135. if signature != MIDX_SIGNATURE:
  136. raise ValueError(f"Invalid MIDX signature: {signature!r}")
  137. # Read version
  138. self.version = self._contents[4]
  139. if self.version != MIDX_VERSION:
  140. raise ValueError(f"Unsupported MIDX version: {self.version}")
  141. # Read object ID version (hash algorithm)
  142. self.hash_algorithm = self._contents[5]
  143. if self.hash_algorithm == HASH_ALGORITHM_SHA1:
  144. self.hash_size = 20
  145. elif self.hash_algorithm == HASH_ALGORITHM_SHA256:
  146. self.hash_size = 32
  147. else:
  148. raise ValueError(f"Unknown hash algorithm: {self.hash_algorithm}")
  149. # Read chunk count
  150. self.chunk_count = self._contents[6]
  151. # Read base MIDX files count (currently always 0)
  152. self.base_midx_files = self._contents[7]
  153. if self.base_midx_files != 0:
  154. raise ValueError("Incremental MIDX not yet supported")
  155. # Read pack file count
  156. (self.pack_count,) = struct.unpack(">L", self._contents[8:12])
  157. def _parse_chunk_table(self) -> None:
  158. """Parse the chunk lookup table."""
  159. self._chunks = {}
  160. # Chunk table starts at offset 12
  161. offset = 12
  162. # Each chunk entry is 12 bytes (4-byte ID + 8-byte offset)
  163. for i in range(self.chunk_count + 1): # +1 for terminator
  164. chunk_id = self._contents[offset : offset + 4]
  165. (chunk_offset,) = struct.unpack(
  166. ">Q", self._contents[offset + 4 : offset + 12]
  167. )
  168. if chunk_id == b"\x00\x00\x00\x00":
  169. # Terminator entry
  170. break
  171. self._chunks[chunk_id] = chunk_offset
  172. offset += 12
  173. # Parse required chunks
  174. self._parse_pnam_chunk()
  175. self._parse_oidf_chunk()
  176. self._parse_oidl_chunk()
  177. self._parse_ooff_chunk()
  178. # Parse optional chunks
  179. if CHUNK_LOFF in self._chunks:
  180. self._parse_loff_chunk()
  181. def _parse_pnam_chunk(self) -> None:
  182. """Parse the Packfile Names (PNAM) chunk."""
  183. if CHUNK_PNAM not in self._chunks:
  184. raise ValueError("Required PNAM chunk not found")
  185. offset = self._chunks[CHUNK_PNAM]
  186. self.pack_names = []
  187. # Find the end of the PNAM chunk (next chunk or end of chunks section)
  188. next_offset = min(
  189. (o for o in self._chunks.values() if o > offset),
  190. default=len(self._contents),
  191. )
  192. # Parse null-terminated pack names
  193. current = offset
  194. while current < next_offset:
  195. # Find the next null terminator
  196. null_pos = self._contents.find(b"\x00", current, next_offset)
  197. if null_pos == -1:
  198. break
  199. pack_name = self._contents[current:null_pos].decode("utf-8")
  200. if pack_name: # Skip empty strings (padding)
  201. self.pack_names.append(pack_name)
  202. current = null_pos + 1
  203. def _parse_oidf_chunk(self) -> None:
  204. """Parse the OID Fanout (OIDF) chunk."""
  205. if CHUNK_OIDF not in self._chunks:
  206. raise ValueError("Required OIDF chunk not found")
  207. offset = self._chunks[CHUNK_OIDF]
  208. self._fanout_table = []
  209. # Read 256 4-byte entries
  210. for i in range(256):
  211. (count,) = struct.unpack(
  212. ">L", self._contents[offset + i * 4 : offset + i * 4 + 4]
  213. )
  214. self._fanout_table.append(count)
  215. # Total object count is the last entry
  216. self.object_count = self._fanout_table[255]
  217. def _parse_oidl_chunk(self) -> None:
  218. """Parse the OID Lookup (OIDL) chunk."""
  219. if CHUNK_OIDL not in self._chunks:
  220. raise ValueError("Required OIDL chunk not found")
  221. self._oidl_offset = self._chunks[CHUNK_OIDL]
  222. def _parse_ooff_chunk(self) -> None:
  223. """Parse the Object Offsets (OOFF) chunk."""
  224. if CHUNK_OOFF not in self._chunks:
  225. raise ValueError("Required OOFF chunk not found")
  226. self._ooff_offset = self._chunks[CHUNK_OOFF]
  227. def _parse_loff_chunk(self) -> None:
  228. """Parse the Large Offsets (LOFF) chunk."""
  229. self._loff_offset = self._chunks[CHUNK_LOFF]
  230. def __len__(self) -> int:
  231. """Return the number of objects in this MIDX."""
  232. return self.object_count
  233. def _get_oid(self, index: int) -> bytes:
  234. """Get the object ID at the given index.
  235. Args:
  236. index: Index of the object
  237. Returns:
  238. Binary object ID
  239. """
  240. if index < 0 or index >= self.object_count:
  241. raise IndexError(f"Index {index} out of range")
  242. offset = self._oidl_offset + index * self.hash_size
  243. return self._contents[offset : offset + self.hash_size]
  244. def _get_pack_info(self, index: int) -> tuple[int, int]:
  245. """Get pack ID and offset for object at the given index.
  246. Args:
  247. index: Index of the object
  248. Returns:
  249. Tuple of (pack_id, offset)
  250. """
  251. if index < 0 or index >= self.object_count:
  252. raise IndexError(f"Index {index} out of range")
  253. # Each entry is 8 bytes (4-byte pack ID + 4-byte offset)
  254. offset = self._ooff_offset + index * 8
  255. (pack_id,) = struct.unpack(">L", self._contents[offset : offset + 4])
  256. (pack_offset,) = struct.unpack(">L", self._contents[offset + 4 : offset + 8])
  257. # Check if this is a large offset (MSB set)
  258. if pack_offset & 0x80000000:
  259. # Look up in LOFF chunk
  260. if CHUNK_LOFF not in self._chunks:
  261. raise ValueError("Large offset found but no LOFF chunk")
  262. large_index = pack_offset & 0x7FFFFFFF
  263. large_offset_pos = self._loff_offset + large_index * 8
  264. (pack_offset,) = struct.unpack(
  265. ">Q", self._contents[large_offset_pos : large_offset_pos + 8]
  266. )
  267. return pack_id, pack_offset
  268. def object_offset(self, sha: bytes) -> tuple[str, int] | None:
  269. """Return the pack name and offset for the given object.
  270. Args:
  271. sha: Binary SHA-1 or SHA-256 hash
  272. Returns:
  273. Tuple of (pack_name, offset) or None if not found
  274. """
  275. if len(sha) != self.hash_size:
  276. raise ValueError(
  277. f"SHA size mismatch: expected {self.hash_size}, got {len(sha)}"
  278. )
  279. # Use fanout table to narrow search range
  280. first_byte = sha[0]
  281. start_idx = 0 if first_byte == 0 else self._fanout_table[first_byte - 1]
  282. end_idx = self._fanout_table[first_byte]
  283. # Binary search within the range
  284. while start_idx < end_idx:
  285. mid = (start_idx + end_idx) // 2
  286. mid_sha = self._get_oid(mid)
  287. if mid_sha == sha:
  288. # Found it!
  289. pack_id, offset = self._get_pack_info(mid)
  290. return self.pack_names[pack_id], offset
  291. elif mid_sha < sha:
  292. start_idx = mid + 1
  293. else:
  294. end_idx = mid
  295. return None
  296. def __contains__(self, sha: bytes) -> bool:
  297. """Check if the given object SHA is in this MIDX.
  298. Args:
  299. sha: Binary SHA hash
  300. Returns:
  301. True if the object is in this MIDX
  302. """
  303. return self.object_offset(sha) is not None
  304. def iterentries(self) -> Iterator[tuple[bytes, str, int]]:
  305. """Iterate over all entries in this MIDX.
  306. Yields:
  307. Tuples of (sha, pack_name, offset)
  308. """
  309. for i in range(self.object_count):
  310. sha = self._get_oid(i)
  311. pack_id, offset = self._get_pack_info(i)
  312. pack_name = self.pack_names[pack_id]
  313. yield sha, pack_name, offset
  314. def close(self) -> None:
  315. """Close the MIDX file."""
  316. if self._file is not None:
  317. self._file.close()
  318. self._file = None
  319. def load_midx(path: str | os.PathLike[str]) -> MultiPackIndex:
  320. """Load a multi-pack-index file by path.
  321. Args:
  322. path: Path to the MIDX file
  323. Returns:
  324. A MultiPackIndex loaded from the given path
  325. """
  326. with GitFile(path, "rb") as f:
  327. return load_midx_file(path, f)
  328. def load_midx_file(
  329. path: str | os.PathLike[str], f: IO[bytes] | _GitFile
  330. ) -> MultiPackIndex:
  331. """Load a multi-pack-index from a file-like object.
  332. Args:
  333. path: Path for the MIDX file
  334. f: File-like object
  335. Returns:
  336. A MultiPackIndex loaded from the given file
  337. """
  338. return MultiPackIndex(path, file=f)
  339. def write_midx(
  340. f: IO[bytes],
  341. pack_index_entries: list[tuple[str, list[tuple[bytes, int, int | None]]]],
  342. hash_algorithm: int = HASH_ALGORITHM_SHA1,
  343. ) -> bytes:
  344. """Write a multi-pack-index file.
  345. Args:
  346. f: File-like object to write to
  347. pack_index_entries: List of (pack_name, entries) tuples where entries are
  348. (sha, offset, crc32) tuples, sorted by SHA
  349. hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
  350. Returns:
  351. SHA-1 checksum of the written MIDX file
  352. """
  353. if hash_algorithm == HASH_ALGORITHM_SHA1:
  354. hash_size = 20
  355. elif hash_algorithm == HASH_ALGORITHM_SHA256:
  356. hash_size = 32
  357. else:
  358. raise ValueError(f"Unknown hash algorithm: {hash_algorithm}")
  359. # Wrap file in SHA1Writer to compute checksum
  360. writer = SHA1Writer(f)
  361. # Collect all objects from all packs
  362. all_objects: list[tuple[bytes, int, int]] = [] # (sha, pack_id, offset)
  363. pack_names: list[str] = []
  364. for pack_id, (pack_name, entries) in enumerate(pack_index_entries):
  365. pack_names.append(pack_name)
  366. for sha, offset, _crc32 in entries:
  367. all_objects.append((sha, pack_id, offset))
  368. # Sort all objects by SHA
  369. all_objects.sort(key=lambda x: x[0])
  370. # Calculate offsets for chunks
  371. num_packs = len(pack_names)
  372. num_objects = len(all_objects)
  373. # Header: 12 bytes
  374. header_size = 12
  375. # Chunk count: PNAM, OIDF, OIDL, OOFF, and optionally LOFF
  376. # We'll determine if LOFF is needed later
  377. chunk_count = 4 # PNAM, OIDF, OIDL, OOFF
  378. # Check if we need LOFF chunk (for offsets >= 2^31)
  379. need_loff = any(offset >= 2**31 for _sha, _pack_id, offset in all_objects)
  380. if need_loff:
  381. chunk_count += 1
  382. # Chunk table: (chunk_count + 1) * 12 bytes (including terminator)
  383. chunk_table_size = (chunk_count + 1) * 12
  384. # Calculate chunk offsets
  385. current_offset = header_size + chunk_table_size
  386. # PNAM chunk: pack names as null-terminated strings, padded to 4-byte boundary
  387. pnam_data = b"".join(name.encode("utf-8") + b"\x00" for name in pack_names)
  388. # Pad to 4-byte boundary
  389. pnam_padding = (4 - len(pnam_data) % 4) % 4
  390. pnam_data += b"\x00" * pnam_padding
  391. pnam_offset = current_offset
  392. current_offset += len(pnam_data)
  393. # OIDF chunk: 256 * 4 bytes
  394. oidf_offset = current_offset
  395. oidf_size = 256 * 4
  396. current_offset += oidf_size
  397. # OIDL chunk: num_objects * hash_size bytes
  398. oidl_offset = current_offset
  399. oidl_size = num_objects * hash_size
  400. current_offset += oidl_size
  401. # OOFF chunk: num_objects * 8 bytes (4 for pack_id + 4 for offset)
  402. ooff_offset = current_offset
  403. ooff_size = num_objects * 8
  404. current_offset += ooff_size
  405. # LOFF chunk (if needed): variable size
  406. loff_offset = current_offset if need_loff else 0
  407. large_offsets: list[int] = []
  408. if need_loff:
  409. # We'll populate this as we write OOFF
  410. pass
  411. # Write header
  412. writer.write(MIDX_SIGNATURE) # 4 bytes: signature
  413. writer.write(bytes([MIDX_VERSION])) # 1 byte: version
  414. writer.write(bytes([hash_algorithm])) # 1 byte: hash algorithm
  415. writer.write(bytes([chunk_count])) # 1 byte: chunk count
  416. writer.write(bytes([0])) # 1 byte: base MIDX files (always 0)
  417. writer.write(struct.pack(">L", num_packs)) # 4 bytes: pack count
  418. # Write chunk table
  419. chunk_table = [
  420. (CHUNK_PNAM, pnam_offset),
  421. (CHUNK_OIDF, oidf_offset),
  422. (CHUNK_OIDL, oidl_offset),
  423. (CHUNK_OOFF, ooff_offset),
  424. ]
  425. if need_loff:
  426. chunk_table.append((CHUNK_LOFF, loff_offset))
  427. for chunk_id, chunk_offset in chunk_table:
  428. writer.write(chunk_id) # 4 bytes
  429. writer.write(struct.pack(">Q", chunk_offset)) # 8 bytes
  430. # Write terminator
  431. writer.write(b"\x00\x00\x00\x00") # 4 bytes
  432. writer.write(struct.pack(">Q", 0)) # 8 bytes
  433. # Write PNAM chunk
  434. writer.write(pnam_data)
  435. # Write OIDF chunk (fanout table)
  436. fanout: list[int] = [0] * 256
  437. for sha, _pack_id, _offset in all_objects:
  438. first_byte = sha[0]
  439. fanout[first_byte] += 1
  440. # Convert counts to cumulative
  441. cumulative = 0
  442. for i in range(256):
  443. cumulative += fanout[i]
  444. writer.write(struct.pack(">L", cumulative))
  445. # Write OIDL chunk (object IDs)
  446. for sha, _pack_id, _offset in all_objects:
  447. writer.write(sha)
  448. # Write OOFF chunk (pack ID and offset for each object)
  449. for _sha, pack_id, offset in all_objects:
  450. writer.write(struct.pack(">L", pack_id))
  451. if offset >= 2**31:
  452. # Use large offset table
  453. large_offset_index = len(large_offsets)
  454. large_offsets.append(offset)
  455. # Set MSB to indicate large offset
  456. writer.write(struct.pack(">L", 0x80000000 | large_offset_index))
  457. else:
  458. writer.write(struct.pack(">L", offset))
  459. # Write LOFF chunk if needed
  460. if need_loff:
  461. for large_offset in large_offsets:
  462. writer.write(struct.pack(">Q", large_offset))
  463. # Write checksum
  464. return writer.write_sha()
  465. def write_midx_file(
  466. path: str | os.PathLike[str],
  467. pack_index_entries: list[tuple[str, list[tuple[bytes, int, int | None]]]],
  468. hash_algorithm: int = HASH_ALGORITHM_SHA1,
  469. ) -> bytes:
  470. """Write a multi-pack-index file to disk.
  471. Args:
  472. path: Path where to write the MIDX file
  473. pack_index_entries: List of (pack_name, entries) tuples where entries are
  474. (sha, offset, crc32) tuples, sorted by SHA
  475. hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
  476. Returns:
  477. SHA-1 checksum of the written MIDX file
  478. """
  479. with GitFile(path, "wb") as f:
  480. return write_midx(f, pack_index_entries, hash_algorithm)
  481. # TODO: Add support for incremental MIDX chains
  482. # TODO: Add support for BTMP and RIDX chunks for bitmap integration