midx.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. # midx.py -- Multi-Pack-Index (MIDX) support
  2. # Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Multi-Pack-Index (MIDX) support.
  22. A multi-pack-index (MIDX) provides a single index that covers multiple pack files,
  23. enabling fast object lookup across all packs without opening each pack index.
  24. The MIDX file format consists of:
  25. - A header with signature, version, and hash algorithm
  26. - A chunk lookup table
  27. - Multiple chunks containing pack names, OID fanout, OID lookup, and object offsets
  28. - A trailer with checksum
  29. This module provides:
  30. - Reading MIDX files
  31. - Writing MIDX files
  32. - Integration with pack-based object stores
  33. Limitations:
  34. - Incremental MIDX chains are not yet supported (base_midx_files must be 0)
  35. - BTMP (bitmapped packfiles) chunk is not yet implemented
  36. - RIDX (reverse index) chunk is not yet implemented
  37. Note: Incremental MIDX chains were introduced in Git 2.47 as an experimental
  38. feature, where multiple MIDX files can be chained together. The format includes
  39. a base_midx_files field in the header and uses a multi-pack-index.d/ directory
  40. with a multi-pack-index-chain file. This feature is not yet supported by Dulwich
  41. as the specification is still evolving.
  42. """
  43. import os
  44. import struct
  45. from collections.abc import Iterator
  46. from io import UnsupportedOperation
  47. from typing import IO, Any
  48. try:
  49. import mmap
  50. except ImportError:
  51. has_mmap = False
  52. else:
  53. has_mmap = True
  54. from .file import GitFile, _GitFile
  55. from .pack import SHA1Writer
  56. # MIDX signature
  57. MIDX_SIGNATURE = b"MIDX"
  58. # MIDX version
  59. MIDX_VERSION = 1
  60. # Chunk identifiers (4 bytes each)
  61. CHUNK_PNAM = b"PNAM" # Packfile names
  62. CHUNK_OIDF = b"OIDF" # OID fanout table
  63. CHUNK_OIDL = b"OIDL" # OID lookup table
  64. CHUNK_OOFF = b"OOFF" # Object offsets
  65. CHUNK_LOFF = b"LOFF" # Large offsets (optional)
  66. CHUNK_BTMP = b"BTMP" # Bitmapped packfiles (optional)
  67. CHUNK_RIDX = b"RIDX" # Reverse index (optional)
  68. # Hash algorithm identifiers
  69. HASH_ALGORITHM_SHA1 = 1
  70. HASH_ALGORITHM_SHA256 = 2
  71. class MultiPackIndex:
  72. """Multi-pack-index for efficient object lookup across multiple pack files."""
  73. def __init__(
  74. self,
  75. filename: str | os.PathLike[str],
  76. file: IO[bytes] | _GitFile | None = None,
  77. contents: bytes | None = None,
  78. size: int | None = None,
  79. ) -> None:
  80. """Initialize a MultiPackIndex.
  81. Args:
  82. filename: Path to the MIDX file
  83. file: Optional file object
  84. contents: Optional mmap'd contents
  85. size: Optional size of the MIDX file
  86. """
  87. self._filename = os.fspath(filename)
  88. self._file = file
  89. self._size = size
  90. # Instance variables that will be set during parsing
  91. self.version: int
  92. self.hash_algorithm: int
  93. self.hash_size: int
  94. self.chunk_count: int
  95. self.base_midx_files: int
  96. self.pack_count: int
  97. self.pack_names: list[str]
  98. self.object_count: int
  99. self._chunks: dict[bytes, int]
  100. self._fanout_table: list[int]
  101. self._oidl_offset: int
  102. self._ooff_offset: int
  103. self._loff_offset: int
  104. # Load file contents
  105. if contents is None:
  106. if file is None:
  107. with GitFile(filename, "rb") as f:
  108. self._contents, self._size = self._load_file_contents(f, size)
  109. else:
  110. self._contents, self._size = self._load_file_contents(file, size)
  111. else:
  112. self._contents = contents
  113. # Parse header
  114. self._parse_header()
  115. # Parse chunk lookup table
  116. self._parse_chunk_table()
  117. def _load_file_contents(
  118. self, f: IO[bytes] | _GitFile, size: int | None = None
  119. ) -> tuple[bytes | Any, int]:
  120. """Load contents from a file, preferring mmap when possible.
  121. Args:
  122. f: File-like object to load
  123. size: Expected size, or None to determine from file
  124. Returns:
  125. Tuple of (contents, size)
  126. """
  127. try:
  128. fd = f.fileno()
  129. except (UnsupportedOperation, AttributeError):
  130. fd = None
  131. # Attempt to use mmap if possible
  132. if fd is not None:
  133. if size is None:
  134. size = os.fstat(fd).st_size
  135. if has_mmap:
  136. try:
  137. contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
  138. except (OSError, ValueError):
  139. # Can't mmap - perhaps a socket or invalid file descriptor
  140. pass
  141. else:
  142. return contents, size
  143. # Fall back to reading entire file into memory
  144. contents_bytes = f.read()
  145. size = len(contents_bytes)
  146. return contents_bytes, size
  147. def _parse_header(self) -> None:
  148. """Parse the MIDX header."""
  149. if len(self._contents) < 12:
  150. raise ValueError("MIDX file too small")
  151. # Check signature
  152. signature = self._contents[0:4]
  153. if signature != MIDX_SIGNATURE:
  154. raise ValueError(f"Invalid MIDX signature: {signature!r}")
  155. # Read version
  156. self.version = self._contents[4]
  157. if self.version != MIDX_VERSION:
  158. raise ValueError(f"Unsupported MIDX version: {self.version}")
  159. # Read object ID version (hash algorithm)
  160. self.hash_algorithm = self._contents[5]
  161. if self.hash_algorithm == HASH_ALGORITHM_SHA1:
  162. self.hash_size = 20
  163. elif self.hash_algorithm == HASH_ALGORITHM_SHA256:
  164. self.hash_size = 32
  165. else:
  166. raise ValueError(f"Unknown hash algorithm: {self.hash_algorithm}")
  167. # Read chunk count
  168. self.chunk_count = self._contents[6]
  169. # Read base MIDX files count (currently always 0)
  170. self.base_midx_files = self._contents[7]
  171. if self.base_midx_files != 0:
  172. raise ValueError("Incremental MIDX not yet supported")
  173. # Read pack file count
  174. (self.pack_count,) = struct.unpack(">L", self._contents[8:12])
  175. def _parse_chunk_table(self) -> None:
  176. """Parse the chunk lookup table."""
  177. self._chunks = {}
  178. # Chunk table starts at offset 12
  179. offset = 12
  180. # Each chunk entry is 12 bytes (4-byte ID + 8-byte offset)
  181. for i in range(self.chunk_count + 1): # +1 for terminator
  182. chunk_id = self._contents[offset : offset + 4]
  183. (chunk_offset,) = struct.unpack(
  184. ">Q", self._contents[offset + 4 : offset + 12]
  185. )
  186. if chunk_id == b"\x00\x00\x00\x00":
  187. # Terminator entry
  188. break
  189. self._chunks[chunk_id] = chunk_offset
  190. offset += 12
  191. # Parse required chunks
  192. self._parse_pnam_chunk()
  193. self._parse_oidf_chunk()
  194. self._parse_oidl_chunk()
  195. self._parse_ooff_chunk()
  196. # Parse optional chunks
  197. if CHUNK_LOFF in self._chunks:
  198. self._parse_loff_chunk()
  199. def _parse_pnam_chunk(self) -> None:
  200. """Parse the Packfile Names (PNAM) chunk."""
  201. if CHUNK_PNAM not in self._chunks:
  202. raise ValueError("Required PNAM chunk not found")
  203. offset = self._chunks[CHUNK_PNAM]
  204. self.pack_names = []
  205. # Find the end of the PNAM chunk (next chunk or end of chunks section)
  206. next_offset = min(
  207. (o for o in self._chunks.values() if o > offset),
  208. default=len(self._contents),
  209. )
  210. # Parse null-terminated pack names
  211. current = offset
  212. while current < next_offset:
  213. # Find the next null terminator
  214. null_pos = self._contents.find(b"\x00", current, next_offset)
  215. if null_pos == -1:
  216. break
  217. pack_name = self._contents[current:null_pos].decode("utf-8")
  218. if pack_name: # Skip empty strings (padding)
  219. self.pack_names.append(pack_name)
  220. current = null_pos + 1
  221. def _parse_oidf_chunk(self) -> None:
  222. """Parse the OID Fanout (OIDF) chunk."""
  223. if CHUNK_OIDF not in self._chunks:
  224. raise ValueError("Required OIDF chunk not found")
  225. offset = self._chunks[CHUNK_OIDF]
  226. self._fanout_table = []
  227. # Read 256 4-byte entries
  228. for i in range(256):
  229. (count,) = struct.unpack(
  230. ">L", self._contents[offset + i * 4 : offset + i * 4 + 4]
  231. )
  232. self._fanout_table.append(count)
  233. # Total object count is the last entry
  234. self.object_count = self._fanout_table[255]
  235. def _parse_oidl_chunk(self) -> None:
  236. """Parse the OID Lookup (OIDL) chunk."""
  237. if CHUNK_OIDL not in self._chunks:
  238. raise ValueError("Required OIDL chunk not found")
  239. self._oidl_offset = self._chunks[CHUNK_OIDL]
  240. def _parse_ooff_chunk(self) -> None:
  241. """Parse the Object Offsets (OOFF) chunk."""
  242. if CHUNK_OOFF not in self._chunks:
  243. raise ValueError("Required OOFF chunk not found")
  244. self._ooff_offset = self._chunks[CHUNK_OOFF]
  245. def _parse_loff_chunk(self) -> None:
  246. """Parse the Large Offsets (LOFF) chunk."""
  247. self._loff_offset = self._chunks[CHUNK_LOFF]
  248. def __len__(self) -> int:
  249. """Return the number of objects in this MIDX."""
  250. return self.object_count
  251. def _get_oid(self, index: int) -> bytes:
  252. """Get the object ID at the given index.
  253. Args:
  254. index: Index of the object
  255. Returns:
  256. Binary object ID
  257. """
  258. if index < 0 or index >= self.object_count:
  259. raise IndexError(f"Index {index} out of range")
  260. offset = self._oidl_offset + index * self.hash_size
  261. return self._contents[offset : offset + self.hash_size]
  262. def _get_pack_info(self, index: int) -> tuple[int, int]:
  263. """Get pack ID and offset for object at the given index.
  264. Args:
  265. index: Index of the object
  266. Returns:
  267. Tuple of (pack_id, offset)
  268. """
  269. if index < 0 or index >= self.object_count:
  270. raise IndexError(f"Index {index} out of range")
  271. # Each entry is 8 bytes (4-byte pack ID + 4-byte offset)
  272. offset = self._ooff_offset + index * 8
  273. (pack_id,) = struct.unpack(">L", self._contents[offset : offset + 4])
  274. (pack_offset,) = struct.unpack(">L", self._contents[offset + 4 : offset + 8])
  275. # Check if this is a large offset (MSB set)
  276. if pack_offset & 0x80000000:
  277. # Look up in LOFF chunk
  278. if CHUNK_LOFF not in self._chunks:
  279. raise ValueError("Large offset found but no LOFF chunk")
  280. large_index = pack_offset & 0x7FFFFFFF
  281. large_offset_pos = self._loff_offset + large_index * 8
  282. (pack_offset,) = struct.unpack(
  283. ">Q", self._contents[large_offset_pos : large_offset_pos + 8]
  284. )
  285. return pack_id, pack_offset
  286. def object_offset(self, sha: bytes) -> tuple[str, int] | None:
  287. """Return the pack name and offset for the given object.
  288. Args:
  289. sha: Binary SHA-1 or SHA-256 hash
  290. Returns:
  291. Tuple of (pack_name, offset) or None if not found
  292. """
  293. if len(sha) != self.hash_size:
  294. raise ValueError(
  295. f"SHA size mismatch: expected {self.hash_size}, got {len(sha)}"
  296. )
  297. # Use fanout table to narrow search range
  298. first_byte = sha[0]
  299. start_idx = 0 if first_byte == 0 else self._fanout_table[first_byte - 1]
  300. end_idx = self._fanout_table[first_byte]
  301. # Binary search within the range
  302. while start_idx < end_idx:
  303. mid = (start_idx + end_idx) // 2
  304. mid_sha = self._get_oid(mid)
  305. if mid_sha == sha:
  306. # Found it!
  307. pack_id, offset = self._get_pack_info(mid)
  308. return self.pack_names[pack_id], offset
  309. elif mid_sha < sha:
  310. start_idx = mid + 1
  311. else:
  312. end_idx = mid
  313. return None
  314. def __contains__(self, sha: bytes) -> bool:
  315. """Check if the given object SHA is in this MIDX.
  316. Args:
  317. sha: Binary SHA hash
  318. Returns:
  319. True if the object is in this MIDX
  320. """
  321. return self.object_offset(sha) is not None
  322. def iterentries(self) -> Iterator[tuple[bytes, str, int]]:
  323. """Iterate over all entries in this MIDX.
  324. Yields:
  325. Tuples of (sha, pack_name, offset)
  326. """
  327. for i in range(self.object_count):
  328. sha = self._get_oid(i)
  329. pack_id, offset = self._get_pack_info(i)
  330. pack_name = self.pack_names[pack_id]
  331. yield sha, pack_name, offset
  332. def close(self) -> None:
  333. """Close the MIDX file and release mmap resources."""
  334. # Close mmap'd contents first if it's an mmap object
  335. if self._contents is not None and has_mmap:
  336. if isinstance(self._contents, mmap.mmap):
  337. self._contents.close()
  338. self._contents = None
  339. # Close file handle
  340. if self._file is not None:
  341. self._file.close()
  342. self._file = None
  343. def load_midx(path: str | os.PathLike[str]) -> MultiPackIndex:
  344. """Load a multi-pack-index file by path.
  345. Args:
  346. path: Path to the MIDX file
  347. Returns:
  348. A MultiPackIndex loaded from the given path
  349. """
  350. with GitFile(path, "rb") as f:
  351. return load_midx_file(path, f)
  352. def load_midx_file(
  353. path: str | os.PathLike[str], f: IO[bytes] | _GitFile
  354. ) -> MultiPackIndex:
  355. """Load a multi-pack-index from a file-like object.
  356. Args:
  357. path: Path for the MIDX file
  358. f: File-like object
  359. Returns:
  360. A MultiPackIndex loaded from the given file
  361. """
  362. return MultiPackIndex(path, file=f)
  363. def write_midx(
  364. f: IO[bytes],
  365. pack_index_entries: list[tuple[str, list[tuple[bytes, int, int | None]]]],
  366. hash_algorithm: int = HASH_ALGORITHM_SHA1,
  367. ) -> bytes:
  368. """Write a multi-pack-index file.
  369. Args:
  370. f: File-like object to write to
  371. pack_index_entries: List of (pack_name, entries) tuples where entries are
  372. (sha, offset, crc32) tuples, sorted by SHA
  373. hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
  374. Returns:
  375. SHA-1 checksum of the written MIDX file
  376. """
  377. if hash_algorithm == HASH_ALGORITHM_SHA1:
  378. hash_size = 20
  379. elif hash_algorithm == HASH_ALGORITHM_SHA256:
  380. hash_size = 32
  381. else:
  382. raise ValueError(f"Unknown hash algorithm: {hash_algorithm}")
  383. # Wrap file in SHA1Writer to compute checksum
  384. writer = SHA1Writer(f)
  385. # Sort pack entries by pack name (required by Git)
  386. pack_index_entries_sorted = sorted(pack_index_entries, key=lambda x: x[0])
  387. # Collect all objects from all packs
  388. all_objects: list[tuple[bytes, int, int]] = [] # (sha, pack_id, offset)
  389. pack_names: list[str] = []
  390. for pack_id, (pack_name, entries) in enumerate(pack_index_entries_sorted):
  391. pack_names.append(pack_name)
  392. for sha, offset, _crc32 in entries:
  393. all_objects.append((sha, pack_id, offset))
  394. # Sort all objects by SHA
  395. all_objects.sort(key=lambda x: x[0])
  396. # Calculate offsets for chunks
  397. num_packs = len(pack_names)
  398. num_objects = len(all_objects)
  399. # Header: 12 bytes
  400. header_size = 12
  401. # Chunk count: PNAM, OIDF, OIDL, OOFF, and optionally LOFF
  402. # We'll determine if LOFF is needed later
  403. chunk_count = 4 # PNAM, OIDF, OIDL, OOFF
  404. # Check if we need LOFF chunk (for offsets >= 2^31)
  405. need_loff = any(offset >= 2**31 for _sha, _pack_id, offset in all_objects)
  406. if need_loff:
  407. chunk_count += 1
  408. # Chunk table: (chunk_count + 1) * 12 bytes (including terminator)
  409. chunk_table_size = (chunk_count + 1) * 12
  410. # Calculate chunk offsets
  411. current_offset = header_size + chunk_table_size
  412. # PNAM chunk: pack names as null-terminated strings, padded to 4-byte boundary
  413. pnam_data = b"".join(name.encode("utf-8") + b"\x00" for name in pack_names)
  414. # Pad to 4-byte boundary
  415. pnam_padding = (4 - len(pnam_data) % 4) % 4
  416. pnam_data += b"\x00" * pnam_padding
  417. pnam_offset = current_offset
  418. current_offset += len(pnam_data)
  419. # OIDF chunk: 256 * 4 bytes
  420. oidf_offset = current_offset
  421. oidf_size = 256 * 4
  422. current_offset += oidf_size
  423. # OIDL chunk: num_objects * hash_size bytes
  424. oidl_offset = current_offset
  425. oidl_size = num_objects * hash_size
  426. current_offset += oidl_size
  427. # OOFF chunk: num_objects * 8 bytes (4 for pack_id + 4 for offset)
  428. ooff_offset = current_offset
  429. ooff_size = num_objects * 8
  430. current_offset += ooff_size
  431. # LOFF chunk (if needed): variable size
  432. # We'll calculate the exact size when we know how many large offsets we have
  433. loff_offset = current_offset if need_loff else 0
  434. large_offsets: list[int] = []
  435. # Calculate trailer offset (where checksum starts)
  436. # We need to pre-calculate large offset count for accurate trailer offset
  437. if need_loff:
  438. # Count large offsets
  439. large_offset_count = sum(1 for _, _, offset in all_objects if offset >= 2**31)
  440. loff_size = large_offset_count * 8
  441. trailer_offset = current_offset + loff_size
  442. else:
  443. trailer_offset = current_offset
  444. # Write header
  445. writer.write(MIDX_SIGNATURE) # 4 bytes: signature
  446. writer.write(bytes([MIDX_VERSION])) # 1 byte: version
  447. writer.write(bytes([hash_algorithm])) # 1 byte: hash algorithm
  448. writer.write(bytes([chunk_count])) # 1 byte: chunk count
  449. writer.write(bytes([0])) # 1 byte: base MIDX files (always 0)
  450. writer.write(struct.pack(">L", num_packs)) # 4 bytes: pack count
  451. # Write chunk table
  452. chunk_table = [
  453. (CHUNK_PNAM, pnam_offset),
  454. (CHUNK_OIDF, oidf_offset),
  455. (CHUNK_OIDL, oidl_offset),
  456. (CHUNK_OOFF, ooff_offset),
  457. ]
  458. if need_loff:
  459. chunk_table.append((CHUNK_LOFF, loff_offset))
  460. for chunk_id, chunk_offset in chunk_table:
  461. writer.write(chunk_id) # 4 bytes
  462. writer.write(struct.pack(">Q", chunk_offset)) # 8 bytes
  463. # Write terminator (points to where trailer/checksum starts)
  464. writer.write(b"\x00\x00\x00\x00") # 4 bytes
  465. writer.write(struct.pack(">Q", trailer_offset)) # 8 bytes
  466. # Write PNAM chunk
  467. writer.write(pnam_data)
  468. # Write OIDF chunk (fanout table)
  469. fanout: list[int] = [0] * 256
  470. for sha, _pack_id, _offset in all_objects:
  471. first_byte = sha[0]
  472. fanout[first_byte] += 1
  473. # Convert counts to cumulative
  474. cumulative = 0
  475. for i in range(256):
  476. cumulative += fanout[i]
  477. writer.write(struct.pack(">L", cumulative))
  478. # Write OIDL chunk (object IDs)
  479. for sha, _pack_id, _offset in all_objects:
  480. writer.write(sha)
  481. # Write OOFF chunk (pack ID and offset for each object)
  482. for _sha, pack_id, offset in all_objects:
  483. writer.write(struct.pack(">L", pack_id))
  484. if offset >= 2**31:
  485. # Use large offset table
  486. large_offset_index = len(large_offsets)
  487. large_offsets.append(offset)
  488. # Set MSB to indicate large offset
  489. writer.write(struct.pack(">L", 0x80000000 | large_offset_index))
  490. else:
  491. writer.write(struct.pack(">L", offset))
  492. # Write LOFF chunk if needed
  493. if need_loff:
  494. for large_offset in large_offsets:
  495. writer.write(struct.pack(">Q", large_offset))
  496. # Write checksum
  497. return writer.write_sha()
  498. def write_midx_file(
  499. path: str | os.PathLike[str],
  500. pack_index_entries: list[tuple[str, list[tuple[bytes, int, int | None]]]],
  501. hash_algorithm: int = HASH_ALGORITHM_SHA1,
  502. ) -> bytes:
  503. """Write a multi-pack-index file to disk.
  504. Args:
  505. path: Path where to write the MIDX file
  506. pack_index_entries: List of (pack_name, entries) tuples where entries are
  507. (sha, offset, crc32) tuples, sorted by SHA
  508. hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
  509. Returns:
  510. SHA-1 checksum of the written MIDX file
  511. """
  512. with GitFile(path, "wb") as f:
  513. return write_midx(f, pack_index_entries, hash_algorithm)
  514. # TODO: Add support for incremental MIDX chains
  515. # TODO: Add support for BTMP and RIDX chunks for bitmap integration