bitmap.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191
  1. # bitmap.py -- Packfile bitmap support for git
  2. # Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Support for Git packfile bitmaps.
  22. Bitmaps store reachability information for packfiles, enabling faster
  23. object counting and enumeration operations without full graph traversal.
  24. The bitmap format uses EWAH (Enhanced Word-Aligned Hybrid) compression
  25. for efficient storage and fast bitwise operations.
  26. """
  27. import os
  28. import struct
  29. from collections import deque
  30. from collections.abc import Callable, Iterable, Iterator
  31. from io import BytesIO
  32. from typing import IO, TYPE_CHECKING, Optional
  33. from .file import GitFile
  34. from .objects import Blob, Commit, Tag, Tree
  35. if TYPE_CHECKING:
  36. from .object_store import BaseObjectStore
  37. from .pack import Pack, PackIndex
  38. # Bitmap file signature
  39. BITMAP_SIGNATURE = b"BITM"
  40. # Bitmap format version
  41. BITMAP_VERSION = 1
  42. # Bitmap flags
  43. BITMAP_OPT_FULL_DAG = 0x1 # Full closure
  44. BITMAP_OPT_HASH_CACHE = 0x4 # Name-hash cache
  45. BITMAP_OPT_LOOKUP_TABLE = 0x10 # Lookup table for random access
  46. BITMAP_OPT_PSEUDO_MERGES = 0x20 # Pseudo-merge bitmaps
  47. DEFAULT_COMMIT_INTERVAL = 100 # Default interval for commit selection
  48. def _encode_ewah_words(words: list[int]) -> list[int]:
  49. """Encode a list of 64-bit words using EWAH run-length compression.
  50. Args:
  51. words: List of 64-bit words to encode
  52. Returns:
  53. List of compressed words (RLWs followed by literals)
  54. """
  55. compressed_words = []
  56. i = 0
  57. while i < len(words):
  58. # Check for runs of all zeros or all ones
  59. if words[i] == 0 or words[i] == 0xFFFFFFFFFFFFFFFF:
  60. # Count consecutive identical words
  61. run_value = words[i]
  62. run_length = 0
  63. while i < len(words) and words[i] == run_value:
  64. run_length += 1
  65. i += 1
  66. # Collect following literal words
  67. literals = []
  68. while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF:
  69. literals.append(words[i])
  70. i += 1
  71. if len(literals) >= 0x7FFFFFFF: # Max literal count in RLW
  72. break
  73. # Create RLW with correct bit layout:
  74. # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)]
  75. running_bit = 1 if run_value == 0xFFFFFFFFFFFFFFFF else 0
  76. rlw = (len(literals) << 33) | (run_length << 1) | running_bit
  77. compressed_words.append(rlw)
  78. compressed_words.extend(literals)
  79. else:
  80. # Collect literal words
  81. literals = []
  82. while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF:
  83. literals.append(words[i])
  84. i += 1
  85. if len(literals) >= 0x7FFFFFFF: # Max literal count
  86. break
  87. # RLW with no run, just literals
  88. # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)]
  89. rlw = (len(literals) << 33) | (0 << 1) | 0
  90. compressed_words.append(rlw)
  91. compressed_words.extend(literals)
  92. return compressed_words
  93. class EWAHBitmap:
  94. """EWAH (Enhanced Word-Aligned Hybrid) compressed bitmap.
  95. EWAH uses run-length encoding for efficient bitmap storage.
  96. Each bitmap consists of:
  97. - Uncompressed bit count (4 bytes)
  98. - Compressed word count (4 bytes)
  99. - Compressed words (8 bytes each)
  100. - Current RLW position (4 bytes)
  101. Each Run Length Word (RLW) 64-bit layout (LSB to MSB):
  102. - Bit 0: running_bit (1 bit) - value of repeated words (0 or 1)
  103. - Bits 1-32: running_len (32 bits) - count of repeated words
  104. - Bits 33-63: literal_words (31 bits) - count of literal words following this RLW
  105. """
  106. def __init__(self, data: bytes | None = None) -> None:
  107. """Initialize EWAH bitmap.
  108. Args:
  109. data: Optional compressed bitmap data to decode
  110. """
  111. self.bits: set[int] = set()
  112. self.bit_count = 0
  113. if data:
  114. self._decode(data)
  115. def _decode(self, data: bytes) -> None:
  116. """Decode EWAH compressed bitmap data.
  117. Args:
  118. data: Compressed bitmap data (EWAH format with header + words +
  119. RLW position)
  120. """
  121. f = BytesIO(data)
  122. # Read header
  123. bit_count_bytes = f.read(4)
  124. word_count_bytes = f.read(4)
  125. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  126. return
  127. bit_count = struct.unpack(">I", bit_count_bytes)[0]
  128. word_count = struct.unpack(">I", word_count_bytes)[0]
  129. self.bit_count = bit_count
  130. current_bit = 0
  131. # Read all words first
  132. words = []
  133. for _ in range(word_count):
  134. word_bytes = f.read(8)
  135. if len(word_bytes) < 8:
  136. break
  137. word = struct.unpack(">Q", word_bytes)[0]
  138. words.append(word)
  139. # Process EWAH chunks: RLW followed by literal words
  140. idx = 0
  141. while idx < len(words):
  142. # This is an RLW
  143. # Bit layout: [literal_words(31)][running_len(32)][running_bit(1)]
  144. rlw = words[idx]
  145. running_bit = rlw & 1
  146. running_len = (rlw >> 1) & 0xFFFFFFFF
  147. literal_words = rlw >> 33
  148. idx += 1
  149. # Process running bits
  150. if running_len > 0:
  151. if running_bit == 1:
  152. # Add all bits in the repeated section
  153. for i in range(running_len * 64):
  154. self.bits.add(current_bit + i)
  155. current_bit += running_len * 64
  156. # Process literal words
  157. for _ in range(literal_words):
  158. if idx >= len(words):
  159. break
  160. literal = words[idx]
  161. idx += 1
  162. # Extract set bits from literal word
  163. for i in range(64):
  164. if literal & (1 << i):
  165. self.bits.add(current_bit + i)
  166. current_bit += 64
  167. # Read RLW position (we don't use it currently, but it's part of the format)
  168. f.read(4)
  169. def encode(self) -> bytes:
  170. """Encode bitmap to EWAH compressed format.
  171. Returns:
  172. Compressed bitmap data including header, words, and RLW position
  173. """
  174. if not self.bits:
  175. # Empty bitmap: bit_count=0, word_count=0, rlw_pos=0
  176. return struct.pack(">III", 0, 0, 0)
  177. max_bit = max(self.bits) if self.bits else 0
  178. bit_count = max_bit + 1
  179. word_count = (bit_count + 63) // 64
  180. # Create literal words
  181. words = [0] * word_count
  182. for bit in self.bits:
  183. word_idx = bit // 64
  184. bit_idx = bit % 64
  185. words[word_idx] |= 1 << bit_idx
  186. # Compress using EWAH run-length encoding
  187. compressed_words = _encode_ewah_words(words)
  188. # Build EWAH data
  189. f = BytesIO()
  190. # Header
  191. f.write(struct.pack(">I", bit_count))
  192. f.write(struct.pack(">I", len(compressed_words)))
  193. # Write compressed words
  194. for word in compressed_words:
  195. f.write(struct.pack(">Q", word))
  196. # Write RLW position (position of last RLW in the compressed words)
  197. # For now, we'll use 0 as we don't track this during encoding
  198. # This could be improved in the future if needed
  199. f.write(struct.pack(">I", 0))
  200. return f.getvalue()
  201. def __contains__(self, bit: int) -> bool:
  202. """Check if a bit is set.
  203. Args:
  204. bit: Bit position to check
  205. Returns:
  206. True if bit is set, False otherwise
  207. """
  208. return bit in self.bits
  209. def __len__(self) -> int:
  210. """Return the number of set bits.
  211. Returns:
  212. Count of set bits
  213. """
  214. return len(self.bits)
  215. def __or__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  216. """Bitwise OR operation.
  217. Args:
  218. other: Other bitmap to OR with
  219. Returns:
  220. New bitmap with OR result
  221. """
  222. result = EWAHBitmap()
  223. result.bits = self.bits | other.bits
  224. result.bit_count = max(self.bit_count, other.bit_count)
  225. return result
  226. def __and__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  227. """Bitwise AND operation.
  228. Args:
  229. other: Other bitmap to AND with
  230. Returns:
  231. New bitmap with AND result
  232. """
  233. result = EWAHBitmap()
  234. result.bits = self.bits & other.bits
  235. result.bit_count = max(self.bit_count, other.bit_count)
  236. return result
  237. def __xor__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  238. """Bitwise XOR operation.
  239. Args:
  240. other: Other bitmap to XOR with
  241. Returns:
  242. New bitmap with XOR result
  243. """
  244. result = EWAHBitmap()
  245. result.bits = self.bits ^ other.bits
  246. result.bit_count = max(self.bit_count, other.bit_count)
  247. return result
  248. def add(self, bit: int) -> None:
  249. """Set a bit.
  250. Args:
  251. bit: Bit position to set
  252. """
  253. self.bits.add(bit)
  254. self.bit_count = max(self.bit_count, bit + 1)
  255. class BitmapEntry:
  256. """A single bitmap entry for a commit."""
  257. def __init__(
  258. self,
  259. object_pos: int,
  260. xor_offset: int,
  261. flags: int,
  262. bitmap: EWAHBitmap,
  263. ) -> None:
  264. """Initialize a bitmap entry.
  265. Args:
  266. object_pos: Position of object in pack index
  267. xor_offset: XOR offset for compression (0-160)
  268. flags: Entry flags
  269. bitmap: The EWAH bitmap data
  270. """
  271. self.object_pos = object_pos
  272. self.xor_offset = xor_offset
  273. self.flags = flags
  274. self.bitmap = bitmap
  275. class PackBitmap:
  276. """A pack bitmap index.
  277. Bitmaps store reachability information for commits in a packfile,
  278. allowing fast object enumeration without graph traversal.
  279. """
  280. def __init__(
  281. self,
  282. version: int = BITMAP_VERSION,
  283. flags: int = BITMAP_OPT_FULL_DAG,
  284. ) -> None:
  285. """Initialize a pack bitmap.
  286. Args:
  287. version: Bitmap format version
  288. flags: Bitmap flags
  289. """
  290. self.version = version
  291. self.flags = flags
  292. self.pack_checksum: bytes | None = None
  293. # Type bitmaps for commits, trees, blobs, tags
  294. self.commit_bitmap = EWAHBitmap()
  295. self.tree_bitmap = EWAHBitmap()
  296. self.blob_bitmap = EWAHBitmap()
  297. self.tag_bitmap = EWAHBitmap()
  298. # Bitmap entries indexed by commit SHA
  299. self.entries: dict[bytes, BitmapEntry] = {}
  300. # List of entries in order (for XOR offset resolution)
  301. self.entries_list: list[tuple[bytes, BitmapEntry]] = []
  302. # Optional lookup table for random access
  303. self.lookup_table: list[tuple[int, int, int]] | None = None
  304. # Optional name-hash cache
  305. self.name_hash_cache: list[int] | None = None
  306. def get_bitmap(self, commit_sha: bytes) -> EWAHBitmap | None:
  307. """Get the bitmap for a commit.
  308. Args:
  309. commit_sha: SHA-1 of the commit
  310. Returns:
  311. EWAH bitmap or None if not found
  312. """
  313. entry = self.entries.get(commit_sha)
  314. if entry is None:
  315. return None
  316. # Decompress using XOR if needed
  317. if entry.xor_offset > 0:
  318. # Find the entry at the XOR offset
  319. # The XOR offset tells us how many entries back to look
  320. # We need to find this entry in the ordered list
  321. try:
  322. current_idx = next(
  323. i
  324. for i, (sha, _) in enumerate(self.entries_list)
  325. if sha == commit_sha
  326. )
  327. except StopIteration:
  328. # Entry not found in list, return as-is
  329. return entry.bitmap
  330. # XOR offset is how many positions back to look (max 160)
  331. if current_idx >= entry.xor_offset:
  332. base_sha, _base_entry = self.entries_list[
  333. current_idx - entry.xor_offset
  334. ]
  335. # Get the base bitmap (recursively if it also uses XOR)
  336. base_bitmap = self.get_bitmap(base_sha)
  337. if base_bitmap is not None:
  338. # XOR the current bitmap with the base
  339. return entry.bitmap ^ base_bitmap
  340. return entry.bitmap
  341. def has_commit(self, commit_sha: bytes) -> bool:
  342. """Check if a commit has a bitmap.
  343. Args:
  344. commit_sha: SHA-1 of the commit
  345. Returns:
  346. True if bitmap exists for this commit
  347. """
  348. return commit_sha in self.entries
  349. def iter_commits(self) -> Iterator[bytes]:
  350. """Iterate over all commits with bitmaps.
  351. Returns:
  352. Iterator of commit SHAs
  353. """
  354. return iter(self.entries.keys())
  355. def read_bitmap(
  356. filename: str | os.PathLike[str],
  357. pack_index: Optional["PackIndex"] = None,
  358. ) -> PackBitmap:
  359. """Read a bitmap index file.
  360. Args:
  361. filename: Path to the .bitmap file
  362. pack_index: Optional PackIndex to resolve object positions to SHAs
  363. Returns:
  364. Loaded PackBitmap
  365. Raises:
  366. ValueError: If file format is invalid
  367. ChecksumMismatch: If checksum verification fails
  368. """
  369. with GitFile(filename, "rb") as f:
  370. return read_bitmap_file(f, pack_index=pack_index)
  371. def read_bitmap_file(
  372. f: IO[bytes], pack_index: Optional["PackIndex"] = None
  373. ) -> PackBitmap:
  374. """Read bitmap data from a file object.
  375. Args:
  376. f: File object to read from
  377. pack_index: Optional PackIndex to resolve object positions to SHAs
  378. Returns:
  379. Loaded PackBitmap
  380. Raises:
  381. ValueError: If file format is invalid
  382. """
  383. # Read header
  384. signature = f.read(4)
  385. if signature != BITMAP_SIGNATURE:
  386. raise ValueError(
  387. f"Invalid bitmap signature: {signature!r}, expected {BITMAP_SIGNATURE!r}"
  388. )
  389. version_bytes = f.read(2)
  390. flags_bytes = f.read(2)
  391. if len(version_bytes) < 2 or len(flags_bytes) < 2:
  392. raise ValueError("Incomplete bitmap header")
  393. version = struct.unpack(">H", version_bytes)[0]
  394. flags = struct.unpack(">H", flags_bytes)[0]
  395. if version != BITMAP_VERSION:
  396. raise ValueError(f"Unsupported bitmap version: {version}")
  397. # Read entry count
  398. entry_count_bytes = f.read(4)
  399. if len(entry_count_bytes) < 4:
  400. raise ValueError("Missing entry count")
  401. entry_count = struct.unpack(">I", entry_count_bytes)[0]
  402. # Read pack checksum
  403. pack_checksum = f.read(20)
  404. if len(pack_checksum) < 20:
  405. raise ValueError("Missing pack checksum")
  406. bitmap = PackBitmap(version=version, flags=flags)
  407. bitmap.pack_checksum = pack_checksum
  408. # Read type bitmaps (EWAH bitmaps are self-describing)
  409. for i, type_bitmap in enumerate(
  410. [
  411. bitmap.commit_bitmap,
  412. bitmap.tree_bitmap,
  413. bitmap.blob_bitmap,
  414. bitmap.tag_bitmap,
  415. ]
  416. ):
  417. # EWAH format:
  418. # 4 bytes: bit count
  419. # 4 bytes: word count
  420. # N x 8 bytes: compressed words
  421. # 4 bytes: RLW position
  422. # Read header to determine size
  423. bit_count_bytes = f.read(4)
  424. word_count_bytes = f.read(4)
  425. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  426. raise ValueError(f"Missing type bitmap {i} header")
  427. word_count = struct.unpack(">I", word_count_bytes)[0]
  428. # Read compressed words
  429. words_data = f.read(word_count * 8)
  430. if len(words_data) < word_count * 8:
  431. raise ValueError(f"Incomplete type bitmap {i} data")
  432. # Read RLW position
  433. rlw_pos_bytes = f.read(4)
  434. if len(rlw_pos_bytes) < 4:
  435. raise ValueError(f"Missing type bitmap {i} RLW position")
  436. # Reconstruct the full EWAH data to pass to _decode
  437. ewah_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes
  438. type_bitmap._decode(ewah_data)
  439. # Read bitmap entries
  440. for _ in range(entry_count):
  441. # Read object position (4 bytes)
  442. obj_pos_bytes = f.read(4)
  443. if len(obj_pos_bytes) < 4:
  444. raise ValueError("Incomplete bitmap entry")
  445. obj_pos = struct.unpack(">I", obj_pos_bytes)[0]
  446. # Read XOR offset (1 byte)
  447. xor_offset_bytes = f.read(1)
  448. if len(xor_offset_bytes) < 1:
  449. raise ValueError("Missing XOR offset")
  450. xor_offset = xor_offset_bytes[0]
  451. # Read flags (1 byte)
  452. flags_bytes = f.read(1)
  453. if len(flags_bytes) < 1:
  454. raise ValueError("Missing entry flags")
  455. entry_flags = flags_bytes[0]
  456. # Read self-describing EWAH bitmap
  457. # EWAH format: bit_count (4) + word_count (4) + words + rlw_pos (4)
  458. bit_count_bytes = f.read(4)
  459. word_count_bytes = f.read(4)
  460. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  461. raise ValueError("Incomplete bitmap entry EWAH header")
  462. word_count = struct.unpack(">I", word_count_bytes)[0]
  463. # Read compressed words
  464. words_data = f.read(word_count * 8)
  465. if len(words_data) < word_count * 8:
  466. raise ValueError("Incomplete bitmap entry EWAH words")
  467. # Read RLW position
  468. rlw_pos_bytes = f.read(4)
  469. if len(rlw_pos_bytes) < 4:
  470. raise ValueError("Missing bitmap entry EWAH RLW position")
  471. # Reconstruct full EWAH data
  472. bitmap_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes
  473. # Create bitmap entry
  474. ewah_bitmap = EWAHBitmap(bitmap_data) if word_count > 0 else EWAHBitmap()
  475. entry = BitmapEntry(
  476. object_pos=obj_pos,
  477. xor_offset=xor_offset,
  478. flags=entry_flags,
  479. bitmap=ewah_bitmap,
  480. )
  481. # Resolve object position to SHA if we have a pack index
  482. if pack_index is not None:
  483. # Get the SHA at the given position in the sorted index
  484. sha = None
  485. for idx, (entry_sha, _offset, _crc32) in enumerate(
  486. pack_index.iterentries()
  487. ):
  488. if idx == obj_pos:
  489. sha = entry_sha
  490. break
  491. if sha is not None:
  492. bitmap.entries[sha] = entry
  493. bitmap.entries_list.append((sha, entry))
  494. else:
  495. # Without pack index, use position as temporary key
  496. temp_key = obj_pos.to_bytes(4, byteorder="big")
  497. bitmap.entries[temp_key] = entry
  498. bitmap.entries_list.append((temp_key, entry))
  499. # Read optional lookup table
  500. if flags & BITMAP_OPT_LOOKUP_TABLE:
  501. # Lookup table contains triplets: (commit_pos, offset, xor_row)
  502. # Number of entries matches the bitmap entry count
  503. lookup_table = []
  504. for _ in range(entry_count):
  505. # Read commit position (4 bytes)
  506. commit_pos_bytes = f.read(4)
  507. if len(commit_pos_bytes) < 4:
  508. break
  509. commit_pos = struct.unpack(">I", commit_pos_bytes)[0]
  510. # Read file offset (8 bytes)
  511. offset_bytes = f.read(8)
  512. if len(offset_bytes) < 8:
  513. break
  514. offset = struct.unpack(">Q", offset_bytes)[0]
  515. # Read XOR row (4 bytes)
  516. xor_row_bytes = f.read(4)
  517. if len(xor_row_bytes) < 4:
  518. break
  519. xor_row = struct.unpack(">I", xor_row_bytes)[0]
  520. lookup_table.append((commit_pos, offset, xor_row))
  521. bitmap.lookup_table = lookup_table
  522. # Read optional name-hash cache
  523. if flags & BITMAP_OPT_HASH_CACHE:
  524. # Name-hash cache contains one 32-bit hash per object in the pack
  525. # The number of hashes depends on the total number of objects
  526. # For now, we'll read what's available
  527. name_hash_cache = []
  528. while True:
  529. hash_bytes = f.read(4)
  530. if len(hash_bytes) < 4:
  531. break
  532. hash_value = struct.unpack(">I", hash_bytes)[0]
  533. name_hash_cache.append(hash_value)
  534. if name_hash_cache:
  535. bitmap.name_hash_cache = name_hash_cache
  536. return bitmap
  537. def write_bitmap(
  538. filename: str | os.PathLike[str],
  539. bitmap: PackBitmap,
  540. ) -> None:
  541. """Write a bitmap index file.
  542. Args:
  543. filename: Path to write the .bitmap file
  544. bitmap: PackBitmap to write
  545. """
  546. with GitFile(filename, "wb") as f:
  547. write_bitmap_file(f, bitmap)
  548. def write_bitmap_file(f: IO[bytes], bitmap: PackBitmap) -> None:
  549. """Write bitmap data to a file object.
  550. Args:
  551. f: File object to write to
  552. bitmap: PackBitmap to write
  553. """
  554. # Write header
  555. f.write(BITMAP_SIGNATURE)
  556. f.write(struct.pack(">H", bitmap.version))
  557. f.write(struct.pack(">H", bitmap.flags))
  558. # Write entry count
  559. f.write(struct.pack(">I", len(bitmap.entries)))
  560. # Write pack checksum
  561. if bitmap.pack_checksum:
  562. f.write(bitmap.pack_checksum)
  563. else:
  564. f.write(b"\x00" * 20)
  565. # Write type bitmaps (self-describing EWAH format, no size prefix needed)
  566. for type_bitmap in [
  567. bitmap.commit_bitmap,
  568. bitmap.tree_bitmap,
  569. bitmap.blob_bitmap,
  570. bitmap.tag_bitmap,
  571. ]:
  572. data = type_bitmap.encode()
  573. f.write(data)
  574. # Write bitmap entries
  575. for _sha, entry in bitmap.entries.items():
  576. # Write object position (4 bytes)
  577. f.write(struct.pack(">I", entry.object_pos))
  578. # Write XOR offset (1 byte)
  579. f.write(bytes([entry.xor_offset]))
  580. # Write flags (1 byte)
  581. f.write(bytes([entry.flags]))
  582. # Write compressed bitmap data (self-describing EWAH format, no size prefix)
  583. bitmap_data = entry.bitmap.encode()
  584. f.write(bitmap_data)
  585. # Write optional lookup table
  586. if bitmap.flags & BITMAP_OPT_LOOKUP_TABLE and bitmap.lookup_table:
  587. for commit_pos, offset, xor_row in bitmap.lookup_table:
  588. f.write(struct.pack(">I", commit_pos)) # 4 bytes
  589. f.write(struct.pack(">Q", offset)) # 8 bytes
  590. f.write(struct.pack(">I", xor_row)) # 4 bytes
  591. # Write optional name-hash cache
  592. if bitmap.flags & BITMAP_OPT_HASH_CACHE and bitmap.name_hash_cache:
  593. for hash_value in bitmap.name_hash_cache:
  594. f.write(struct.pack(">I", hash_value))
  595. def _compute_name_hash(name: bytes) -> int:
  596. """Compute the name hash for a tree entry.
  597. This is the same algorithm Git uses for the name-hash cache.
  598. Args:
  599. name: The name of the tree entry
  600. Returns:
  601. 32-bit hash value
  602. """
  603. hash_value = 0
  604. for byte in name:
  605. hash_value = (hash_value >> 19) | (hash_value << 13)
  606. hash_value += byte
  607. hash_value &= 0xFFFFFFFF
  608. return hash_value
  609. def select_bitmap_commits(
  610. refs: dict[bytes, bytes],
  611. object_store: "BaseObjectStore",
  612. commit_interval: int = DEFAULT_COMMIT_INTERVAL,
  613. ) -> list[bytes]:
  614. """Select commits for bitmap generation.
  615. Uses Git's strategy:
  616. - All branch and tag tips
  617. - Every Nth commit in history
  618. Args:
  619. refs: Dictionary of ref names to commit SHAs
  620. object_store: Object store to read commits from
  621. commit_interval: Include every Nth commit in history
  622. Returns:
  623. List of commit SHAs to create bitmaps for
  624. """
  625. selected = set()
  626. seen = set()
  627. # Start with all refs
  628. ref_commits = set()
  629. for ref_name, sha in refs.items():
  630. try:
  631. obj = object_store[sha]
  632. # Dereference tags to get to commits
  633. while isinstance(obj, Tag):
  634. obj = object_store[obj.object[1]]
  635. if isinstance(obj, Commit):
  636. ref_commits.add(obj.id)
  637. except KeyError:
  638. continue
  639. # Add all ref tips
  640. selected.update(ref_commits)
  641. # Walk the commit graph and select every Nth commit
  642. queue = deque(ref_commits)
  643. commit_count = 0
  644. while queue:
  645. commit_sha = queue.popleft()
  646. if commit_sha in seen:
  647. continue
  648. seen.add(commit_sha)
  649. try:
  650. obj = object_store[commit_sha]
  651. if not isinstance(obj, Commit):
  652. continue
  653. commit_count += 1
  654. if commit_count % commit_interval == 0:
  655. selected.add(commit_sha)
  656. # Add parents to queue
  657. for parent in obj.parents:
  658. if parent not in seen:
  659. queue.append(parent)
  660. except KeyError:
  661. continue
  662. return sorted(selected)
  663. def build_reachability_bitmap(
  664. commit_sha: bytes,
  665. pack_index: "PackIndex",
  666. object_store: "BaseObjectStore",
  667. ) -> EWAHBitmap:
  668. """Build a reachability bitmap for a commit.
  669. The bitmap has a bit set for each object that is reachable from the commit.
  670. The bit position corresponds to the object's position in the pack index.
  671. Args:
  672. commit_sha: The commit to build a bitmap for
  673. pack_index: Pack index to get object positions
  674. object_store: Object store to traverse objects
  675. Returns:
  676. EWAH bitmap with bits set for reachable objects
  677. """
  678. bitmap = EWAHBitmap()
  679. # Create mapping from SHA to position in pack index
  680. sha_to_pos = {}
  681. for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  682. sha_to_pos[sha] = pos
  683. # Traverse all objects reachable from the commit
  684. seen = set()
  685. queue = deque([commit_sha])
  686. while queue:
  687. sha = queue.popleft()
  688. if sha in seen:
  689. continue
  690. seen.add(sha)
  691. # Add this object to the bitmap if it's in the pack
  692. if sha in sha_to_pos:
  693. bitmap.add(sha_to_pos[sha])
  694. # Get the object and traverse its references
  695. try:
  696. obj = object_store[sha]
  697. if isinstance(obj, Commit):
  698. # Add parents and tree
  699. queue.append(obj.tree)
  700. queue.extend(obj.parents)
  701. elif hasattr(obj, "items"):
  702. # Tree object - add all entries
  703. for item in obj.items():
  704. queue.append(item.sha)
  705. except KeyError:
  706. # Object not in store, skip it
  707. continue
  708. return bitmap
  709. def apply_xor_compression(
  710. bitmaps: list[tuple[bytes, EWAHBitmap]],
  711. max_xor_offset: int = 160,
  712. ) -> list[tuple[bytes, EWAHBitmap, int]]:
  713. """Apply XOR compression to bitmaps.
  714. XOR compression stores some bitmaps as XOR differences from previous bitmaps,
  715. reducing storage size when bitmaps are similar.
  716. Args:
  717. bitmaps: List of (commit_sha, bitmap) tuples
  718. max_xor_offset: Maximum offset to search for XOR base (default: 160)
  719. Returns:
  720. List of (commit_sha, bitmap, xor_offset) tuples
  721. """
  722. compressed = []
  723. for i, (sha, bitmap) in enumerate(bitmaps):
  724. best_xor_offset = 0
  725. best_size = len(bitmap.encode())
  726. best_xor_bitmap = bitmap
  727. # Try XORing with previous bitmaps within max_xor_offset
  728. for offset in range(1, min(i + 1, max_xor_offset + 1)):
  729. _prev_sha, prev_bitmap = bitmaps[i - offset]
  730. xor_bitmap = bitmap ^ prev_bitmap
  731. xor_size = len(xor_bitmap.encode())
  732. # Use XOR if it reduces size
  733. if xor_size < best_size:
  734. best_size = xor_size
  735. best_xor_offset = offset
  736. best_xor_bitmap = xor_bitmap
  737. compressed.append((sha, best_xor_bitmap, best_xor_offset))
  738. return compressed
  739. def build_type_bitmaps(
  740. pack_index: "PackIndex",
  741. object_store: "BaseObjectStore",
  742. ) -> tuple[EWAHBitmap, EWAHBitmap, EWAHBitmap, EWAHBitmap]:
  743. """Build type bitmaps for all objects in a pack.
  744. Type bitmaps classify objects by type: commit, tree, blob, or tag.
  745. Args:
  746. pack_index: Pack index to iterate objects
  747. object_store: Object store to read object types
  748. Returns:
  749. Tuple of (commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap)
  750. """
  751. commit_bitmap = EWAHBitmap()
  752. tree_bitmap = EWAHBitmap()
  753. blob_bitmap = EWAHBitmap()
  754. tag_bitmap = EWAHBitmap()
  755. for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  756. try:
  757. obj = object_store[sha]
  758. except KeyError:
  759. pass
  760. else:
  761. obj_type = obj.type_num
  762. if obj_type == Commit.type_num:
  763. commit_bitmap.add(pos)
  764. elif obj_type == Tree.type_num:
  765. tree_bitmap.add(pos)
  766. elif obj_type == Blob.type_num:
  767. blob_bitmap.add(pos)
  768. elif obj_type == Tag.type_num:
  769. tag_bitmap.add(pos)
  770. return commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap
  771. def build_name_hash_cache(
  772. pack_index: "PackIndex",
  773. object_store: "BaseObjectStore",
  774. ) -> list[int]:
  775. """Build name-hash cache for all objects in a pack.
  776. The name-hash cache stores a hash of the name for each object,
  777. which can speed up path-based operations.
  778. Args:
  779. pack_index: Pack index to iterate objects
  780. object_store: Object store to read objects
  781. Returns:
  782. List of 32-bit hash values, one per object in the pack
  783. """
  784. name_hashes = []
  785. for _pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  786. try:
  787. obj = object_store[sha]
  788. # For tree entries, use the tree entry name
  789. # For commits, use the tree SHA
  790. # For other objects, use the object SHA
  791. if hasattr(obj, "items"):
  792. # Tree object - use the SHA as the name
  793. name_hash = _compute_name_hash(sha)
  794. elif isinstance(obj, Commit):
  795. # Commit - use the tree SHA as the name
  796. name_hash = _compute_name_hash(obj.tree)
  797. else:
  798. # Other objects - use the SHA as the name
  799. name_hash = _compute_name_hash(sha)
  800. name_hashes.append(name_hash)
  801. except KeyError:
  802. # Object not in store, use zero hash
  803. name_hashes.append(0)
  804. return name_hashes
  805. def generate_bitmap(
  806. pack_index: "PackIndex",
  807. object_store: "BaseObjectStore",
  808. refs: dict[bytes, bytes],
  809. pack_checksum: bytes,
  810. include_hash_cache: bool = True,
  811. include_lookup_table: bool = True,
  812. commit_interval: int = DEFAULT_COMMIT_INTERVAL,
  813. progress: Callable[[str], None] | None = None,
  814. ) -> PackBitmap:
  815. """Generate a complete bitmap for a pack.
  816. Args:
  817. pack_index: Pack index for the pack
  818. object_store: Object store to read objects from
  819. refs: Dictionary of ref names to commit SHAs
  820. pack_checksum: SHA-1 checksum of the pack file
  821. include_hash_cache: Whether to include name-hash cache
  822. include_lookup_table: Whether to include lookup table
  823. commit_interval: Include every Nth commit in history
  824. progress: Optional progress reporting callback
  825. Returns:
  826. Complete PackBitmap ready to write to disk
  827. """
  828. if progress:
  829. progress("Selecting commits for bitmap")
  830. # Select commits to create bitmaps for
  831. selected_commits = select_bitmap_commits(refs, object_store, commit_interval)
  832. if progress:
  833. progress(f"Building bitmaps for {len(selected_commits)} commits")
  834. # Build reachability bitmaps for selected commits
  835. commit_bitmaps = []
  836. for i, commit_sha in enumerate(selected_commits):
  837. if progress and i % 10 == 0:
  838. progress(f"Building bitmap {i + 1}/{len(selected_commits)}")
  839. bitmap = build_reachability_bitmap(commit_sha, pack_index, object_store)
  840. commit_bitmaps.append((commit_sha, bitmap))
  841. if progress:
  842. progress("Applying XOR compression")
  843. # Apply XOR compression
  844. compressed_bitmaps = apply_xor_compression(commit_bitmaps)
  845. if progress:
  846. progress("Building type bitmaps")
  847. # Build type bitmaps
  848. commit_type_bitmap, tree_type_bitmap, blob_type_bitmap, tag_type_bitmap = (
  849. build_type_bitmaps(pack_index, object_store)
  850. )
  851. # Create PackBitmap
  852. flags = BITMAP_OPT_FULL_DAG
  853. if include_hash_cache:
  854. flags |= BITMAP_OPT_HASH_CACHE
  855. if include_lookup_table:
  856. flags |= BITMAP_OPT_LOOKUP_TABLE
  857. pack_bitmap = PackBitmap(version=1, flags=flags)
  858. pack_bitmap.pack_checksum = pack_checksum
  859. pack_bitmap.commit_bitmap = commit_type_bitmap
  860. pack_bitmap.tree_bitmap = tree_type_bitmap
  861. pack_bitmap.blob_bitmap = blob_type_bitmap
  862. pack_bitmap.tag_bitmap = tag_type_bitmap
  863. # Create mapping from SHA to position in pack index
  864. sha_to_pos = {}
  865. for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  866. sha_to_pos[sha] = pos
  867. # Add bitmap entries
  868. for commit_sha, xor_bitmap, xor_offset in compressed_bitmaps:
  869. if commit_sha not in sha_to_pos:
  870. continue
  871. entry = BitmapEntry(
  872. object_pos=sha_to_pos[commit_sha],
  873. xor_offset=xor_offset,
  874. flags=0,
  875. bitmap=xor_bitmap,
  876. )
  877. pack_bitmap.entries[commit_sha] = entry
  878. pack_bitmap.entries_list.append((commit_sha, entry))
  879. # Build optional name-hash cache
  880. if include_hash_cache:
  881. if progress:
  882. progress("Building name-hash cache")
  883. pack_bitmap.name_hash_cache = build_name_hash_cache(pack_index, object_store)
  884. # Build optional lookup table
  885. if include_lookup_table:
  886. if progress:
  887. progress("Building lookup table")
  888. # The lookup table is built automatically from the entries
  889. # For now, we'll leave it as None and let the write function handle it
  890. # TODO: Implement lookup table generation if needed
  891. pack_bitmap.lookup_table = None
  892. if progress:
  893. progress("Bitmap generation complete")
  894. return pack_bitmap
  895. def find_commit_bitmaps(
  896. commit_shas: set[bytes], packs: Iterable[Pack]
  897. ) -> dict[bytes, tuple]:
  898. """Find which packs have bitmaps for the given commits.
  899. Args:
  900. commit_shas: Set of commit SHAs to look for
  901. packs: Iterable of Pack objects to search
  902. Returns:
  903. Dict mapping commit SHA to (pack, pack_bitmap, position) tuple
  904. """
  905. result = {}
  906. remaining = set(commit_shas)
  907. for pack in packs:
  908. if not remaining:
  909. break
  910. try:
  911. pack_bitmap = pack.bitmap
  912. if not pack_bitmap:
  913. continue
  914. # Build SHA to position mapping for this pack
  915. sha_to_pos = {}
  916. for pos, (sha, _offset, _crc32) in enumerate(pack.index.iterentries()):
  917. sha_to_pos[sha] = pos
  918. # Check which commits have bitmaps
  919. for commit_sha in list(remaining):
  920. if pack_bitmap.has_commit(commit_sha):
  921. if commit_sha in sha_to_pos:
  922. result[commit_sha] = (pack, pack_bitmap, sha_to_pos)
  923. remaining.remove(commit_sha)
  924. except (FileNotFoundError, ValueError, AttributeError):
  925. # No bitmap or corrupt, skip this pack
  926. continue
  927. return result
  928. def bitmap_to_object_shas(
  929. bitmap: EWAHBitmap,
  930. pack_index: "PackIndex",
  931. type_filter: EWAHBitmap | None = None,
  932. ) -> set[bytes]:
  933. """Convert a bitmap to a set of object SHAs.
  934. Args:
  935. bitmap: The EWAH bitmap with set bits for objects
  936. pack_index: Pack index to map positions to SHAs
  937. type_filter: Optional type bitmap to filter results (e.g., commits only)
  938. Returns:
  939. Set of object SHAs
  940. """
  941. result = set()
  942. for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  943. # Check if this position is in the bitmap
  944. if pos in bitmap:
  945. # Apply type filter if provided
  946. if type_filter is None or pos in type_filter:
  947. result.add(sha)
  948. return result