bitmap.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206
  1. # bitmap.py -- Packfile bitmap support for git
  2. # Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Support for Git packfile bitmaps.
  22. Bitmaps store reachability information for packfiles, enabling faster
  23. object counting and enumeration operations without full graph traversal.
  24. The bitmap format uses EWAH (Enhanced Word-Aligned Hybrid) compression
  25. for efficient storage and fast bitwise operations.
  26. """
  27. import os
  28. import struct
  29. from collections import deque
  30. from collections.abc import Callable, Iterable, Iterator
  31. from io import BytesIO
  32. from typing import IO, TYPE_CHECKING, Optional
  33. from .file import GitFile
  34. from .objects import Blob, Commit, Tag, Tree
  35. if TYPE_CHECKING:
  36. from .object_store import BaseObjectStore
  37. from .pack import Pack, PackIndex
  38. # Bitmap file signature
  39. BITMAP_SIGNATURE = b"BITM"
  40. # Bitmap format version
  41. BITMAP_VERSION = 1
  42. # Bitmap flags
  43. BITMAP_OPT_FULL_DAG = 0x1 # Full closure
  44. BITMAP_OPT_HASH_CACHE = 0x4 # Name-hash cache
  45. BITMAP_OPT_LOOKUP_TABLE = 0x10 # Lookup table for random access
  46. BITMAP_OPT_PSEUDO_MERGES = 0x20 # Pseudo-merge bitmaps
  47. DEFAULT_COMMIT_INTERVAL = 100 # Default interval for commit selection
  48. def _encode_ewah_words(words: list[int]) -> list[int]:
  49. """Encode a list of 64-bit words using EWAH run-length compression.
  50. Args:
  51. words: List of 64-bit words to encode
  52. Returns:
  53. List of compressed words (RLWs followed by literals)
  54. """
  55. compressed_words = []
  56. i = 0
  57. while i < len(words):
  58. # Check for runs of all zeros or all ones
  59. if words[i] == 0 or words[i] == 0xFFFFFFFFFFFFFFFF:
  60. # Count consecutive identical words
  61. run_value = words[i]
  62. run_length = 0
  63. while i < len(words) and words[i] == run_value:
  64. run_length += 1
  65. i += 1
  66. # Collect following literal words
  67. literals = []
  68. while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF:
  69. literals.append(words[i])
  70. i += 1
  71. if len(literals) >= 0x7FFFFFFF: # Max literal count in RLW
  72. break
  73. # Create RLW with correct bit layout:
  74. # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)]
  75. running_bit = 1 if run_value == 0xFFFFFFFFFFFFFFFF else 0
  76. rlw = (len(literals) << 33) | (run_length << 1) | running_bit
  77. compressed_words.append(rlw)
  78. compressed_words.extend(literals)
  79. else:
  80. # Collect literal words
  81. literals = []
  82. while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF:
  83. literals.append(words[i])
  84. i += 1
  85. if len(literals) >= 0x7FFFFFFF: # Max literal count
  86. break
  87. # RLW with no run, just literals
  88. # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)]
  89. rlw = (len(literals) << 33) | (0 << 1) | 0
  90. compressed_words.append(rlw)
  91. compressed_words.extend(literals)
  92. return compressed_words
  93. class EWAHBitmap:
  94. """EWAH (Enhanced Word-Aligned Hybrid) compressed bitmap.
  95. EWAH uses run-length encoding for efficient bitmap storage.
  96. Each bitmap consists of:
  97. - Uncompressed bit count (4 bytes)
  98. - Compressed word count (4 bytes)
  99. - Compressed words (8 bytes each)
  100. - Current RLW position (4 bytes)
  101. Each Run Length Word (RLW) 64-bit layout (LSB to MSB):
  102. - Bit 0: running_bit (1 bit) - value of repeated words (0 or 1)
  103. - Bits 1-32: running_len (32 bits) - count of repeated words
  104. - Bits 33-63: literal_words (31 bits) - count of literal words following this RLW
  105. """
  106. def __init__(self, data: bytes | None = None) -> None:
  107. """Initialize EWAH bitmap.
  108. Args:
  109. data: Optional compressed bitmap data to decode
  110. """
  111. self.bits: set[int] = set()
  112. self.bit_count = 0
  113. if data:
  114. self._decode(data)
  115. def _decode(self, data: bytes) -> None:
  116. """Decode EWAH compressed bitmap data.
  117. Args:
  118. data: Compressed bitmap data (EWAH format with header + words +
  119. RLW position)
  120. """
  121. f = BytesIO(data)
  122. # Read header
  123. bit_count_bytes = f.read(4)
  124. word_count_bytes = f.read(4)
  125. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  126. return
  127. bit_count = struct.unpack(">I", bit_count_bytes)[0]
  128. word_count = struct.unpack(">I", word_count_bytes)[0]
  129. self.bit_count = bit_count
  130. current_bit = 0
  131. # Read all words first
  132. words = []
  133. for _ in range(word_count):
  134. word_bytes = f.read(8)
  135. if len(word_bytes) < 8:
  136. break
  137. word = struct.unpack(">Q", word_bytes)[0]
  138. words.append(word)
  139. # Process EWAH chunks: RLW followed by literal words
  140. idx = 0
  141. while idx < len(words):
  142. # This is an RLW
  143. # Bit layout: [literal_words(31)][running_len(32)][running_bit(1)]
  144. rlw = words[idx]
  145. running_bit = rlw & 1
  146. running_len = (rlw >> 1) & 0xFFFFFFFF
  147. literal_words = rlw >> 33
  148. idx += 1
  149. # Process running bits
  150. if running_len > 0:
  151. if running_bit == 1:
  152. # Add all bits in the repeated section
  153. for i in range(running_len * 64):
  154. self.bits.add(current_bit + i)
  155. current_bit += running_len * 64
  156. # Process literal words
  157. for _ in range(literal_words):
  158. if idx >= len(words):
  159. break
  160. literal = words[idx]
  161. idx += 1
  162. # Extract set bits from literal word
  163. for i in range(64):
  164. if literal & (1 << i):
  165. self.bits.add(current_bit + i)
  166. current_bit += 64
  167. # Read RLW position (we don't use it currently, but it's part of the format)
  168. f.read(4)
  169. def encode(self) -> bytes:
  170. """Encode bitmap to EWAH compressed format.
  171. Returns:
  172. Compressed bitmap data including header, words, and RLW position
  173. """
  174. if not self.bits:
  175. # Empty bitmap: bit_count=0, word_count=0, rlw_pos=0
  176. return struct.pack(">III", 0, 0, 0)
  177. max_bit = max(self.bits) if self.bits else 0
  178. bit_count = max_bit + 1
  179. word_count = (bit_count + 63) // 64
  180. # Create literal words
  181. words = [0] * word_count
  182. for bit in self.bits:
  183. word_idx = bit // 64
  184. bit_idx = bit % 64
  185. words[word_idx] |= 1 << bit_idx
  186. # Compress using EWAH run-length encoding
  187. compressed_words = _encode_ewah_words(words)
  188. # Build EWAH data
  189. f = BytesIO()
  190. # Header
  191. f.write(struct.pack(">I", bit_count))
  192. f.write(struct.pack(">I", len(compressed_words)))
  193. # Write compressed words
  194. for word in compressed_words:
  195. f.write(struct.pack(">Q", word))
  196. # Write RLW position (position of last RLW in the compressed words)
  197. # For now, we'll use 0 as we don't track this during encoding
  198. # This could be improved in the future if needed
  199. f.write(struct.pack(">I", 0))
  200. return f.getvalue()
  201. def __contains__(self, bit: int) -> bool:
  202. """Check if a bit is set.
  203. Args:
  204. bit: Bit position to check
  205. Returns:
  206. True if bit is set, False otherwise
  207. """
  208. return bit in self.bits
  209. def __len__(self) -> int:
  210. """Return the number of set bits.
  211. Returns:
  212. Count of set bits
  213. """
  214. return len(self.bits)
  215. def __or__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  216. """Bitwise OR operation.
  217. Args:
  218. other: Other bitmap to OR with
  219. Returns:
  220. New bitmap with OR result
  221. """
  222. result = EWAHBitmap()
  223. result.bits = self.bits | other.bits
  224. result.bit_count = max(self.bit_count, other.bit_count)
  225. return result
  226. def __and__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  227. """Bitwise AND operation.
  228. Args:
  229. other: Other bitmap to AND with
  230. Returns:
  231. New bitmap with AND result
  232. """
  233. result = EWAHBitmap()
  234. result.bits = self.bits & other.bits
  235. result.bit_count = max(self.bit_count, other.bit_count)
  236. return result
  237. def __xor__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  238. """Bitwise XOR operation.
  239. Args:
  240. other: Other bitmap to XOR with
  241. Returns:
  242. New bitmap with XOR result
  243. """
  244. result = EWAHBitmap()
  245. result.bits = self.bits ^ other.bits
  246. result.bit_count = max(self.bit_count, other.bit_count)
  247. return result
  248. def __sub__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  249. """Bitwise subtraction (set difference).
  250. Returns bits that are in self but not in other.
  251. Equivalent to: self & ~other
  252. Args:
  253. other: Bitmap to subtract
  254. Returns:
  255. New bitmap with bits in self but not in other
  256. """
  257. result = EWAHBitmap()
  258. result.bits = self.bits - other.bits
  259. result.bit_count = self.bit_count
  260. return result
  261. def add(self, bit: int) -> None:
  262. """Set a bit.
  263. Args:
  264. bit: Bit position to set
  265. """
  266. self.bits.add(bit)
  267. self.bit_count = max(self.bit_count, bit + 1)
  268. class BitmapEntry:
  269. """A single bitmap entry for a commit."""
  270. def __init__(
  271. self,
  272. object_pos: int,
  273. xor_offset: int,
  274. flags: int,
  275. bitmap: EWAHBitmap,
  276. ) -> None:
  277. """Initialize a bitmap entry.
  278. Args:
  279. object_pos: Position of object in pack index
  280. xor_offset: XOR offset for compression (0-160)
  281. flags: Entry flags
  282. bitmap: The EWAH bitmap data
  283. """
  284. self.object_pos = object_pos
  285. self.xor_offset = xor_offset
  286. self.flags = flags
  287. self.bitmap = bitmap
  288. class PackBitmap:
  289. """A pack bitmap index.
  290. Bitmaps store reachability information for commits in a packfile,
  291. allowing fast object enumeration without graph traversal.
  292. """
  293. def __init__(
  294. self,
  295. version: int = BITMAP_VERSION,
  296. flags: int = BITMAP_OPT_FULL_DAG,
  297. ) -> None:
  298. """Initialize a pack bitmap.
  299. Args:
  300. version: Bitmap format version
  301. flags: Bitmap flags
  302. """
  303. self.version = version
  304. self.flags = flags
  305. self.pack_checksum: bytes | None = None
  306. # Type bitmaps for commits, trees, blobs, tags
  307. self.commit_bitmap = EWAHBitmap()
  308. self.tree_bitmap = EWAHBitmap()
  309. self.blob_bitmap = EWAHBitmap()
  310. self.tag_bitmap = EWAHBitmap()
  311. # Bitmap entries indexed by commit SHA
  312. self.entries: dict[bytes, BitmapEntry] = {}
  313. # List of entries in order (for XOR offset resolution)
  314. self.entries_list: list[tuple[bytes, BitmapEntry]] = []
  315. # Optional lookup table for random access
  316. self.lookup_table: list[tuple[int, int, int]] | None = None
  317. # Optional name-hash cache
  318. self.name_hash_cache: list[int] | None = None
  319. def get_bitmap(self, commit_sha: bytes) -> EWAHBitmap | None:
  320. """Get the bitmap for a commit.
  321. Args:
  322. commit_sha: SHA-1 of the commit
  323. Returns:
  324. EWAH bitmap or None if not found
  325. """
  326. entry = self.entries.get(commit_sha)
  327. if entry is None:
  328. return None
  329. # Decompress using XOR if needed
  330. if entry.xor_offset > 0:
  331. # Find the entry at the XOR offset
  332. # The XOR offset tells us how many entries back to look
  333. # We need to find this entry in the ordered list
  334. try:
  335. current_idx = next(
  336. i
  337. for i, (sha, _) in enumerate(self.entries_list)
  338. if sha == commit_sha
  339. )
  340. except StopIteration:
  341. # Entry not found in list, return as-is
  342. return entry.bitmap
  343. # XOR offset is how many positions back to look (max 160)
  344. if current_idx >= entry.xor_offset:
  345. base_sha, _base_entry = self.entries_list[
  346. current_idx - entry.xor_offset
  347. ]
  348. # Get the base bitmap (recursively if it also uses XOR)
  349. base_bitmap = self.get_bitmap(base_sha)
  350. if base_bitmap is not None:
  351. # XOR the current bitmap with the base
  352. return entry.bitmap ^ base_bitmap
  353. return entry.bitmap
  354. def has_commit(self, commit_sha: bytes) -> bool:
  355. """Check if a commit has a bitmap.
  356. Args:
  357. commit_sha: SHA-1 of the commit
  358. Returns:
  359. True if bitmap exists for this commit
  360. """
  361. return commit_sha in self.entries
  362. def iter_commits(self) -> Iterator[bytes]:
  363. """Iterate over all commits with bitmaps.
  364. Returns:
  365. Iterator of commit SHAs
  366. """
  367. return iter(self.entries.keys())
  368. def read_bitmap(
  369. filename: str | os.PathLike[str],
  370. pack_index: Optional["PackIndex"] = None,
  371. ) -> PackBitmap:
  372. """Read a bitmap index file.
  373. Args:
  374. filename: Path to the .bitmap file
  375. pack_index: Optional PackIndex to resolve object positions to SHAs
  376. Returns:
  377. Loaded PackBitmap
  378. Raises:
  379. ValueError: If file format is invalid
  380. ChecksumMismatch: If checksum verification fails
  381. """
  382. with GitFile(filename, "rb") as f:
  383. return read_bitmap_file(f, pack_index=pack_index)
  384. def read_bitmap_file(
  385. f: IO[bytes], pack_index: Optional["PackIndex"] = None
  386. ) -> PackBitmap:
  387. """Read bitmap data from a file object.
  388. Args:
  389. f: File object to read from
  390. pack_index: Optional PackIndex to resolve object positions to SHAs
  391. Returns:
  392. Loaded PackBitmap
  393. Raises:
  394. ValueError: If file format is invalid
  395. """
  396. # Read header
  397. signature = f.read(4)
  398. if signature != BITMAP_SIGNATURE:
  399. raise ValueError(
  400. f"Invalid bitmap signature: {signature!r}, expected {BITMAP_SIGNATURE!r}"
  401. )
  402. version_bytes = f.read(2)
  403. flags_bytes = f.read(2)
  404. if len(version_bytes) < 2 or len(flags_bytes) < 2:
  405. raise ValueError("Incomplete bitmap header")
  406. version = struct.unpack(">H", version_bytes)[0]
  407. flags = struct.unpack(">H", flags_bytes)[0]
  408. if version != BITMAP_VERSION:
  409. raise ValueError(f"Unsupported bitmap version: {version}")
  410. # Read entry count
  411. entry_count_bytes = f.read(4)
  412. if len(entry_count_bytes) < 4:
  413. raise ValueError("Missing entry count")
  414. entry_count = struct.unpack(">I", entry_count_bytes)[0]
  415. # Read pack checksum
  416. pack_checksum = f.read(20)
  417. if len(pack_checksum) < 20:
  418. raise ValueError("Missing pack checksum")
  419. bitmap = PackBitmap(version=version, flags=flags)
  420. bitmap.pack_checksum = pack_checksum
  421. # Read type bitmaps (EWAH bitmaps are self-describing)
  422. for i, type_bitmap in enumerate(
  423. [
  424. bitmap.commit_bitmap,
  425. bitmap.tree_bitmap,
  426. bitmap.blob_bitmap,
  427. bitmap.tag_bitmap,
  428. ]
  429. ):
  430. # EWAH format:
  431. # 4 bytes: bit count
  432. # 4 bytes: word count
  433. # N x 8 bytes: compressed words
  434. # 4 bytes: RLW position
  435. # Read header to determine size
  436. bit_count_bytes = f.read(4)
  437. word_count_bytes = f.read(4)
  438. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  439. raise ValueError(f"Missing type bitmap {i} header")
  440. word_count = struct.unpack(">I", word_count_bytes)[0]
  441. # Read compressed words
  442. words_data = f.read(word_count * 8)
  443. if len(words_data) < word_count * 8:
  444. raise ValueError(f"Incomplete type bitmap {i} data")
  445. # Read RLW position
  446. rlw_pos_bytes = f.read(4)
  447. if len(rlw_pos_bytes) < 4:
  448. raise ValueError(f"Missing type bitmap {i} RLW position")
  449. # Reconstruct the full EWAH data to pass to _decode
  450. ewah_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes
  451. type_bitmap._decode(ewah_data)
  452. # Read bitmap entries
  453. for _ in range(entry_count):
  454. # Read object position (4 bytes)
  455. obj_pos_bytes = f.read(4)
  456. if len(obj_pos_bytes) < 4:
  457. raise ValueError("Incomplete bitmap entry")
  458. obj_pos = struct.unpack(">I", obj_pos_bytes)[0]
  459. # Read XOR offset (1 byte)
  460. xor_offset_bytes = f.read(1)
  461. if len(xor_offset_bytes) < 1:
  462. raise ValueError("Missing XOR offset")
  463. xor_offset = xor_offset_bytes[0]
  464. # Read flags (1 byte)
  465. flags_bytes = f.read(1)
  466. if len(flags_bytes) < 1:
  467. raise ValueError("Missing entry flags")
  468. entry_flags = flags_bytes[0]
  469. # Read self-describing EWAH bitmap
  470. # EWAH format: bit_count (4) + word_count (4) + words + rlw_pos (4)
  471. bit_count_bytes = f.read(4)
  472. word_count_bytes = f.read(4)
  473. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  474. raise ValueError("Incomplete bitmap entry EWAH header")
  475. word_count = struct.unpack(">I", word_count_bytes)[0]
  476. # Read compressed words
  477. words_data = f.read(word_count * 8)
  478. if len(words_data) < word_count * 8:
  479. raise ValueError("Incomplete bitmap entry EWAH words")
  480. # Read RLW position
  481. rlw_pos_bytes = f.read(4)
  482. if len(rlw_pos_bytes) < 4:
  483. raise ValueError("Missing bitmap entry EWAH RLW position")
  484. # Reconstruct full EWAH data
  485. bitmap_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes
  486. # Create bitmap entry
  487. ewah_bitmap = EWAHBitmap(bitmap_data) if word_count > 0 else EWAHBitmap()
  488. entry = BitmapEntry(
  489. object_pos=obj_pos,
  490. xor_offset=xor_offset,
  491. flags=entry_flags,
  492. bitmap=ewah_bitmap,
  493. )
  494. # Resolve object position to SHA if we have a pack index
  495. if pack_index is not None:
  496. # Get the SHA at the given position in the sorted index
  497. sha = None
  498. for idx, (entry_sha, _offset, _crc32) in enumerate(
  499. pack_index.iterentries()
  500. ):
  501. if idx == obj_pos:
  502. sha = entry_sha
  503. break
  504. if sha is not None:
  505. bitmap.entries[sha] = entry
  506. bitmap.entries_list.append((sha, entry))
  507. else:
  508. # Without pack index, use position as temporary key
  509. temp_key = obj_pos.to_bytes(4, byteorder="big")
  510. bitmap.entries[temp_key] = entry
  511. bitmap.entries_list.append((temp_key, entry))
  512. # Read optional lookup table
  513. if flags & BITMAP_OPT_LOOKUP_TABLE:
  514. # Lookup table contains triplets: (commit_pos, offset, xor_row)
  515. # Number of entries matches the bitmap entry count
  516. lookup_table = []
  517. for _ in range(entry_count):
  518. # Read commit position (4 bytes)
  519. commit_pos_bytes = f.read(4)
  520. if len(commit_pos_bytes) < 4:
  521. break
  522. commit_pos = struct.unpack(">I", commit_pos_bytes)[0]
  523. # Read file offset (8 bytes)
  524. offset_bytes = f.read(8)
  525. if len(offset_bytes) < 8:
  526. break
  527. offset = struct.unpack(">Q", offset_bytes)[0]
  528. # Read XOR row (4 bytes)
  529. xor_row_bytes = f.read(4)
  530. if len(xor_row_bytes) < 4:
  531. break
  532. xor_row = struct.unpack(">I", xor_row_bytes)[0]
  533. lookup_table.append((commit_pos, offset, xor_row))
  534. bitmap.lookup_table = lookup_table
  535. # Read optional name-hash cache
  536. if flags & BITMAP_OPT_HASH_CACHE:
  537. # Name-hash cache contains one 32-bit hash per object in the pack
  538. # The number of hashes depends on the total number of objects
  539. # For now, we'll read what's available
  540. name_hash_cache = []
  541. while True:
  542. hash_bytes = f.read(4)
  543. if len(hash_bytes) < 4:
  544. break
  545. hash_value = struct.unpack(">I", hash_bytes)[0]
  546. name_hash_cache.append(hash_value)
  547. if name_hash_cache:
  548. bitmap.name_hash_cache = name_hash_cache
  549. return bitmap
  550. def write_bitmap(
  551. filename: str | os.PathLike[str],
  552. bitmap: PackBitmap,
  553. ) -> None:
  554. """Write a bitmap index file.
  555. Args:
  556. filename: Path to write the .bitmap file
  557. bitmap: PackBitmap to write
  558. """
  559. with GitFile(filename, "wb") as f:
  560. write_bitmap_file(f, bitmap)
  561. def write_bitmap_file(f: IO[bytes], bitmap: PackBitmap) -> None:
  562. """Write bitmap data to a file object.
  563. Args:
  564. f: File object to write to
  565. bitmap: PackBitmap to write
  566. """
  567. # Write header
  568. f.write(BITMAP_SIGNATURE)
  569. f.write(struct.pack(">H", bitmap.version))
  570. f.write(struct.pack(">H", bitmap.flags))
  571. # Write entry count
  572. f.write(struct.pack(">I", len(bitmap.entries)))
  573. # Write pack checksum
  574. if bitmap.pack_checksum:
  575. f.write(bitmap.pack_checksum)
  576. else:
  577. f.write(b"\x00" * 20)
  578. # Write type bitmaps (self-describing EWAH format, no size prefix needed)
  579. for type_bitmap in [
  580. bitmap.commit_bitmap,
  581. bitmap.tree_bitmap,
  582. bitmap.blob_bitmap,
  583. bitmap.tag_bitmap,
  584. ]:
  585. data = type_bitmap.encode()
  586. f.write(data)
  587. # Write bitmap entries
  588. for _sha, entry in bitmap.entries.items():
  589. # Write object position (4 bytes)
  590. f.write(struct.pack(">I", entry.object_pos))
  591. # Write XOR offset (1 byte)
  592. f.write(bytes([entry.xor_offset]))
  593. # Write flags (1 byte)
  594. f.write(bytes([entry.flags]))
  595. # Write compressed bitmap data (self-describing EWAH format, no size prefix)
  596. bitmap_data = entry.bitmap.encode()
  597. f.write(bitmap_data)
  598. # Write optional lookup table
  599. if bitmap.flags & BITMAP_OPT_LOOKUP_TABLE and bitmap.lookup_table:
  600. for commit_pos, offset, xor_row in bitmap.lookup_table:
  601. f.write(struct.pack(">I", commit_pos)) # 4 bytes
  602. f.write(struct.pack(">Q", offset)) # 8 bytes
  603. f.write(struct.pack(">I", xor_row)) # 4 bytes
  604. # Write optional name-hash cache
  605. if bitmap.flags & BITMAP_OPT_HASH_CACHE and bitmap.name_hash_cache:
  606. for hash_value in bitmap.name_hash_cache:
  607. f.write(struct.pack(">I", hash_value))
  608. def _compute_name_hash(name: bytes) -> int:
  609. """Compute the name hash for a tree entry.
  610. This is the same algorithm Git uses for the name-hash cache.
  611. Args:
  612. name: The name of the tree entry
  613. Returns:
  614. 32-bit hash value
  615. """
  616. hash_value = 0
  617. for byte in name:
  618. hash_value = (hash_value >> 19) | (hash_value << 13)
  619. hash_value += byte
  620. hash_value &= 0xFFFFFFFF
  621. return hash_value
  622. def select_bitmap_commits(
  623. refs: dict[bytes, bytes],
  624. object_store: "BaseObjectStore",
  625. commit_interval: int = DEFAULT_COMMIT_INTERVAL,
  626. ) -> list[bytes]:
  627. """Select commits for bitmap generation.
  628. Uses Git's strategy:
  629. - All branch and tag tips
  630. - Every Nth commit in history
  631. Args:
  632. refs: Dictionary of ref names to commit SHAs
  633. object_store: Object store to read commits from
  634. commit_interval: Include every Nth commit in history
  635. Returns:
  636. List of commit SHAs to create bitmaps for
  637. """
  638. selected = set()
  639. seen = set()
  640. # Start with all refs
  641. ref_commits = set()
  642. for ref_name, sha in refs.items():
  643. try:
  644. obj = object_store[sha]
  645. except KeyError:
  646. continue
  647. else:
  648. # Dereference tags to get to commits
  649. while isinstance(obj, Tag):
  650. obj = object_store[obj.object[1]]
  651. if isinstance(obj, Commit):
  652. ref_commits.add(obj.id)
  653. # Add all ref tips
  654. selected.update(ref_commits)
  655. # Walk the commit graph and select every Nth commit
  656. queue = deque(ref_commits)
  657. commit_count = 0
  658. while queue:
  659. commit_sha = queue.popleft()
  660. if commit_sha in seen:
  661. continue
  662. seen.add(commit_sha)
  663. try:
  664. obj = object_store[commit_sha]
  665. if not isinstance(obj, Commit):
  666. continue
  667. commit_count += 1
  668. if commit_count % commit_interval == 0:
  669. selected.add(commit_sha)
  670. # Add parents to queue
  671. for parent in obj.parents:
  672. if parent not in seen:
  673. queue.append(parent)
  674. except KeyError:
  675. continue
  676. return sorted(selected)
  677. def build_reachability_bitmap(
  678. commit_sha: bytes,
  679. sha_to_pos: dict[bytes, int],
  680. object_store: "BaseObjectStore",
  681. ) -> EWAHBitmap:
  682. """Build a reachability bitmap for a commit.
  683. The bitmap has a bit set for each object that is reachable from the commit.
  684. The bit position corresponds to the object's position in the pack index.
  685. Args:
  686. commit_sha: The commit to build a bitmap for
  687. sha_to_pos: Pre-built mapping from SHA to position in pack
  688. object_store: Object store to traverse objects
  689. Returns:
  690. EWAH bitmap with bits set for reachable objects
  691. """
  692. bitmap = EWAHBitmap()
  693. # Traverse all objects reachable from the commit
  694. seen = set()
  695. queue = deque([commit_sha])
  696. while queue:
  697. sha = queue.popleft()
  698. if sha in seen:
  699. continue
  700. seen.add(sha)
  701. # Add this object to the bitmap if it's in the pack
  702. if sha in sha_to_pos:
  703. bitmap.add(sha_to_pos[sha])
  704. # Get the object and traverse its references
  705. try:
  706. obj = object_store[sha]
  707. if isinstance(obj, Commit):
  708. # Add parents and tree
  709. queue.append(obj.tree)
  710. queue.extend(obj.parents)
  711. elif hasattr(obj, "items"):
  712. # Tree object - add all entries
  713. for item in obj.items():
  714. queue.append(item.sha)
  715. except KeyError:
  716. # Object not in store, skip it
  717. continue
  718. return bitmap
  719. def apply_xor_compression(
  720. bitmaps: list[tuple[bytes, EWAHBitmap]],
  721. max_xor_offset: int = 160,
  722. ) -> list[tuple[bytes, EWAHBitmap, int]]:
  723. """Apply XOR compression to bitmaps.
  724. XOR compression stores some bitmaps as XOR differences from previous bitmaps,
  725. reducing storage size when bitmaps are similar.
  726. Args:
  727. bitmaps: List of (commit_sha, bitmap) tuples
  728. max_xor_offset: Maximum offset to search for XOR base (default: 160)
  729. Returns:
  730. List of (commit_sha, bitmap, xor_offset) tuples
  731. """
  732. compressed = []
  733. for i, (sha, bitmap) in enumerate(bitmaps):
  734. best_xor_offset = 0
  735. best_size = len(bitmap.encode())
  736. best_xor_bitmap = bitmap
  737. # Try XORing with previous bitmaps within max_xor_offset
  738. for offset in range(1, min(i + 1, max_xor_offset + 1)):
  739. _prev_sha, prev_bitmap = bitmaps[i - offset]
  740. xor_bitmap = bitmap ^ prev_bitmap
  741. xor_size = len(xor_bitmap.encode())
  742. # Use XOR if it reduces size
  743. if xor_size < best_size:
  744. best_size = xor_size
  745. best_xor_offset = offset
  746. best_xor_bitmap = xor_bitmap
  747. compressed.append((sha, best_xor_bitmap, best_xor_offset))
  748. return compressed
  749. def build_type_bitmaps(
  750. sha_to_pos: dict[bytes, int],
  751. object_store: "BaseObjectStore",
  752. ) -> tuple[EWAHBitmap, EWAHBitmap, EWAHBitmap, EWAHBitmap]:
  753. """Build type bitmaps for all objects in a pack.
  754. Type bitmaps classify objects by type: commit, tree, blob, or tag.
  755. Args:
  756. sha_to_pos: Pre-built mapping from SHA to position in pack
  757. object_store: Object store to read object types
  758. Returns:
  759. Tuple of (commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap)
  760. """
  761. commit_bitmap = EWAHBitmap()
  762. tree_bitmap = EWAHBitmap()
  763. blob_bitmap = EWAHBitmap()
  764. tag_bitmap = EWAHBitmap()
  765. for sha, pos in sha_to_pos.items():
  766. try:
  767. obj = object_store[sha]
  768. except KeyError:
  769. pass
  770. else:
  771. obj_type = obj.type_num
  772. if obj_type == Commit.type_num:
  773. commit_bitmap.add(pos)
  774. elif obj_type == Tree.type_num:
  775. tree_bitmap.add(pos)
  776. elif obj_type == Blob.type_num:
  777. blob_bitmap.add(pos)
  778. elif obj_type == Tag.type_num:
  779. tag_bitmap.add(pos)
  780. return commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap
  781. def build_name_hash_cache(
  782. sha_to_pos: dict[bytes, int],
  783. object_store: "BaseObjectStore",
  784. ) -> list[int]:
  785. """Build name-hash cache for all objects in a pack.
  786. The name-hash cache stores a hash of the name for each object,
  787. which can speed up path-based operations.
  788. Args:
  789. sha_to_pos: Pre-built mapping from SHA to position in pack
  790. object_store: Object store to read objects
  791. Returns:
  792. List of 32-bit hash values, one per object in the pack
  793. """
  794. # Pre-allocate list with correct size
  795. num_objects = len(sha_to_pos)
  796. name_hashes = [0] * num_objects
  797. for sha, pos in sha_to_pos.items():
  798. try:
  799. obj = object_store[sha]
  800. except KeyError:
  801. # Object not in store, use zero hash
  802. pass
  803. else:
  804. # For tree entries, use the tree entry name
  805. # For commits, use the tree SHA
  806. # For other objects, use the object SHA
  807. if isinstance(obj, Tree):
  808. # Tree object - use the SHA as the name
  809. name_hash = _compute_name_hash(sha)
  810. elif isinstance(obj, Commit):
  811. # Commit - use the tree SHA as the name
  812. name_hash = _compute_name_hash(obj.tree)
  813. else:
  814. # Other objects - use the SHA as the name
  815. name_hash = _compute_name_hash(sha)
  816. name_hashes[pos] = name_hash
  817. return name_hashes
  818. def generate_bitmap(
  819. pack_index: "PackIndex",
  820. object_store: "BaseObjectStore",
  821. refs: dict[bytes, bytes],
  822. pack_checksum: bytes,
  823. include_hash_cache: bool = True,
  824. include_lookup_table: bool = True,
  825. commit_interval: int = DEFAULT_COMMIT_INTERVAL,
  826. progress: Callable[[str], None] | None = None,
  827. ) -> PackBitmap:
  828. """Generate a complete bitmap for a pack.
  829. Args:
  830. pack_index: Pack index for the pack
  831. object_store: Object store to read objects from
  832. refs: Dictionary of ref names to commit SHAs
  833. pack_checksum: SHA-1 checksum of the pack file
  834. include_hash_cache: Whether to include name-hash cache
  835. include_lookup_table: Whether to include lookup table
  836. commit_interval: Include every Nth commit in history
  837. progress: Optional progress reporting callback
  838. Returns:
  839. Complete PackBitmap ready to write to disk
  840. """
  841. if progress:
  842. progress("Building pack index mapping")
  843. # Build mapping from SHA to position in pack index ONCE
  844. # This is used by all subsequent operations and avoids repeated enumeration
  845. sha_to_pos = {}
  846. for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  847. sha_to_pos[sha] = pos
  848. if progress:
  849. progress("Selecting commits for bitmap")
  850. # Select commits to create bitmaps for
  851. selected_commits = select_bitmap_commits(refs, object_store, commit_interval)
  852. if progress:
  853. progress(f"Building bitmaps for {len(selected_commits)} commits")
  854. # Build reachability bitmaps for selected commits
  855. commit_bitmaps = []
  856. for i, commit_sha in enumerate(selected_commits):
  857. if progress and i % 10 == 0:
  858. progress(f"Building bitmap {i + 1}/{len(selected_commits)}")
  859. bitmap = build_reachability_bitmap(commit_sha, sha_to_pos, object_store)
  860. commit_bitmaps.append((commit_sha, bitmap))
  861. if progress:
  862. progress("Applying XOR compression")
  863. # Apply XOR compression
  864. compressed_bitmaps = apply_xor_compression(commit_bitmaps)
  865. if progress:
  866. progress("Building type bitmaps")
  867. # Build type bitmaps (using pre-built sha_to_pos mapping)
  868. commit_type_bitmap, tree_type_bitmap, blob_type_bitmap, tag_type_bitmap = (
  869. build_type_bitmaps(sha_to_pos, object_store)
  870. )
  871. # Create PackBitmap
  872. flags = BITMAP_OPT_FULL_DAG
  873. if include_hash_cache:
  874. flags |= BITMAP_OPT_HASH_CACHE
  875. if include_lookup_table:
  876. flags |= BITMAP_OPT_LOOKUP_TABLE
  877. pack_bitmap = PackBitmap(version=1, flags=flags)
  878. pack_bitmap.pack_checksum = pack_checksum
  879. pack_bitmap.commit_bitmap = commit_type_bitmap
  880. pack_bitmap.tree_bitmap = tree_type_bitmap
  881. pack_bitmap.blob_bitmap = blob_type_bitmap
  882. pack_bitmap.tag_bitmap = tag_type_bitmap
  883. # Add bitmap entries
  884. for commit_sha, xor_bitmap, xor_offset in compressed_bitmaps:
  885. if commit_sha not in sha_to_pos:
  886. continue
  887. entry = BitmapEntry(
  888. object_pos=sha_to_pos[commit_sha],
  889. xor_offset=xor_offset,
  890. flags=0,
  891. bitmap=xor_bitmap,
  892. )
  893. pack_bitmap.entries[commit_sha] = entry
  894. pack_bitmap.entries_list.append((commit_sha, entry))
  895. # Build optional name-hash cache (using pre-built sha_to_pos mapping)
  896. if include_hash_cache:
  897. if progress:
  898. progress("Building name-hash cache")
  899. pack_bitmap.name_hash_cache = build_name_hash_cache(sha_to_pos, object_store)
  900. # Build optional lookup table
  901. if include_lookup_table:
  902. if progress:
  903. progress("Building lookup table")
  904. # The lookup table is built automatically from the entries
  905. # For now, we'll leave it as None and let the write function handle it
  906. # TODO: Implement lookup table generation if needed
  907. pack_bitmap.lookup_table = None
  908. if progress:
  909. progress("Bitmap generation complete")
  910. return pack_bitmap
  911. def find_commit_bitmaps(
  912. commit_shas: set[bytes], packs: Iterable[Pack]
  913. ) -> dict[bytes, tuple]:
  914. """Find which packs have bitmaps for the given commits.
  915. Args:
  916. commit_shas: Set of commit SHAs to look for
  917. packs: Iterable of Pack objects to search
  918. Returns:
  919. Dict mapping commit SHA to (pack, pack_bitmap, position) tuple
  920. """
  921. result = {}
  922. remaining = set(commit_shas)
  923. for pack in packs:
  924. if not remaining:
  925. break
  926. pack_bitmap = pack.bitmap
  927. if not pack_bitmap:
  928. # No bitmap for this pack
  929. continue
  930. # Build SHA to position mapping for this pack
  931. sha_to_pos = {}
  932. for pos, (sha, _offset, _crc32) in enumerate(pack.index.iterentries()):
  933. sha_to_pos[sha] = pos
  934. # Check which commits have bitmaps
  935. for commit_sha in list(remaining):
  936. if pack_bitmap.has_commit(commit_sha):
  937. if commit_sha in sha_to_pos:
  938. result[commit_sha] = (pack, pack_bitmap, sha_to_pos)
  939. remaining.remove(commit_sha)
  940. return result
  941. def bitmap_to_object_shas(
  942. bitmap: EWAHBitmap,
  943. pack_index: "PackIndex",
  944. type_filter: EWAHBitmap | None = None,
  945. ) -> set[bytes]:
  946. """Convert a bitmap to a set of object SHAs.
  947. Args:
  948. bitmap: The EWAH bitmap with set bits for objects
  949. pack_index: Pack index to map positions to SHAs
  950. type_filter: Optional type bitmap to filter results (e.g., commits only)
  951. Returns:
  952. Set of object SHAs
  953. """
  954. result = set()
  955. for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  956. # Check if this position is in the bitmap
  957. if pos in bitmap:
  958. # Apply type filter if provided
  959. if type_filter is None or pos in type_filter:
  960. result.add(sha)
  961. return result