bitmap.py 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207
  1. # bitmap.py -- Packfile bitmap support for git
  2. # Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Support for Git packfile bitmaps.
  22. Bitmaps store reachability information for packfiles, enabling faster
  23. object counting and enumeration operations without full graph traversal.
  24. The bitmap format uses EWAH (Enhanced Word-Aligned Hybrid) compression
  25. for efficient storage and fast bitwise operations.
  26. """
  27. import os
  28. import struct
  29. from collections import deque
  30. from collections.abc import Callable, Iterable, Iterator
  31. from io import BytesIO
  32. from typing import IO, TYPE_CHECKING, Callable, Optional
  33. from .file import GitFile
  34. from .objects import Blob, Commit, Tag, Tree
  35. if TYPE_CHECKING:
  36. from .object_store import BaseObjectStore
  37. from .pack import Pack, PackIndex
  38. # Bitmap file signature
  39. BITMAP_SIGNATURE = b"BITM"
  40. # Bitmap format version
  41. BITMAP_VERSION = 1
  42. # Bitmap flags
  43. BITMAP_OPT_FULL_DAG = 0x1 # Full closure
  44. BITMAP_OPT_HASH_CACHE = 0x4 # Name-hash cache
  45. BITMAP_OPT_LOOKUP_TABLE = 0x10 # Lookup table for random access
  46. BITMAP_OPT_PSEUDO_MERGES = 0x20 # Pseudo-merge bitmaps
  47. # EWAH compression constants
  48. MAX_LITERAL_WORDS = 0x7FFFFFFF # Maximum literal words in EWAH format (31 bits)
  49. MAX_XOR_OFFSET = 160 # Maximum distance to search for XOR compression base
  50. DEFAULT_COMMIT_INTERVAL = 100 # Default interval for commit selection
  51. def _encode_ewah_words(words: list[int]) -> list[int]:
  52. """Encode a list of 64-bit words using EWAH run-length compression.
  53. Args:
  54. words: List of 64-bit words to encode
  55. Returns:
  56. List of compressed words (RLWs followed by literals)
  57. """
  58. compressed_words = []
  59. i = 0
  60. while i < len(words):
  61. # Check for runs of all zeros or all ones
  62. if words[i] == 0 or words[i] == 0xFFFFFFFFFFFFFFFF:
  63. # Count consecutive identical words
  64. run_value = words[i]
  65. run_length = 0
  66. while i < len(words) and words[i] == run_value:
  67. run_length += 1
  68. i += 1
  69. # Collect following literal words
  70. literals = []
  71. while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF:
  72. literals.append(words[i])
  73. i += 1
  74. if len(literals) >= MAX_LITERAL_WORDS:
  75. break
  76. # Create RLW with correct bit layout:
  77. # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)]
  78. running_bit = 1 if run_value == 0xFFFFFFFFFFFFFFFF else 0
  79. rlw = (len(literals) << 33) | (run_length << 1) | running_bit
  80. compressed_words.append(rlw)
  81. compressed_words.extend(literals)
  82. else:
  83. # Collect literal words
  84. literals = []
  85. while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF:
  86. literals.append(words[i])
  87. i += 1
  88. if len(literals) >= MAX_LITERAL_WORDS:
  89. break
  90. # RLW with no run, just literals
  91. # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)]
  92. rlw = (len(literals) << 33) | (0 << 1) | 0
  93. compressed_words.append(rlw)
  94. compressed_words.extend(literals)
  95. return compressed_words
  96. class EWAHBitmap:
  97. """EWAH (Enhanced Word-Aligned Hybrid) compressed bitmap.
  98. EWAH uses run-length encoding for efficient bitmap storage.
  99. Each bitmap consists of:
  100. - Uncompressed bit count (4 bytes)
  101. - Compressed word count (4 bytes)
  102. - Compressed words (8 bytes each)
  103. - Current RLW position (4 bytes)
  104. Each Run Length Word (RLW) 64-bit layout (LSB to MSB):
  105. - Bit 0: running_bit (1 bit) - value of repeated words (0 or 1)
  106. - Bits 1-32: running_len (32 bits) - count of repeated words
  107. - Bits 33-63: literal_words (31 bits) - count of literal words following this RLW
  108. """
  109. def __init__(self, data: bytes | None = None) -> None:
  110. """Initialize EWAH bitmap.
  111. Args:
  112. data: Optional compressed bitmap data to decode
  113. """
  114. self.bits: set[int] = set()
  115. self.bit_count = 0
  116. if data:
  117. self._decode(data)
  118. def _decode(self, data: bytes) -> None:
  119. """Decode EWAH compressed bitmap data.
  120. Args:
  121. data: Compressed bitmap data (EWAH format with header + words +
  122. RLW position)
  123. """
  124. f = BytesIO(data)
  125. # Read header
  126. bit_count_bytes = f.read(4)
  127. word_count_bytes = f.read(4)
  128. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  129. return
  130. bit_count = struct.unpack(">I", bit_count_bytes)[0]
  131. word_count = struct.unpack(">I", word_count_bytes)[0]
  132. self.bit_count = bit_count
  133. current_bit = 0
  134. # Read all words first
  135. words = []
  136. for _ in range(word_count):
  137. word_bytes = f.read(8)
  138. if len(word_bytes) < 8:
  139. break
  140. word = struct.unpack(">Q", word_bytes)[0]
  141. words.append(word)
  142. # Process EWAH chunks: RLW followed by literal words
  143. idx = 0
  144. while idx < len(words):
  145. # This is an RLW
  146. # Bit layout: [literal_words(31)][running_len(32)][running_bit(1)]
  147. rlw = words[idx]
  148. running_bit = rlw & 1
  149. running_len = (rlw >> 1) & 0xFFFFFFFF
  150. literal_words = rlw >> 33
  151. idx += 1
  152. # Process running bits
  153. if running_len > 0:
  154. if running_bit == 1:
  155. # Add all bits in the repeated section
  156. for i in range(running_len * 64):
  157. self.bits.add(current_bit + i)
  158. current_bit += running_len * 64
  159. # Process literal words
  160. for _ in range(literal_words):
  161. if idx >= len(words):
  162. break
  163. literal = words[idx]
  164. idx += 1
  165. # Extract set bits from literal word
  166. for i in range(64):
  167. if literal & (1 << i):
  168. self.bits.add(current_bit + i)
  169. current_bit += 64
  170. # Read RLW position (we don't use it currently, but it's part of the format)
  171. f.read(4)
  172. def encode(self) -> bytes:
  173. """Encode bitmap to EWAH compressed format.
  174. Returns:
  175. Compressed bitmap data including header, words, and RLW position
  176. """
  177. if not self.bits:
  178. # Empty bitmap: bit_count=0, word_count=0, rlw_pos=0
  179. return struct.pack(">III", 0, 0, 0)
  180. max_bit = max(self.bits) if self.bits else 0
  181. bit_count = max_bit + 1
  182. word_count = (bit_count + 63) // 64
  183. # Create literal words
  184. words = [0] * word_count
  185. for bit in self.bits:
  186. word_idx = bit // 64
  187. bit_idx = bit % 64
  188. words[word_idx] |= 1 << bit_idx
  189. # Compress using EWAH run-length encoding
  190. compressed_words = _encode_ewah_words(words)
  191. # Build EWAH data
  192. f = BytesIO()
  193. # Header
  194. f.write(struct.pack(">I", bit_count))
  195. f.write(struct.pack(">I", len(compressed_words)))
  196. # Write compressed words
  197. for word in compressed_words:
  198. f.write(struct.pack(">Q", word))
  199. # Write RLW position (position of last RLW in the compressed words)
  200. # For now, we'll use 0 as we don't track this during encoding
  201. # This could be improved in the future if needed
  202. f.write(struct.pack(">I", 0))
  203. return f.getvalue()
  204. def __contains__(self, bit: int) -> bool:
  205. """Check if a bit is set.
  206. Args:
  207. bit: Bit position to check
  208. Returns:
  209. True if bit is set, False otherwise
  210. """
  211. return bit in self.bits
  212. def __len__(self) -> int:
  213. """Return the number of set bits.
  214. Returns:
  215. Count of set bits
  216. """
  217. return len(self.bits)
  218. def __or__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  219. """Bitwise OR operation.
  220. Args:
  221. other: Other bitmap to OR with
  222. Returns:
  223. New bitmap with OR result
  224. """
  225. result = EWAHBitmap()
  226. result.bits = self.bits | other.bits
  227. result.bit_count = max(self.bit_count, other.bit_count)
  228. return result
  229. def __and__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  230. """Bitwise AND operation.
  231. Args:
  232. other: Other bitmap to AND with
  233. Returns:
  234. New bitmap with AND result
  235. """
  236. result = EWAHBitmap()
  237. result.bits = self.bits & other.bits
  238. result.bit_count = max(self.bit_count, other.bit_count)
  239. return result
  240. def __xor__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  241. """Bitwise XOR operation.
  242. Args:
  243. other: Other bitmap to XOR with
  244. Returns:
  245. New bitmap with XOR result
  246. """
  247. result = EWAHBitmap()
  248. result.bits = self.bits ^ other.bits
  249. result.bit_count = max(self.bit_count, other.bit_count)
  250. return result
  251. def __sub__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  252. """Bitwise subtraction (set difference).
  253. Returns bits that are in self but not in other.
  254. Equivalent to: self & ~other
  255. Args:
  256. other: Bitmap to subtract
  257. Returns:
  258. New bitmap with bits in self but not in other
  259. """
  260. result = EWAHBitmap()
  261. result.bits = self.bits - other.bits
  262. result.bit_count = self.bit_count
  263. return result
  264. def add(self, bit: int) -> None:
  265. """Set a bit.
  266. Args:
  267. bit: Bit position to set
  268. """
  269. self.bits.add(bit)
  270. self.bit_count = max(self.bit_count, bit + 1)
  271. class BitmapEntry:
  272. """A single bitmap entry for a commit."""
  273. def __init__(
  274. self,
  275. object_pos: int,
  276. xor_offset: int,
  277. flags: int,
  278. bitmap: EWAHBitmap,
  279. ) -> None:
  280. """Initialize a bitmap entry.
  281. Args:
  282. object_pos: Position of object in pack index
  283. xor_offset: XOR offset for compression
  284. flags: Entry flags
  285. bitmap: The EWAH bitmap data
  286. """
  287. self.object_pos = object_pos
  288. self.xor_offset = xor_offset
  289. self.flags = flags
  290. self.bitmap = bitmap
  291. class PackBitmap:
  292. """A pack bitmap index.
  293. Bitmaps store reachability information for commits in a packfile,
  294. allowing fast object enumeration without graph traversal.
  295. """
  296. def __init__(
  297. self,
  298. version: int = BITMAP_VERSION,
  299. flags: int = BITMAP_OPT_FULL_DAG,
  300. ) -> None:
  301. """Initialize a pack bitmap.
  302. Args:
  303. version: Bitmap format version
  304. flags: Bitmap flags
  305. """
  306. self.version = version
  307. self.flags = flags
  308. self.pack_checksum: bytes | None = None
  309. # Type bitmaps for commits, trees, blobs, tags
  310. self.commit_bitmap = EWAHBitmap()
  311. self.tree_bitmap = EWAHBitmap()
  312. self.blob_bitmap = EWAHBitmap()
  313. self.tag_bitmap = EWAHBitmap()
  314. # Bitmap entries indexed by commit SHA
  315. self.entries: dict[bytes, BitmapEntry] = {}
  316. # List of entries in order (for XOR offset resolution)
  317. self.entries_list: list[tuple[bytes, BitmapEntry]] = []
  318. # Optional lookup table for random access
  319. self.lookup_table: list[tuple[int, int, int]] | None = None
  320. # Optional name-hash cache
  321. self.name_hash_cache: list[int] | None = None
  322. def get_bitmap(self, commit_sha: bytes) -> EWAHBitmap | None:
  323. """Get the bitmap for a commit.
  324. Args:
  325. commit_sha: SHA-1 of the commit
  326. Returns:
  327. EWAH bitmap or None if not found
  328. """
  329. entry = self.entries.get(commit_sha)
  330. if entry is None:
  331. return None
  332. # Decompress using XOR if needed
  333. if entry.xor_offset > 0:
  334. # Find the entry at the XOR offset
  335. # The XOR offset tells us how many entries back to look
  336. # We need to find this entry in the ordered list
  337. try:
  338. current_idx = next(
  339. i
  340. for i, (sha, _) in enumerate(self.entries_list)
  341. if sha == commit_sha
  342. )
  343. except StopIteration:
  344. # Entry not found in list, return as-is
  345. return entry.bitmap
  346. # XOR offset is how many positions back to look
  347. if current_idx >= entry.xor_offset:
  348. base_sha, _base_entry = self.entries_list[
  349. current_idx - entry.xor_offset
  350. ]
  351. # Get the base bitmap (recursively if it also uses XOR)
  352. base_bitmap = self.get_bitmap(base_sha)
  353. if base_bitmap is not None:
  354. # XOR the current bitmap with the base
  355. return entry.bitmap ^ base_bitmap
  356. return entry.bitmap
  357. def has_commit(self, commit_sha: bytes) -> bool:
  358. """Check if a commit has a bitmap.
  359. Args:
  360. commit_sha: SHA-1 of the commit
  361. Returns:
  362. True if bitmap exists for this commit
  363. """
  364. return commit_sha in self.entries
  365. def iter_commits(self) -> Iterator[bytes]:
  366. """Iterate over all commits with bitmaps.
  367. Returns:
  368. Iterator of commit SHAs
  369. """
  370. return iter(self.entries.keys())
  371. def read_bitmap(
  372. filename: str | os.PathLike[str],
  373. pack_index: Optional["PackIndex"] = None,
  374. ) -> PackBitmap:
  375. """Read a bitmap index file.
  376. Args:
  377. filename: Path to the .bitmap file
  378. pack_index: Optional PackIndex to resolve object positions to SHAs
  379. Returns:
  380. Loaded PackBitmap
  381. Raises:
  382. ValueError: If file format is invalid
  383. ChecksumMismatch: If checksum verification fails
  384. """
  385. with GitFile(filename, "rb") as f:
  386. return read_bitmap_file(f, pack_index=pack_index)
  387. def read_bitmap_file(
  388. f: IO[bytes], pack_index: Optional["PackIndex"] = None
  389. ) -> PackBitmap:
  390. """Read bitmap data from a file object.
  391. Args:
  392. f: File object to read from
  393. pack_index: Optional PackIndex to resolve object positions to SHAs
  394. Returns:
  395. Loaded PackBitmap
  396. Raises:
  397. ValueError: If file format is invalid
  398. """
  399. # Read header
  400. signature = f.read(4)
  401. if signature != BITMAP_SIGNATURE:
  402. raise ValueError(
  403. f"Invalid bitmap signature: {signature!r}, expected {BITMAP_SIGNATURE!r}"
  404. )
  405. version_bytes = f.read(2)
  406. flags_bytes = f.read(2)
  407. if len(version_bytes) < 2 or len(flags_bytes) < 2:
  408. raise ValueError("Incomplete bitmap header")
  409. version = struct.unpack(">H", version_bytes)[0]
  410. flags = struct.unpack(">H", flags_bytes)[0]
  411. if version != BITMAP_VERSION:
  412. raise ValueError(f"Unsupported bitmap version: {version}")
  413. # Read entry count
  414. entry_count_bytes = f.read(4)
  415. if len(entry_count_bytes) < 4:
  416. raise ValueError("Missing entry count")
  417. entry_count = struct.unpack(">I", entry_count_bytes)[0]
  418. # Read pack checksum
  419. pack_checksum = f.read(20)
  420. if len(pack_checksum) < 20:
  421. raise ValueError("Missing pack checksum")
  422. bitmap = PackBitmap(version=version, flags=flags)
  423. bitmap.pack_checksum = pack_checksum
  424. # Read type bitmaps (EWAH bitmaps are self-describing)
  425. for i, type_bitmap in enumerate(
  426. [
  427. bitmap.commit_bitmap,
  428. bitmap.tree_bitmap,
  429. bitmap.blob_bitmap,
  430. bitmap.tag_bitmap,
  431. ]
  432. ):
  433. # EWAH format:
  434. # 4 bytes: bit count
  435. # 4 bytes: word count
  436. # N x 8 bytes: compressed words
  437. # 4 bytes: RLW position
  438. # Read header to determine size
  439. bit_count_bytes = f.read(4)
  440. word_count_bytes = f.read(4)
  441. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  442. raise ValueError(f"Missing type bitmap {i} header")
  443. word_count = struct.unpack(">I", word_count_bytes)[0]
  444. # Read compressed words
  445. words_data = f.read(word_count * 8)
  446. if len(words_data) < word_count * 8:
  447. raise ValueError(f"Incomplete type bitmap {i} data")
  448. # Read RLW position
  449. rlw_pos_bytes = f.read(4)
  450. if len(rlw_pos_bytes) < 4:
  451. raise ValueError(f"Missing type bitmap {i} RLW position")
  452. # Reconstruct the full EWAH data to pass to _decode
  453. ewah_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes
  454. type_bitmap._decode(ewah_data)
  455. # Read bitmap entries
  456. for _ in range(entry_count):
  457. # Read object position (4 bytes)
  458. obj_pos_bytes = f.read(4)
  459. if len(obj_pos_bytes) < 4:
  460. raise ValueError("Incomplete bitmap entry")
  461. obj_pos = struct.unpack(">I", obj_pos_bytes)[0]
  462. # Read XOR offset (1 byte)
  463. xor_offset_bytes = f.read(1)
  464. if len(xor_offset_bytes) < 1:
  465. raise ValueError("Missing XOR offset")
  466. xor_offset = xor_offset_bytes[0]
  467. # Read flags (1 byte)
  468. flags_bytes = f.read(1)
  469. if len(flags_bytes) < 1:
  470. raise ValueError("Missing entry flags")
  471. entry_flags = flags_bytes[0]
  472. # Read self-describing EWAH bitmap
  473. # EWAH format: bit_count (4) + word_count (4) + words + rlw_pos (4)
  474. bit_count_bytes = f.read(4)
  475. word_count_bytes = f.read(4)
  476. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  477. raise ValueError("Incomplete bitmap entry EWAH header")
  478. word_count = struct.unpack(">I", word_count_bytes)[0]
  479. # Read compressed words
  480. words_data = f.read(word_count * 8)
  481. if len(words_data) < word_count * 8:
  482. raise ValueError("Incomplete bitmap entry EWAH words")
  483. # Read RLW position
  484. rlw_pos_bytes = f.read(4)
  485. if len(rlw_pos_bytes) < 4:
  486. raise ValueError("Missing bitmap entry EWAH RLW position")
  487. # Reconstruct full EWAH data
  488. bitmap_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes
  489. # Create bitmap entry
  490. ewah_bitmap = EWAHBitmap(bitmap_data) if word_count > 0 else EWAHBitmap()
  491. entry = BitmapEntry(
  492. object_pos=obj_pos,
  493. xor_offset=xor_offset,
  494. flags=entry_flags,
  495. bitmap=ewah_bitmap,
  496. )
  497. # Resolve object position to SHA if we have a pack index
  498. if pack_index is not None:
  499. # Get the SHA at the given position in the sorted index
  500. sha = None
  501. for idx, (entry_sha, _offset, _crc32) in enumerate(
  502. pack_index.iterentries()
  503. ):
  504. if idx == obj_pos:
  505. sha = entry_sha
  506. break
  507. if sha is not None:
  508. bitmap.entries[sha] = entry
  509. bitmap.entries_list.append((sha, entry))
  510. else:
  511. # Without pack index, use position as temporary key
  512. temp_key = obj_pos.to_bytes(4, byteorder="big")
  513. bitmap.entries[temp_key] = entry
  514. bitmap.entries_list.append((temp_key, entry))
  515. # Read optional lookup table
  516. if flags & BITMAP_OPT_LOOKUP_TABLE:
  517. # Lookup table contains triplets: (commit_pos, offset, xor_row)
  518. # Number of entries matches the bitmap entry count
  519. lookup_table = []
  520. for _ in range(entry_count):
  521. # Read commit position (4 bytes)
  522. commit_pos_bytes = f.read(4)
  523. if len(commit_pos_bytes) < 4:
  524. break
  525. commit_pos = struct.unpack(">I", commit_pos_bytes)[0]
  526. # Read file offset (8 bytes)
  527. offset_bytes = f.read(8)
  528. if len(offset_bytes) < 8:
  529. break
  530. offset = struct.unpack(">Q", offset_bytes)[0]
  531. # Read XOR row (4 bytes)
  532. xor_row_bytes = f.read(4)
  533. if len(xor_row_bytes) < 4:
  534. break
  535. xor_row = struct.unpack(">I", xor_row_bytes)[0]
  536. lookup_table.append((commit_pos, offset, xor_row))
  537. bitmap.lookup_table = lookup_table
  538. # Read optional name-hash cache
  539. if flags & BITMAP_OPT_HASH_CACHE:
  540. # Name-hash cache contains one 32-bit hash per object in the pack
  541. # The number of hashes depends on the total number of objects
  542. # For now, we'll read what's available
  543. name_hash_cache = []
  544. while True:
  545. hash_bytes = f.read(4)
  546. if len(hash_bytes) < 4:
  547. break
  548. hash_value = struct.unpack(">I", hash_bytes)[0]
  549. name_hash_cache.append(hash_value)
  550. if name_hash_cache:
  551. bitmap.name_hash_cache = name_hash_cache
  552. return bitmap
  553. def write_bitmap(
  554. filename: str | os.PathLike[str],
  555. bitmap: PackBitmap,
  556. ) -> None:
  557. """Write a bitmap index file.
  558. Args:
  559. filename: Path to write the .bitmap file
  560. bitmap: PackBitmap to write
  561. """
  562. with GitFile(filename, "wb") as f:
  563. write_bitmap_file(f, bitmap)
  564. def write_bitmap_file(f: IO[bytes], bitmap: PackBitmap) -> None:
  565. """Write bitmap data to a file object.
  566. Args:
  567. f: File object to write to
  568. bitmap: PackBitmap to write
  569. """
  570. # Write header
  571. f.write(BITMAP_SIGNATURE)
  572. f.write(struct.pack(">H", bitmap.version))
  573. f.write(struct.pack(">H", bitmap.flags))
  574. # Write entry count
  575. f.write(struct.pack(">I", len(bitmap.entries)))
  576. # Write pack checksum
  577. if bitmap.pack_checksum:
  578. f.write(bitmap.pack_checksum)
  579. else:
  580. f.write(b"\x00" * 20)
  581. # Write type bitmaps (self-describing EWAH format, no size prefix needed)
  582. for type_bitmap in [
  583. bitmap.commit_bitmap,
  584. bitmap.tree_bitmap,
  585. bitmap.blob_bitmap,
  586. bitmap.tag_bitmap,
  587. ]:
  588. data = type_bitmap.encode()
  589. f.write(data)
  590. # Write bitmap entries
  591. for _sha, entry in bitmap.entries.items():
  592. # Write object position (4 bytes)
  593. f.write(struct.pack(">I", entry.object_pos))
  594. # Write XOR offset (1 byte)
  595. f.write(bytes([entry.xor_offset]))
  596. # Write flags (1 byte)
  597. f.write(bytes([entry.flags]))
  598. # Write compressed bitmap data (self-describing EWAH format, no size prefix)
  599. bitmap_data = entry.bitmap.encode()
  600. f.write(bitmap_data)
  601. # Write optional lookup table
  602. if bitmap.flags & BITMAP_OPT_LOOKUP_TABLE and bitmap.lookup_table:
  603. for commit_pos, offset, xor_row in bitmap.lookup_table:
  604. f.write(struct.pack(">I", commit_pos)) # 4 bytes
  605. f.write(struct.pack(">Q", offset)) # 8 bytes
  606. f.write(struct.pack(">I", xor_row)) # 4 bytes
  607. # Write optional name-hash cache
  608. if bitmap.flags & BITMAP_OPT_HASH_CACHE and bitmap.name_hash_cache:
  609. for hash_value in bitmap.name_hash_cache:
  610. f.write(struct.pack(">I", hash_value))
  611. def _compute_name_hash(name: bytes) -> int:
  612. """Compute the name hash for a tree entry.
  613. This is the same algorithm Git uses for the name-hash cache.
  614. Args:
  615. name: The name of the tree entry
  616. Returns:
  617. 32-bit hash value
  618. """
  619. hash_value = 0
  620. for byte in name:
  621. hash_value = (hash_value >> 19) | (hash_value << 13)
  622. hash_value += byte
  623. hash_value &= 0xFFFFFFFF
  624. return hash_value
  625. def select_bitmap_commits(
  626. refs: dict[bytes, bytes],
  627. object_store: "BaseObjectStore",
  628. commit_interval: int = DEFAULT_COMMIT_INTERVAL,
  629. ) -> list[bytes]:
  630. """Select commits for bitmap generation.
  631. Uses Git's strategy:
  632. - All branch and tag tips
  633. - Every Nth commit in history
  634. Args:
  635. refs: Dictionary of ref names to commit SHAs
  636. object_store: Object store to read commits from
  637. commit_interval: Include every Nth commit in history
  638. Returns:
  639. List of commit SHAs to create bitmaps for
  640. """
  641. selected = set()
  642. seen = set()
  643. # Start with all refs
  644. ref_commits = set()
  645. for ref_name, sha in refs.items():
  646. try:
  647. obj = object_store[sha]
  648. except KeyError:
  649. continue
  650. else:
  651. # Dereference tags to get to commits
  652. while isinstance(obj, Tag):
  653. obj = object_store[obj.object[1]]
  654. if isinstance(obj, Commit):
  655. ref_commits.add(obj.id)
  656. # Add all ref tips
  657. selected.update(ref_commits)
  658. # Walk the commit graph and select every Nth commit
  659. queue = deque(ref_commits)
  660. commit_count = 0
  661. while queue:
  662. commit_sha = queue.popleft()
  663. if commit_sha in seen:
  664. continue
  665. seen.add(commit_sha)
  666. try:
  667. obj = object_store[commit_sha]
  668. if not isinstance(obj, Commit):
  669. continue
  670. commit_count += 1
  671. if commit_count % commit_interval == 0:
  672. selected.add(commit_sha)
  673. # Add parents to queue
  674. for parent in obj.parents:
  675. if parent not in seen:
  676. queue.append(parent)
  677. except KeyError:
  678. continue
  679. return sorted(selected)
  680. def build_reachability_bitmap(
  681. commit_sha: bytes,
  682. sha_to_pos: dict[bytes, int],
  683. object_store: "BaseObjectStore",
  684. ) -> EWAHBitmap:
  685. """Build a reachability bitmap for a commit.
  686. The bitmap has a bit set for each object that is reachable from the commit.
  687. The bit position corresponds to the object's position in the pack index.
  688. Args:
  689. commit_sha: The commit to build a bitmap for
  690. sha_to_pos: Pre-built mapping from SHA to position in pack
  691. object_store: Object store to traverse objects
  692. Returns:
  693. EWAH bitmap with bits set for reachable objects
  694. """
  695. bitmap = EWAHBitmap()
  696. # Traverse all objects reachable from the commit
  697. seen = set()
  698. queue = deque([commit_sha])
  699. while queue:
  700. sha = queue.popleft()
  701. if sha in seen:
  702. continue
  703. seen.add(sha)
  704. # Add this object to the bitmap if it's in the pack
  705. if sha in sha_to_pos:
  706. bitmap.add(sha_to_pos[sha])
  707. # Get the object and traverse its references
  708. try:
  709. obj = object_store[sha]
  710. if isinstance(obj, Commit):
  711. # Add parents and tree
  712. queue.append(obj.tree)
  713. queue.extend(obj.parents)
  714. elif hasattr(obj, "items"):
  715. # Tree object - add all entries
  716. for item in obj.items():
  717. queue.append(item.sha)
  718. except KeyError:
  719. # Object not in store, skip it
  720. continue
  721. return bitmap
  722. def apply_xor_compression(
  723. bitmaps: list[tuple[bytes, EWAHBitmap]],
  724. max_xor_offset: int = MAX_XOR_OFFSET,
  725. ) -> list[tuple[bytes, EWAHBitmap, int]]:
  726. """Apply XOR compression to bitmaps.
  727. XOR compression stores some bitmaps as XOR differences from previous bitmaps,
  728. reducing storage size when bitmaps are similar.
  729. Args:
  730. bitmaps: List of (commit_sha, bitmap) tuples
  731. max_xor_offset: Maximum offset to search for XOR base
  732. Returns:
  733. List of (commit_sha, bitmap, xor_offset) tuples
  734. """
  735. compressed = []
  736. for i, (sha, bitmap) in enumerate(bitmaps):
  737. best_xor_offset = 0
  738. best_size = len(bitmap.encode())
  739. best_xor_bitmap = bitmap
  740. # Try XORing with previous bitmaps within max_xor_offset
  741. for offset in range(1, min(i + 1, max_xor_offset + 1)):
  742. _prev_sha, prev_bitmap = bitmaps[i - offset]
  743. xor_bitmap = bitmap ^ prev_bitmap
  744. xor_size = len(xor_bitmap.encode())
  745. # Use XOR if it reduces size
  746. if xor_size < best_size:
  747. best_size = xor_size
  748. best_xor_offset = offset
  749. best_xor_bitmap = xor_bitmap
  750. compressed.append((sha, best_xor_bitmap, best_xor_offset))
  751. return compressed
  752. def build_type_bitmaps(
  753. sha_to_pos: dict[bytes, int],
  754. object_store: "BaseObjectStore",
  755. ) -> tuple[EWAHBitmap, EWAHBitmap, EWAHBitmap, EWAHBitmap]:
  756. """Build type bitmaps for all objects in a pack.
  757. Type bitmaps classify objects by type: commit, tree, blob, or tag.
  758. Args:
  759. sha_to_pos: Pre-built mapping from SHA to position in pack
  760. object_store: Object store to read object types
  761. Returns:
  762. Tuple of (commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap)
  763. """
  764. commit_bitmap = EWAHBitmap()
  765. tree_bitmap = EWAHBitmap()
  766. blob_bitmap = EWAHBitmap()
  767. tag_bitmap = EWAHBitmap()
  768. for sha, pos in sha_to_pos.items():
  769. try:
  770. obj = object_store[sha]
  771. except KeyError:
  772. pass
  773. else:
  774. obj_type = obj.type_num
  775. if obj_type == Commit.type_num:
  776. commit_bitmap.add(pos)
  777. elif obj_type == Tree.type_num:
  778. tree_bitmap.add(pos)
  779. elif obj_type == Blob.type_num:
  780. blob_bitmap.add(pos)
  781. elif obj_type == Tag.type_num:
  782. tag_bitmap.add(pos)
  783. return commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap
  784. def build_name_hash_cache(
  785. sha_to_pos: dict[bytes, int],
  786. object_store: "BaseObjectStore",
  787. ) -> list[int]:
  788. """Build name-hash cache for all objects in a pack.
  789. The name-hash cache stores a hash of the name for each object,
  790. which can speed up path-based operations.
  791. Args:
  792. sha_to_pos: Pre-built mapping from SHA to position in pack
  793. object_store: Object store to read objects
  794. Returns:
  795. List of 32-bit hash values, one per object in the pack
  796. """
  797. # Pre-allocate list with correct size
  798. num_objects = len(sha_to_pos)
  799. name_hashes = [0] * num_objects
  800. for sha, pos in sha_to_pos.items():
  801. try:
  802. obj = object_store[sha]
  803. except KeyError:
  804. # Object not in store, use zero hash
  805. pass
  806. else:
  807. # For tree entries, use the tree entry name
  808. # For commits, use the tree SHA
  809. # For other objects, use the object SHA
  810. if isinstance(obj, Tree):
  811. # Tree object - use the SHA as the name
  812. name_hash = _compute_name_hash(sha)
  813. elif isinstance(obj, Commit):
  814. # Commit - use the tree SHA as the name
  815. name_hash = _compute_name_hash(obj.tree)
  816. else:
  817. # Other objects - use the SHA as the name
  818. name_hash = _compute_name_hash(sha)
  819. name_hashes[pos] = name_hash
  820. return name_hashes
  821. def generate_bitmap(
  822. pack_index: "PackIndex",
  823. object_store: "BaseObjectStore",
  824. refs: dict[bytes, bytes],
  825. pack_checksum: bytes,
  826. include_hash_cache: bool = True,
  827. include_lookup_table: bool = True,
  828. commit_interval: int = DEFAULT_COMMIT_INTERVAL,
  829. progress: Callable[[str], None] | None = None,
  830. ) -> PackBitmap:
  831. """Generate a complete bitmap for a pack.
  832. Args:
  833. pack_index: Pack index for the pack
  834. object_store: Object store to read objects from
  835. refs: Dictionary of ref names to commit SHAs
  836. pack_checksum: SHA-1 checksum of the pack file
  837. include_hash_cache: Whether to include name-hash cache
  838. include_lookup_table: Whether to include lookup table
  839. commit_interval: Include every Nth commit in history
  840. progress: Optional progress reporting callback
  841. Returns:
  842. Complete PackBitmap ready to write to disk
  843. """
  844. if progress:
  845. progress("Building pack index mapping")
  846. # Build mapping from SHA to position in pack index ONCE
  847. # This is used by all subsequent operations and avoids repeated enumeration
  848. sha_to_pos = {}
  849. for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  850. sha_to_pos[sha] = pos
  851. if progress:
  852. progress("Selecting commits for bitmap")
  853. # Select commits to create bitmaps for
  854. selected_commits = select_bitmap_commits(refs, object_store, commit_interval)
  855. if progress:
  856. progress(f"Building bitmaps for {len(selected_commits)} commits")
  857. # Build reachability bitmaps for selected commits
  858. commit_bitmaps = []
  859. for i, commit_sha in enumerate(selected_commits):
  860. if progress and i % 10 == 0:
  861. progress(f"Building bitmap {i + 1}/{len(selected_commits)}")
  862. bitmap = build_reachability_bitmap(commit_sha, sha_to_pos, object_store)
  863. commit_bitmaps.append((commit_sha, bitmap))
  864. if progress:
  865. progress("Applying XOR compression")
  866. # Apply XOR compression
  867. compressed_bitmaps = apply_xor_compression(commit_bitmaps)
  868. if progress:
  869. progress("Building type bitmaps")
  870. # Build type bitmaps (using pre-built sha_to_pos mapping)
  871. commit_type_bitmap, tree_type_bitmap, blob_type_bitmap, tag_type_bitmap = (
  872. build_type_bitmaps(sha_to_pos, object_store)
  873. )
  874. # Create PackBitmap
  875. flags = BITMAP_OPT_FULL_DAG
  876. if include_hash_cache:
  877. flags |= BITMAP_OPT_HASH_CACHE
  878. if include_lookup_table:
  879. flags |= BITMAP_OPT_LOOKUP_TABLE
  880. pack_bitmap = PackBitmap(version=1, flags=flags)
  881. pack_bitmap.pack_checksum = pack_checksum
  882. pack_bitmap.commit_bitmap = commit_type_bitmap
  883. pack_bitmap.tree_bitmap = tree_type_bitmap
  884. pack_bitmap.blob_bitmap = blob_type_bitmap
  885. pack_bitmap.tag_bitmap = tag_type_bitmap
  886. # Add bitmap entries
  887. for commit_sha, xor_bitmap, xor_offset in compressed_bitmaps:
  888. if commit_sha not in sha_to_pos:
  889. continue
  890. entry = BitmapEntry(
  891. object_pos=sha_to_pos[commit_sha],
  892. xor_offset=xor_offset,
  893. flags=0,
  894. bitmap=xor_bitmap,
  895. )
  896. pack_bitmap.entries[commit_sha] = entry
  897. pack_bitmap.entries_list.append((commit_sha, entry))
  898. # Build optional name-hash cache (using pre-built sha_to_pos mapping)
  899. if include_hash_cache:
  900. if progress:
  901. progress("Building name-hash cache")
  902. pack_bitmap.name_hash_cache = build_name_hash_cache(sha_to_pos, object_store)
  903. # Build optional lookup table
  904. if include_lookup_table:
  905. if progress:
  906. progress("Building lookup table")
  907. # The lookup table is built automatically from the entries
  908. # For now, we'll leave it as None and let the write function handle it
  909. # TODO: Implement lookup table generation if needed
  910. pack_bitmap.lookup_table = None
  911. if progress:
  912. progress("Bitmap generation complete")
  913. return pack_bitmap
  914. def find_commit_bitmaps(commit_shas: set[bytes], packs: Iterable[Pack]) -> dict[bytes, tuple]:
  915. """Find which packs have bitmaps for the given commits.
  916. Args:
  917. commit_shas: Set of commit SHAs to look for
  918. packs: Iterable of Pack objects to search
  919. Returns:
  920. Dict mapping commit SHA to (pack, pack_bitmap, position) tuple
  921. """
  922. result = {}
  923. remaining = set(commit_shas)
  924. for pack in packs:
  925. if not remaining:
  926. break
  927. pack_bitmap = pack.bitmap
  928. if not pack_bitmap:
  929. # No bitmap for this pack
  930. continue
  931. # Build SHA to position mapping for this pack
  932. sha_to_pos = {}
  933. for pos, (sha, _offset, _crc32) in enumerate(pack.index.iterentries()):
  934. sha_to_pos[sha] = pos
  935. # Check which commits have bitmaps
  936. for commit_sha in list(remaining):
  937. if pack_bitmap.has_commit(commit_sha):
  938. if commit_sha in sha_to_pos:
  939. result[commit_sha] = (pack, pack_bitmap, sha_to_pos)
  940. remaining.remove(commit_sha)
  941. return result
  942. def bitmap_to_object_shas(
  943. bitmap: EWAHBitmap,
  944. pack_index: "PackIndex",
  945. type_filter: EWAHBitmap | None = None,
  946. ) -> set[bytes]:
  947. """Convert a bitmap to a set of object SHAs.
  948. Args:
  949. bitmap: The EWAH bitmap with set bits for objects
  950. pack_index: Pack index to map positions to SHAs
  951. type_filter: Optional type bitmap to filter results (e.g., commits only)
  952. Returns:
  953. Set of object SHAs
  954. """
  955. result = set()
  956. for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  957. # Check if this position is in the bitmap
  958. if pos in bitmap:
  959. # Apply type filter if provided
  960. if type_filter is None or pos in type_filter:
  961. result.add(sha)
  962. return result