bitmap.py 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256
  1. # bitmap.py -- Packfile bitmap support for git
  2. # Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Support for Git packfile bitmaps.
  22. Bitmaps store reachability information for packfiles, enabling faster
  23. object counting and enumeration operations without full graph traversal.
  24. The bitmap format uses EWAH (Enhanced Word-Aligned Hybrid) compression
  25. for efficient storage and fast bitwise operations.
  26. """
  27. __all__ = [
  28. "BITMAP_OPT_FULL_DAG",
  29. "BITMAP_OPT_HASH_CACHE",
  30. "BITMAP_OPT_LOOKUP_TABLE",
  31. "BITMAP_OPT_PSEUDO_MERGES",
  32. "BITMAP_SIGNATURE",
  33. "BITMAP_VERSION",
  34. "DEFAULT_COMMIT_INTERVAL",
  35. "MAX_LITERAL_WORDS",
  36. "MAX_XOR_OFFSET",
  37. "BitmapEntry",
  38. "EWAHBitmap",
  39. "PackBitmap",
  40. "apply_xor_compression",
  41. "bitmap_to_object_shas",
  42. "build_name_hash_cache",
  43. "build_reachability_bitmap",
  44. "build_type_bitmaps",
  45. "find_commit_bitmaps",
  46. "generate_bitmap",
  47. "read_bitmap",
  48. "read_bitmap_file",
  49. "select_bitmap_commits",
  50. "write_bitmap",
  51. "write_bitmap_file",
  52. ]
  53. import os
  54. import struct
  55. from collections import deque
  56. from collections.abc import Callable, Iterable, Iterator
  57. from io import BytesIO
  58. from typing import IO, TYPE_CHECKING
  59. from .file import GitFile
  60. from .objects import (
  61. Blob,
  62. Commit,
  63. ObjectID,
  64. RawObjectID,
  65. Tag,
  66. Tree,
  67. hex_to_sha,
  68. sha_to_hex,
  69. )
  70. if TYPE_CHECKING:
  71. from .object_store import BaseObjectStore
  72. from .pack import Pack, PackIndex
  73. from .refs import Ref
  74. # Bitmap file signature
  75. BITMAP_SIGNATURE = b"BITM"
  76. # Bitmap format version
  77. BITMAP_VERSION = 1
  78. # Bitmap flags
  79. BITMAP_OPT_FULL_DAG = 0x1 # Full closure
  80. BITMAP_OPT_HASH_CACHE = 0x4 # Name-hash cache
  81. BITMAP_OPT_LOOKUP_TABLE = 0x10 # Lookup table for random access
  82. BITMAP_OPT_PSEUDO_MERGES = 0x20 # Pseudo-merge bitmaps
  83. # EWAH compression constants
  84. MAX_LITERAL_WORDS = 0x7FFFFFFF # Maximum literal words in EWAH format (31 bits)
  85. MAX_XOR_OFFSET = 160 # Maximum distance to search for XOR compression base
  86. DEFAULT_COMMIT_INTERVAL = 100 # Default interval for commit selection
  87. def _encode_ewah_words(words: list[int]) -> list[int]:
  88. """Encode a list of 64-bit words using EWAH run-length compression.
  89. Args:
  90. words: List of 64-bit words to encode
  91. Returns:
  92. List of compressed words (RLWs followed by literals)
  93. """
  94. compressed_words = []
  95. i = 0
  96. while i < len(words):
  97. # Check for runs of all zeros or all ones
  98. if words[i] == 0 or words[i] == 0xFFFFFFFFFFFFFFFF:
  99. # Count consecutive identical words
  100. run_value = words[i]
  101. run_length = 0
  102. while i < len(words) and words[i] == run_value:
  103. run_length += 1
  104. i += 1
  105. # Collect following literal words
  106. literals = []
  107. while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF:
  108. literals.append(words[i])
  109. i += 1
  110. if len(literals) >= MAX_LITERAL_WORDS:
  111. break
  112. # Create RLW with correct bit layout:
  113. # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)]
  114. running_bit = 1 if run_value == 0xFFFFFFFFFFFFFFFF else 0
  115. rlw = (len(literals) << 33) | (run_length << 1) | running_bit
  116. compressed_words.append(rlw)
  117. compressed_words.extend(literals)
  118. else:
  119. # Collect literal words
  120. literals = []
  121. while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF:
  122. literals.append(words[i])
  123. i += 1
  124. if len(literals) >= MAX_LITERAL_WORDS:
  125. break
  126. # RLW with no run, just literals
  127. # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)]
  128. rlw = (len(literals) << 33) | (0 << 1) | 0
  129. compressed_words.append(rlw)
  130. compressed_words.extend(literals)
  131. return compressed_words
  132. class EWAHBitmap:
  133. """EWAH (Enhanced Word-Aligned Hybrid) compressed bitmap.
  134. EWAH uses run-length encoding for efficient bitmap storage.
  135. Each bitmap consists of:
  136. - Uncompressed bit count (4 bytes)
  137. - Compressed word count (4 bytes)
  138. - Compressed words (8 bytes each)
  139. - Current RLW position (4 bytes)
  140. Each Run Length Word (RLW) 64-bit layout (LSB to MSB):
  141. - Bit 0: running_bit (1 bit) - value of repeated words (0 or 1)
  142. - Bits 1-32: running_len (32 bits) - count of repeated words
  143. - Bits 33-63: literal_words (31 bits) - count of literal words following this RLW
  144. """
  145. def __init__(self, data: bytes | None = None) -> None:
  146. """Initialize EWAH bitmap.
  147. Args:
  148. data: Optional compressed bitmap data to decode
  149. """
  150. self.bits: set[int] = set()
  151. self.bit_count = 0
  152. if data:
  153. self._decode(data)
  154. def _decode(self, data: bytes) -> None:
  155. """Decode EWAH compressed bitmap data.
  156. Args:
  157. data: Compressed bitmap data (EWAH format with header + words +
  158. RLW position)
  159. """
  160. f = BytesIO(data)
  161. # Read header
  162. bit_count_bytes = f.read(4)
  163. word_count_bytes = f.read(4)
  164. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  165. return
  166. bit_count = struct.unpack(">I", bit_count_bytes)[0]
  167. word_count = struct.unpack(">I", word_count_bytes)[0]
  168. self.bit_count = bit_count
  169. current_bit = 0
  170. # Read all words first
  171. words = []
  172. for _ in range(word_count):
  173. word_bytes = f.read(8)
  174. if len(word_bytes) < 8:
  175. break
  176. word = struct.unpack(">Q", word_bytes)[0]
  177. words.append(word)
  178. # Process EWAH chunks: RLW followed by literal words
  179. idx = 0
  180. while idx < len(words):
  181. # This is an RLW
  182. # Bit layout: [literal_words(31)][running_len(32)][running_bit(1)]
  183. rlw = words[idx]
  184. running_bit = rlw & 1
  185. running_len = (rlw >> 1) & 0xFFFFFFFF
  186. literal_words = rlw >> 33
  187. idx += 1
  188. # Process running bits
  189. if running_len > 0:
  190. if running_bit == 1:
  191. # Add all bits in the repeated section
  192. for i in range(running_len * 64):
  193. self.bits.add(current_bit + i)
  194. current_bit += running_len * 64
  195. # Process literal words
  196. for _ in range(literal_words):
  197. if idx >= len(words):
  198. break
  199. literal = words[idx]
  200. idx += 1
  201. # Extract set bits from literal word
  202. for i in range(64):
  203. if literal & (1 << i):
  204. self.bits.add(current_bit + i)
  205. current_bit += 64
  206. # Read RLW position (we don't use it currently, but it's part of the format)
  207. f.read(4)
  208. def encode(self) -> bytes:
  209. """Encode bitmap to EWAH compressed format.
  210. Returns:
  211. Compressed bitmap data including header, words, and RLW position
  212. """
  213. if not self.bits:
  214. # Empty bitmap: bit_count=0, word_count=0, rlw_pos=0
  215. return struct.pack(">III", 0, 0, 0)
  216. max_bit = max(self.bits) if self.bits else 0
  217. bit_count = max_bit + 1
  218. word_count = (bit_count + 63) // 64
  219. # Create literal words
  220. words = [0] * word_count
  221. for bit in self.bits:
  222. word_idx = bit // 64
  223. bit_idx = bit % 64
  224. words[word_idx] |= 1 << bit_idx
  225. # Compress using EWAH run-length encoding
  226. compressed_words = _encode_ewah_words(words)
  227. # Build EWAH data
  228. f = BytesIO()
  229. # Header
  230. f.write(struct.pack(">I", bit_count))
  231. f.write(struct.pack(">I", len(compressed_words)))
  232. # Write compressed words
  233. for word in compressed_words:
  234. f.write(struct.pack(">Q", word))
  235. # Write RLW position (position of last RLW in the compressed words)
  236. # For now, we'll use 0 as we don't track this during encoding
  237. # This could be improved in the future if needed
  238. f.write(struct.pack(">I", 0))
  239. return f.getvalue()
  240. def __contains__(self, bit: int) -> bool:
  241. """Check if a bit is set.
  242. Args:
  243. bit: Bit position to check
  244. Returns:
  245. True if bit is set, False otherwise
  246. """
  247. return bit in self.bits
  248. def __len__(self) -> int:
  249. """Return the number of set bits.
  250. Returns:
  251. Count of set bits
  252. """
  253. return len(self.bits)
  254. def __or__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  255. """Bitwise OR operation.
  256. Args:
  257. other: Other bitmap to OR with
  258. Returns:
  259. New bitmap with OR result
  260. """
  261. result = EWAHBitmap()
  262. result.bits = self.bits | other.bits
  263. result.bit_count = max(self.bit_count, other.bit_count)
  264. return result
  265. def __and__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  266. """Bitwise AND operation.
  267. Args:
  268. other: Other bitmap to AND with
  269. Returns:
  270. New bitmap with AND result
  271. """
  272. result = EWAHBitmap()
  273. result.bits = self.bits & other.bits
  274. result.bit_count = max(self.bit_count, other.bit_count)
  275. return result
  276. def __xor__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  277. """Bitwise XOR operation.
  278. Args:
  279. other: Other bitmap to XOR with
  280. Returns:
  281. New bitmap with XOR result
  282. """
  283. result = EWAHBitmap()
  284. result.bits = self.bits ^ other.bits
  285. result.bit_count = max(self.bit_count, other.bit_count)
  286. return result
  287. def __sub__(self, other: "EWAHBitmap") -> "EWAHBitmap":
  288. """Bitwise subtraction (set difference).
  289. Returns bits that are in self but not in other.
  290. Equivalent to: self & ~other
  291. Args:
  292. other: Bitmap to subtract
  293. Returns:
  294. New bitmap with bits in self but not in other
  295. """
  296. result = EWAHBitmap()
  297. result.bits = self.bits - other.bits
  298. result.bit_count = self.bit_count
  299. return result
  300. def add(self, bit: int) -> None:
  301. """Set a bit.
  302. Args:
  303. bit: Bit position to set
  304. """
  305. self.bits.add(bit)
  306. self.bit_count = max(self.bit_count, bit + 1)
  307. class BitmapEntry:
  308. """A single bitmap entry for a commit."""
  309. def __init__(
  310. self,
  311. object_pos: int,
  312. xor_offset: int,
  313. flags: int,
  314. bitmap: EWAHBitmap,
  315. ) -> None:
  316. """Initialize a bitmap entry.
  317. Args:
  318. object_pos: Position of object in pack index
  319. xor_offset: XOR offset for compression
  320. flags: Entry flags
  321. bitmap: The EWAH bitmap data
  322. """
  323. self.object_pos = object_pos
  324. self.xor_offset = xor_offset
  325. self.flags = flags
  326. self.bitmap = bitmap
  327. class PackBitmap:
  328. """A pack bitmap index.
  329. Bitmaps store reachability information for commits in a packfile,
  330. allowing fast object enumeration without graph traversal.
  331. """
  332. def __init__(
  333. self,
  334. version: int = BITMAP_VERSION,
  335. flags: int = BITMAP_OPT_FULL_DAG,
  336. ) -> None:
  337. """Initialize a pack bitmap.
  338. Args:
  339. version: Bitmap format version
  340. flags: Bitmap flags
  341. """
  342. self.version = version
  343. self.flags = flags
  344. self.pack_checksum: bytes | None = None
  345. # Type bitmaps for commits, trees, blobs, tags
  346. self.commit_bitmap = EWAHBitmap()
  347. self.tree_bitmap = EWAHBitmap()
  348. self.blob_bitmap = EWAHBitmap()
  349. self.tag_bitmap = EWAHBitmap()
  350. # Bitmap entries indexed by commit SHA
  351. self.entries: dict[bytes, BitmapEntry] = {}
  352. # List of entries in order (for XOR offset resolution)
  353. self.entries_list: list[tuple[bytes, BitmapEntry]] = []
  354. # Optional lookup table for random access
  355. self.lookup_table: list[tuple[int, int, int]] | None = None
  356. # Optional name-hash cache
  357. self.name_hash_cache: list[int] | None = None
  358. def get_bitmap(self, commit_sha: bytes) -> EWAHBitmap | None:
  359. """Get the bitmap for a commit.
  360. Args:
  361. commit_sha: SHA-1 of the commit
  362. Returns:
  363. EWAH bitmap or None if not found
  364. """
  365. entry = self.entries.get(commit_sha)
  366. if entry is None:
  367. return None
  368. # Decompress using XOR if needed
  369. if entry.xor_offset > 0:
  370. # Find the entry at the XOR offset
  371. # The XOR offset tells us how many entries back to look
  372. # We need to find this entry in the ordered list
  373. try:
  374. current_idx = next(
  375. i
  376. for i, (sha, _) in enumerate(self.entries_list)
  377. if sha == commit_sha
  378. )
  379. except StopIteration:
  380. # Entry not found in list, return as-is
  381. return entry.bitmap
  382. # XOR offset is how many positions back to look
  383. if current_idx >= entry.xor_offset:
  384. base_sha, _base_entry = self.entries_list[
  385. current_idx - entry.xor_offset
  386. ]
  387. # Get the base bitmap (recursively if it also uses XOR)
  388. base_bitmap = self.get_bitmap(base_sha)
  389. if base_bitmap is not None:
  390. # XOR the current bitmap with the base
  391. return entry.bitmap ^ base_bitmap
  392. return entry.bitmap
  393. def has_commit(self, commit_sha: bytes) -> bool:
  394. """Check if a commit has a bitmap.
  395. Args:
  396. commit_sha: SHA-1 of the commit
  397. Returns:
  398. True if bitmap exists for this commit
  399. """
  400. return commit_sha in self.entries
  401. def iter_commits(self) -> Iterator[bytes]:
  402. """Iterate over all commits with bitmaps.
  403. Returns:
  404. Iterator of commit SHAs
  405. """
  406. return iter(self.entries.keys())
  407. def read_bitmap(
  408. filename: str | os.PathLike[str],
  409. pack_index: "PackIndex | None" = None,
  410. ) -> PackBitmap:
  411. """Read a bitmap index file.
  412. Args:
  413. filename: Path to the .bitmap file
  414. pack_index: Optional PackIndex to resolve object positions to SHAs
  415. Returns:
  416. Loaded PackBitmap
  417. Raises:
  418. ValueError: If file format is invalid
  419. ChecksumMismatch: If checksum verification fails
  420. """
  421. with GitFile(filename, "rb") as f:
  422. return read_bitmap_file(f, pack_index=pack_index)
  423. def read_bitmap_file(f: IO[bytes], pack_index: "PackIndex | None" = None) -> PackBitmap:
  424. """Read bitmap data from a file object.
  425. Args:
  426. f: File object to read from
  427. pack_index: Optional PackIndex to resolve object positions to SHAs
  428. Returns:
  429. Loaded PackBitmap
  430. Raises:
  431. ValueError: If file format is invalid
  432. """
  433. # Read header
  434. signature = f.read(4)
  435. if signature != BITMAP_SIGNATURE:
  436. raise ValueError(
  437. f"Invalid bitmap signature: {signature!r}, expected {BITMAP_SIGNATURE!r}"
  438. )
  439. version_bytes = f.read(2)
  440. flags_bytes = f.read(2)
  441. if len(version_bytes) < 2 or len(flags_bytes) < 2:
  442. raise ValueError("Incomplete bitmap header")
  443. version = struct.unpack(">H", version_bytes)[0]
  444. flags = struct.unpack(">H", flags_bytes)[0]
  445. if version != BITMAP_VERSION:
  446. raise ValueError(f"Unsupported bitmap version: {version}")
  447. # Read entry count
  448. entry_count_bytes = f.read(4)
  449. if len(entry_count_bytes) < 4:
  450. raise ValueError("Missing entry count")
  451. entry_count = struct.unpack(">I", entry_count_bytes)[0]
  452. # Read pack checksum
  453. pack_checksum = f.read(20)
  454. if len(pack_checksum) < 20:
  455. raise ValueError("Missing pack checksum")
  456. bitmap = PackBitmap(version=version, flags=flags)
  457. bitmap.pack_checksum = pack_checksum
  458. # Read type bitmaps (EWAH bitmaps are self-describing)
  459. for i, type_bitmap in enumerate(
  460. [
  461. bitmap.commit_bitmap,
  462. bitmap.tree_bitmap,
  463. bitmap.blob_bitmap,
  464. bitmap.tag_bitmap,
  465. ]
  466. ):
  467. # EWAH format:
  468. # 4 bytes: bit count
  469. # 4 bytes: word count
  470. # N x 8 bytes: compressed words
  471. # 4 bytes: RLW position
  472. # Read header to determine size
  473. bit_count_bytes = f.read(4)
  474. word_count_bytes = f.read(4)
  475. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  476. raise ValueError(f"Missing type bitmap {i} header")
  477. word_count = struct.unpack(">I", word_count_bytes)[0]
  478. # Read compressed words
  479. words_data = f.read(word_count * 8)
  480. if len(words_data) < word_count * 8:
  481. raise ValueError(f"Incomplete type bitmap {i} data")
  482. # Read RLW position
  483. rlw_pos_bytes = f.read(4)
  484. if len(rlw_pos_bytes) < 4:
  485. raise ValueError(f"Missing type bitmap {i} RLW position")
  486. # Reconstruct the full EWAH data to pass to _decode
  487. ewah_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes
  488. type_bitmap._decode(ewah_data)
  489. # Read bitmap entries
  490. for _ in range(entry_count):
  491. # Read object position (4 bytes)
  492. obj_pos_bytes = f.read(4)
  493. if len(obj_pos_bytes) < 4:
  494. raise ValueError("Incomplete bitmap entry")
  495. obj_pos = struct.unpack(">I", obj_pos_bytes)[0]
  496. # Read XOR offset (1 byte)
  497. xor_offset_bytes = f.read(1)
  498. if len(xor_offset_bytes) < 1:
  499. raise ValueError("Missing XOR offset")
  500. xor_offset = xor_offset_bytes[0]
  501. # Read flags (1 byte)
  502. flags_bytes = f.read(1)
  503. if len(flags_bytes) < 1:
  504. raise ValueError("Missing entry flags")
  505. entry_flags = flags_bytes[0]
  506. # Read self-describing EWAH bitmap
  507. # EWAH format: bit_count (4) + word_count (4) + words + rlw_pos (4)
  508. bit_count_bytes = f.read(4)
  509. word_count_bytes = f.read(4)
  510. if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
  511. raise ValueError("Incomplete bitmap entry EWAH header")
  512. word_count = struct.unpack(">I", word_count_bytes)[0]
  513. # Read compressed words
  514. words_data = f.read(word_count * 8)
  515. if len(words_data) < word_count * 8:
  516. raise ValueError("Incomplete bitmap entry EWAH words")
  517. # Read RLW position
  518. rlw_pos_bytes = f.read(4)
  519. if len(rlw_pos_bytes) < 4:
  520. raise ValueError("Missing bitmap entry EWAH RLW position")
  521. # Reconstruct full EWAH data
  522. bitmap_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes
  523. # Create bitmap entry
  524. ewah_bitmap = EWAHBitmap(bitmap_data) if word_count > 0 else EWAHBitmap()
  525. entry = BitmapEntry(
  526. object_pos=obj_pos,
  527. xor_offset=xor_offset,
  528. flags=entry_flags,
  529. bitmap=ewah_bitmap,
  530. )
  531. # Resolve object position to SHA if we have a pack index
  532. if pack_index is not None:
  533. # Get the SHA at the given position in the sorted index
  534. sha = None
  535. for idx, (entry_sha, _offset, _crc32) in enumerate(
  536. pack_index.iterentries()
  537. ):
  538. if idx == obj_pos:
  539. sha = entry_sha
  540. break
  541. if sha is not None:
  542. bitmap.entries[sha] = entry
  543. bitmap.entries_list.append((sha, entry))
  544. else:
  545. # Without pack index, use position as temporary key
  546. temp_key = obj_pos.to_bytes(4, byteorder="big")
  547. bitmap.entries[temp_key] = entry
  548. bitmap.entries_list.append((temp_key, entry))
  549. # Read optional lookup table
  550. if flags & BITMAP_OPT_LOOKUP_TABLE:
  551. # Lookup table contains triplets: (commit_pos, offset, xor_row)
  552. # Number of entries matches the bitmap entry count
  553. lookup_table = []
  554. for _ in range(entry_count):
  555. # Read commit position (4 bytes)
  556. commit_pos_bytes = f.read(4)
  557. if len(commit_pos_bytes) < 4:
  558. break
  559. commit_pos = struct.unpack(">I", commit_pos_bytes)[0]
  560. # Read file offset (8 bytes)
  561. offset_bytes = f.read(8)
  562. if len(offset_bytes) < 8:
  563. break
  564. offset = struct.unpack(">Q", offset_bytes)[0]
  565. # Read XOR row (4 bytes)
  566. xor_row_bytes = f.read(4)
  567. if len(xor_row_bytes) < 4:
  568. break
  569. xor_row = struct.unpack(">I", xor_row_bytes)[0]
  570. lookup_table.append((commit_pos, offset, xor_row))
  571. bitmap.lookup_table = lookup_table
  572. # Read optional name-hash cache
  573. if flags & BITMAP_OPT_HASH_CACHE:
  574. # Name-hash cache contains one 32-bit hash per object in the pack
  575. # The number of hashes depends on the total number of objects
  576. # For now, we'll read what's available
  577. name_hash_cache = []
  578. while True:
  579. hash_bytes = f.read(4)
  580. if len(hash_bytes) < 4:
  581. break
  582. hash_value = struct.unpack(">I", hash_bytes)[0]
  583. name_hash_cache.append(hash_value)
  584. if name_hash_cache:
  585. bitmap.name_hash_cache = name_hash_cache
  586. return bitmap
  587. def write_bitmap(
  588. filename: str | os.PathLike[str],
  589. bitmap: PackBitmap,
  590. ) -> None:
  591. """Write a bitmap index file.
  592. Args:
  593. filename: Path to write the .bitmap file
  594. bitmap: PackBitmap to write
  595. """
  596. with GitFile(filename, "wb") as f:
  597. write_bitmap_file(f, bitmap)
  598. def write_bitmap_file(f: IO[bytes], bitmap: PackBitmap) -> None:
  599. """Write bitmap data to a file object.
  600. Args:
  601. f: File object to write to
  602. bitmap: PackBitmap to write
  603. """
  604. # Write header
  605. f.write(BITMAP_SIGNATURE)
  606. f.write(struct.pack(">H", bitmap.version))
  607. f.write(struct.pack(">H", bitmap.flags))
  608. # Write entry count
  609. f.write(struct.pack(">I", len(bitmap.entries)))
  610. # Write pack checksum
  611. if bitmap.pack_checksum:
  612. f.write(bitmap.pack_checksum)
  613. else:
  614. f.write(b"\x00" * 20)
  615. # Write type bitmaps (self-describing EWAH format, no size prefix needed)
  616. for type_bitmap in [
  617. bitmap.commit_bitmap,
  618. bitmap.tree_bitmap,
  619. bitmap.blob_bitmap,
  620. bitmap.tag_bitmap,
  621. ]:
  622. data = type_bitmap.encode()
  623. f.write(data)
  624. # Write bitmap entries
  625. for _sha, entry in bitmap.entries.items():
  626. # Write object position (4 bytes)
  627. f.write(struct.pack(">I", entry.object_pos))
  628. # Write XOR offset (1 byte)
  629. f.write(bytes([entry.xor_offset]))
  630. # Write flags (1 byte)
  631. f.write(bytes([entry.flags]))
  632. # Write compressed bitmap data (self-describing EWAH format, no size prefix)
  633. bitmap_data = entry.bitmap.encode()
  634. f.write(bitmap_data)
  635. # Write optional lookup table
  636. if bitmap.flags & BITMAP_OPT_LOOKUP_TABLE and bitmap.lookup_table:
  637. for commit_pos, offset, xor_row in bitmap.lookup_table:
  638. f.write(struct.pack(">I", commit_pos)) # 4 bytes
  639. f.write(struct.pack(">Q", offset)) # 8 bytes
  640. f.write(struct.pack(">I", xor_row)) # 4 bytes
  641. # Write optional name-hash cache
  642. if bitmap.flags & BITMAP_OPT_HASH_CACHE and bitmap.name_hash_cache:
  643. for hash_value in bitmap.name_hash_cache:
  644. f.write(struct.pack(">I", hash_value))
  645. def _compute_name_hash(name: bytes) -> int:
  646. """Compute the name hash for a tree entry.
  647. This is the same algorithm Git uses for the name-hash cache.
  648. Args:
  649. name: The name of the tree entry
  650. Returns:
  651. 32-bit hash value
  652. """
  653. hash_value = 0
  654. for byte in name:
  655. hash_value = (hash_value >> 19) | (hash_value << 13)
  656. hash_value += byte
  657. hash_value &= 0xFFFFFFFF
  658. return hash_value
  659. def select_bitmap_commits(
  660. refs: dict["Ref", ObjectID],
  661. object_store: "BaseObjectStore",
  662. commit_interval: int = DEFAULT_COMMIT_INTERVAL,
  663. ) -> list[ObjectID]:
  664. """Select commits for bitmap generation.
  665. Uses Git's strategy:
  666. - All branch and tag tips
  667. - Every Nth commit in history
  668. Args:
  669. refs: Dictionary of ref names to commit SHAs
  670. object_store: Object store to read commits from
  671. commit_interval: Include every Nth commit in history
  672. Returns:
  673. List of commit SHAs to create bitmaps for
  674. """
  675. selected = set()
  676. seen = set()
  677. # Start with all refs
  678. ref_commits = set()
  679. for ref_name, sha in refs.items():
  680. try:
  681. obj = object_store[sha]
  682. except KeyError:
  683. continue
  684. else:
  685. # Dereference tags to get to commits
  686. while isinstance(obj, Tag):
  687. obj = object_store[obj.object[1]]
  688. if isinstance(obj, Commit):
  689. ref_commits.add(obj.id)
  690. # Add all ref tips
  691. selected.update(ref_commits)
  692. # Walk the commit graph and select every Nth commit
  693. queue = deque(ref_commits)
  694. commit_count = 0
  695. while queue:
  696. commit_sha = queue.popleft()
  697. if commit_sha in seen:
  698. continue
  699. seen.add(commit_sha)
  700. try:
  701. obj = object_store[commit_sha]
  702. if not isinstance(obj, Commit):
  703. continue
  704. commit_count += 1
  705. if commit_count % commit_interval == 0:
  706. selected.add(commit_sha)
  707. # Add parents to queue
  708. for parent in obj.parents:
  709. if parent not in seen:
  710. queue.append(parent)
  711. except KeyError:
  712. continue
  713. return sorted(selected)
  714. def build_reachability_bitmap(
  715. commit_sha: ObjectID,
  716. sha_to_pos: dict[RawObjectID, int],
  717. object_store: "BaseObjectStore",
  718. ) -> EWAHBitmap:
  719. """Build a reachability bitmap for a commit.
  720. The bitmap has a bit set for each object that is reachable from the commit.
  721. The bit position corresponds to the object's position in the pack index.
  722. Args:
  723. commit_sha: The commit to build a bitmap for
  724. sha_to_pos: Pre-built mapping from SHA to position in pack
  725. object_store: Object store to traverse objects
  726. Returns:
  727. EWAH bitmap with bits set for reachable objects
  728. """
  729. bitmap = EWAHBitmap()
  730. # Traverse all objects reachable from the commit
  731. seen = set()
  732. queue = deque([commit_sha])
  733. while queue:
  734. sha = queue.popleft()
  735. if sha in seen:
  736. continue
  737. seen.add(sha)
  738. # Add this object to the bitmap if it's in the pack
  739. # Convert hex SHA to binary for pack index lookup
  740. raw_sha = hex_to_sha(sha)
  741. if raw_sha in sha_to_pos:
  742. bitmap.add(sha_to_pos[raw_sha])
  743. # Get the object and traverse its references
  744. try:
  745. obj = object_store[sha]
  746. if isinstance(obj, Commit):
  747. # Add parents and tree
  748. queue.append(obj.tree)
  749. queue.extend(obj.parents)
  750. elif hasattr(obj, "items"):
  751. # Tree object - add all entries
  752. for item in obj.items():
  753. queue.append(item.sha)
  754. except KeyError:
  755. # Object not in store, skip it
  756. continue
  757. return bitmap
  758. def apply_xor_compression(
  759. bitmaps: list[tuple[ObjectID, EWAHBitmap]],
  760. max_xor_offset: int = MAX_XOR_OFFSET,
  761. ) -> list[tuple[ObjectID, EWAHBitmap, int]]:
  762. """Apply XOR compression to bitmaps.
  763. XOR compression stores some bitmaps as XOR differences from previous bitmaps,
  764. reducing storage size when bitmaps are similar.
  765. Args:
  766. bitmaps: List of (commit_sha, bitmap) tuples
  767. max_xor_offset: Maximum offset to search for XOR base
  768. Returns:
  769. List of (commit_sha, bitmap, xor_offset) tuples
  770. """
  771. compressed = []
  772. for i, (sha, bitmap) in enumerate(bitmaps):
  773. best_xor_offset = 0
  774. best_size = len(bitmap.encode())
  775. best_xor_bitmap = bitmap
  776. # Try XORing with previous bitmaps within max_xor_offset
  777. for offset in range(1, min(i + 1, max_xor_offset + 1)):
  778. _prev_sha, prev_bitmap = bitmaps[i - offset]
  779. xor_bitmap = bitmap ^ prev_bitmap
  780. xor_size = len(xor_bitmap.encode())
  781. # Use XOR if it reduces size
  782. if xor_size < best_size:
  783. best_size = xor_size
  784. best_xor_offset = offset
  785. best_xor_bitmap = xor_bitmap
  786. compressed.append((sha, best_xor_bitmap, best_xor_offset))
  787. return compressed
  788. def build_type_bitmaps(
  789. sha_to_pos: dict["RawObjectID", int],
  790. object_store: "BaseObjectStore",
  791. ) -> tuple[EWAHBitmap, EWAHBitmap, EWAHBitmap, EWAHBitmap]:
  792. """Build type bitmaps for all objects in a pack.
  793. Type bitmaps classify objects by type: commit, tree, blob, or tag.
  794. Args:
  795. sha_to_pos: Pre-built mapping from SHA to position in pack
  796. object_store: Object store to read object types
  797. Returns:
  798. Tuple of (commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap)
  799. """
  800. commit_bitmap = EWAHBitmap()
  801. tree_bitmap = EWAHBitmap()
  802. blob_bitmap = EWAHBitmap()
  803. tag_bitmap = EWAHBitmap()
  804. for sha, pos in sha_to_pos.items():
  805. # Pack index returns binary SHA (20 bytes), but object_store expects hex SHA (40 bytes)
  806. hex_sha = sha_to_hex(sha) if len(sha) == 20 else ObjectID(sha)
  807. try:
  808. obj = object_store[hex_sha]
  809. except KeyError:
  810. # Object not in store, skip it
  811. continue
  812. obj_type = obj.type_num
  813. if obj_type == Commit.type_num:
  814. commit_bitmap.add(pos)
  815. elif obj_type == Tree.type_num:
  816. tree_bitmap.add(pos)
  817. elif obj_type == Blob.type_num:
  818. blob_bitmap.add(pos)
  819. elif obj_type == Tag.type_num:
  820. tag_bitmap.add(pos)
  821. return commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap
  822. def build_name_hash_cache(
  823. sha_to_pos: dict["RawObjectID", int],
  824. object_store: "BaseObjectStore",
  825. ) -> list[int]:
  826. """Build name-hash cache for all objects in a pack.
  827. The name-hash cache stores a hash of the name for each object,
  828. which can speed up path-based operations.
  829. Args:
  830. sha_to_pos: Pre-built mapping from SHA to position in pack
  831. object_store: Object store to read objects
  832. Returns:
  833. List of 32-bit hash values, one per object in the pack
  834. """
  835. # Pre-allocate list with correct size
  836. num_objects = len(sha_to_pos)
  837. name_hashes = [0] * num_objects
  838. for sha, pos in sha_to_pos.items():
  839. # Pack index returns binary SHA (20 bytes), but object_store expects hex SHA (40 bytes)
  840. hex_sha = sha_to_hex(sha) if len(sha) == 20 else ObjectID(sha)
  841. try:
  842. obj = object_store[hex_sha]
  843. except KeyError:
  844. # Object not in store, use zero hash
  845. continue
  846. # For tree entries, use the tree entry name
  847. # For commits, use the tree SHA
  848. # For other objects, use the object SHA
  849. if isinstance(obj, Tree):
  850. # Tree object - use the SHA as the name
  851. name_hash = _compute_name_hash(sha)
  852. elif isinstance(obj, Commit):
  853. # Commit - use the tree SHA as the name
  854. name_hash = _compute_name_hash(obj.tree)
  855. else:
  856. # Other objects - use the SHA as the name
  857. name_hash = _compute_name_hash(sha)
  858. name_hashes[pos] = name_hash
  859. return name_hashes
  860. def generate_bitmap(
  861. pack_index: "PackIndex",
  862. object_store: "BaseObjectStore",
  863. refs: dict["Ref", ObjectID],
  864. pack_checksum: bytes,
  865. include_hash_cache: bool = True,
  866. include_lookup_table: bool = True,
  867. commit_interval: int | None = None,
  868. progress: Callable[[str], None] | None = None,
  869. ) -> PackBitmap:
  870. """Generate a complete bitmap for a pack.
  871. Args:
  872. pack_index: Pack index for the pack
  873. object_store: Object store to read objects from
  874. refs: Dictionary of ref names to commit SHAs
  875. pack_checksum: SHA-1 checksum of the pack file
  876. include_hash_cache: Whether to include name-hash cache
  877. include_lookup_table: Whether to include lookup table
  878. commit_interval: Include every Nth commit in history (None for default)
  879. progress: Optional progress reporting callback
  880. Returns:
  881. Complete PackBitmap ready to write to disk
  882. """
  883. if commit_interval is None:
  884. commit_interval = DEFAULT_COMMIT_INTERVAL
  885. if progress:
  886. progress("Building pack index mapping")
  887. # Build mapping from SHA to position in pack index ONCE
  888. # This is used by all subsequent operations and avoids repeated enumeration
  889. sha_to_pos: dict[RawObjectID, int] = {}
  890. for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  891. sha_to_pos[sha] = pos
  892. if progress:
  893. progress("Selecting commits for bitmap")
  894. # Select commits to create bitmaps for
  895. selected_commits = select_bitmap_commits(refs, object_store, commit_interval)
  896. if progress:
  897. progress(f"Building bitmaps for {len(selected_commits)} commits")
  898. # Build reachability bitmaps for selected commits
  899. commit_bitmaps = []
  900. for i, commit_sha in enumerate(selected_commits):
  901. if progress and i % 10 == 0:
  902. progress(f"Building bitmap {i + 1}/{len(selected_commits)}")
  903. bitmap = build_reachability_bitmap(commit_sha, sha_to_pos, object_store)
  904. commit_bitmaps.append((commit_sha, bitmap))
  905. if progress:
  906. progress("Applying XOR compression")
  907. # Apply XOR compression
  908. compressed_bitmaps = apply_xor_compression(commit_bitmaps)
  909. if progress:
  910. progress("Building type bitmaps")
  911. # Build type bitmaps (using pre-built sha_to_pos mapping)
  912. commit_type_bitmap, tree_type_bitmap, blob_type_bitmap, tag_type_bitmap = (
  913. build_type_bitmaps(sha_to_pos, object_store)
  914. )
  915. # Create PackBitmap
  916. flags = BITMAP_OPT_FULL_DAG
  917. if include_hash_cache:
  918. flags |= BITMAP_OPT_HASH_CACHE
  919. if include_lookup_table:
  920. flags |= BITMAP_OPT_LOOKUP_TABLE
  921. pack_bitmap = PackBitmap(version=1, flags=flags)
  922. pack_bitmap.pack_checksum = pack_checksum
  923. pack_bitmap.commit_bitmap = commit_type_bitmap
  924. pack_bitmap.tree_bitmap = tree_type_bitmap
  925. pack_bitmap.blob_bitmap = blob_type_bitmap
  926. pack_bitmap.tag_bitmap = tag_type_bitmap
  927. # Add bitmap entries
  928. for commit_sha, xor_bitmap, xor_offset in compressed_bitmaps:
  929. raw_commit_sha = hex_to_sha(commit_sha)
  930. if raw_commit_sha not in sha_to_pos:
  931. continue
  932. entry = BitmapEntry(
  933. object_pos=sha_to_pos[raw_commit_sha],
  934. xor_offset=xor_offset,
  935. flags=0,
  936. bitmap=xor_bitmap,
  937. )
  938. pack_bitmap.entries[commit_sha] = entry
  939. pack_bitmap.entries_list.append((commit_sha, entry))
  940. # Build optional name-hash cache (using pre-built sha_to_pos mapping)
  941. if include_hash_cache:
  942. if progress:
  943. progress("Building name-hash cache")
  944. pack_bitmap.name_hash_cache = build_name_hash_cache(sha_to_pos, object_store)
  945. # Build optional lookup table
  946. if include_lookup_table:
  947. if progress:
  948. progress("Building lookup table")
  949. # The lookup table is built automatically from the entries
  950. # For now, we'll leave it as None and let the write function handle it
  951. # TODO: Implement lookup table generation if needed
  952. pack_bitmap.lookup_table = None
  953. if progress:
  954. progress("Bitmap generation complete")
  955. return pack_bitmap
  956. def find_commit_bitmaps(
  957. commit_shas: set["ObjectID"], packs: Iterable["Pack"]
  958. ) -> dict["ObjectID", tuple["Pack", "PackBitmap", dict[RawObjectID, int]]]:
  959. """Find which packs have bitmaps for the given commits.
  960. Args:
  961. commit_shas: Set of commit SHAs to look for
  962. packs: Iterable of Pack objects to search
  963. Returns:
  964. Dict mapping commit SHA to (pack, pack_bitmap, position) tuple
  965. """
  966. result = {}
  967. remaining = set(commit_shas)
  968. for pack in packs:
  969. if not remaining:
  970. break
  971. pack_bitmap = pack.bitmap
  972. if not pack_bitmap:
  973. # No bitmap for this pack
  974. continue
  975. # Build SHA to position mapping for this pack
  976. sha_to_pos: dict[RawObjectID, int] = {}
  977. for pos, (sha, _offset, _crc32) in enumerate(pack.index.iterentries()):
  978. sha_to_pos[sha] = pos
  979. # Check which commits have bitmaps
  980. for commit_sha in list(remaining):
  981. if pack_bitmap.has_commit(commit_sha):
  982. raw_commit_sha = hex_to_sha(commit_sha)
  983. if raw_commit_sha in sha_to_pos:
  984. result[commit_sha] = (pack, pack_bitmap, sha_to_pos)
  985. remaining.remove(commit_sha)
  986. return result
  987. def bitmap_to_object_shas(
  988. bitmap: EWAHBitmap,
  989. pack_index: "PackIndex",
  990. type_filter: EWAHBitmap | None = None,
  991. ) -> set[ObjectID]:
  992. """Convert a bitmap to a set of object SHAs.
  993. Args:
  994. bitmap: The EWAH bitmap with set bits for objects
  995. pack_index: Pack index to map positions to SHAs
  996. type_filter: Optional type bitmap to filter results (e.g., commits only)
  997. Returns:
  998. Set of object SHAs (hex format)
  999. """
  1000. result: set[ObjectID] = set()
  1001. for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
  1002. # Check if this position is in the bitmap
  1003. if pos in bitmap:
  1004. # Apply type filter if provided
  1005. if type_filter is None or pos in type_filter:
  1006. result.add(sha_to_hex(sha))
  1007. return result