| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207 |
- # bitmap.py -- Packfile bitmap support for git
- # Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
- #
- # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
- # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
- # General Public License as published by the Free Software Foundation; version 2.0
- # or (at your option) any later version. You can redistribute it and/or
- # modify it under the terms of either of these two licenses.
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- # You should have received a copy of the licenses; if not, see
- # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
- # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
- # License, Version 2.0.
- #
- """Support for Git packfile bitmaps.
- Bitmaps store reachability information for packfiles, enabling faster
- object counting and enumeration operations without full graph traversal.
- The bitmap format uses EWAH (Enhanced Word-Aligned Hybrid) compression
- for efficient storage and fast bitwise operations.
- """
- import os
- import struct
- from collections import deque
- from collections.abc import Callable, Iterable, Iterator
- from io import BytesIO
- from typing import IO, TYPE_CHECKING, Callable, Optional
- from .file import GitFile
- from .objects import Blob, Commit, Tag, Tree
- if TYPE_CHECKING:
- from .object_store import BaseObjectStore
- from .pack import Pack, PackIndex
- # Bitmap file signature
- BITMAP_SIGNATURE = b"BITM"
- # Bitmap format version
- BITMAP_VERSION = 1
- # Bitmap flags
- BITMAP_OPT_FULL_DAG = 0x1 # Full closure
- BITMAP_OPT_HASH_CACHE = 0x4 # Name-hash cache
- BITMAP_OPT_LOOKUP_TABLE = 0x10 # Lookup table for random access
- BITMAP_OPT_PSEUDO_MERGES = 0x20 # Pseudo-merge bitmaps
- # EWAH compression constants
- MAX_LITERAL_WORDS = 0x7FFFFFFF # Maximum literal words in EWAH format (31 bits)
- MAX_XOR_OFFSET = 160 # Maximum distance to search for XOR compression base
- DEFAULT_COMMIT_INTERVAL = 100 # Default interval for commit selection
- def _encode_ewah_words(words: list[int]) -> list[int]:
- """Encode a list of 64-bit words using EWAH run-length compression.
- Args:
- words: List of 64-bit words to encode
- Returns:
- List of compressed words (RLWs followed by literals)
- """
- compressed_words = []
- i = 0
- while i < len(words):
- # Check for runs of all zeros or all ones
- if words[i] == 0 or words[i] == 0xFFFFFFFFFFFFFFFF:
- # Count consecutive identical words
- run_value = words[i]
- run_length = 0
- while i < len(words) and words[i] == run_value:
- run_length += 1
- i += 1
- # Collect following literal words
- literals = []
- while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF:
- literals.append(words[i])
- i += 1
- if len(literals) >= MAX_LITERAL_WORDS:
- break
- # Create RLW with correct bit layout:
- # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)]
- running_bit = 1 if run_value == 0xFFFFFFFFFFFFFFFF else 0
- rlw = (len(literals) << 33) | (run_length << 1) | running_bit
- compressed_words.append(rlw)
- compressed_words.extend(literals)
- else:
- # Collect literal words
- literals = []
- while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF:
- literals.append(words[i])
- i += 1
- if len(literals) >= MAX_LITERAL_WORDS:
- break
- # RLW with no run, just literals
- # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)]
- rlw = (len(literals) << 33) | (0 << 1) | 0
- compressed_words.append(rlw)
- compressed_words.extend(literals)
- return compressed_words
- class EWAHBitmap:
- """EWAH (Enhanced Word-Aligned Hybrid) compressed bitmap.
- EWAH uses run-length encoding for efficient bitmap storage.
- Each bitmap consists of:
- - Uncompressed bit count (4 bytes)
- - Compressed word count (4 bytes)
- - Compressed words (8 bytes each)
- - Current RLW position (4 bytes)
- Each Run Length Word (RLW) 64-bit layout (LSB to MSB):
- - Bit 0: running_bit (1 bit) - value of repeated words (0 or 1)
- - Bits 1-32: running_len (32 bits) - count of repeated words
- - Bits 33-63: literal_words (31 bits) - count of literal words following this RLW
- """
- def __init__(self, data: bytes | None = None) -> None:
- """Initialize EWAH bitmap.
- Args:
- data: Optional compressed bitmap data to decode
- """
- self.bits: set[int] = set()
- self.bit_count = 0
- if data:
- self._decode(data)
- def _decode(self, data: bytes) -> None:
- """Decode EWAH compressed bitmap data.
- Args:
- data: Compressed bitmap data (EWAH format with header + words +
- RLW position)
- """
- f = BytesIO(data)
- # Read header
- bit_count_bytes = f.read(4)
- word_count_bytes = f.read(4)
- if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
- return
- bit_count = struct.unpack(">I", bit_count_bytes)[0]
- word_count = struct.unpack(">I", word_count_bytes)[0]
- self.bit_count = bit_count
- current_bit = 0
- # Read all words first
- words = []
- for _ in range(word_count):
- word_bytes = f.read(8)
- if len(word_bytes) < 8:
- break
- word = struct.unpack(">Q", word_bytes)[0]
- words.append(word)
- # Process EWAH chunks: RLW followed by literal words
- idx = 0
- while idx < len(words):
- # This is an RLW
- # Bit layout: [literal_words(31)][running_len(32)][running_bit(1)]
- rlw = words[idx]
- running_bit = rlw & 1
- running_len = (rlw >> 1) & 0xFFFFFFFF
- literal_words = rlw >> 33
- idx += 1
- # Process running bits
- if running_len > 0:
- if running_bit == 1:
- # Add all bits in the repeated section
- for i in range(running_len * 64):
- self.bits.add(current_bit + i)
- current_bit += running_len * 64
- # Process literal words
- for _ in range(literal_words):
- if idx >= len(words):
- break
- literal = words[idx]
- idx += 1
- # Extract set bits from literal word
- for i in range(64):
- if literal & (1 << i):
- self.bits.add(current_bit + i)
- current_bit += 64
- # Read RLW position (we don't use it currently, but it's part of the format)
- f.read(4)
- def encode(self) -> bytes:
- """Encode bitmap to EWAH compressed format.
- Returns:
- Compressed bitmap data including header, words, and RLW position
- """
- if not self.bits:
- # Empty bitmap: bit_count=0, word_count=0, rlw_pos=0
- return struct.pack(">III", 0, 0, 0)
- max_bit = max(self.bits) if self.bits else 0
- bit_count = max_bit + 1
- word_count = (bit_count + 63) // 64
- # Create literal words
- words = [0] * word_count
- for bit in self.bits:
- word_idx = bit // 64
- bit_idx = bit % 64
- words[word_idx] |= 1 << bit_idx
- # Compress using EWAH run-length encoding
- compressed_words = _encode_ewah_words(words)
- # Build EWAH data
- f = BytesIO()
- # Header
- f.write(struct.pack(">I", bit_count))
- f.write(struct.pack(">I", len(compressed_words)))
- # Write compressed words
- for word in compressed_words:
- f.write(struct.pack(">Q", word))
- # Write RLW position (position of last RLW in the compressed words)
- # For now, we'll use 0 as we don't track this during encoding
- # This could be improved in the future if needed
- f.write(struct.pack(">I", 0))
- return f.getvalue()
- def __contains__(self, bit: int) -> bool:
- """Check if a bit is set.
- Args:
- bit: Bit position to check
- Returns:
- True if bit is set, False otherwise
- """
- return bit in self.bits
- def __len__(self) -> int:
- """Return the number of set bits.
- Returns:
- Count of set bits
- """
- return len(self.bits)
- def __or__(self, other: "EWAHBitmap") -> "EWAHBitmap":
- """Bitwise OR operation.
- Args:
- other: Other bitmap to OR with
- Returns:
- New bitmap with OR result
- """
- result = EWAHBitmap()
- result.bits = self.bits | other.bits
- result.bit_count = max(self.bit_count, other.bit_count)
- return result
- def __and__(self, other: "EWAHBitmap") -> "EWAHBitmap":
- """Bitwise AND operation.
- Args:
- other: Other bitmap to AND with
- Returns:
- New bitmap with AND result
- """
- result = EWAHBitmap()
- result.bits = self.bits & other.bits
- result.bit_count = max(self.bit_count, other.bit_count)
- return result
- def __xor__(self, other: "EWAHBitmap") -> "EWAHBitmap":
- """Bitwise XOR operation.
- Args:
- other: Other bitmap to XOR with
- Returns:
- New bitmap with XOR result
- """
- result = EWAHBitmap()
- result.bits = self.bits ^ other.bits
- result.bit_count = max(self.bit_count, other.bit_count)
- return result
- def __sub__(self, other: "EWAHBitmap") -> "EWAHBitmap":
- """Bitwise subtraction (set difference).
- Returns bits that are in self but not in other.
- Equivalent to: self & ~other
- Args:
- other: Bitmap to subtract
- Returns:
- New bitmap with bits in self but not in other
- """
- result = EWAHBitmap()
- result.bits = self.bits - other.bits
- result.bit_count = self.bit_count
- return result
- def add(self, bit: int) -> None:
- """Set a bit.
- Args:
- bit: Bit position to set
- """
- self.bits.add(bit)
- self.bit_count = max(self.bit_count, bit + 1)
- class BitmapEntry:
- """A single bitmap entry for a commit."""
- def __init__(
- self,
- object_pos: int,
- xor_offset: int,
- flags: int,
- bitmap: EWAHBitmap,
- ) -> None:
- """Initialize a bitmap entry.
- Args:
- object_pos: Position of object in pack index
- xor_offset: XOR offset for compression
- flags: Entry flags
- bitmap: The EWAH bitmap data
- """
- self.object_pos = object_pos
- self.xor_offset = xor_offset
- self.flags = flags
- self.bitmap = bitmap
- class PackBitmap:
- """A pack bitmap index.
- Bitmaps store reachability information for commits in a packfile,
- allowing fast object enumeration without graph traversal.
- """
- def __init__(
- self,
- version: int = BITMAP_VERSION,
- flags: int = BITMAP_OPT_FULL_DAG,
- ) -> None:
- """Initialize a pack bitmap.
- Args:
- version: Bitmap format version
- flags: Bitmap flags
- """
- self.version = version
- self.flags = flags
- self.pack_checksum: bytes | None = None
- # Type bitmaps for commits, trees, blobs, tags
- self.commit_bitmap = EWAHBitmap()
- self.tree_bitmap = EWAHBitmap()
- self.blob_bitmap = EWAHBitmap()
- self.tag_bitmap = EWAHBitmap()
- # Bitmap entries indexed by commit SHA
- self.entries: dict[bytes, BitmapEntry] = {}
- # List of entries in order (for XOR offset resolution)
- self.entries_list: list[tuple[bytes, BitmapEntry]] = []
- # Optional lookup table for random access
- self.lookup_table: list[tuple[int, int, int]] | None = None
- # Optional name-hash cache
- self.name_hash_cache: list[int] | None = None
- def get_bitmap(self, commit_sha: bytes) -> EWAHBitmap | None:
- """Get the bitmap for a commit.
- Args:
- commit_sha: SHA-1 of the commit
- Returns:
- EWAH bitmap or None if not found
- """
- entry = self.entries.get(commit_sha)
- if entry is None:
- return None
- # Decompress using XOR if needed
- if entry.xor_offset > 0:
- # Find the entry at the XOR offset
- # The XOR offset tells us how many entries back to look
- # We need to find this entry in the ordered list
- try:
- current_idx = next(
- i
- for i, (sha, _) in enumerate(self.entries_list)
- if sha == commit_sha
- )
- except StopIteration:
- # Entry not found in list, return as-is
- return entry.bitmap
- # XOR offset is how many positions back to look
- if current_idx >= entry.xor_offset:
- base_sha, _base_entry = self.entries_list[
- current_idx - entry.xor_offset
- ]
- # Get the base bitmap (recursively if it also uses XOR)
- base_bitmap = self.get_bitmap(base_sha)
- if base_bitmap is not None:
- # XOR the current bitmap with the base
- return entry.bitmap ^ base_bitmap
- return entry.bitmap
- def has_commit(self, commit_sha: bytes) -> bool:
- """Check if a commit has a bitmap.
- Args:
- commit_sha: SHA-1 of the commit
- Returns:
- True if bitmap exists for this commit
- """
- return commit_sha in self.entries
- def iter_commits(self) -> Iterator[bytes]:
- """Iterate over all commits with bitmaps.
- Returns:
- Iterator of commit SHAs
- """
- return iter(self.entries.keys())
- def read_bitmap(
- filename: str | os.PathLike[str],
- pack_index: Optional["PackIndex"] = None,
- ) -> PackBitmap:
- """Read a bitmap index file.
- Args:
- filename: Path to the .bitmap file
- pack_index: Optional PackIndex to resolve object positions to SHAs
- Returns:
- Loaded PackBitmap
- Raises:
- ValueError: If file format is invalid
- ChecksumMismatch: If checksum verification fails
- """
- with GitFile(filename, "rb") as f:
- return read_bitmap_file(f, pack_index=pack_index)
- def read_bitmap_file(
- f: IO[bytes], pack_index: Optional["PackIndex"] = None
- ) -> PackBitmap:
- """Read bitmap data from a file object.
- Args:
- f: File object to read from
- pack_index: Optional PackIndex to resolve object positions to SHAs
- Returns:
- Loaded PackBitmap
- Raises:
- ValueError: If file format is invalid
- """
- # Read header
- signature = f.read(4)
- if signature != BITMAP_SIGNATURE:
- raise ValueError(
- f"Invalid bitmap signature: {signature!r}, expected {BITMAP_SIGNATURE!r}"
- )
- version_bytes = f.read(2)
- flags_bytes = f.read(2)
- if len(version_bytes) < 2 or len(flags_bytes) < 2:
- raise ValueError("Incomplete bitmap header")
- version = struct.unpack(">H", version_bytes)[0]
- flags = struct.unpack(">H", flags_bytes)[0]
- if version != BITMAP_VERSION:
- raise ValueError(f"Unsupported bitmap version: {version}")
- # Read entry count
- entry_count_bytes = f.read(4)
- if len(entry_count_bytes) < 4:
- raise ValueError("Missing entry count")
- entry_count = struct.unpack(">I", entry_count_bytes)[0]
- # Read pack checksum
- pack_checksum = f.read(20)
- if len(pack_checksum) < 20:
- raise ValueError("Missing pack checksum")
- bitmap = PackBitmap(version=version, flags=flags)
- bitmap.pack_checksum = pack_checksum
- # Read type bitmaps (EWAH bitmaps are self-describing)
- for i, type_bitmap in enumerate(
- [
- bitmap.commit_bitmap,
- bitmap.tree_bitmap,
- bitmap.blob_bitmap,
- bitmap.tag_bitmap,
- ]
- ):
- # EWAH format:
- # 4 bytes: bit count
- # 4 bytes: word count
- # N x 8 bytes: compressed words
- # 4 bytes: RLW position
- # Read header to determine size
- bit_count_bytes = f.read(4)
- word_count_bytes = f.read(4)
- if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
- raise ValueError(f"Missing type bitmap {i} header")
- word_count = struct.unpack(">I", word_count_bytes)[0]
- # Read compressed words
- words_data = f.read(word_count * 8)
- if len(words_data) < word_count * 8:
- raise ValueError(f"Incomplete type bitmap {i} data")
- # Read RLW position
- rlw_pos_bytes = f.read(4)
- if len(rlw_pos_bytes) < 4:
- raise ValueError(f"Missing type bitmap {i} RLW position")
- # Reconstruct the full EWAH data to pass to _decode
- ewah_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes
- type_bitmap._decode(ewah_data)
- # Read bitmap entries
- for _ in range(entry_count):
- # Read object position (4 bytes)
- obj_pos_bytes = f.read(4)
- if len(obj_pos_bytes) < 4:
- raise ValueError("Incomplete bitmap entry")
- obj_pos = struct.unpack(">I", obj_pos_bytes)[0]
- # Read XOR offset (1 byte)
- xor_offset_bytes = f.read(1)
- if len(xor_offset_bytes) < 1:
- raise ValueError("Missing XOR offset")
- xor_offset = xor_offset_bytes[0]
- # Read flags (1 byte)
- flags_bytes = f.read(1)
- if len(flags_bytes) < 1:
- raise ValueError("Missing entry flags")
- entry_flags = flags_bytes[0]
- # Read self-describing EWAH bitmap
- # EWAH format: bit_count (4) + word_count (4) + words + rlw_pos (4)
- bit_count_bytes = f.read(4)
- word_count_bytes = f.read(4)
- if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4:
- raise ValueError("Incomplete bitmap entry EWAH header")
- word_count = struct.unpack(">I", word_count_bytes)[0]
- # Read compressed words
- words_data = f.read(word_count * 8)
- if len(words_data) < word_count * 8:
- raise ValueError("Incomplete bitmap entry EWAH words")
- # Read RLW position
- rlw_pos_bytes = f.read(4)
- if len(rlw_pos_bytes) < 4:
- raise ValueError("Missing bitmap entry EWAH RLW position")
- # Reconstruct full EWAH data
- bitmap_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes
- # Create bitmap entry
- ewah_bitmap = EWAHBitmap(bitmap_data) if word_count > 0 else EWAHBitmap()
- entry = BitmapEntry(
- object_pos=obj_pos,
- xor_offset=xor_offset,
- flags=entry_flags,
- bitmap=ewah_bitmap,
- )
- # Resolve object position to SHA if we have a pack index
- if pack_index is not None:
- # Get the SHA at the given position in the sorted index
- sha = None
- for idx, (entry_sha, _offset, _crc32) in enumerate(
- pack_index.iterentries()
- ):
- if idx == obj_pos:
- sha = entry_sha
- break
- if sha is not None:
- bitmap.entries[sha] = entry
- bitmap.entries_list.append((sha, entry))
- else:
- # Without pack index, use position as temporary key
- temp_key = obj_pos.to_bytes(4, byteorder="big")
- bitmap.entries[temp_key] = entry
- bitmap.entries_list.append((temp_key, entry))
- # Read optional lookup table
- if flags & BITMAP_OPT_LOOKUP_TABLE:
- # Lookup table contains triplets: (commit_pos, offset, xor_row)
- # Number of entries matches the bitmap entry count
- lookup_table = []
- for _ in range(entry_count):
- # Read commit position (4 bytes)
- commit_pos_bytes = f.read(4)
- if len(commit_pos_bytes) < 4:
- break
- commit_pos = struct.unpack(">I", commit_pos_bytes)[0]
- # Read file offset (8 bytes)
- offset_bytes = f.read(8)
- if len(offset_bytes) < 8:
- break
- offset = struct.unpack(">Q", offset_bytes)[0]
- # Read XOR row (4 bytes)
- xor_row_bytes = f.read(4)
- if len(xor_row_bytes) < 4:
- break
- xor_row = struct.unpack(">I", xor_row_bytes)[0]
- lookup_table.append((commit_pos, offset, xor_row))
- bitmap.lookup_table = lookup_table
- # Read optional name-hash cache
- if flags & BITMAP_OPT_HASH_CACHE:
- # Name-hash cache contains one 32-bit hash per object in the pack
- # The number of hashes depends on the total number of objects
- # For now, we'll read what's available
- name_hash_cache = []
- while True:
- hash_bytes = f.read(4)
- if len(hash_bytes) < 4:
- break
- hash_value = struct.unpack(">I", hash_bytes)[0]
- name_hash_cache.append(hash_value)
- if name_hash_cache:
- bitmap.name_hash_cache = name_hash_cache
- return bitmap
- def write_bitmap(
- filename: str | os.PathLike[str],
- bitmap: PackBitmap,
- ) -> None:
- """Write a bitmap index file.
- Args:
- filename: Path to write the .bitmap file
- bitmap: PackBitmap to write
- """
- with GitFile(filename, "wb") as f:
- write_bitmap_file(f, bitmap)
- def write_bitmap_file(f: IO[bytes], bitmap: PackBitmap) -> None:
- """Write bitmap data to a file object.
- Args:
- f: File object to write to
- bitmap: PackBitmap to write
- """
- # Write header
- f.write(BITMAP_SIGNATURE)
- f.write(struct.pack(">H", bitmap.version))
- f.write(struct.pack(">H", bitmap.flags))
- # Write entry count
- f.write(struct.pack(">I", len(bitmap.entries)))
- # Write pack checksum
- if bitmap.pack_checksum:
- f.write(bitmap.pack_checksum)
- else:
- f.write(b"\x00" * 20)
- # Write type bitmaps (self-describing EWAH format, no size prefix needed)
- for type_bitmap in [
- bitmap.commit_bitmap,
- bitmap.tree_bitmap,
- bitmap.blob_bitmap,
- bitmap.tag_bitmap,
- ]:
- data = type_bitmap.encode()
- f.write(data)
- # Write bitmap entries
- for _sha, entry in bitmap.entries.items():
- # Write object position (4 bytes)
- f.write(struct.pack(">I", entry.object_pos))
- # Write XOR offset (1 byte)
- f.write(bytes([entry.xor_offset]))
- # Write flags (1 byte)
- f.write(bytes([entry.flags]))
- # Write compressed bitmap data (self-describing EWAH format, no size prefix)
- bitmap_data = entry.bitmap.encode()
- f.write(bitmap_data)
- # Write optional lookup table
- if bitmap.flags & BITMAP_OPT_LOOKUP_TABLE and bitmap.lookup_table:
- for commit_pos, offset, xor_row in bitmap.lookup_table:
- f.write(struct.pack(">I", commit_pos)) # 4 bytes
- f.write(struct.pack(">Q", offset)) # 8 bytes
- f.write(struct.pack(">I", xor_row)) # 4 bytes
- # Write optional name-hash cache
- if bitmap.flags & BITMAP_OPT_HASH_CACHE and bitmap.name_hash_cache:
- for hash_value in bitmap.name_hash_cache:
- f.write(struct.pack(">I", hash_value))
- def _compute_name_hash(name: bytes) -> int:
- """Compute the name hash for a tree entry.
- This is the same algorithm Git uses for the name-hash cache.
- Args:
- name: The name of the tree entry
- Returns:
- 32-bit hash value
- """
- hash_value = 0
- for byte in name:
- hash_value = (hash_value >> 19) | (hash_value << 13)
- hash_value += byte
- hash_value &= 0xFFFFFFFF
- return hash_value
- def select_bitmap_commits(
- refs: dict[bytes, bytes],
- object_store: "BaseObjectStore",
- commit_interval: int = DEFAULT_COMMIT_INTERVAL,
- ) -> list[bytes]:
- """Select commits for bitmap generation.
- Uses Git's strategy:
- - All branch and tag tips
- - Every Nth commit in history
- Args:
- refs: Dictionary of ref names to commit SHAs
- object_store: Object store to read commits from
- commit_interval: Include every Nth commit in history
- Returns:
- List of commit SHAs to create bitmaps for
- """
- selected = set()
- seen = set()
- # Start with all refs
- ref_commits = set()
- for ref_name, sha in refs.items():
- try:
- obj = object_store[sha]
- except KeyError:
- continue
- else:
- # Dereference tags to get to commits
- while isinstance(obj, Tag):
- obj = object_store[obj.object[1]]
- if isinstance(obj, Commit):
- ref_commits.add(obj.id)
- # Add all ref tips
- selected.update(ref_commits)
- # Walk the commit graph and select every Nth commit
- queue = deque(ref_commits)
- commit_count = 0
- while queue:
- commit_sha = queue.popleft()
- if commit_sha in seen:
- continue
- seen.add(commit_sha)
- try:
- obj = object_store[commit_sha]
- if not isinstance(obj, Commit):
- continue
- commit_count += 1
- if commit_count % commit_interval == 0:
- selected.add(commit_sha)
- # Add parents to queue
- for parent in obj.parents:
- if parent not in seen:
- queue.append(parent)
- except KeyError:
- continue
- return sorted(selected)
- def build_reachability_bitmap(
- commit_sha: bytes,
- sha_to_pos: dict[bytes, int],
- object_store: "BaseObjectStore",
- ) -> EWAHBitmap:
- """Build a reachability bitmap for a commit.
- The bitmap has a bit set for each object that is reachable from the commit.
- The bit position corresponds to the object's position in the pack index.
- Args:
- commit_sha: The commit to build a bitmap for
- sha_to_pos: Pre-built mapping from SHA to position in pack
- object_store: Object store to traverse objects
- Returns:
- EWAH bitmap with bits set for reachable objects
- """
- bitmap = EWAHBitmap()
- # Traverse all objects reachable from the commit
- seen = set()
- queue = deque([commit_sha])
- while queue:
- sha = queue.popleft()
- if sha in seen:
- continue
- seen.add(sha)
- # Add this object to the bitmap if it's in the pack
- if sha in sha_to_pos:
- bitmap.add(sha_to_pos[sha])
- # Get the object and traverse its references
- try:
- obj = object_store[sha]
- if isinstance(obj, Commit):
- # Add parents and tree
- queue.append(obj.tree)
- queue.extend(obj.parents)
- elif hasattr(obj, "items"):
- # Tree object - add all entries
- for item in obj.items():
- queue.append(item.sha)
- except KeyError:
- # Object not in store, skip it
- continue
- return bitmap
- def apply_xor_compression(
- bitmaps: list[tuple[bytes, EWAHBitmap]],
- max_xor_offset: int = MAX_XOR_OFFSET,
- ) -> list[tuple[bytes, EWAHBitmap, int]]:
- """Apply XOR compression to bitmaps.
- XOR compression stores some bitmaps as XOR differences from previous bitmaps,
- reducing storage size when bitmaps are similar.
- Args:
- bitmaps: List of (commit_sha, bitmap) tuples
- max_xor_offset: Maximum offset to search for XOR base
- Returns:
- List of (commit_sha, bitmap, xor_offset) tuples
- """
- compressed = []
- for i, (sha, bitmap) in enumerate(bitmaps):
- best_xor_offset = 0
- best_size = len(bitmap.encode())
- best_xor_bitmap = bitmap
- # Try XORing with previous bitmaps within max_xor_offset
- for offset in range(1, min(i + 1, max_xor_offset + 1)):
- _prev_sha, prev_bitmap = bitmaps[i - offset]
- xor_bitmap = bitmap ^ prev_bitmap
- xor_size = len(xor_bitmap.encode())
- # Use XOR if it reduces size
- if xor_size < best_size:
- best_size = xor_size
- best_xor_offset = offset
- best_xor_bitmap = xor_bitmap
- compressed.append((sha, best_xor_bitmap, best_xor_offset))
- return compressed
- def build_type_bitmaps(
- sha_to_pos: dict[bytes, int],
- object_store: "BaseObjectStore",
- ) -> tuple[EWAHBitmap, EWAHBitmap, EWAHBitmap, EWAHBitmap]:
- """Build type bitmaps for all objects in a pack.
- Type bitmaps classify objects by type: commit, tree, blob, or tag.
- Args:
- sha_to_pos: Pre-built mapping from SHA to position in pack
- object_store: Object store to read object types
- Returns:
- Tuple of (commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap)
- """
- commit_bitmap = EWAHBitmap()
- tree_bitmap = EWAHBitmap()
- blob_bitmap = EWAHBitmap()
- tag_bitmap = EWAHBitmap()
- for sha, pos in sha_to_pos.items():
- try:
- obj = object_store[sha]
- except KeyError:
- pass
- else:
- obj_type = obj.type_num
- if obj_type == Commit.type_num:
- commit_bitmap.add(pos)
- elif obj_type == Tree.type_num:
- tree_bitmap.add(pos)
- elif obj_type == Blob.type_num:
- blob_bitmap.add(pos)
- elif obj_type == Tag.type_num:
- tag_bitmap.add(pos)
- return commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap
- def build_name_hash_cache(
- sha_to_pos: dict[bytes, int],
- object_store: "BaseObjectStore",
- ) -> list[int]:
- """Build name-hash cache for all objects in a pack.
- The name-hash cache stores a hash of the name for each object,
- which can speed up path-based operations.
- Args:
- sha_to_pos: Pre-built mapping from SHA to position in pack
- object_store: Object store to read objects
- Returns:
- List of 32-bit hash values, one per object in the pack
- """
- # Pre-allocate list with correct size
- num_objects = len(sha_to_pos)
- name_hashes = [0] * num_objects
- for sha, pos in sha_to_pos.items():
- try:
- obj = object_store[sha]
- except KeyError:
- # Object not in store, use zero hash
- pass
- else:
- # For tree entries, use the tree entry name
- # For commits, use the tree SHA
- # For other objects, use the object SHA
- if isinstance(obj, Tree):
- # Tree object - use the SHA as the name
- name_hash = _compute_name_hash(sha)
- elif isinstance(obj, Commit):
- # Commit - use the tree SHA as the name
- name_hash = _compute_name_hash(obj.tree)
- else:
- # Other objects - use the SHA as the name
- name_hash = _compute_name_hash(sha)
- name_hashes[pos] = name_hash
- return name_hashes
- def generate_bitmap(
- pack_index: "PackIndex",
- object_store: "BaseObjectStore",
- refs: dict[bytes, bytes],
- pack_checksum: bytes,
- include_hash_cache: bool = True,
- include_lookup_table: bool = True,
- commit_interval: int = DEFAULT_COMMIT_INTERVAL,
- progress: Callable[[str], None] | None = None,
- ) -> PackBitmap:
- """Generate a complete bitmap for a pack.
- Args:
- pack_index: Pack index for the pack
- object_store: Object store to read objects from
- refs: Dictionary of ref names to commit SHAs
- pack_checksum: SHA-1 checksum of the pack file
- include_hash_cache: Whether to include name-hash cache
- include_lookup_table: Whether to include lookup table
- commit_interval: Include every Nth commit in history
- progress: Optional progress reporting callback
- Returns:
- Complete PackBitmap ready to write to disk
- """
- if progress:
- progress("Building pack index mapping")
- # Build mapping from SHA to position in pack index ONCE
- # This is used by all subsequent operations and avoids repeated enumeration
- sha_to_pos = {}
- for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
- sha_to_pos[sha] = pos
- if progress:
- progress("Selecting commits for bitmap")
- # Select commits to create bitmaps for
- selected_commits = select_bitmap_commits(refs, object_store, commit_interval)
- if progress:
- progress(f"Building bitmaps for {len(selected_commits)} commits")
- # Build reachability bitmaps for selected commits
- commit_bitmaps = []
- for i, commit_sha in enumerate(selected_commits):
- if progress and i % 10 == 0:
- progress(f"Building bitmap {i + 1}/{len(selected_commits)}")
- bitmap = build_reachability_bitmap(commit_sha, sha_to_pos, object_store)
- commit_bitmaps.append((commit_sha, bitmap))
- if progress:
- progress("Applying XOR compression")
- # Apply XOR compression
- compressed_bitmaps = apply_xor_compression(commit_bitmaps)
- if progress:
- progress("Building type bitmaps")
- # Build type bitmaps (using pre-built sha_to_pos mapping)
- commit_type_bitmap, tree_type_bitmap, blob_type_bitmap, tag_type_bitmap = (
- build_type_bitmaps(sha_to_pos, object_store)
- )
- # Create PackBitmap
- flags = BITMAP_OPT_FULL_DAG
- if include_hash_cache:
- flags |= BITMAP_OPT_HASH_CACHE
- if include_lookup_table:
- flags |= BITMAP_OPT_LOOKUP_TABLE
- pack_bitmap = PackBitmap(version=1, flags=flags)
- pack_bitmap.pack_checksum = pack_checksum
- pack_bitmap.commit_bitmap = commit_type_bitmap
- pack_bitmap.tree_bitmap = tree_type_bitmap
- pack_bitmap.blob_bitmap = blob_type_bitmap
- pack_bitmap.tag_bitmap = tag_type_bitmap
- # Add bitmap entries
- for commit_sha, xor_bitmap, xor_offset in compressed_bitmaps:
- if commit_sha not in sha_to_pos:
- continue
- entry = BitmapEntry(
- object_pos=sha_to_pos[commit_sha],
- xor_offset=xor_offset,
- flags=0,
- bitmap=xor_bitmap,
- )
- pack_bitmap.entries[commit_sha] = entry
- pack_bitmap.entries_list.append((commit_sha, entry))
- # Build optional name-hash cache (using pre-built sha_to_pos mapping)
- if include_hash_cache:
- if progress:
- progress("Building name-hash cache")
- pack_bitmap.name_hash_cache = build_name_hash_cache(sha_to_pos, object_store)
- # Build optional lookup table
- if include_lookup_table:
- if progress:
- progress("Building lookup table")
- # The lookup table is built automatically from the entries
- # For now, we'll leave it as None and let the write function handle it
- # TODO: Implement lookup table generation if needed
- pack_bitmap.lookup_table = None
- if progress:
- progress("Bitmap generation complete")
- return pack_bitmap
- def find_commit_bitmaps(commit_shas: set[bytes], packs: Iterable[Pack]) -> dict[bytes, tuple]:
- """Find which packs have bitmaps for the given commits.
- Args:
- commit_shas: Set of commit SHAs to look for
- packs: Iterable of Pack objects to search
- Returns:
- Dict mapping commit SHA to (pack, pack_bitmap, position) tuple
- """
- result = {}
- remaining = set(commit_shas)
- for pack in packs:
- if not remaining:
- break
- pack_bitmap = pack.bitmap
- if not pack_bitmap:
- # No bitmap for this pack
- continue
- # Build SHA to position mapping for this pack
- sha_to_pos = {}
- for pos, (sha, _offset, _crc32) in enumerate(pack.index.iterentries()):
- sha_to_pos[sha] = pos
- # Check which commits have bitmaps
- for commit_sha in list(remaining):
- if pack_bitmap.has_commit(commit_sha):
- if commit_sha in sha_to_pos:
- result[commit_sha] = (pack, pack_bitmap, sha_to_pos)
- remaining.remove(commit_sha)
- return result
- def bitmap_to_object_shas(
- bitmap: EWAHBitmap,
- pack_index: "PackIndex",
- type_filter: EWAHBitmap | None = None,
- ) -> set[bytes]:
- """Convert a bitmap to a set of object SHAs.
- Args:
- bitmap: The EWAH bitmap with set bits for objects
- pack_index: Pack index to map positions to SHAs
- type_filter: Optional type bitmap to filter results (e.g., commits only)
- Returns:
- Set of object SHAs
- """
- result = set()
- for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()):
- # Check if this position is in the bitmap
- if pos in bitmap:
- # Apply type filter if provided
- if type_filter is None or pos in type_filter:
- result.add(sha)
- return result
|