2 bulan lalu · b8a7dc62f6
--- a/dulwich/midx.py
+++ b/dulwich/midx.py
@@ -42,6 +42,7 @@ from collections.abc import Iterator
 
				 from typing import IO, Any
			
 
				 
			
 
				 from .file import GitFile, _GitFile
			
 
				+from .pack import SHA1Writer
			
 
				 
			
 
				 # MIDX signature
			
 
				 MIDX_SIGNATURE = b"MIDX"
			
@@ -412,7 +413,182 @@ def load_midx_file(
 
				     return MultiPackIndex(path, file=f)
			
 
				 
			
 
				 
			
 
				-# TODO: Implement MIDX writing functionality
			
 
				-# TODO: Implement integration with object_store.py
			
 
				+def write_midx(
			
 
				+    f: IO[bytes],
			
 
				+    pack_index_entries: list[tuple[str, list[tuple[bytes, int, int | None]]]],
			
 
				+    hash_algorithm: int = HASH_ALGORITHM_SHA1,
			
 
				+) -> bytes:
			
 
				+    """Write a multi-pack-index file.
			
 
				+
			
 
				+    Args:
			
 
				+        f: File-like object to write to
			
 
				+        pack_index_entries: List of (pack_name, entries) tuples where entries are
			
 
				+                          (sha, offset, crc32) tuples, sorted by SHA
			
 
				+        hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
			
 
				+
			
 
				+    Returns:
			
 
				+        SHA-1 checksum of the written MIDX file
			
 
				+    """
			
 
				+    if hash_algorithm == HASH_ALGORITHM_SHA1:
			
 
				+        hash_size = 20
			
 
				+    elif hash_algorithm == HASH_ALGORITHM_SHA256:
			
 
				+        hash_size = 32
			
 
				+    else:
			
 
				+        raise ValueError(f"Unknown hash algorithm: {hash_algorithm}")
			
 
				+
			
 
				+    # Wrap file in SHA1Writer to compute checksum
			
 
				+    writer = SHA1Writer(f)
			
 
				+
			
 
				+    # Collect all objects from all packs
			
 
				+    all_objects: list[tuple[bytes, int, int]] = []  # (sha, pack_id, offset)
			
 
				+    pack_names: list[str] = []
			
 
				+
			
 
				+    for pack_id, (pack_name, entries) in enumerate(pack_index_entries):
			
 
				+        pack_names.append(pack_name)
			
 
				+        for sha, offset, _crc32 in entries:
			
 
				+            all_objects.append((sha, pack_id, offset))
			
 
				+
			
 
				+    # Sort all objects by SHA
			
 
				+    all_objects.sort(key=lambda x: x[0])
			
 
				+
			
 
				+    # Calculate offsets for chunks
			
 
				+    num_packs = len(pack_names)
			
 
				+    num_objects = len(all_objects)
			
 
				+
			
 
				+    # Header: 12 bytes
			
 
				+    header_size = 12
			
 
				+
			
 
				+    # Chunk count: PNAM, OIDF, OIDL, OOFF, and optionally LOFF
			
 
				+    # We'll determine if LOFF is needed later
			
 
				+    chunk_count = 4  # PNAM, OIDF, OIDL, OOFF
			
 
				+
			
 
				+    # Check if we need LOFF chunk (for offsets >= 2^31)
			
 
				+    need_loff = any(offset >= 2**31 for _sha, _pack_id, offset in all_objects)
			
 
				+    if need_loff:
			
 
				+        chunk_count += 1
			
 
				+
			
 
				+    # Chunk table: (chunk_count + 1) * 12 bytes (including terminator)
			
 
				+    chunk_table_size = (chunk_count + 1) * 12
			
 
				+
			
 
				+    # Calculate chunk offsets
			
 
				+    current_offset = header_size + chunk_table_size
			
 
				+
			
 
				+    # PNAM chunk: pack names as null-terminated strings, padded to 4-byte boundary
			
 
				+    pnam_data = b"".join(name.encode("utf-8") + b"\x00" for name in pack_names)
			
 
				+    # Pad to 4-byte boundary
			
 
				+    pnam_padding = (4 - len(pnam_data) % 4) % 4
			
 
				+    pnam_data += b"\x00" * pnam_padding
			
 
				+    pnam_offset = current_offset
			
 
				+    current_offset += len(pnam_data)
			
 
				+
			
 
				+    # OIDF chunk: 256 * 4 bytes
			
 
				+    oidf_offset = current_offset
			
 
				+    oidf_size = 256 * 4
			
 
				+    current_offset += oidf_size
			
 
				+
			
 
				+    # OIDL chunk: num_objects * hash_size bytes
			
 
				+    oidl_offset = current_offset
			
 
				+    oidl_size = num_objects * hash_size
			
 
				+    current_offset += oidl_size
			
 
				+
			
 
				+    # OOFF chunk: num_objects * 8 bytes (4 for pack_id + 4 for offset)
			
 
				+    ooff_offset = current_offset
			
 
				+    ooff_size = num_objects * 8
			
 
				+    current_offset += ooff_size
			
 
				+
			
 
				+    # LOFF chunk (if needed): variable size
			
 
				+    loff_offset = current_offset if need_loff else 0
			
 
				+    large_offsets: list[int] = []
			
 
				+    if need_loff:
			
 
				+        # We'll populate this as we write OOFF
			
 
				+        pass
			
 
				+
			
 
				+    # Write header
			
 
				+    writer.write(MIDX_SIGNATURE)  # 4 bytes: signature
			
 
				+    writer.write(bytes([MIDX_VERSION]))  # 1 byte: version
			
 
				+    writer.write(bytes([hash_algorithm]))  # 1 byte: hash algorithm
			
 
				+    writer.write(bytes([chunk_count]))  # 1 byte: chunk count
			
 
				+    writer.write(bytes([0]))  # 1 byte: base MIDX files (always 0)
			
 
				+    writer.write(struct.pack(">L", num_packs))  # 4 bytes: pack count
			
 
				+
			
 
				+    # Write chunk table
			
 
				+    chunk_table = [
			
 
				+        (CHUNK_PNAM, pnam_offset),
			
 
				+        (CHUNK_OIDF, oidf_offset),
			
 
				+        (CHUNK_OIDL, oidl_offset),
			
 
				+        (CHUNK_OOFF, ooff_offset),
			
 
				+    ]
			
 
				+    if need_loff:
			
 
				+        chunk_table.append((CHUNK_LOFF, loff_offset))
			
 
				+
			
 
				+    for chunk_id, chunk_offset in chunk_table:
			
 
				+        writer.write(chunk_id)  # 4 bytes
			
 
				+        writer.write(struct.pack(">Q", chunk_offset))  # 8 bytes
			
 
				+
			
 
				+    # Write terminator
			
 
				+    writer.write(b"\x00\x00\x00\x00")  # 4 bytes
			
 
				+    writer.write(struct.pack(">Q", 0))  # 8 bytes
			
 
				+
			
 
				+    # Write PNAM chunk
			
 
				+    writer.write(pnam_data)
			
 
				+
			
 
				+    # Write OIDF chunk (fanout table)
			
 
				+    fanout: list[int] = [0] * 256
			
 
				+    for sha, _pack_id, _offset in all_objects:
			
 
				+        first_byte = sha[0]
			
 
				+        fanout[first_byte] += 1
			
 
				+
			
 
				+    # Convert counts to cumulative
			
 
				+    cumulative = 0
			
 
				+    for i in range(256):
			
 
				+        cumulative += fanout[i]
			
 
				+        writer.write(struct.pack(">L", cumulative))
			
 
				+
			
 
				+    # Write OIDL chunk (object IDs)
			
 
				+    for sha, _pack_id, _offset in all_objects:
			
 
				+        writer.write(sha)
			
 
				+
			
 
				+    # Write OOFF chunk (pack ID and offset for each object)
			
 
				+    for _sha, pack_id, offset in all_objects:
			
 
				+        writer.write(struct.pack(">L", pack_id))
			
 
				+
			
 
				+        if offset >= 2**31:
			
 
				+            # Use large offset table
			
 
				+            large_offset_index = len(large_offsets)
			
 
				+            large_offsets.append(offset)
			
 
				+            # Set MSB to indicate large offset
			
 
				+            writer.write(struct.pack(">L", 0x80000000 | large_offset_index))
			
 
				+        else:
			
 
				+            writer.write(struct.pack(">L", offset))
			
 
				+
			
 
				+    # Write LOFF chunk if needed
			
 
				+    if need_loff:
			
 
				+        for large_offset in large_offsets:
			
 
				+            writer.write(struct.pack(">Q", large_offset))
			
 
				+
			
 
				+    # Write checksum
			
 
				+    return writer.write_sha()
			
 
				+
			
 
				+
			
 
				+def write_midx_file(
			
 
				+    path: str | os.PathLike[str],
			
 
				+    pack_index_entries: list[tuple[str, list[tuple[bytes, int, int | None]]]],
			
 
				+    hash_algorithm: int = HASH_ALGORITHM_SHA1,
			
 
				+) -> bytes:
			
 
				+    """Write a multi-pack-index file to disk.
			
 
				+
			
 
				+    Args:
			
 
				+        path: Path where to write the MIDX file
			
 
				+        pack_index_entries: List of (pack_name, entries) tuples where entries are
			
 
				+                          (sha, offset, crc32) tuples, sorted by SHA
			
 
				+        hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256)
			
 
				+
			
 
				+    Returns:
			
 
				+        SHA-1 checksum of the written MIDX file
			
 
				+    """
			
 
				+    with GitFile(path, "wb") as f:
			
 
				+        return write_midx(f, pack_index_entries, hash_algorithm)
			
 
				+
			
 
				+
			
 
				 # TODO: Add support for incremental MIDX chains
			
 
				 # TODO: Add support for BTMP and RIDX chunks for bitmap integration