123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464 |
- # dumb.py -- Support for dumb HTTP(S) git repositories
- # Copyright (C) 2025 Dulwich contributors
- #
- # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
- # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
- # General Public License as public by the Free Software Foundation; version 2.0
- # or (at your option) any later version. You can redistribute it and/or
- # modify it under the terms of either of these two licenses.
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- # You should have received a copy of the licenses; if not, see
- # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
- # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
- # License, Version 2.0.
- #
- """Support for dumb HTTP(S) git repositories."""
- import os
- import tempfile
- import zlib
- from collections.abc import Iterator
- from io import BytesIO
- from typing import Optional
- from urllib.parse import urljoin
- from .errors import NotGitRepository, ObjectFormatException
- from .object_store import BaseObjectStore
- from .objects import (
- ZERO_SHA,
- Blob,
- Commit,
- ObjectID,
- ShaFile,
- Tag,
- Tree,
- hex_to_sha,
- sha_to_hex,
- )
- from .pack import Pack, PackIndex, UnpackedObject, load_pack_index_file
- from .refs import Ref, read_info_refs, split_peeled_refs
- from .repo import BaseRepo
- class DumbHTTPObjectStore(BaseObjectStore):
- """Object store implementation that fetches objects over dumb HTTP."""
- def __init__(self, base_url: str, http_request_func):
- """Initialize a DumbHTTPObjectStore.
- Args:
- base_url: Base URL of the remote repository (e.g. "https://example.com/repo.git/")
- http_request_func: Function to make HTTP requests, should accept (url, headers)
- and return (response, read_func).
- """
- self.base_url = base_url.rstrip("/") + "/"
- self._http_request = http_request_func
- self._packs: Optional[list[tuple[str, Optional[PackIndex]]]] = None
- self._cached_objects: dict[bytes, tuple[int, bytes]] = {}
- self._temp_pack_dir = None
- def _ensure_temp_pack_dir(self):
- """Ensure we have a temporary directory for storing pack files."""
- if self._temp_pack_dir is None:
- self._temp_pack_dir = tempfile.mkdtemp(prefix="dulwich-dumb-")
- def _fetch_url(self, path: str) -> bytes:
- """Fetch content from a URL path relative to base_url.
- Args:
- path: Path relative to base URL
- Returns:
- Content as bytes
- Raises:
- IOError: If the URL cannot be fetched
- """
- url = urljoin(self.base_url, path)
- resp, read = self._http_request(url, {})
- try:
- if resp.status == 404:
- raise OSError(f"Not found: {url}")
- elif resp.status != 200:
- raise OSError(f"HTTP error {resp.status}: {url}")
- # Read all content
- chunks = []
- while True:
- chunk = read(4096)
- if not chunk:
- break
- chunks.append(chunk)
- return b"".join(chunks)
- finally:
- resp.close()
- def _fetch_loose_object(self, sha: bytes) -> tuple[int, bytes]:
- """Fetch a loose object by SHA.
- Args:
- sha: SHA1 of the object (hex string as bytes)
- Returns:
- Tuple of (type_num, content)
- Raises:
- KeyError: If object not found
- """
- hex_sha = sha.decode("ascii")
- path = f"objects/{hex_sha[:2]}/{hex_sha[2:]}"
- try:
- compressed = self._fetch_url(path)
- except OSError:
- raise KeyError(sha)
- # Decompress and parse the object
- decompressed = zlib.decompress(compressed)
- # Parse header
- header_end = decompressed.find(b"\x00")
- if header_end == -1:
- raise ObjectFormatException("Invalid object header")
- header = decompressed[:header_end]
- content = decompressed[header_end + 1 :]
- parts = header.split(b" ", 1)
- if len(parts) != 2:
- raise ObjectFormatException("Invalid object header")
- obj_type = parts[0]
- obj_size = int(parts[1])
- if len(content) != obj_size:
- raise ObjectFormatException("Object size mismatch")
- # Convert type name to type number
- type_map = {
- b"blob": Blob.type_num,
- b"tree": Tree.type_num,
- b"commit": Commit.type_num,
- b"tag": Tag.type_num,
- }
- if obj_type not in type_map:
- raise ObjectFormatException(f"Unknown object type: {obj_type!r}")
- return type_map[obj_type], content
- def _load_packs(self):
- """Load the list of available packs from the remote."""
- if self._packs is not None:
- return
- self._packs = []
- try:
- packs_data = self._fetch_url("objects/info/packs")
- except OSError:
- # No packs file, repository might only have loose objects
- return
- for line in packs_data.strip().split(b"\n"):
- if line.startswith(b"P "):
- pack_name = line[2:].decode("utf-8")
- # Extract just the pack name without path
- if "/" in pack_name:
- pack_name = pack_name.split("/")[-1]
- if pack_name.endswith(".pack"):
- pack_name = pack_name[:-5] # Remove .pack extension
- self._packs.append((pack_name, None))
- def _get_pack_index(self, pack_name: str) -> PackIndex:
- """Get or fetch a pack index.
- Args:
- pack_name: Name of the pack (without .idx extension)
- Returns:
- PackIndex object
- """
- # Find the pack in our list
- for i, (name, idx) in enumerate(self._packs or []):
- if name == pack_name:
- if idx is None:
- # Fetch and cache the index
- idx_data = self._fetch_url(f"objects/pack/{pack_name}.idx")
- idx = load_pack_index_file("<http>", BytesIO(idx_data))
- if self._packs is not None:
- self._packs[i] = (name, idx)
- return idx
- raise KeyError(f"Pack not found: {pack_name}")
- def _fetch_from_pack(self, sha: bytes) -> tuple[int, bytes]:
- """Try to fetch an object from pack files.
- Args:
- sha: SHA1 of the object (hex string as bytes)
- Returns:
- Tuple of (type_num, content)
- Raises:
- KeyError: If object not found in any pack
- """
- self._load_packs()
- # Convert hex to binary for pack operations
- binsha = hex_to_sha(sha)
- for pack_name, idx in self._packs or []:
- if idx is None:
- idx = self._get_pack_index(pack_name)
- try:
- # Check if object is in this pack
- idx.object_offset(binsha)
- except KeyError:
- continue
- # We found the object, now we need to fetch the pack data
- # For efficiency, we could fetch just the needed portion, but for
- # simplicity we'll fetch the whole pack and cache it
- self._ensure_temp_pack_dir()
- if self._temp_pack_dir is None:
- raise RuntimeError("Temp pack directory not initialized")
- pack_path = os.path.join(self._temp_pack_dir, f"{pack_name}.pack")
- if not os.path.exists(pack_path):
- # Download the pack file
- pack_data = self._fetch_url(f"objects/pack/{pack_name}.pack")
- with open(pack_path, "wb") as f:
- f.write(pack_data)
- # Open the pack and get the object
- pack = Pack(pack_path[:-5]) # Remove .pack extension
- try:
- return pack.get_raw(binsha)
- finally:
- pack.close()
- raise KeyError(sha)
- def get_raw(self, sha: bytes) -> tuple[int, bytes]:
- """Obtain the raw text for an object.
- Args:
- sha: SHA1 of the object
- Returns:
- Tuple with numeric type and object contents
- """
- # Check cache first
- if sha in self._cached_objects:
- return self._cached_objects[sha]
- # Try loose object first
- try:
- result = self._fetch_loose_object(sha)
- self._cached_objects[sha] = result
- return result
- except KeyError:
- pass
- # Try packs
- result = self._fetch_from_pack(sha)
- self._cached_objects[sha] = result
- return result
- def contains_loose(self, sha: bytes) -> bool:
- """Check if a particular object is present by SHA1 and is loose."""
- try:
- self._fetch_loose_object(sha)
- return True
- except KeyError:
- return False
- def __contains__(self, sha: bytes) -> bool:
- """Check if a particular object is present by SHA1."""
- if sha in self._cached_objects:
- return True
- # Try loose object
- try:
- self._fetch_loose_object(sha)
- return True
- except KeyError:
- pass
- # Try packs
- try:
- self._fetch_from_pack(sha)
- return True
- except KeyError:
- return False
- def __iter__(self) -> Iterator[bytes]:
- """Iterate over all SHAs in the store.
- Note: This is inefficient for dumb HTTP as it requires
- downloading all pack indices.
- """
- seen = set()
- # We can't efficiently list loose objects over dumb HTTP
- # So we only iterate pack objects
- self._load_packs()
- for pack_name, idx in self._packs or []:
- if idx is None:
- idx = self._get_pack_index(pack_name)
- for sha in idx:
- if sha not in seen:
- seen.add(sha)
- yield sha_to_hex(sha)
- @property
- def packs(self):
- """Iterable of pack objects.
- Note: Returns empty list as we don't have actual Pack objects.
- """
- return []
- def add_object(self, obj) -> None:
- """Add a single object to this object store."""
- raise NotImplementedError("Cannot add objects to dumb HTTP repository")
- def add_objects(self, objects, progress=None) -> None:
- """Add a set of objects to this object store."""
- raise NotImplementedError("Cannot add objects to dumb HTTP repository")
- def __del__(self):
- """Clean up temporary directory on deletion."""
- if self._temp_pack_dir and os.path.exists(self._temp_pack_dir):
- import shutil
- shutil.rmtree(self._temp_pack_dir, ignore_errors=True)
- class DumbRemoteHTTPRepo(BaseRepo):
- """Repository implementation for dumb HTTP remotes."""
- def __init__(self, base_url: str, http_request_func):
- """Initialize a DumbRemoteHTTPRepo.
- Args:
- base_url: Base URL of the remote repository
- http_request_func: Function to make HTTP requests.
- """
- self.base_url = base_url.rstrip("/") + "/"
- self._http_request = http_request_func
- self._refs: Optional[dict[Ref, ObjectID]] = None
- self._peeled: Optional[dict[Ref, ObjectID]] = None
- self._object_store = DumbHTTPObjectStore(base_url, http_request_func)
- @property
- def object_store(self):
- """ObjectStore for this repository."""
- return self._object_store
- def _fetch_url(self, path: str) -> bytes:
- """Fetch content from a URL path relative to base_url."""
- url = urljoin(self.base_url, path)
- resp, read = self._http_request(url, {})
- try:
- if resp.status == 404:
- raise OSError(f"Not found: {url}")
- elif resp.status != 200:
- raise OSError(f"HTTP error {resp.status}: {url}")
- chunks = []
- while True:
- chunk = read(4096)
- if not chunk:
- break
- chunks.append(chunk)
- return b"".join(chunks)
- finally:
- resp.close()
- def get_refs(self) -> dict[Ref, ObjectID]:
- """Get dictionary with all refs."""
- if self._refs is None:
- # Fetch info/refs
- try:
- refs_data = self._fetch_url("info/refs")
- except OSError:
- raise NotGitRepository(f"Cannot read refs from {self.base_url}")
- refs_hex = read_info_refs(BytesIO(refs_data))
- # Keep SHAs as hex
- self._refs, self._peeled = split_peeled_refs(refs_hex)
- return dict(self._refs)
- def get_peeled(self, ref: Ref) -> ObjectID:
- """Get the peeled value of a ref."""
- # For dumb HTTP, we don't have peeled refs readily available
- # We would need to fetch and parse tag objects
- sha = self.get_refs().get(ref, None)
- return sha if sha is not None else ZERO_SHA
- def fetch_pack_data(self, graph_walker, determine_wants, progress=None, depth=None):
- """Fetch pack data from the remote.
- This is the main method for fetching objects from a dumb HTTP remote.
- Since dumb HTTP doesn't support negotiation, we need to download
- all objects reachable from the wanted refs.
- Args:
- graph_walker: GraphWalker instance (not used for dumb HTTP)
- determine_wants: Function that returns list of wanted SHAs
- progress: Optional progress callback
- depth: Depth for shallow clones (not supported for dumb HTTP)
- Returns:
- Iterator of UnpackedObject instances
- """
- refs = self.get_refs()
- wants = determine_wants(refs)
- if not wants:
- return
- # For dumb HTTP, we traverse the object graph starting from wants
- to_fetch = set(wants)
- seen = set()
- while to_fetch:
- sha = to_fetch.pop()
- if sha in seen:
- continue
- seen.add(sha)
- # Fetch the object
- try:
- type_num, content = self._object_store.get_raw(sha)
- except KeyError:
- # Object not found, skip it
- continue
- unpacked = UnpackedObject(type_num, sha=sha, decomp_chunks=[content])
- yield unpacked
- # Parse the object to find references to other objects
- obj = ShaFile.from_raw_string(type_num, content)
- if isinstance(obj, Commit): # Commit
- to_fetch.add(obj.tree)
- for parent in obj.parents:
- to_fetch.add(parent)
- elif isinstance(obj, Tag): # Tag
- to_fetch.add(obj.object[1])
- elif isinstance(obj, Tree): # Tree
- for _, _, item_sha in obj.items():
- to_fetch.add(item_sha)
- if progress:
- progress(f"Fetching objects: {len(seen)} done\n".encode())
|