dumb.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. # dumb.py -- Support for dumb HTTP(S) git repositories
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Support for dumb HTTP(S) git repositories."""
  22. __all__ = [
  23. "DumbHTTPObjectStore",
  24. "DumbRemoteHTTPRepo",
  25. ]
  26. import os
  27. import tempfile
  28. import zlib
  29. from collections.abc import Callable, Iterator, Mapping, Sequence
  30. from io import BytesIO
  31. from typing import Any
  32. from urllib.parse import urljoin
  33. from .errors import NotGitRepository, ObjectFormatException
  34. from .object_store import BaseObjectStore
  35. from .objects import (
  36. ZERO_SHA,
  37. Blob,
  38. Commit,
  39. ObjectID,
  40. RawObjectID,
  41. ShaFile,
  42. Tag,
  43. Tree,
  44. hex_to_sha,
  45. sha_to_hex,
  46. )
  47. from .pack import Pack, PackData, PackIndex, UnpackedObject, load_pack_index_file
  48. from .protocol import split_peeled_refs
  49. from .refs import Ref, read_info_refs
  50. class DumbHTTPObjectStore(BaseObjectStore):
  51. """Object store implementation that fetches objects over dumb HTTP."""
  52. def __init__(
  53. self,
  54. base_url: str,
  55. http_request_func: Callable[
  56. [str, dict[str, str]], tuple[Any, Callable[..., bytes]]
  57. ],
  58. object_format=None,
  59. ) -> None:
  60. """Initialize a DumbHTTPObjectStore.
  61. Args:
  62. base_url: Base URL of the remote repository (e.g. "https://example.com/repo.git/")
  63. http_request_func: Function to make HTTP requests, should accept (url, headers)
  64. and return (response, read_func).
  65. object_format: Object format to use (defaults to DEFAULT_OBJECT_FORMAT)
  66. """
  67. super().__init__(object_format=object_format)
  68. self.base_url = base_url.rstrip("/") + "/"
  69. self._http_request = http_request_func
  70. self._packs: list[tuple[str, PackIndex | None]] | None = None
  71. self._cached_objects: dict[bytes, tuple[int, bytes]] = {}
  72. self._temp_pack_dir: str | None = None
  73. def _ensure_temp_pack_dir(self) -> None:
  74. """Ensure we have a temporary directory for storing pack files."""
  75. if self._temp_pack_dir is None:
  76. self._temp_pack_dir = tempfile.mkdtemp(prefix="dulwich-dumb-")
  77. def _fetch_url(self, path: str) -> bytes:
  78. """Fetch content from a URL path relative to base_url.
  79. Args:
  80. path: Path relative to base URL
  81. Returns:
  82. Content as bytes
  83. Raises:
  84. IOError: If the URL cannot be fetched
  85. """
  86. url = urljoin(self.base_url, path)
  87. resp, read = self._http_request(url, {})
  88. try:
  89. if resp.status == 404:
  90. raise OSError(f"Not found: {url}")
  91. elif resp.status != 200:
  92. raise OSError(f"HTTP error {resp.status}: {url}")
  93. # Read all content
  94. chunks = []
  95. while True:
  96. chunk = read(4096)
  97. if not chunk:
  98. break
  99. chunks.append(chunk)
  100. return b"".join(chunks)
  101. finally:
  102. resp.close()
  103. def _fetch_loose_object(self, sha: bytes) -> tuple[int, bytes]:
  104. """Fetch a loose object by SHA.
  105. Args:
  106. sha: SHA1 of the object (hex string as bytes)
  107. Returns:
  108. Tuple of (type_num, content)
  109. Raises:
  110. KeyError: If object not found
  111. """
  112. hex_sha = sha.decode("ascii")
  113. path = f"objects/{hex_sha[:2]}/{hex_sha[2:]}"
  114. try:
  115. compressed = self._fetch_url(path)
  116. except OSError:
  117. raise KeyError(sha)
  118. # Decompress and parse the object
  119. decompressed = zlib.decompress(compressed)
  120. # Parse header
  121. header_end = decompressed.find(b"\x00")
  122. if header_end == -1:
  123. raise ObjectFormatException("Invalid object header")
  124. header = decompressed[:header_end]
  125. content = decompressed[header_end + 1 :]
  126. parts = header.split(b" ", 1)
  127. if len(parts) != 2:
  128. raise ObjectFormatException("Invalid object header")
  129. obj_type = parts[0]
  130. obj_size = int(parts[1])
  131. if len(content) != obj_size:
  132. raise ObjectFormatException("Object size mismatch")
  133. # Convert type name to type number
  134. type_map = {
  135. b"blob": Blob.type_num,
  136. b"tree": Tree.type_num,
  137. b"commit": Commit.type_num,
  138. b"tag": Tag.type_num,
  139. }
  140. if obj_type not in type_map:
  141. raise ObjectFormatException(f"Unknown object type: {obj_type!r}")
  142. return type_map[obj_type], content
  143. def _load_packs(self) -> None:
  144. """Load the list of available packs from the remote."""
  145. if self._packs is not None:
  146. return
  147. self._packs = []
  148. try:
  149. packs_data = self._fetch_url("objects/info/packs")
  150. except OSError:
  151. # No packs file, repository might only have loose objects
  152. return
  153. for line in packs_data.strip().split(b"\n"):
  154. if line.startswith(b"P "):
  155. pack_name = line[2:].decode("utf-8")
  156. # Extract just the pack name without path
  157. if "/" in pack_name:
  158. pack_name = pack_name.split("/")[-1]
  159. if pack_name.endswith(".pack"):
  160. pack_name = pack_name[:-5] # Remove .pack extension
  161. self._packs.append((pack_name, None))
  162. def _get_pack_index(self, pack_name: str) -> PackIndex:
  163. """Get or fetch a pack index.
  164. Args:
  165. pack_name: Name of the pack (without .idx extension)
  166. Returns:
  167. PackIndex object
  168. """
  169. # Find the pack in our list
  170. for i, (name, idx) in enumerate(self._packs or []):
  171. if name == pack_name:
  172. if idx is None:
  173. # Fetch and cache the index
  174. idx_data = self._fetch_url(f"objects/pack/{pack_name}.idx")
  175. idx = load_pack_index_file("<http>", BytesIO(idx_data))
  176. if self._packs is not None:
  177. self._packs[i] = (name, idx)
  178. return idx
  179. raise KeyError(f"Pack not found: {pack_name}")
  180. def _fetch_from_pack(self, sha: RawObjectID | ObjectID) -> tuple[int, bytes]:
  181. """Try to fetch an object from pack files.
  182. Args:
  183. sha: SHA1 of the object (hex string as bytes)
  184. Returns:
  185. Tuple of (type_num, content)
  186. Raises:
  187. KeyError: If object not found in any pack
  188. """
  189. self._load_packs()
  190. # Convert hex to binary for pack operations
  191. if len(sha) == 20:
  192. binsha = RawObjectID(sha) # Already binary
  193. else:
  194. binsha = hex_to_sha(ObjectID(sha)) # Convert hex to binary
  195. for pack_name, pack_idx in self._packs or []:
  196. if pack_idx is None:
  197. pack_idx = self._get_pack_index(pack_name)
  198. try:
  199. # Check if object is in this pack
  200. pack_idx.object_offset(binsha)
  201. except KeyError:
  202. continue
  203. # We found the object, now we need to fetch the pack data
  204. # For efficiency, we could fetch just the needed portion, but for
  205. # simplicity we'll fetch the whole pack and cache it
  206. self._ensure_temp_pack_dir()
  207. if self._temp_pack_dir is None:
  208. raise RuntimeError("Temp pack directory not initialized")
  209. pack_path = os.path.join(self._temp_pack_dir, f"{pack_name}.pack")
  210. if not os.path.exists(pack_path):
  211. # Download the pack file
  212. data = self._fetch_url(f"objects/pack/{pack_name}.pack")
  213. with open(pack_path, "wb") as f:
  214. f.write(data)
  215. # Open the pack and get the object
  216. pack_data = PackData(pack_path, object_format=self.object_format)
  217. pack = Pack.from_objects(pack_data, pack_idx)
  218. try:
  219. return pack.get_raw(binsha)
  220. finally:
  221. pack.close()
  222. raise KeyError(sha)
  223. def get_raw(self, sha: RawObjectID | ObjectID) -> tuple[int, bytes]:
  224. """Obtain the raw text for an object.
  225. Args:
  226. sha: SHA1 of the object
  227. Returns:
  228. Tuple with numeric type and object contents
  229. """
  230. # Check cache first
  231. if sha in self._cached_objects:
  232. return self._cached_objects[sha]
  233. # Try packs first
  234. try:
  235. result = self._fetch_from_pack(sha)
  236. self._cached_objects[sha] = result
  237. return result
  238. except KeyError:
  239. pass
  240. # Try loose object
  241. result = self._fetch_loose_object(sha)
  242. self._cached_objects[sha] = result
  243. return result
  244. def contains_loose(self, sha: RawObjectID | ObjectID) -> bool:
  245. """Check if a particular object is present by SHA1 and is loose."""
  246. try:
  247. self._fetch_loose_object(sha)
  248. return True
  249. except KeyError:
  250. return False
  251. def __contains__(self, sha: RawObjectID | ObjectID) -> bool:
  252. """Check if a particular object is present by SHA1."""
  253. if sha in self._cached_objects:
  254. return True
  255. # Try packs
  256. try:
  257. self._fetch_from_pack(sha)
  258. return True
  259. except KeyError:
  260. pass
  261. # Try loose object
  262. try:
  263. self._fetch_loose_object(sha)
  264. return True
  265. except KeyError:
  266. return False
  267. def __iter__(self) -> Iterator[ObjectID]:
  268. """Iterate over all SHAs in the store.
  269. Note: This is inefficient for dumb HTTP as it requires
  270. downloading all pack indices.
  271. """
  272. seen = set()
  273. # We can't efficiently list loose objects over dumb HTTP
  274. # So we only iterate pack objects
  275. self._load_packs()
  276. for pack_name, idx in self._packs or []:
  277. if idx is None:
  278. idx = self._get_pack_index(pack_name)
  279. for sha in idx:
  280. if sha not in seen:
  281. seen.add(sha)
  282. yield sha_to_hex(RawObjectID(sha))
  283. @property
  284. def packs(self) -> list[Any]:
  285. """Iterable of pack objects.
  286. Note: Returns empty list as we don't have actual Pack objects.
  287. """
  288. return []
  289. def add_object(self, obj: ShaFile) -> None:
  290. """Add a single object to this object store."""
  291. raise NotImplementedError("Cannot add objects to dumb HTTP repository")
  292. def add_objects(
  293. self,
  294. objects: Sequence[tuple[ShaFile, str | None]],
  295. progress: Callable[[str], None] | None = None,
  296. ) -> "Pack | None":
  297. """Add a set of objects to this object store."""
  298. raise NotImplementedError("Cannot add objects to dumb HTTP repository")
  299. def __del__(self) -> None:
  300. """Clean up temporary directory on deletion."""
  301. if self._temp_pack_dir and os.path.exists(self._temp_pack_dir):
  302. import shutil
  303. shutil.rmtree(self._temp_pack_dir, ignore_errors=True)
  304. class DumbRemoteHTTPRepo:
  305. """Repository implementation for dumb HTTP remotes."""
  306. def __init__(
  307. self,
  308. base_url: str,
  309. http_request_func: Callable[
  310. [str, dict[str, str]], tuple[Any, Callable[..., bytes]]
  311. ],
  312. ) -> None:
  313. """Initialize a DumbRemoteHTTPRepo.
  314. Args:
  315. base_url: Base URL of the remote repository
  316. http_request_func: Function to make HTTP requests.
  317. """
  318. self.base_url = base_url.rstrip("/") + "/"
  319. self._http_request = http_request_func
  320. self._refs: dict[Ref, ObjectID] | None = None
  321. self._peeled: dict[Ref, ObjectID] | None = None
  322. self.object_store = DumbHTTPObjectStore(base_url, http_request_func)
  323. def _fetch_url(self, path: str) -> bytes:
  324. """Fetch content from a URL path relative to base_url."""
  325. url = urljoin(self.base_url, path)
  326. resp, read = self._http_request(url, {})
  327. try:
  328. if resp.status == 404:
  329. raise OSError(f"Not found: {url}")
  330. elif resp.status != 200:
  331. raise OSError(f"HTTP error {resp.status}: {url}")
  332. chunks = []
  333. while True:
  334. chunk = read(4096)
  335. if not chunk:
  336. break
  337. chunks.append(chunk)
  338. return b"".join(chunks)
  339. finally:
  340. resp.close()
  341. def get_refs(self) -> dict[Ref, ObjectID]:
  342. """Get dictionary with all refs."""
  343. if self._refs is None:
  344. # Fetch info/refs
  345. try:
  346. refs_data = self._fetch_url("info/refs")
  347. except OSError:
  348. raise NotGitRepository(f"Cannot read refs from {self.base_url}")
  349. refs_hex = read_info_refs(BytesIO(refs_data))
  350. # Keep SHAs as hex
  351. refs_raw, peeled_raw = split_peeled_refs(refs_hex)
  352. # Convert to typed dicts
  353. self._refs = {Ref(k): ObjectID(v) for k, v in refs_raw.items()}
  354. self._peeled = peeled_raw
  355. return dict(self._refs)
  356. def get_head(self) -> Ref:
  357. """Get the current HEAD reference.
  358. Returns:
  359. HEAD reference name or commit ID
  360. """
  361. head_resp_bytes = self._fetch_url("HEAD")
  362. head_split = head_resp_bytes.replace(b"\n", b"").split(b" ")
  363. head_target_bytes = head_split[1] if len(head_split) > 1 else head_split[0]
  364. # handle HEAD legacy format containing a commit id instead of a ref name
  365. for ref_name, ret_target in self.get_refs().items():
  366. if ret_target == head_target_bytes:
  367. return ref_name
  368. return Ref(head_target_bytes)
  369. def get_peeled(self, ref: Ref) -> ObjectID:
  370. """Get the peeled value of a ref."""
  371. # For dumb HTTP, we don't have peeled refs readily available
  372. # We would need to fetch and parse tag objects
  373. sha = self.get_refs().get(ref, None)
  374. return sha if sha is not None else ZERO_SHA
  375. def fetch_pack_data(
  376. self,
  377. determine_wants: Callable[[Mapping[Ref, ObjectID], int | None], list[ObjectID]],
  378. graph_walker: object,
  379. progress: Callable[[bytes], None] | None = None,
  380. *,
  381. get_tagged: bool | None = None,
  382. depth: int | None = None,
  383. ) -> Iterator[UnpackedObject]:
  384. """Fetch pack data from the remote.
  385. This is the main method for fetching objects from a dumb HTTP remote.
  386. Since dumb HTTP doesn't support negotiation, we need to download
  387. all objects reachable from the wanted refs.
  388. Args:
  389. determine_wants: Function that returns list of wanted SHAs
  390. graph_walker: GraphWalker instance (not used for dumb HTTP)
  391. progress: Optional progress callback
  392. get_tagged: Whether to get tagged objects
  393. depth: Depth for shallow clones (not supported for dumb HTTP)
  394. Returns:
  395. Iterator of UnpackedObject instances
  396. """
  397. refs = self.get_refs()
  398. wants = determine_wants(refs, depth)
  399. if not wants:
  400. return
  401. # For dumb HTTP, we traverse the object graph starting from wants
  402. to_fetch = set(wants)
  403. seen = set()
  404. while to_fetch:
  405. sha = to_fetch.pop()
  406. if sha in seen:
  407. continue
  408. seen.add(sha)
  409. # Fetch the object
  410. try:
  411. type_num, content = self.object_store.get_raw(sha)
  412. except KeyError:
  413. # Object not found, skip it
  414. continue
  415. unpacked = UnpackedObject(type_num, sha=sha, decomp_chunks=[content])
  416. yield unpacked
  417. # Parse the object to find references to other objects
  418. obj = ShaFile.from_raw_string(type_num, content)
  419. if isinstance(obj, Commit): # Commit
  420. to_fetch.add(obj.tree)
  421. for parent in obj.parents:
  422. to_fetch.add(parent)
  423. elif isinstance(obj, Tag): # Tag
  424. to_fetch.add(obj.object[1])
  425. elif isinstance(obj, Tree): # Tree
  426. for _, _, item_sha in obj.items():
  427. assert item_sha is not None
  428. to_fetch.add(item_sha)
  429. if progress:
  430. progress(f"Fetching objects: {len(seen)} done\n".encode())