dumb.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. # dumb.py -- Support for dumb HTTP(S) git repositories
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Support for dumb HTTP(S) git repositories."""
  22. __all__ = [
  23. "DumbHTTPObjectStore",
  24. "DumbRemoteHTTPRepo",
  25. ]
  26. import os
  27. import tempfile
  28. import zlib
  29. from collections.abc import Callable, Iterator, Mapping, Sequence
  30. from io import BytesIO
  31. from typing import Any
  32. from urllib.parse import urljoin
  33. from .errors import NotGitRepository, ObjectFormatException
  34. from .object_store import BaseObjectStore
  35. from .objects import (
  36. ZERO_SHA,
  37. Blob,
  38. Commit,
  39. ObjectID,
  40. RawObjectID,
  41. ShaFile,
  42. Tag,
  43. Tree,
  44. hex_to_sha,
  45. sha_to_hex,
  46. )
  47. from .pack import Pack, PackData, PackIndex, UnpackedObject, load_pack_index_file
  48. from .protocol import split_peeled_refs
  49. from .refs import Ref, read_info_refs
  50. class DumbHTTPObjectStore(BaseObjectStore):
  51. """Object store implementation that fetches objects over dumb HTTP."""
  52. def __init__(
  53. self,
  54. base_url: str,
  55. http_request_func: Callable[
  56. [str, dict[str, str]], tuple[Any, Callable[..., bytes]]
  57. ],
  58. ) -> None:
  59. """Initialize a DumbHTTPObjectStore.
  60. Args:
  61. base_url: Base URL of the remote repository (e.g. "https://example.com/repo.git/")
  62. http_request_func: Function to make HTTP requests, should accept (url, headers)
  63. and return (response, read_func).
  64. """
  65. self.base_url = base_url.rstrip("/") + "/"
  66. self._http_request = http_request_func
  67. self._packs: list[tuple[str, PackIndex | None]] | None = None
  68. self._cached_objects: dict[bytes, tuple[int, bytes]] = {}
  69. self._temp_pack_dir: str | None = None
  70. def _ensure_temp_pack_dir(self) -> None:
  71. """Ensure we have a temporary directory for storing pack files."""
  72. if self._temp_pack_dir is None:
  73. self._temp_pack_dir = tempfile.mkdtemp(prefix="dulwich-dumb-")
  74. def _fetch_url(self, path: str) -> bytes:
  75. """Fetch content from a URL path relative to base_url.
  76. Args:
  77. path: Path relative to base URL
  78. Returns:
  79. Content as bytes
  80. Raises:
  81. IOError: If the URL cannot be fetched
  82. """
  83. url = urljoin(self.base_url, path)
  84. resp, read = self._http_request(url, {})
  85. try:
  86. if resp.status == 404:
  87. raise OSError(f"Not found: {url}")
  88. elif resp.status != 200:
  89. raise OSError(f"HTTP error {resp.status}: {url}")
  90. # Read all content
  91. chunks = []
  92. while True:
  93. chunk = read(4096)
  94. if not chunk:
  95. break
  96. chunks.append(chunk)
  97. return b"".join(chunks)
  98. finally:
  99. resp.close()
  100. def _fetch_loose_object(self, sha: bytes) -> tuple[int, bytes]:
  101. """Fetch a loose object by SHA.
  102. Args:
  103. sha: SHA1 of the object (hex string as bytes)
  104. Returns:
  105. Tuple of (type_num, content)
  106. Raises:
  107. KeyError: If object not found
  108. """
  109. hex_sha = sha.decode("ascii")
  110. path = f"objects/{hex_sha[:2]}/{hex_sha[2:]}"
  111. try:
  112. compressed = self._fetch_url(path)
  113. except OSError:
  114. raise KeyError(sha)
  115. # Decompress and parse the object
  116. decompressed = zlib.decompress(compressed)
  117. # Parse header
  118. header_end = decompressed.find(b"\x00")
  119. if header_end == -1:
  120. raise ObjectFormatException("Invalid object header")
  121. header = decompressed[:header_end]
  122. content = decompressed[header_end + 1 :]
  123. parts = header.split(b" ", 1)
  124. if len(parts) != 2:
  125. raise ObjectFormatException("Invalid object header")
  126. obj_type = parts[0]
  127. obj_size = int(parts[1])
  128. if len(content) != obj_size:
  129. raise ObjectFormatException("Object size mismatch")
  130. # Convert type name to type number
  131. type_map = {
  132. b"blob": Blob.type_num,
  133. b"tree": Tree.type_num,
  134. b"commit": Commit.type_num,
  135. b"tag": Tag.type_num,
  136. }
  137. if obj_type not in type_map:
  138. raise ObjectFormatException(f"Unknown object type: {obj_type!r}")
  139. return type_map[obj_type], content
  140. def _load_packs(self) -> None:
  141. """Load the list of available packs from the remote."""
  142. if self._packs is not None:
  143. return
  144. self._packs = []
  145. try:
  146. packs_data = self._fetch_url("objects/info/packs")
  147. except OSError:
  148. # No packs file, repository might only have loose objects
  149. return
  150. for line in packs_data.strip().split(b"\n"):
  151. if line.startswith(b"P "):
  152. pack_name = line[2:].decode("utf-8")
  153. # Extract just the pack name without path
  154. if "/" in pack_name:
  155. pack_name = pack_name.split("/")[-1]
  156. if pack_name.endswith(".pack"):
  157. pack_name = pack_name[:-5] # Remove .pack extension
  158. self._packs.append((pack_name, None))
  159. def _get_pack_index(self, pack_name: str) -> PackIndex:
  160. """Get or fetch a pack index.
  161. Args:
  162. pack_name: Name of the pack (without .idx extension)
  163. Returns:
  164. PackIndex object
  165. """
  166. # Find the pack in our list
  167. for i, (name, idx) in enumerate(self._packs or []):
  168. if name == pack_name:
  169. if idx is None:
  170. # Fetch and cache the index
  171. idx_data = self._fetch_url(f"objects/pack/{pack_name}.idx")
  172. idx = load_pack_index_file("<http>", BytesIO(idx_data))
  173. if self._packs is not None:
  174. self._packs[i] = (name, idx)
  175. return idx
  176. raise KeyError(f"Pack not found: {pack_name}")
  177. def _fetch_from_pack(self, sha: RawObjectID | ObjectID) -> tuple[int, bytes]:
  178. """Try to fetch an object from pack files.
  179. Args:
  180. sha: SHA1 of the object (hex string as bytes)
  181. Returns:
  182. Tuple of (type_num, content)
  183. Raises:
  184. KeyError: If object not found in any pack
  185. """
  186. self._load_packs()
  187. # Convert hex to binary for pack operations
  188. if len(sha) == 20:
  189. binsha = RawObjectID(sha) # Already binary
  190. else:
  191. binsha = hex_to_sha(ObjectID(sha)) # Convert hex to binary
  192. for pack_name, pack_idx in self._packs or []:
  193. if pack_idx is None:
  194. pack_idx = self._get_pack_index(pack_name)
  195. try:
  196. # Check if object is in this pack
  197. pack_idx.object_offset(binsha)
  198. except KeyError:
  199. continue
  200. # We found the object, now we need to fetch the pack data
  201. # For efficiency, we could fetch just the needed portion, but for
  202. # simplicity we'll fetch the whole pack and cache it
  203. self._ensure_temp_pack_dir()
  204. if self._temp_pack_dir is None:
  205. raise RuntimeError("Temp pack directory not initialized")
  206. pack_path = os.path.join(self._temp_pack_dir, f"{pack_name}.pack")
  207. if not os.path.exists(pack_path):
  208. # Download the pack file
  209. data = self._fetch_url(f"objects/pack/{pack_name}.pack")
  210. with open(pack_path, "wb") as f:
  211. f.write(data)
  212. # Open the pack and get the object
  213. pack_data = PackData(pack_path)
  214. pack = Pack.from_objects(pack_data, pack_idx)
  215. try:
  216. return pack.get_raw(binsha)
  217. finally:
  218. pack.close()
  219. raise KeyError(sha)
  220. def get_raw(self, sha: RawObjectID | ObjectID) -> tuple[int, bytes]:
  221. """Obtain the raw text for an object.
  222. Args:
  223. sha: SHA1 of the object
  224. Returns:
  225. Tuple with numeric type and object contents
  226. """
  227. # Check cache first
  228. if sha in self._cached_objects:
  229. return self._cached_objects[sha]
  230. # Try packs first
  231. try:
  232. result = self._fetch_from_pack(sha)
  233. self._cached_objects[sha] = result
  234. return result
  235. except KeyError:
  236. pass
  237. # Try loose object
  238. result = self._fetch_loose_object(sha)
  239. self._cached_objects[sha] = result
  240. return result
  241. def contains_loose(self, sha: RawObjectID | ObjectID) -> bool:
  242. """Check if a particular object is present by SHA1 and is loose."""
  243. try:
  244. self._fetch_loose_object(sha)
  245. return True
  246. except KeyError:
  247. return False
  248. def __contains__(self, sha: RawObjectID | ObjectID) -> bool:
  249. """Check if a particular object is present by SHA1."""
  250. if sha in self._cached_objects:
  251. return True
  252. # Try packs
  253. try:
  254. self._fetch_from_pack(sha)
  255. return True
  256. except KeyError:
  257. pass
  258. # Try loose object
  259. try:
  260. self._fetch_loose_object(sha)
  261. return True
  262. except KeyError:
  263. return False
  264. def __iter__(self) -> Iterator[ObjectID]:
  265. """Iterate over all SHAs in the store.
  266. Note: This is inefficient for dumb HTTP as it requires
  267. downloading all pack indices.
  268. """
  269. seen = set()
  270. # We can't efficiently list loose objects over dumb HTTP
  271. # So we only iterate pack objects
  272. self._load_packs()
  273. for pack_name, idx in self._packs or []:
  274. if idx is None:
  275. idx = self._get_pack_index(pack_name)
  276. for sha in idx:
  277. if sha not in seen:
  278. seen.add(sha)
  279. yield sha_to_hex(RawObjectID(sha))
  280. @property
  281. def packs(self) -> list[Any]:
  282. """Iterable of pack objects.
  283. Note: Returns empty list as we don't have actual Pack objects.
  284. """
  285. return []
  286. def add_object(self, obj: ShaFile) -> None:
  287. """Add a single object to this object store."""
  288. raise NotImplementedError("Cannot add objects to dumb HTTP repository")
  289. def add_objects(
  290. self,
  291. objects: Sequence[tuple[ShaFile, str | None]],
  292. progress: Callable[[str], None] | None = None,
  293. ) -> "Pack | None":
  294. """Add a set of objects to this object store."""
  295. raise NotImplementedError("Cannot add objects to dumb HTTP repository")
  296. def __del__(self) -> None:
  297. """Clean up temporary directory on deletion."""
  298. if self._temp_pack_dir and os.path.exists(self._temp_pack_dir):
  299. import shutil
  300. shutil.rmtree(self._temp_pack_dir, ignore_errors=True)
  301. class DumbRemoteHTTPRepo:
  302. """Repository implementation for dumb HTTP remotes."""
  303. def __init__(
  304. self,
  305. base_url: str,
  306. http_request_func: Callable[
  307. [str, dict[str, str]], tuple[Any, Callable[..., bytes]]
  308. ],
  309. ) -> None:
  310. """Initialize a DumbRemoteHTTPRepo.
  311. Args:
  312. base_url: Base URL of the remote repository
  313. http_request_func: Function to make HTTP requests.
  314. """
  315. self.base_url = base_url.rstrip("/") + "/"
  316. self._http_request = http_request_func
  317. self._refs: dict[Ref, ObjectID] | None = None
  318. self._peeled: dict[Ref, ObjectID] | None = None
  319. self.object_store = DumbHTTPObjectStore(base_url, http_request_func)
  320. def _fetch_url(self, path: str) -> bytes:
  321. """Fetch content from a URL path relative to base_url."""
  322. url = urljoin(self.base_url, path)
  323. resp, read = self._http_request(url, {})
  324. try:
  325. if resp.status == 404:
  326. raise OSError(f"Not found: {url}")
  327. elif resp.status != 200:
  328. raise OSError(f"HTTP error {resp.status}: {url}")
  329. chunks = []
  330. while True:
  331. chunk = read(4096)
  332. if not chunk:
  333. break
  334. chunks.append(chunk)
  335. return b"".join(chunks)
  336. finally:
  337. resp.close()
  338. def get_refs(self) -> dict[Ref, ObjectID]:
  339. """Get dictionary with all refs."""
  340. if self._refs is None:
  341. # Fetch info/refs
  342. try:
  343. refs_data = self._fetch_url("info/refs")
  344. except OSError:
  345. raise NotGitRepository(f"Cannot read refs from {self.base_url}")
  346. refs_hex = read_info_refs(BytesIO(refs_data))
  347. # Keep SHAs as hex
  348. refs_raw, peeled_raw = split_peeled_refs(refs_hex)
  349. # Convert to typed dicts
  350. self._refs = {Ref(k): ObjectID(v) for k, v in refs_raw.items()}
  351. self._peeled = peeled_raw
  352. return dict(self._refs)
  353. def get_head(self) -> Ref:
  354. """Get the current HEAD reference.
  355. Returns:
  356. HEAD reference name or commit ID
  357. """
  358. head_resp_bytes = self._fetch_url("HEAD")
  359. head_split = head_resp_bytes.replace(b"\n", b"").split(b" ")
  360. head_target_bytes = head_split[1] if len(head_split) > 1 else head_split[0]
  361. # handle HEAD legacy format containing a commit id instead of a ref name
  362. for ref_name, ret_target in self.get_refs().items():
  363. if ret_target == head_target_bytes:
  364. return ref_name
  365. return Ref(head_target_bytes)
  366. def get_peeled(self, ref: Ref) -> ObjectID:
  367. """Get the peeled value of a ref."""
  368. # For dumb HTTP, we don't have peeled refs readily available
  369. # We would need to fetch and parse tag objects
  370. sha = self.get_refs().get(ref, None)
  371. return sha if sha is not None else ZERO_SHA
  372. def fetch_pack_data(
  373. self,
  374. determine_wants: Callable[[Mapping[Ref, ObjectID], int | None], list[ObjectID]],
  375. graph_walker: object,
  376. progress: Callable[[bytes], None] | None = None,
  377. *,
  378. get_tagged: bool | None = None,
  379. depth: int | None = None,
  380. ) -> Iterator[UnpackedObject]:
  381. """Fetch pack data from the remote.
  382. This is the main method for fetching objects from a dumb HTTP remote.
  383. Since dumb HTTP doesn't support negotiation, we need to download
  384. all objects reachable from the wanted refs.
  385. Args:
  386. determine_wants: Function that returns list of wanted SHAs
  387. graph_walker: GraphWalker instance (not used for dumb HTTP)
  388. progress: Optional progress callback
  389. get_tagged: Whether to get tagged objects
  390. depth: Depth for shallow clones (not supported for dumb HTTP)
  391. Returns:
  392. Iterator of UnpackedObject instances
  393. """
  394. refs = self.get_refs()
  395. wants = determine_wants(refs, depth)
  396. if not wants:
  397. return
  398. # For dumb HTTP, we traverse the object graph starting from wants
  399. to_fetch = set(wants)
  400. seen = set()
  401. while to_fetch:
  402. sha = to_fetch.pop()
  403. if sha in seen:
  404. continue
  405. seen.add(sha)
  406. # Fetch the object
  407. try:
  408. type_num, content = self.object_store.get_raw(sha)
  409. except KeyError:
  410. # Object not found, skip it
  411. continue
  412. unpacked = UnpackedObject(type_num, sha=sha, decomp_chunks=[content])
  413. yield unpacked
  414. # Parse the object to find references to other objects
  415. obj = ShaFile.from_raw_string(type_num, content)
  416. if isinstance(obj, Commit): # Commit
  417. to_fetch.add(obj.tree)
  418. for parent in obj.parents:
  419. to_fetch.add(parent)
  420. elif isinstance(obj, Tag): # Tag
  421. to_fetch.add(obj.object[1])
  422. elif isinstance(obj, Tree): # Tree
  423. for _, _, item_sha in obj.items():
  424. assert item_sha is not None
  425. to_fetch.add(item_sha)
  426. if progress:
  427. progress(f"Fetching objects: {len(seen)} done\n".encode())