dumb.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. # dumb.py -- Support for dumb HTTP(S) git repositories
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Support for dumb HTTP(S) git repositories."""
  22. import os
  23. import tempfile
  24. import zlib
  25. from collections.abc import Iterator, Sequence
  26. from io import BytesIO
  27. from typing import Any, Callable, Optional
  28. from urllib.parse import urljoin
  29. from .errors import NotGitRepository, ObjectFormatException
  30. from .object_store import BaseObjectStore
  31. from .objects import (
  32. ZERO_SHA,
  33. Blob,
  34. Commit,
  35. ObjectID,
  36. ShaFile,
  37. Tag,
  38. Tree,
  39. hex_to_sha,
  40. sha_to_hex,
  41. )
  42. from .pack import Pack, PackData, PackIndex, UnpackedObject, load_pack_index_file
  43. from .refs import Ref, read_info_refs, split_peeled_refs
  44. class DumbHTTPObjectStore(BaseObjectStore):
  45. """Object store implementation that fetches objects over dumb HTTP."""
  46. def __init__(
  47. self,
  48. base_url: str,
  49. http_request_func: Callable[
  50. [str, dict[str, str]], tuple[Any, Callable[..., bytes]]
  51. ],
  52. ) -> None:
  53. """Initialize a DumbHTTPObjectStore.
  54. Args:
  55. base_url: Base URL of the remote repository (e.g. "https://example.com/repo.git/")
  56. http_request_func: Function to make HTTP requests, should accept (url, headers)
  57. and return (response, read_func).
  58. """
  59. self.base_url = base_url.rstrip("/") + "/"
  60. self._http_request = http_request_func
  61. self._packs: Optional[list[tuple[str, Optional[PackIndex]]]] = None
  62. self._cached_objects: dict[bytes, tuple[int, bytes]] = {}
  63. self._temp_pack_dir: Optional[str] = None
  64. def _ensure_temp_pack_dir(self) -> None:
  65. """Ensure we have a temporary directory for storing pack files."""
  66. if self._temp_pack_dir is None:
  67. self._temp_pack_dir = tempfile.mkdtemp(prefix="dulwich-dumb-")
  68. def _fetch_url(self, path: str) -> bytes:
  69. """Fetch content from a URL path relative to base_url.
  70. Args:
  71. path: Path relative to base URL
  72. Returns:
  73. Content as bytes
  74. Raises:
  75. IOError: If the URL cannot be fetched
  76. """
  77. url = urljoin(self.base_url, path)
  78. resp, read = self._http_request(url, {})
  79. try:
  80. if resp.status == 404:
  81. raise OSError(f"Not found: {url}")
  82. elif resp.status != 200:
  83. raise OSError(f"HTTP error {resp.status}: {url}")
  84. # Read all content
  85. chunks = []
  86. while True:
  87. chunk = read(4096)
  88. if not chunk:
  89. break
  90. chunks.append(chunk)
  91. return b"".join(chunks)
  92. finally:
  93. resp.close()
  94. def _fetch_loose_object(self, sha: bytes) -> tuple[int, bytes]:
  95. """Fetch a loose object by SHA.
  96. Args:
  97. sha: SHA1 of the object (hex string as bytes)
  98. Returns:
  99. Tuple of (type_num, content)
  100. Raises:
  101. KeyError: If object not found
  102. """
  103. hex_sha = sha.decode("ascii")
  104. path = f"objects/{hex_sha[:2]}/{hex_sha[2:]}"
  105. try:
  106. compressed = self._fetch_url(path)
  107. except OSError:
  108. raise KeyError(sha)
  109. # Decompress and parse the object
  110. decompressed = zlib.decompress(compressed)
  111. # Parse header
  112. header_end = decompressed.find(b"\x00")
  113. if header_end == -1:
  114. raise ObjectFormatException("Invalid object header")
  115. header = decompressed[:header_end]
  116. content = decompressed[header_end + 1 :]
  117. parts = header.split(b" ", 1)
  118. if len(parts) != 2:
  119. raise ObjectFormatException("Invalid object header")
  120. obj_type = parts[0]
  121. obj_size = int(parts[1])
  122. if len(content) != obj_size:
  123. raise ObjectFormatException("Object size mismatch")
  124. # Convert type name to type number
  125. type_map = {
  126. b"blob": Blob.type_num,
  127. b"tree": Tree.type_num,
  128. b"commit": Commit.type_num,
  129. b"tag": Tag.type_num,
  130. }
  131. if obj_type not in type_map:
  132. raise ObjectFormatException(f"Unknown object type: {obj_type!r}")
  133. return type_map[obj_type], content
  134. def _load_packs(self) -> None:
  135. """Load the list of available packs from the remote."""
  136. if self._packs is not None:
  137. return
  138. self._packs = []
  139. try:
  140. packs_data = self._fetch_url("objects/info/packs")
  141. except OSError:
  142. # No packs file, repository might only have loose objects
  143. return
  144. for line in packs_data.strip().split(b"\n"):
  145. if line.startswith(b"P "):
  146. pack_name = line[2:].decode("utf-8")
  147. # Extract just the pack name without path
  148. if "/" in pack_name:
  149. pack_name = pack_name.split("/")[-1]
  150. if pack_name.endswith(".pack"):
  151. pack_name = pack_name[:-5] # Remove .pack extension
  152. self._packs.append((pack_name, None))
  153. def _get_pack_index(self, pack_name: str) -> PackIndex:
  154. """Get or fetch a pack index.
  155. Args:
  156. pack_name: Name of the pack (without .idx extension)
  157. Returns:
  158. PackIndex object
  159. """
  160. # Find the pack in our list
  161. for i, (name, idx) in enumerate(self._packs or []):
  162. if name == pack_name:
  163. if idx is None:
  164. # Fetch and cache the index
  165. idx_data = self._fetch_url(f"objects/pack/{pack_name}.idx")
  166. idx = load_pack_index_file("<http>", BytesIO(idx_data))
  167. if self._packs is not None:
  168. self._packs[i] = (name, idx)
  169. return idx
  170. raise KeyError(f"Pack not found: {pack_name}")
  171. def _fetch_from_pack(self, sha: bytes) -> tuple[int, bytes]:
  172. """Try to fetch an object from pack files.
  173. Args:
  174. sha: SHA1 of the object (hex string as bytes)
  175. Returns:
  176. Tuple of (type_num, content)
  177. Raises:
  178. KeyError: If object not found in any pack
  179. """
  180. self._load_packs()
  181. # Convert hex to binary for pack operations
  182. binsha = hex_to_sha(sha)
  183. for pack_name, pack_idx in self._packs or []:
  184. if pack_idx is None:
  185. pack_idx = self._get_pack_index(pack_name)
  186. try:
  187. # Check if object is in this pack
  188. pack_idx.object_offset(binsha)
  189. except KeyError:
  190. continue
  191. # We found the object, now we need to fetch the pack data
  192. # For efficiency, we could fetch just the needed portion, but for
  193. # simplicity we'll fetch the whole pack and cache it
  194. self._ensure_temp_pack_dir()
  195. if self._temp_pack_dir is None:
  196. raise RuntimeError("Temp pack directory not initialized")
  197. pack_path = os.path.join(self._temp_pack_dir, f"{pack_name}.pack")
  198. if not os.path.exists(pack_path):
  199. # Download the pack file
  200. data = self._fetch_url(f"objects/pack/{pack_name}.pack")
  201. with open(pack_path, "wb") as f:
  202. f.write(data)
  203. # Open the pack and get the object
  204. pack_data = PackData(pack_path)
  205. pack = Pack.from_objects(pack_data, pack_idx)
  206. try:
  207. return pack.get_raw(binsha)
  208. finally:
  209. pack.close()
  210. raise KeyError(sha)
  211. def get_raw(self, sha: bytes) -> tuple[int, bytes]:
  212. """Obtain the raw text for an object.
  213. Args:
  214. sha: SHA1 of the object
  215. Returns:
  216. Tuple with numeric type and object contents
  217. """
  218. # Check cache first
  219. if sha in self._cached_objects:
  220. return self._cached_objects[sha]
  221. # Try packs first
  222. try:
  223. result = self._fetch_from_pack(sha)
  224. self._cached_objects[sha] = result
  225. return result
  226. except KeyError:
  227. pass
  228. # Try loose object
  229. result = self._fetch_loose_object(sha)
  230. self._cached_objects[sha] = result
  231. return result
  232. def contains_loose(self, sha: bytes) -> bool:
  233. """Check if a particular object is present by SHA1 and is loose."""
  234. try:
  235. self._fetch_loose_object(sha)
  236. return True
  237. except KeyError:
  238. return False
  239. def __contains__(self, sha: bytes) -> bool:
  240. """Check if a particular object is present by SHA1."""
  241. if sha in self._cached_objects:
  242. return True
  243. # Try packs
  244. try:
  245. self._fetch_from_pack(sha)
  246. return True
  247. except KeyError:
  248. pass
  249. # Try loose object
  250. try:
  251. self._fetch_loose_object(sha)
  252. return True
  253. except KeyError:
  254. return False
  255. def __iter__(self) -> Iterator[bytes]:
  256. """Iterate over all SHAs in the store.
  257. Note: This is inefficient for dumb HTTP as it requires
  258. downloading all pack indices.
  259. """
  260. seen = set()
  261. # We can't efficiently list loose objects over dumb HTTP
  262. # So we only iterate pack objects
  263. self._load_packs()
  264. for pack_name, idx in self._packs or []:
  265. if idx is None:
  266. idx = self._get_pack_index(pack_name)
  267. for sha in idx:
  268. if sha not in seen:
  269. seen.add(sha)
  270. yield sha_to_hex(sha)
  271. @property
  272. def packs(self) -> list[Any]:
  273. """Iterable of pack objects.
  274. Note: Returns empty list as we don't have actual Pack objects.
  275. """
  276. return []
  277. def add_object(self, obj: ShaFile) -> None:
  278. """Add a single object to this object store."""
  279. raise NotImplementedError("Cannot add objects to dumb HTTP repository")
  280. def add_objects(
  281. self,
  282. objects: Sequence[tuple[ShaFile, Optional[str]]],
  283. progress: Optional[Callable[[str], None]] = None,
  284. ) -> Optional["Pack"]:
  285. """Add a set of objects to this object store."""
  286. raise NotImplementedError("Cannot add objects to dumb HTTP repository")
  287. def __del__(self) -> None:
  288. """Clean up temporary directory on deletion."""
  289. if self._temp_pack_dir and os.path.exists(self._temp_pack_dir):
  290. import shutil
  291. shutil.rmtree(self._temp_pack_dir, ignore_errors=True)
  292. class DumbRemoteHTTPRepo:
  293. """Repository implementation for dumb HTTP remotes."""
  294. def __init__(
  295. self,
  296. base_url: str,
  297. http_request_func: Callable[
  298. [str, dict[str, str]], tuple[Any, Callable[..., bytes]]
  299. ],
  300. ) -> None:
  301. """Initialize a DumbRemoteHTTPRepo.
  302. Args:
  303. base_url: Base URL of the remote repository
  304. http_request_func: Function to make HTTP requests.
  305. """
  306. self.base_url = base_url.rstrip("/") + "/"
  307. self._http_request = http_request_func
  308. self._refs: Optional[dict[Ref, ObjectID]] = None
  309. self._peeled: Optional[dict[Ref, ObjectID]] = None
  310. self.object_store = DumbHTTPObjectStore(base_url, http_request_func)
  311. def _fetch_url(self, path: str) -> bytes:
  312. """Fetch content from a URL path relative to base_url."""
  313. url = urljoin(self.base_url, path)
  314. resp, read = self._http_request(url, {})
  315. try:
  316. if resp.status == 404:
  317. raise OSError(f"Not found: {url}")
  318. elif resp.status != 200:
  319. raise OSError(f"HTTP error {resp.status}: {url}")
  320. chunks = []
  321. while True:
  322. chunk = read(4096)
  323. if not chunk:
  324. break
  325. chunks.append(chunk)
  326. return b"".join(chunks)
  327. finally:
  328. resp.close()
  329. def get_refs(self) -> dict[Ref, ObjectID]:
  330. """Get dictionary with all refs."""
  331. if self._refs is None:
  332. # Fetch info/refs
  333. try:
  334. refs_data = self._fetch_url("info/refs")
  335. except OSError:
  336. raise NotGitRepository(f"Cannot read refs from {self.base_url}")
  337. refs_hex = read_info_refs(BytesIO(refs_data))
  338. # Keep SHAs as hex
  339. self._refs, self._peeled = split_peeled_refs(refs_hex)
  340. return dict(self._refs)
  341. def get_head(self) -> Ref:
  342. """Get the current HEAD reference.
  343. Returns:
  344. HEAD reference name or commit ID
  345. """
  346. head_resp_bytes = self._fetch_url("HEAD")
  347. head_split = head_resp_bytes.replace(b"\n", b"").split(b" ")
  348. head_target = head_split[1] if len(head_split) > 1 else head_split[0]
  349. # handle HEAD legacy format containing a commit id instead of a ref name
  350. for ref_name, ret_target in self.get_refs().items():
  351. if ret_target == head_target:
  352. head_target = ref_name
  353. break
  354. return head_target
  355. def get_peeled(self, ref: Ref) -> ObjectID:
  356. """Get the peeled value of a ref."""
  357. # For dumb HTTP, we don't have peeled refs readily available
  358. # We would need to fetch and parse tag objects
  359. sha = self.get_refs().get(ref, None)
  360. return sha if sha is not None else ZERO_SHA
  361. def fetch_pack_data(
  362. self,
  363. determine_wants: Callable[[dict[Ref, ObjectID], Optional[int]], list[ObjectID]],
  364. graph_walker: object,
  365. progress: Optional[Callable[[bytes], None]] = None,
  366. *,
  367. get_tagged: Optional[bool] = None,
  368. depth: Optional[int] = None,
  369. ) -> Iterator[UnpackedObject]:
  370. """Fetch pack data from the remote.
  371. This is the main method for fetching objects from a dumb HTTP remote.
  372. Since dumb HTTP doesn't support negotiation, we need to download
  373. all objects reachable from the wanted refs.
  374. Args:
  375. determine_wants: Function that returns list of wanted SHAs
  376. graph_walker: GraphWalker instance (not used for dumb HTTP)
  377. progress: Optional progress callback
  378. get_tagged: Whether to get tagged objects
  379. depth: Depth for shallow clones (not supported for dumb HTTP)
  380. Returns:
  381. Iterator of UnpackedObject instances
  382. """
  383. refs = self.get_refs()
  384. wants = determine_wants(refs, depth)
  385. if not wants:
  386. return
  387. # For dumb HTTP, we traverse the object graph starting from wants
  388. to_fetch = set(wants)
  389. seen = set()
  390. while to_fetch:
  391. sha = to_fetch.pop()
  392. if sha in seen:
  393. continue
  394. seen.add(sha)
  395. # Fetch the object
  396. try:
  397. type_num, content = self.object_store.get_raw(sha)
  398. except KeyError:
  399. # Object not found, skip it
  400. continue
  401. unpacked = UnpackedObject(type_num, sha=sha, decomp_chunks=[content])
  402. yield unpacked
  403. # Parse the object to find references to other objects
  404. obj = ShaFile.from_raw_string(type_num, content)
  405. if isinstance(obj, Commit): # Commit
  406. to_fetch.add(obj.tree)
  407. for parent in obj.parents:
  408. to_fetch.add(parent)
  409. elif isinstance(obj, Tag): # Tag
  410. to_fetch.add(obj.object[1])
  411. elif isinstance(obj, Tree): # Tree
  412. for _, _, item_sha in obj.items():
  413. assert item_sha is not None
  414. to_fetch.add(item_sha)
  415. if progress:
  416. progress(f"Fetching objects: {len(seen)} done\n".encode())