dumb.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521
  1. # dumb.py -- Support for dumb HTTP(S) git repositories
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Support for dumb HTTP(S) git repositories."""
  22. __all__ = [
  23. "DumbHTTPObjectStore",
  24. "DumbRemoteHTTPRepo",
  25. ]
  26. import os
  27. import tempfile
  28. import zlib
  29. from collections.abc import Callable, Iterator, Mapping, Sequence
  30. from io import BytesIO
  31. from typing import TYPE_CHECKING, Any
  32. from urllib.parse import urljoin
  33. if TYPE_CHECKING:
  34. from .object_format import ObjectFormat
  35. from .errors import NotGitRepository, ObjectFormatException
  36. from .object_store import BaseObjectStore
  37. from .objects import (
  38. ZERO_SHA,
  39. Blob,
  40. Commit,
  41. ObjectID,
  42. RawObjectID,
  43. ShaFile,
  44. Tag,
  45. Tree,
  46. hex_to_sha,
  47. sha_to_hex,
  48. )
  49. from .pack import Pack, PackData, PackIndex, UnpackedObject, load_pack_index_file
  50. from .protocol import split_peeled_refs
  51. from .refs import Ref, read_info_refs
  52. class DumbHTTPObjectStore(BaseObjectStore):
  53. """Object store implementation that fetches objects over dumb HTTP."""
  54. def __init__(
  55. self,
  56. base_url: str,
  57. http_request_func: Callable[
  58. [str, dict[str, str]], tuple[Any, Callable[..., bytes]]
  59. ],
  60. object_format: "ObjectFormat | None" = None,
  61. ) -> None:
  62. """Initialize a DumbHTTPObjectStore.
  63. Args:
  64. base_url: Base URL of the remote repository (e.g. "https://example.com/repo.git/")
  65. http_request_func: Function to make HTTP requests, should accept (url, headers)
  66. and return (response, read_func).
  67. object_format: Object format to use (defaults to DEFAULT_OBJECT_FORMAT)
  68. """
  69. super().__init__(object_format=object_format)
  70. self.base_url = base_url.rstrip("/") + "/"
  71. self._http_request = http_request_func
  72. self._packs: list[tuple[str, PackIndex | None]] | None = None
  73. self._cached_objects: dict[bytes, tuple[int, bytes]] = {}
  74. self._temp_pack_dir: str | None = None
  75. def _ensure_temp_pack_dir(self) -> None:
  76. """Ensure we have a temporary directory for storing pack files."""
  77. if self._temp_pack_dir is None:
  78. self._temp_pack_dir = tempfile.mkdtemp(prefix="dulwich-dumb-")
  79. def _fetch_url(self, path: str) -> bytes:
  80. """Fetch content from a URL path relative to base_url.
  81. Args:
  82. path: Path relative to base URL
  83. Returns:
  84. Content as bytes
  85. Raises:
  86. IOError: If the URL cannot be fetched
  87. """
  88. url = urljoin(self.base_url, path)
  89. resp, read = self._http_request(url, {})
  90. try:
  91. if resp.status == 404:
  92. raise OSError(f"Not found: {url}")
  93. elif resp.status != 200:
  94. raise OSError(f"HTTP error {resp.status}: {url}")
  95. # Read all content
  96. chunks = []
  97. while True:
  98. chunk = read(4096)
  99. if not chunk:
  100. break
  101. chunks.append(chunk)
  102. return b"".join(chunks)
  103. finally:
  104. resp.close()
  105. def _fetch_loose_object(self, sha: bytes) -> tuple[int, bytes]:
  106. """Fetch a loose object by SHA.
  107. Args:
  108. sha: SHA1 of the object (hex string as bytes)
  109. Returns:
  110. Tuple of (type_num, content)
  111. Raises:
  112. KeyError: If object not found
  113. """
  114. hex_sha = sha.decode("ascii")
  115. path = f"objects/{hex_sha[:2]}/{hex_sha[2:]}"
  116. try:
  117. compressed = self._fetch_url(path)
  118. except OSError:
  119. raise KeyError(sha)
  120. # Decompress and parse the object
  121. decompressed = zlib.decompress(compressed)
  122. # Parse header
  123. header_end = decompressed.find(b"\x00")
  124. if header_end == -1:
  125. raise ObjectFormatException("Invalid object header")
  126. header = decompressed[:header_end]
  127. content = decompressed[header_end + 1 :]
  128. parts = header.split(b" ", 1)
  129. if len(parts) != 2:
  130. raise ObjectFormatException("Invalid object header")
  131. obj_type = parts[0]
  132. obj_size = int(parts[1])
  133. if len(content) != obj_size:
  134. raise ObjectFormatException("Object size mismatch")
  135. # Convert type name to type number
  136. type_map = {
  137. b"blob": Blob.type_num,
  138. b"tree": Tree.type_num,
  139. b"commit": Commit.type_num,
  140. b"tag": Tag.type_num,
  141. }
  142. if obj_type not in type_map:
  143. raise ObjectFormatException(f"Unknown object type: {obj_type!r}")
  144. return type_map[obj_type], content
  145. def _load_packs(self) -> None:
  146. """Load the list of available packs from the remote."""
  147. if self._packs is not None:
  148. return
  149. self._packs = []
  150. try:
  151. packs_data = self._fetch_url("objects/info/packs")
  152. except OSError:
  153. # No packs file, repository might only have loose objects
  154. return
  155. for line in packs_data.strip().split(b"\n"):
  156. if line.startswith(b"P "):
  157. pack_name = line[2:].decode("utf-8")
  158. # Extract just the pack name without path
  159. if "/" in pack_name:
  160. pack_name = pack_name.split("/")[-1]
  161. if pack_name.endswith(".pack"):
  162. pack_name = pack_name[:-5] # Remove .pack extension
  163. self._packs.append((pack_name, None))
  164. def _get_pack_index(self, pack_name: str) -> PackIndex:
  165. """Get or fetch a pack index.
  166. Args:
  167. pack_name: Name of the pack (without .idx extension)
  168. Returns:
  169. PackIndex object
  170. """
  171. # Find the pack in our list
  172. for i, (name, idx) in enumerate(self._packs or []):
  173. if name == pack_name:
  174. if idx is None:
  175. # Fetch and cache the index
  176. idx_data = self._fetch_url(f"objects/pack/{pack_name}.idx")
  177. idx = load_pack_index_file("<http>", BytesIO(idx_data), self.object_format)
  178. if self._packs is not None:
  179. self._packs[i] = (name, idx)
  180. return idx
  181. raise KeyError(f"Pack not found: {pack_name}")
  182. def _fetch_from_pack(self, sha: RawObjectID | ObjectID) -> tuple[int, bytes]:
  183. """Try to fetch an object from pack files.
  184. Args:
  185. sha: SHA1 of the object (hex string as bytes)
  186. Returns:
  187. Tuple of (type_num, content)
  188. Raises:
  189. KeyError: If object not found in any pack
  190. """
  191. self._load_packs()
  192. # Convert hex to binary for pack operations
  193. if len(sha) == 20:
  194. binsha = RawObjectID(sha) # Already binary
  195. else:
  196. binsha = hex_to_sha(ObjectID(sha)) # Convert hex to binary
  197. for pack_name, pack_idx in self._packs or []:
  198. if pack_idx is None:
  199. pack_idx = self._get_pack_index(pack_name)
  200. try:
  201. # Check if object is in this pack
  202. pack_idx.object_offset(binsha)
  203. except KeyError:
  204. continue
  205. # We found the object, now we need to fetch the pack data
  206. # For efficiency, we could fetch just the needed portion, but for
  207. # simplicity we'll fetch the whole pack and cache it
  208. self._ensure_temp_pack_dir()
  209. if self._temp_pack_dir is None:
  210. raise RuntimeError("Temp pack directory not initialized")
  211. pack_path = os.path.join(self._temp_pack_dir, f"{pack_name}.pack")
  212. if not os.path.exists(pack_path):
  213. # Download the pack file
  214. data = self._fetch_url(f"objects/pack/{pack_name}.pack")
  215. with open(pack_path, "wb") as f:
  216. f.write(data)
  217. # Open the pack and get the object
  218. pack_data = PackData(pack_path, object_format=self.object_format)
  219. pack = Pack.from_objects(
  220. pack_data, pack_idx
  221. )
  222. try:
  223. return pack.get_raw(binsha)
  224. finally:
  225. pack.close()
  226. raise KeyError(sha)
  227. def get_raw(self, sha: RawObjectID | ObjectID) -> tuple[int, bytes]:
  228. """Obtain the raw text for an object.
  229. Args:
  230. sha: SHA1 of the object
  231. Returns:
  232. Tuple with numeric type and object contents
  233. """
  234. # Check cache first
  235. if sha in self._cached_objects:
  236. return self._cached_objects[sha]
  237. # Try packs first
  238. try:
  239. result = self._fetch_from_pack(sha)
  240. self._cached_objects[sha] = result
  241. return result
  242. except KeyError:
  243. pass
  244. # Try loose object
  245. result = self._fetch_loose_object(sha)
  246. self._cached_objects[sha] = result
  247. return result
  248. def contains_loose(self, sha: RawObjectID | ObjectID) -> bool:
  249. """Check if a particular object is present by SHA1 and is loose."""
  250. try:
  251. self._fetch_loose_object(sha)
  252. return True
  253. except KeyError:
  254. return False
  255. def __contains__(self, sha: RawObjectID | ObjectID) -> bool:
  256. """Check if a particular object is present by SHA1."""
  257. if sha in self._cached_objects:
  258. return True
  259. # Try packs
  260. try:
  261. self._fetch_from_pack(sha)
  262. return True
  263. except KeyError:
  264. pass
  265. # Try loose object
  266. try:
  267. self._fetch_loose_object(sha)
  268. return True
  269. except KeyError:
  270. return False
  271. def __iter__(self) -> Iterator[ObjectID]:
  272. """Iterate over all SHAs in the store.
  273. Note: This is inefficient for dumb HTTP as it requires
  274. downloading all pack indices.
  275. """
  276. seen = set()
  277. # We can't efficiently list loose objects over dumb HTTP
  278. # So we only iterate pack objects
  279. self._load_packs()
  280. for pack_name, idx in self._packs or []:
  281. if idx is None:
  282. idx = self._get_pack_index(pack_name)
  283. for sha in idx:
  284. if sha not in seen:
  285. seen.add(sha)
  286. yield sha_to_hex(RawObjectID(sha))
  287. @property
  288. def packs(self) -> list[Any]:
  289. """Iterable of pack objects.
  290. Note: Returns empty list as we don't have actual Pack objects.
  291. """
  292. return []
  293. def add_object(self, obj: ShaFile) -> None:
  294. """Add a single object to this object store."""
  295. raise NotImplementedError("Cannot add objects to dumb HTTP repository")
  296. def add_objects(
  297. self,
  298. objects: Sequence[tuple[ShaFile, str | None]],
  299. progress: Callable[[str], None] | None = None,
  300. ) -> "Pack | None":
  301. """Add a set of objects to this object store."""
  302. raise NotImplementedError("Cannot add objects to dumb HTTP repository")
  303. def __del__(self) -> None:
  304. """Clean up temporary directory on deletion."""
  305. if self._temp_pack_dir and os.path.exists(self._temp_pack_dir):
  306. import shutil
  307. shutil.rmtree(self._temp_pack_dir, ignore_errors=True)
  308. class DumbRemoteHTTPRepo:
  309. """Repository implementation for dumb HTTP remotes."""
  310. def __init__(
  311. self,
  312. base_url: str,
  313. http_request_func: Callable[
  314. [str, dict[str, str]], tuple[Any, Callable[..., bytes]]
  315. ],
  316. ) -> None:
  317. """Initialize a DumbRemoteHTTPRepo.
  318. Args:
  319. base_url: Base URL of the remote repository
  320. http_request_func: Function to make HTTP requests.
  321. """
  322. self.base_url = base_url.rstrip("/") + "/"
  323. self._http_request = http_request_func
  324. self._refs: dict[Ref, ObjectID] | None = None
  325. self._peeled: dict[Ref, ObjectID] | None = None
  326. self.object_store = DumbHTTPObjectStore(base_url, http_request_func)
  327. def _fetch_url(self, path: str) -> bytes:
  328. """Fetch content from a URL path relative to base_url."""
  329. url = urljoin(self.base_url, path)
  330. resp, read = self._http_request(url, {})
  331. try:
  332. if resp.status == 404:
  333. raise OSError(f"Not found: {url}")
  334. elif resp.status != 200:
  335. raise OSError(f"HTTP error {resp.status}: {url}")
  336. chunks = []
  337. while True:
  338. chunk = read(4096)
  339. if not chunk:
  340. break
  341. chunks.append(chunk)
  342. return b"".join(chunks)
  343. finally:
  344. resp.close()
  345. def get_refs(self) -> dict[Ref, ObjectID]:
  346. """Get dictionary with all refs."""
  347. if self._refs is None:
  348. # Fetch info/refs
  349. try:
  350. refs_data = self._fetch_url("info/refs")
  351. except OSError:
  352. raise NotGitRepository(f"Cannot read refs from {self.base_url}")
  353. refs_hex = read_info_refs(BytesIO(refs_data))
  354. # Keep SHAs as hex
  355. refs_raw, peeled_raw = split_peeled_refs(refs_hex)
  356. # Convert to typed dicts
  357. self._refs = {Ref(k): ObjectID(v) for k, v in refs_raw.items()}
  358. self._peeled = peeled_raw
  359. return dict(self._refs)
  360. def get_head(self) -> Ref:
  361. """Get the current HEAD reference.
  362. Returns:
  363. HEAD reference name or commit ID
  364. """
  365. head_resp_bytes = self._fetch_url("HEAD")
  366. head_split = head_resp_bytes.replace(b"\n", b"").split(b" ")
  367. head_target_bytes = head_split[1] if len(head_split) > 1 else head_split[0]
  368. # handle HEAD legacy format containing a commit id instead of a ref name
  369. for ref_name, ret_target in self.get_refs().items():
  370. if ret_target == head_target_bytes:
  371. return ref_name
  372. return Ref(head_target_bytes)
  373. def get_peeled(self, ref: Ref) -> ObjectID:
  374. """Get the peeled value of a ref."""
  375. # For dumb HTTP, we don't have peeled refs readily available
  376. # We would need to fetch and parse tag objects
  377. sha: ObjectID | None = self.get_refs().get(ref, None)
  378. return sha if sha is not None else ZERO_SHA
  379. def fetch_pack_data(
  380. self,
  381. determine_wants: Callable[[Mapping[Ref, ObjectID], int | None], list[ObjectID]],
  382. graph_walker: object,
  383. progress: Callable[[bytes], None] | None = None,
  384. *,
  385. get_tagged: bool | None = None,
  386. depth: int | None = None,
  387. ) -> Iterator[UnpackedObject]:
  388. """Fetch pack data from the remote.
  389. This is the main method for fetching objects from a dumb HTTP remote.
  390. Since dumb HTTP doesn't support negotiation, we need to download
  391. all objects reachable from the wanted refs.
  392. Args:
  393. determine_wants: Function that returns list of wanted SHAs
  394. graph_walker: GraphWalker instance (not used for dumb HTTP)
  395. progress: Optional progress callback
  396. get_tagged: Whether to get tagged objects
  397. depth: Depth for shallow clones (not supported for dumb HTTP)
  398. Returns:
  399. Iterator of UnpackedObject instances
  400. """
  401. refs = self.get_refs()
  402. wants = determine_wants(refs, depth)
  403. if not wants:
  404. return
  405. # For dumb HTTP, we traverse the object graph starting from wants
  406. to_fetch = set(wants)
  407. seen = set()
  408. while to_fetch:
  409. sha = to_fetch.pop()
  410. if sha in seen:
  411. continue
  412. seen.add(sha)
  413. # Fetch the object
  414. try:
  415. type_num, content = self.object_store.get_raw(sha)
  416. except KeyError:
  417. # Object not found, skip it
  418. continue
  419. unpacked = UnpackedObject(type_num, sha=sha, decomp_chunks=[content])
  420. yield unpacked
  421. # Parse the object to find references to other objects
  422. obj = ShaFile.from_raw_string(type_num, content)
  423. if isinstance(obj, Commit): # Commit
  424. to_fetch.add(obj.tree)
  425. for parent in obj.parents:
  426. to_fetch.add(parent)
  427. elif isinstance(obj, Tag): # Tag
  428. to_fetch.add(obj.object[1])
  429. elif isinstance(obj, Tree): # Tree
  430. for _, _, item_sha in obj.items():
  431. assert item_sha is not None
  432. to_fetch.add(item_sha)
  433. if progress:
  434. progress(f"Fetching objects: {len(seen)} done\n".encode())