dumb.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. # dumb.py -- Support for dumb HTTP(S) git repositories
  2. # Copyright (C) 2025 Dulwich contributors
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as public by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Support for dumb HTTP(S) git repositories."""
  22. import os
  23. import tempfile
  24. import zlib
  25. from collections.abc import Iterator
  26. from io import BytesIO
  27. from typing import Optional
  28. from urllib.parse import urljoin
  29. from .errors import NotGitRepository, ObjectFormatException
  30. from .object_store import BaseObjectStore
  31. from .objects import (
  32. ZERO_SHA,
  33. Blob,
  34. Commit,
  35. ObjectID,
  36. ShaFile,
  37. Tag,
  38. Tree,
  39. hex_to_sha,
  40. sha_to_hex,
  41. )
  42. from .pack import Pack, PackIndex, UnpackedObject, load_pack_index_file
  43. from .refs import Ref, read_info_refs, split_peeled_refs
  44. from .repo import BaseRepo
  45. class DumbHTTPObjectStore(BaseObjectStore):
  46. """Object store implementation that fetches objects over dumb HTTP."""
  47. def __init__(self, base_url: str, http_request_func):
  48. """Initialize a DumbHTTPObjectStore.
  49. Args:
  50. base_url: Base URL of the remote repository (e.g. "https://example.com/repo.git/")
  51. http_request_func: Function to make HTTP requests, should accept (url, headers)
  52. and return (response, read_func).
  53. """
  54. self.base_url = base_url.rstrip("/") + "/"
  55. self._http_request = http_request_func
  56. self._packs: Optional[list[tuple[str, Optional[PackIndex]]]] = None
  57. self._cached_objects: dict[bytes, tuple[int, bytes]] = {}
  58. self._temp_pack_dir = None
  59. def _ensure_temp_pack_dir(self):
  60. """Ensure we have a temporary directory for storing pack files."""
  61. if self._temp_pack_dir is None:
  62. self._temp_pack_dir = tempfile.mkdtemp(prefix="dulwich-dumb-")
  63. def _fetch_url(self, path: str) -> bytes:
  64. """Fetch content from a URL path relative to base_url.
  65. Args:
  66. path: Path relative to base URL
  67. Returns:
  68. Content as bytes
  69. Raises:
  70. IOError: If the URL cannot be fetched
  71. """
  72. url = urljoin(self.base_url, path)
  73. resp, read = self._http_request(url, {})
  74. try:
  75. if resp.status == 404:
  76. raise OSError(f"Not found: {url}")
  77. elif resp.status != 200:
  78. raise OSError(f"HTTP error {resp.status}: {url}")
  79. # Read all content
  80. chunks = []
  81. while True:
  82. chunk = read(4096)
  83. if not chunk:
  84. break
  85. chunks.append(chunk)
  86. return b"".join(chunks)
  87. finally:
  88. resp.close()
  89. def _fetch_loose_object(self, sha: bytes) -> tuple[int, bytes]:
  90. """Fetch a loose object by SHA.
  91. Args:
  92. sha: SHA1 of the object (hex string as bytes)
  93. Returns:
  94. Tuple of (type_num, content)
  95. Raises:
  96. KeyError: If object not found
  97. """
  98. hex_sha = sha.decode("ascii")
  99. path = f"objects/{hex_sha[:2]}/{hex_sha[2:]}"
  100. try:
  101. compressed = self._fetch_url(path)
  102. except OSError:
  103. raise KeyError(sha)
  104. # Decompress and parse the object
  105. decompressed = zlib.decompress(compressed)
  106. # Parse header
  107. header_end = decompressed.find(b"\x00")
  108. if header_end == -1:
  109. raise ObjectFormatException("Invalid object header")
  110. header = decompressed[:header_end]
  111. content = decompressed[header_end + 1 :]
  112. parts = header.split(b" ", 1)
  113. if len(parts) != 2:
  114. raise ObjectFormatException("Invalid object header")
  115. obj_type = parts[0]
  116. obj_size = int(parts[1])
  117. if len(content) != obj_size:
  118. raise ObjectFormatException("Object size mismatch")
  119. # Convert type name to type number
  120. type_map = {
  121. b"blob": Blob.type_num,
  122. b"tree": Tree.type_num,
  123. b"commit": Commit.type_num,
  124. b"tag": Tag.type_num,
  125. }
  126. if obj_type not in type_map:
  127. raise ObjectFormatException(f"Unknown object type: {obj_type!r}")
  128. return type_map[obj_type], content
  129. def _load_packs(self):
  130. """Load the list of available packs from the remote."""
  131. if self._packs is not None:
  132. return
  133. self._packs = []
  134. try:
  135. packs_data = self._fetch_url("objects/info/packs")
  136. except OSError:
  137. # No packs file, repository might only have loose objects
  138. return
  139. for line in packs_data.strip().split(b"\n"):
  140. if line.startswith(b"P "):
  141. pack_name = line[2:].decode("utf-8")
  142. # Extract just the pack name without path
  143. if "/" in pack_name:
  144. pack_name = pack_name.split("/")[-1]
  145. if pack_name.endswith(".pack"):
  146. pack_name = pack_name[:-5] # Remove .pack extension
  147. self._packs.append((pack_name, None))
  148. def _get_pack_index(self, pack_name: str) -> PackIndex:
  149. """Get or fetch a pack index.
  150. Args:
  151. pack_name: Name of the pack (without .idx extension)
  152. Returns:
  153. PackIndex object
  154. """
  155. # Find the pack in our list
  156. for i, (name, idx) in enumerate(self._packs or []):
  157. if name == pack_name:
  158. if idx is None:
  159. # Fetch and cache the index
  160. idx_data = self._fetch_url(f"objects/pack/{pack_name}.idx")
  161. idx = load_pack_index_file("<http>", BytesIO(idx_data))
  162. if self._packs is not None:
  163. self._packs[i] = (name, idx)
  164. return idx
  165. raise KeyError(f"Pack not found: {pack_name}")
  166. def _fetch_from_pack(self, sha: bytes) -> tuple[int, bytes]:
  167. """Try to fetch an object from pack files.
  168. Args:
  169. sha: SHA1 of the object (hex string as bytes)
  170. Returns:
  171. Tuple of (type_num, content)
  172. Raises:
  173. KeyError: If object not found in any pack
  174. """
  175. self._load_packs()
  176. # Convert hex to binary for pack operations
  177. binsha = hex_to_sha(sha)
  178. for pack_name, idx in self._packs or []:
  179. if idx is None:
  180. idx = self._get_pack_index(pack_name)
  181. try:
  182. # Check if object is in this pack
  183. idx.object_offset(binsha)
  184. except KeyError:
  185. continue
  186. # We found the object, now we need to fetch the pack data
  187. # For efficiency, we could fetch just the needed portion, but for
  188. # simplicity we'll fetch the whole pack and cache it
  189. self._ensure_temp_pack_dir()
  190. if self._temp_pack_dir is None:
  191. raise RuntimeError("Temp pack directory not initialized")
  192. pack_path = os.path.join(self._temp_pack_dir, f"{pack_name}.pack")
  193. if not os.path.exists(pack_path):
  194. # Download the pack file
  195. pack_data = self._fetch_url(f"objects/pack/{pack_name}.pack")
  196. with open(pack_path, "wb") as f:
  197. f.write(pack_data)
  198. # Open the pack and get the object
  199. pack = Pack(pack_path[:-5]) # Remove .pack extension
  200. try:
  201. return pack.get_raw(binsha)
  202. finally:
  203. pack.close()
  204. raise KeyError(sha)
  205. def get_raw(self, sha: bytes) -> tuple[int, bytes]:
  206. """Obtain the raw text for an object.
  207. Args:
  208. sha: SHA1 of the object
  209. Returns:
  210. Tuple with numeric type and object contents
  211. """
  212. # Check cache first
  213. if sha in self._cached_objects:
  214. return self._cached_objects[sha]
  215. # Try loose object first
  216. try:
  217. result = self._fetch_loose_object(sha)
  218. self._cached_objects[sha] = result
  219. return result
  220. except KeyError:
  221. pass
  222. # Try packs
  223. result = self._fetch_from_pack(sha)
  224. self._cached_objects[sha] = result
  225. return result
  226. def contains_loose(self, sha: bytes) -> bool:
  227. """Check if a particular object is present by SHA1 and is loose."""
  228. try:
  229. self._fetch_loose_object(sha)
  230. return True
  231. except KeyError:
  232. return False
  233. def __contains__(self, sha: bytes) -> bool:
  234. """Check if a particular object is present by SHA1."""
  235. if sha in self._cached_objects:
  236. return True
  237. # Try loose object
  238. try:
  239. self._fetch_loose_object(sha)
  240. return True
  241. except KeyError:
  242. pass
  243. # Try packs
  244. try:
  245. self._fetch_from_pack(sha)
  246. return True
  247. except KeyError:
  248. return False
  249. def __iter__(self) -> Iterator[bytes]:
  250. """Iterate over all SHAs in the store.
  251. Note: This is inefficient for dumb HTTP as it requires
  252. downloading all pack indices.
  253. """
  254. seen = set()
  255. # We can't efficiently list loose objects over dumb HTTP
  256. # So we only iterate pack objects
  257. self._load_packs()
  258. for pack_name, idx in self._packs or []:
  259. if idx is None:
  260. idx = self._get_pack_index(pack_name)
  261. for sha in idx:
  262. if sha not in seen:
  263. seen.add(sha)
  264. yield sha_to_hex(sha)
  265. @property
  266. def packs(self):
  267. """Iterable of pack objects.
  268. Note: Returns empty list as we don't have actual Pack objects.
  269. """
  270. return []
  271. def add_object(self, obj) -> None:
  272. """Add a single object to this object store."""
  273. raise NotImplementedError("Cannot add objects to dumb HTTP repository")
  274. def add_objects(self, objects, progress=None) -> None:
  275. """Add a set of objects to this object store."""
  276. raise NotImplementedError("Cannot add objects to dumb HTTP repository")
  277. def __del__(self):
  278. """Clean up temporary directory on deletion."""
  279. if self._temp_pack_dir and os.path.exists(self._temp_pack_dir):
  280. import shutil
  281. shutil.rmtree(self._temp_pack_dir, ignore_errors=True)
  282. class DumbRemoteHTTPRepo(BaseRepo):
  283. """Repository implementation for dumb HTTP remotes."""
  284. def __init__(self, base_url: str, http_request_func):
  285. """Initialize a DumbRemoteHTTPRepo.
  286. Args:
  287. base_url: Base URL of the remote repository
  288. http_request_func: Function to make HTTP requests.
  289. """
  290. self.base_url = base_url.rstrip("/") + "/"
  291. self._http_request = http_request_func
  292. self._refs: Optional[dict[Ref, ObjectID]] = None
  293. self._peeled: Optional[dict[Ref, ObjectID]] = None
  294. self._object_store = DumbHTTPObjectStore(base_url, http_request_func)
  295. @property
  296. def object_store(self):
  297. """ObjectStore for this repository."""
  298. return self._object_store
  299. def _fetch_url(self, path: str) -> bytes:
  300. """Fetch content from a URL path relative to base_url."""
  301. url = urljoin(self.base_url, path)
  302. resp, read = self._http_request(url, {})
  303. try:
  304. if resp.status == 404:
  305. raise OSError(f"Not found: {url}")
  306. elif resp.status != 200:
  307. raise OSError(f"HTTP error {resp.status}: {url}")
  308. chunks = []
  309. while True:
  310. chunk = read(4096)
  311. if not chunk:
  312. break
  313. chunks.append(chunk)
  314. return b"".join(chunks)
  315. finally:
  316. resp.close()
  317. def get_refs(self) -> dict[Ref, ObjectID]:
  318. """Get dictionary with all refs."""
  319. if self._refs is None:
  320. # Fetch info/refs
  321. try:
  322. refs_data = self._fetch_url("info/refs")
  323. except OSError:
  324. raise NotGitRepository(f"Cannot read refs from {self.base_url}")
  325. refs_hex = read_info_refs(BytesIO(refs_data))
  326. # Keep SHAs as hex
  327. self._refs, self._peeled = split_peeled_refs(refs_hex)
  328. return dict(self._refs)
  329. def get_peeled(self, ref: Ref) -> ObjectID:
  330. """Get the peeled value of a ref."""
  331. # For dumb HTTP, we don't have peeled refs readily available
  332. # We would need to fetch and parse tag objects
  333. sha = self.get_refs().get(ref, None)
  334. return sha if sha is not None else ZERO_SHA
  335. def fetch_pack_data(self, graph_walker, determine_wants, progress=None, depth=None):
  336. """Fetch pack data from the remote.
  337. This is the main method for fetching objects from a dumb HTTP remote.
  338. Since dumb HTTP doesn't support negotiation, we need to download
  339. all objects reachable from the wanted refs.
  340. Args:
  341. graph_walker: GraphWalker instance (not used for dumb HTTP)
  342. determine_wants: Function that returns list of wanted SHAs
  343. progress: Optional progress callback
  344. depth: Depth for shallow clones (not supported for dumb HTTP)
  345. Returns:
  346. Iterator of UnpackedObject instances
  347. """
  348. refs = self.get_refs()
  349. wants = determine_wants(refs)
  350. if not wants:
  351. return
  352. # For dumb HTTP, we traverse the object graph starting from wants
  353. to_fetch = set(wants)
  354. seen = set()
  355. while to_fetch:
  356. sha = to_fetch.pop()
  357. if sha in seen:
  358. continue
  359. seen.add(sha)
  360. # Fetch the object
  361. try:
  362. type_num, content = self._object_store.get_raw(sha)
  363. except KeyError:
  364. # Object not found, skip it
  365. continue
  366. unpacked = UnpackedObject(type_num, sha=sha, decomp_chunks=[content])
  367. yield unpacked
  368. # Parse the object to find references to other objects
  369. obj = ShaFile.from_raw_string(type_num, content)
  370. if isinstance(obj, Commit): # Commit
  371. to_fetch.add(obj.tree)
  372. for parent in obj.parents:
  373. to_fetch.add(parent)
  374. elif isinstance(obj, Tag): # Tag
  375. to_fetch.add(obj.object[1])
  376. elif isinstance(obj, Tree): # Tree
  377. for _, _, item_sha in obj.items():
  378. to_fetch.add(item_sha)
  379. if progress:
  380. progress(f"Fetching objects: {len(seen)} done\n".encode())