web.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612
  1. # web.py -- WSGI smart-http server
  2. # Copyright (C) 2010 Google, Inc.
  3. # Copyright (C) 2012 Jelmer Vernooij <jelmer@jelmer.uk>
  4. #
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as public by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """HTTP server for dulwich that implements the git smart HTTP protocol."""
  22. import os
  23. import re
  24. import sys
  25. import time
  26. from collections.abc import Iterator
  27. from io import BytesIO
  28. from typing import Callable, ClassVar, Optional
  29. from urllib.parse import parse_qs
  30. from wsgiref.simple_server import (
  31. ServerHandler,
  32. WSGIRequestHandler,
  33. WSGIServer,
  34. make_server,
  35. )
  36. from dulwich import log_utils
  37. from .protocol import ReceivableProtocol
  38. from .repo import BaseRepo, NotGitRepository, Repo
  39. from .server import (
  40. DEFAULT_HANDLERS,
  41. Backend,
  42. DictBackend,
  43. generate_info_refs,
  44. generate_objects_info_packs,
  45. )
  46. logger = log_utils.getLogger(__name__)
  47. # HTTP error strings
  48. HTTP_OK = "200 OK"
  49. HTTP_NOT_FOUND = "404 Not Found"
  50. HTTP_FORBIDDEN = "403 Forbidden"
  51. HTTP_ERROR = "500 Internal Server Error"
  52. NO_CACHE_HEADERS = [
  53. ("Expires", "Fri, 01 Jan 1980 00:00:00 GMT"),
  54. ("Pragma", "no-cache"),
  55. ("Cache-Control", "no-cache, max-age=0, must-revalidate"),
  56. ]
  57. def cache_forever_headers(now=None):
  58. if now is None:
  59. now = time.time()
  60. return [
  61. ("Date", date_time_string(now)),
  62. ("Expires", date_time_string(now + 31536000)),
  63. ("Cache-Control", "public, max-age=31536000"),
  64. ]
  65. def date_time_string(timestamp: Optional[float] = None) -> str:
  66. # From BaseHTTPRequestHandler.date_time_string in BaseHTTPServer.py in the
  67. # Python 2.6.5 standard library, following modifications:
  68. # - Made a global rather than an instance method.
  69. # - weekdayname and monthname are renamed and locals rather than class
  70. # variables.
  71. # Copyright (c) 2001-2010 Python Software Foundation; All Rights Reserved
  72. weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
  73. months = [
  74. None,
  75. "Jan",
  76. "Feb",
  77. "Mar",
  78. "Apr",
  79. "May",
  80. "Jun",
  81. "Jul",
  82. "Aug",
  83. "Sep",
  84. "Oct",
  85. "Nov",
  86. "Dec",
  87. ]
  88. if timestamp is None:
  89. timestamp = time.time()
  90. year, month, day, hh, mm, ss, wd = time.gmtime(timestamp)[:7]
  91. return "%s, %02d %3s %4d %02d:%02d:%02d GMD" % (
  92. weekdays[wd],
  93. day,
  94. months[month],
  95. year,
  96. hh,
  97. mm,
  98. ss,
  99. )
  100. def url_prefix(mat) -> str:
  101. """Extract the URL prefix from a regex match.
  102. Args:
  103. mat: A regex match object.
  104. Returns: The URL prefix, defined as the text before the match in the
  105. original string. Normalized to start with one leading slash and end
  106. with zero.
  107. """
  108. return "/" + mat.string[: mat.start()].strip("/")
  109. def get_repo(backend, mat) -> BaseRepo:
  110. """Get a Repo instance for the given backend and URL regex match."""
  111. return backend.open_repository(url_prefix(mat))
  112. def send_file(req, f, content_type):
  113. """Send a file-like object to the request output.
  114. Args:
  115. req: The HTTPGitRequest object to send output to.
  116. f: An open file-like object to send; will be closed.
  117. content_type: The MIME type for the file.
  118. Returns: Iterator over the contents of the file, as chunks.
  119. """
  120. if f is None:
  121. yield req.not_found("File not found")
  122. return
  123. try:
  124. req.respond(HTTP_OK, content_type)
  125. while True:
  126. data = f.read(10240)
  127. if not data:
  128. break
  129. yield data
  130. except OSError:
  131. yield req.error("Error reading file")
  132. finally:
  133. f.close()
  134. def _url_to_path(url):
  135. return url.replace("/", os.path.sep)
  136. def get_text_file(req, backend, mat):
  137. req.nocache()
  138. path = _url_to_path(mat.group())
  139. logger.info("Sending plain text file %s", path)
  140. return send_file(req, get_repo(backend, mat).get_named_file(path), "text/plain")
  141. def get_loose_object(req, backend, mat):
  142. sha = (mat.group(1) + mat.group(2)).encode("ascii")
  143. logger.info("Sending loose object %s", sha)
  144. object_store = get_repo(backend, mat).object_store
  145. if not object_store.contains_loose(sha):
  146. yield req.not_found("Object not found")
  147. return
  148. try:
  149. data = object_store[sha].as_legacy_object()
  150. except OSError:
  151. yield req.error("Error reading object")
  152. return
  153. req.cache_forever()
  154. req.respond(HTTP_OK, "application/x-git-loose-object")
  155. yield data
  156. def get_pack_file(req, backend, mat):
  157. req.cache_forever()
  158. path = _url_to_path(mat.group())
  159. logger.info("Sending pack file %s", path)
  160. return send_file(
  161. req,
  162. get_repo(backend, mat).get_named_file(path),
  163. "application/x-git-packed-objects",
  164. )
  165. def get_idx_file(req, backend, mat):
  166. req.cache_forever()
  167. path = _url_to_path(mat.group())
  168. logger.info("Sending pack file %s", path)
  169. return send_file(
  170. req,
  171. get_repo(backend, mat).get_named_file(path),
  172. "application/x-git-packed-objects-toc",
  173. )
  174. def get_info_refs(req, backend, mat):
  175. params = parse_qs(req.environ["QUERY_STRING"])
  176. service = params.get("service", [None])[0]
  177. try:
  178. repo = get_repo(backend, mat)
  179. except NotGitRepository as e:
  180. yield req.not_found(str(e))
  181. return
  182. if service and not req.dumb:
  183. handler_cls = req.handlers.get(service.encode("ascii"), None)
  184. if handler_cls is None:
  185. yield req.forbidden("Unsupported service")
  186. return
  187. req.nocache()
  188. write = req.respond(HTTP_OK, f"application/x-{service}-advertisement")
  189. proto = ReceivableProtocol(BytesIO().read, write)
  190. handler = handler_cls(
  191. backend,
  192. [url_prefix(mat)],
  193. proto,
  194. stateless_rpc=True,
  195. advertise_refs=True,
  196. )
  197. handler.proto.write_pkt_line(b"# service=" + service.encode("ascii") + b"\n")
  198. handler.proto.write_pkt_line(None)
  199. handler.handle()
  200. else:
  201. # non-smart fallback
  202. # TODO: select_getanyfile() (see http-backend.c)
  203. req.nocache()
  204. req.respond(HTTP_OK, "text/plain")
  205. logger.info("Emulating dumb info/refs")
  206. yield from generate_info_refs(repo)
  207. def get_info_packs(req, backend, mat):
  208. req.nocache()
  209. req.respond(HTTP_OK, "text/plain")
  210. logger.info("Emulating dumb info/packs")
  211. return generate_objects_info_packs(get_repo(backend, mat))
  212. def _chunk_iter(f):
  213. while True:
  214. line = f.readline()
  215. length = int(line.rstrip(), 16)
  216. chunk = f.read(length + 2)
  217. if length == 0:
  218. break
  219. yield chunk[:-2]
  220. class ChunkReader:
  221. def __init__(self, f) -> None:
  222. self._iter = _chunk_iter(f)
  223. self._buffer: list[bytes] = []
  224. def read(self, n):
  225. while sum(map(len, self._buffer)) < n:
  226. try:
  227. self._buffer.append(next(self._iter))
  228. except StopIteration:
  229. break
  230. f = b"".join(self._buffer)
  231. ret = f[:n]
  232. self._buffer = [f[n:]]
  233. return ret
  234. class _LengthLimitedFile:
  235. """Wrapper class to limit the length of reads from a file-like object.
  236. This is used to ensure EOF is read from the wsgi.input object once
  237. Content-Length bytes are read. This behavior is required by the WSGI spec
  238. but not implemented in wsgiref as of 2.5.
  239. """
  240. def __init__(self, input, max_bytes) -> None:
  241. self._input = input
  242. self._bytes_avail = max_bytes
  243. def read(self, size=-1):
  244. if self._bytes_avail <= 0:
  245. return b""
  246. if size == -1 or size > self._bytes_avail:
  247. size = self._bytes_avail
  248. self._bytes_avail -= size
  249. return self._input.read(size)
  250. # TODO: support more methods as necessary
  251. def handle_service_request(req, backend, mat):
  252. service = mat.group().lstrip("/")
  253. logger.info("Handling service request for %s", service)
  254. handler_cls = req.handlers.get(service.encode("ascii"), None)
  255. if handler_cls is None:
  256. yield req.forbidden("Unsupported service")
  257. return
  258. try:
  259. get_repo(backend, mat)
  260. except NotGitRepository as e:
  261. yield req.not_found(str(e))
  262. return
  263. req.nocache()
  264. write = req.respond(HTTP_OK, f"application/x-{service}-result")
  265. if req.environ.get("HTTP_TRANSFER_ENCODING") == "chunked":
  266. read = ChunkReader(req.environ["wsgi.input"]).read
  267. else:
  268. read = req.environ["wsgi.input"].read
  269. proto = ReceivableProtocol(read, write)
  270. # TODO(jelmer): Find a way to pass in repo, rather than having handler_cls
  271. # reopen.
  272. handler = handler_cls(backend, [url_prefix(mat)], proto, stateless_rpc=True)
  273. handler.handle()
  274. class HTTPGitRequest:
  275. """Class encapsulating the state of a single git HTTP request.
  276. Attributes:
  277. environ: the WSGI environment for the request.
  278. """
  279. def __init__(
  280. self, environ, start_response, dumb: bool = False, handlers=None
  281. ) -> None:
  282. self.environ = environ
  283. self.dumb = dumb
  284. self.handlers = handlers
  285. self._start_response = start_response
  286. self._cache_headers: list[tuple[str, str]] = []
  287. self._headers: list[tuple[str, str]] = []
  288. def add_header(self, name, value) -> None:
  289. """Add a header to the response."""
  290. self._headers.append((name, value))
  291. def respond(
  292. self,
  293. status: str = HTTP_OK,
  294. content_type: Optional[str] = None,
  295. headers: Optional[list[tuple[str, str]]] = None,
  296. ):
  297. """Begin a response with the given status and other headers."""
  298. if headers:
  299. self._headers.extend(headers)
  300. if content_type:
  301. self._headers.append(("Content-Type", content_type))
  302. self._headers.extend(self._cache_headers)
  303. return self._start_response(status, self._headers)
  304. def not_found(self, message: str) -> bytes:
  305. """Begin a HTTP 404 response and return the text of a message."""
  306. self._cache_headers = []
  307. logger.info("Not found: %s", message)
  308. self.respond(HTTP_NOT_FOUND, "text/plain")
  309. return message.encode("ascii")
  310. def forbidden(self, message: str) -> bytes:
  311. """Begin a HTTP 403 response and return the text of a message."""
  312. self._cache_headers = []
  313. logger.info("Forbidden: %s", message)
  314. self.respond(HTTP_FORBIDDEN, "text/plain")
  315. return message.encode("ascii")
  316. def error(self, message: str) -> bytes:
  317. """Begin a HTTP 500 response and return the text of a message."""
  318. self._cache_headers = []
  319. logger.error("Error: %s", message)
  320. self.respond(HTTP_ERROR, "text/plain")
  321. return message.encode("ascii")
  322. def nocache(self) -> None:
  323. """Set the response to never be cached by the client."""
  324. self._cache_headers = NO_CACHE_HEADERS
  325. def cache_forever(self) -> None:
  326. """Set the response to be cached forever by the client."""
  327. self._cache_headers = cache_forever_headers()
  328. class HTTPGitApplication:
  329. """Class encapsulating the state of a git WSGI application.
  330. Attributes:
  331. backend: the Backend object backing this application
  332. """
  333. services: ClassVar[
  334. dict[
  335. tuple[str, re.Pattern],
  336. Callable[[HTTPGitRequest, Backend, re.Match], Iterator[bytes]],
  337. ]
  338. ] = {
  339. ("GET", re.compile("/HEAD$")): get_text_file,
  340. ("GET", re.compile("/info/refs$")): get_info_refs,
  341. ("GET", re.compile("/objects/info/alternates$")): get_text_file,
  342. ("GET", re.compile("/objects/info/http-alternates$")): get_text_file,
  343. ("GET", re.compile("/objects/info/packs$")): get_info_packs,
  344. (
  345. "GET",
  346. re.compile("/objects/([0-9a-f]{2})/([0-9a-f]{38})$"),
  347. ): get_loose_object,
  348. (
  349. "GET",
  350. re.compile("/objects/pack/pack-([0-9a-f]{40})\\.pack$"),
  351. ): get_pack_file,
  352. (
  353. "GET",
  354. re.compile("/objects/pack/pack-([0-9a-f]{40})\\.idx$"),
  355. ): get_idx_file,
  356. ("POST", re.compile("/git-upload-pack$")): handle_service_request,
  357. ("POST", re.compile("/git-receive-pack$")): handle_service_request,
  358. }
  359. def __init__(
  360. self, backend, dumb: bool = False, handlers=None, fallback_app=None
  361. ) -> None:
  362. self.backend = backend
  363. self.dumb = dumb
  364. self.handlers = dict(DEFAULT_HANDLERS)
  365. self.fallback_app = fallback_app
  366. if handlers is not None:
  367. self.handlers.update(handlers)
  368. def __call__(self, environ, start_response):
  369. path = environ["PATH_INFO"]
  370. method = environ["REQUEST_METHOD"]
  371. req = HTTPGitRequest(
  372. environ, start_response, dumb=self.dumb, handlers=self.handlers
  373. )
  374. # environ['QUERY_STRING'] has qs args
  375. handler = None
  376. for smethod, spath in self.services.keys():
  377. if smethod != method:
  378. continue
  379. mat = spath.search(path)
  380. if mat:
  381. handler = self.services[smethod, spath]
  382. break
  383. if handler is None:
  384. if self.fallback_app is not None:
  385. return self.fallback_app(environ, start_response)
  386. else:
  387. return [req.not_found("Sorry, that method is not supported")]
  388. return handler(req, self.backend, mat)
  389. class GunzipFilter:
  390. """WSGI middleware that unzips gzip-encoded requests before
  391. passing on to the underlying application.
  392. """
  393. def __init__(self, application) -> None:
  394. self.app = application
  395. def __call__(self, environ, start_response):
  396. import gzip
  397. if environ.get("HTTP_CONTENT_ENCODING", "") == "gzip":
  398. environ["wsgi.input"] = gzip.GzipFile(
  399. filename=None, fileobj=environ["wsgi.input"], mode="rb"
  400. )
  401. del environ["HTTP_CONTENT_ENCODING"]
  402. if "CONTENT_LENGTH" in environ:
  403. del environ["CONTENT_LENGTH"]
  404. return self.app(environ, start_response)
  405. class LimitedInputFilter:
  406. """WSGI middleware that limits the input length of a request to that
  407. specified in Content-Length.
  408. """
  409. def __init__(self, application) -> None:
  410. self.app = application
  411. def __call__(self, environ, start_response):
  412. # This is not necessary if this app is run from a conforming WSGI
  413. # server. Unfortunately, there's no way to tell that at this point.
  414. # TODO: git may used HTTP/1.1 chunked encoding instead of specifying
  415. # content-length
  416. content_length = environ.get("CONTENT_LENGTH", "")
  417. if content_length:
  418. environ["wsgi.input"] = _LengthLimitedFile(
  419. environ["wsgi.input"], int(content_length)
  420. )
  421. return self.app(environ, start_response)
  422. def make_wsgi_chain(*args, **kwargs):
  423. """Factory function to create an instance of HTTPGitApplication,
  424. correctly wrapped with needed middleware.
  425. """
  426. app = HTTPGitApplication(*args, **kwargs)
  427. wrapped_app = LimitedInputFilter(GunzipFilter(app))
  428. return wrapped_app
  429. class ServerHandlerLogger(ServerHandler):
  430. """ServerHandler that uses dulwich's logger for logging exceptions."""
  431. def log_exception(self, exc_info) -> None:
  432. logger.exception(
  433. "Exception happened during processing of request",
  434. exc_info=exc_info,
  435. )
  436. def log_message(self, format, *args) -> None:
  437. logger.info(format, *args)
  438. def log_error(self, *args) -> None:
  439. logger.error(*args)
  440. class WSGIRequestHandlerLogger(WSGIRequestHandler):
  441. """WSGIRequestHandler that uses dulwich's logger for logging exceptions."""
  442. def log_exception(self, exc_info) -> None:
  443. logger.exception(
  444. "Exception happened during processing of request",
  445. exc_info=exc_info,
  446. )
  447. def log_message(self, format, *args) -> None:
  448. logger.info(format, *args)
  449. def log_error(self, *args) -> None:
  450. logger.error(*args)
  451. def handle(self) -> None:
  452. """Handle a single HTTP request."""
  453. self.raw_requestline = self.rfile.readline()
  454. if not self.parse_request(): # An error code has been sent, just exit
  455. return
  456. handler = ServerHandlerLogger(
  457. self.rfile,
  458. self.wfile, # type: ignore
  459. self.get_stderr(),
  460. self.get_environ(),
  461. )
  462. handler.request_handler = self # type: ignore # backpointer for logging
  463. handler.run(self.server.get_app()) # type: ignore
  464. class WSGIServerLogger(WSGIServer):
  465. def handle_error(self, request, client_address) -> None:
  466. """Handle an error."""
  467. logger.exception(
  468. f"Exception happened during processing of request from {client_address!s}"
  469. )
  470. def main(argv=sys.argv) -> None:
  471. """Entry point for starting an HTTP git server."""
  472. import optparse
  473. parser = optparse.OptionParser()
  474. parser.add_option(
  475. "-l",
  476. "--listen_address",
  477. dest="listen_address",
  478. default="localhost",
  479. help="Binding IP address.",
  480. )
  481. parser.add_option(
  482. "-p",
  483. "--port",
  484. dest="port",
  485. type=int,
  486. default=8000,
  487. help="Port to listen on.",
  488. )
  489. options, args = parser.parse_args(argv)
  490. if len(args) > 1:
  491. gitdir = args[1]
  492. else:
  493. gitdir = os.getcwd()
  494. log_utils.default_logging_config()
  495. backend = DictBackend({"/": Repo(gitdir)})
  496. app = make_wsgi_chain(backend)
  497. server = make_server(
  498. options.listen_address,
  499. options.port,
  500. app,
  501. handler_class=WSGIRequestHandlerLogger,
  502. server_class=WSGIServerLogger,
  503. )
  504. logger.info(
  505. "Listening for HTTP connections on %s:%d",
  506. options.listen_address,
  507. options.port,
  508. )
  509. server.serve_forever()
  510. if __name__ == "__main__":
  511. main()