web.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. # web.py -- WSGI smart-http server
  2. # Copyright (C) 2010 Google, Inc.
  3. # Copyright (C) 2012 Jelmer Vernooij <jelmer@samba.org>
  4. #
  5. # This program is free software; you can redistribute it and/or
  6. # modify it under the terms of the GNU General Public License
  7. # as published by the Free Software Foundation; version 2
  8. # or (at your option) any later version of the License.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18. # MA 02110-1301, USA.
  19. """HTTP server for dulwich that implements the git smart HTTP protocol."""
  20. from cStringIO import StringIO
  21. import gzip
  22. import os
  23. import re
  24. import sys
  25. import time
  26. try:
  27. from urlparse import parse_qs
  28. except ImportError:
  29. from dulwich._compat import parse_qs
  30. from dulwich import log_utils
  31. from dulwich.protocol import (
  32. ReceivableProtocol,
  33. )
  34. from dulwich.repo import (
  35. Repo,
  36. )
  37. from dulwich.server import (
  38. DictBackend,
  39. DEFAULT_HANDLERS,
  40. generate_info_refs,
  41. generate_objects_info_packs,
  42. )
  43. logger = log_utils.getLogger(__name__)
  44. # HTTP error strings
  45. HTTP_OK = '200 OK'
  46. HTTP_NOT_FOUND = '404 Not Found'
  47. HTTP_FORBIDDEN = '403 Forbidden'
  48. HTTP_ERROR = '500 Internal Server Error'
  49. def date_time_string(timestamp=None):
  50. # From BaseHTTPRequestHandler.date_time_string in BaseHTTPServer.py in the
  51. # Python 2.6.5 standard library, following modifications:
  52. # - Made a global rather than an instance method.
  53. # - weekdayname and monthname are renamed and locals rather than class
  54. # variables.
  55. # Copyright (c) 2001-2010 Python Software Foundation; All Rights Reserved
  56. weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
  57. months = [None,
  58. 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
  59. 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
  60. if timestamp is None:
  61. timestamp = time.time()
  62. year, month, day, hh, mm, ss, wd, y, z = time.gmtime(timestamp)
  63. return '%s, %02d %3s %4d %02d:%02d:%02d GMD' % (
  64. weekdays[wd], day, months[month], year, hh, mm, ss)
  65. def url_prefix(mat):
  66. """Extract the URL prefix from a regex match.
  67. :param mat: A regex match object.
  68. :returns: The URL prefix, defined as the text before the match in the
  69. original string. Normalized to start with one leading slash and end with
  70. zero.
  71. """
  72. return '/' + mat.string[:mat.start()].strip('/')
  73. def get_repo(backend, mat):
  74. """Get a Repo instance for the given backend and URL regex match."""
  75. return backend.open_repository(url_prefix(mat))
  76. def send_file(req, f, content_type):
  77. """Send a file-like object to the request output.
  78. :param req: The HTTPGitRequest object to send output to.
  79. :param f: An open file-like object to send; will be closed.
  80. :param content_type: The MIME type for the file.
  81. :return: Iterator over the contents of the file, as chunks.
  82. """
  83. if f is None:
  84. yield req.not_found('File not found')
  85. return
  86. try:
  87. req.respond(HTTP_OK, content_type)
  88. while True:
  89. data = f.read(10240)
  90. if not data:
  91. break
  92. yield data
  93. f.close()
  94. except IOError:
  95. f.close()
  96. yield req.error('Error reading file')
  97. except:
  98. f.close()
  99. raise
  100. def _url_to_path(url):
  101. return url.replace('/', os.path.sep)
  102. def get_text_file(req, backend, mat):
  103. req.nocache()
  104. path = _url_to_path(mat.group())
  105. logger.info('Sending plain text file %s', path)
  106. return send_file(req, get_repo(backend, mat).get_named_file(path),
  107. 'text/plain')
  108. def get_loose_object(req, backend, mat):
  109. sha = mat.group(1) + mat.group(2)
  110. logger.info('Sending loose object %s', sha)
  111. object_store = get_repo(backend, mat).object_store
  112. if not object_store.contains_loose(sha):
  113. yield req.not_found('Object not found')
  114. return
  115. try:
  116. data = object_store[sha].as_legacy_object()
  117. except IOError:
  118. yield req.error('Error reading object')
  119. return
  120. req.cache_forever()
  121. req.respond(HTTP_OK, 'application/x-git-loose-object')
  122. yield data
  123. def get_pack_file(req, backend, mat):
  124. req.cache_forever()
  125. path = _url_to_path(mat.group())
  126. logger.info('Sending pack file %s', path)
  127. return send_file(req, get_repo(backend, mat).get_named_file(path),
  128. 'application/x-git-packed-objects')
  129. def get_idx_file(req, backend, mat):
  130. req.cache_forever()
  131. path = _url_to_path(mat.group())
  132. logger.info('Sending pack file %s', path)
  133. return send_file(req, get_repo(backend, mat).get_named_file(path),
  134. 'application/x-git-packed-objects-toc')
  135. def get_info_refs(req, backend, mat):
  136. params = parse_qs(req.environ['QUERY_STRING'])
  137. service = params.get('service', [None])[0]
  138. if service and not req.dumb:
  139. handler_cls = req.handlers.get(service, None)
  140. if handler_cls is None:
  141. yield req.forbidden('Unsupported service %s' % service)
  142. return
  143. req.nocache()
  144. write = req.respond(HTTP_OK, 'application/x-%s-advertisement' % service)
  145. proto = ReceivableProtocol(StringIO().read, write)
  146. handler = handler_cls(backend, [url_prefix(mat)], proto,
  147. http_req=req, advertise_refs=True)
  148. handler.proto.write_pkt_line('# service=%s\n' % service)
  149. handler.proto.write_pkt_line(None)
  150. handler.handle()
  151. else:
  152. # non-smart fallback
  153. # TODO: select_getanyfile() (see http-backend.c)
  154. req.nocache()
  155. req.respond(HTTP_OK, 'text/plain')
  156. logger.info('Emulating dumb info/refs')
  157. repo = get_repo(backend, mat)
  158. for text in generate_info_refs(repo):
  159. yield text
  160. def get_info_packs(req, backend, mat):
  161. req.nocache()
  162. req.respond(HTTP_OK, 'text/plain')
  163. logger.info('Emulating dumb info/packs')
  164. return generate_objects_info_packs(get_repo(backend, mat))
  165. class _LengthLimitedFile(object):
  166. """Wrapper class to limit the length of reads from a file-like object.
  167. This is used to ensure EOF is read from the wsgi.input object once
  168. Content-Length bytes are read. This behavior is required by the WSGI spec
  169. but not implemented in wsgiref as of 2.5.
  170. """
  171. def __init__(self, input, max_bytes):
  172. self._input = input
  173. self._bytes_avail = max_bytes
  174. def read(self, size=-1):
  175. if self._bytes_avail <= 0:
  176. return ''
  177. if size == -1 or size > self._bytes_avail:
  178. size = self._bytes_avail
  179. self._bytes_avail -= size
  180. return self._input.read(size)
  181. # TODO: support more methods as necessary
  182. def handle_service_request(req, backend, mat):
  183. service = mat.group().lstrip('/')
  184. logger.info('Handling service request for %s', service)
  185. handler_cls = req.handlers.get(service, None)
  186. if handler_cls is None:
  187. yield req.forbidden('Unsupported service %s' % service)
  188. return
  189. req.nocache()
  190. write = req.respond(HTTP_OK, 'application/x-%s-result' % service)
  191. proto = ReceivableProtocol(req.environ['wsgi.input'].read, write)
  192. handler = handler_cls(backend, [url_prefix(mat)], proto, http_req=req)
  193. handler.handle()
  194. class HTTPGitRequest(object):
  195. """Class encapsulating the state of a single git HTTP request.
  196. :ivar environ: the WSGI environment for the request.
  197. """
  198. def __init__(self, environ, start_response, dumb=False, handlers=None):
  199. self.environ = environ
  200. self.dumb = dumb
  201. self.handlers = handlers
  202. self._start_response = start_response
  203. self._cache_headers = []
  204. self._headers = []
  205. def add_header(self, name, value):
  206. """Add a header to the response."""
  207. self._headers.append((name, value))
  208. def respond(self, status=HTTP_OK, content_type=None, headers=None):
  209. """Begin a response with the given status and other headers."""
  210. if headers:
  211. self._headers.extend(headers)
  212. if content_type:
  213. self._headers.append(('Content-Type', content_type))
  214. self._headers.extend(self._cache_headers)
  215. return self._start_response(status, self._headers)
  216. def not_found(self, message):
  217. """Begin a HTTP 404 response and return the text of a message."""
  218. self._cache_headers = []
  219. logger.info('Not found: %s', message)
  220. self.respond(HTTP_NOT_FOUND, 'text/plain')
  221. return message
  222. def forbidden(self, message):
  223. """Begin a HTTP 403 response and return the text of a message."""
  224. self._cache_headers = []
  225. logger.info('Forbidden: %s', message)
  226. self.respond(HTTP_FORBIDDEN, 'text/plain')
  227. return message
  228. def error(self, message):
  229. """Begin a HTTP 500 response and return the text of a message."""
  230. self._cache_headers = []
  231. logger.error('Error: %s', message)
  232. self.respond(HTTP_ERROR, 'text/plain')
  233. return message
  234. def nocache(self):
  235. """Set the response to never be cached by the client."""
  236. self._cache_headers = [
  237. ('Expires', 'Fri, 01 Jan 1980 00:00:00 GMT'),
  238. ('Pragma', 'no-cache'),
  239. ('Cache-Control', 'no-cache, max-age=0, must-revalidate'),
  240. ]
  241. def cache_forever(self):
  242. """Set the response to be cached forever by the client."""
  243. now = time.time()
  244. self._cache_headers = [
  245. ('Date', date_time_string(now)),
  246. ('Expires', date_time_string(now + 31536000)),
  247. ('Cache-Control', 'public, max-age=31536000'),
  248. ]
  249. class HTTPGitApplication(object):
  250. """Class encapsulating the state of a git WSGI application.
  251. :ivar backend: the Backend object backing this application
  252. """
  253. services = {
  254. ('GET', re.compile('/HEAD$')): get_text_file,
  255. ('GET', re.compile('/info/refs$')): get_info_refs,
  256. ('GET', re.compile('/objects/info/alternates$')): get_text_file,
  257. ('GET', re.compile('/objects/info/http-alternates$')): get_text_file,
  258. ('GET', re.compile('/objects/info/packs$')): get_info_packs,
  259. ('GET', re.compile('/objects/([0-9a-f]{2})/([0-9a-f]{38})$')): get_loose_object,
  260. ('GET', re.compile('/objects/pack/pack-([0-9a-f]{40})\\.pack$')): get_pack_file,
  261. ('GET', re.compile('/objects/pack/pack-([0-9a-f]{40})\\.idx$')): get_idx_file,
  262. ('POST', re.compile('/git-upload-pack$')): handle_service_request,
  263. ('POST', re.compile('/git-receive-pack$')): handle_service_request,
  264. }
  265. def __init__(self, backend, dumb=False, handlers=None, fallback_app=None):
  266. self.backend = backend
  267. self.dumb = dumb
  268. self.handlers = dict(DEFAULT_HANDLERS)
  269. self.fallback_app = fallback_app
  270. if handlers is not None:
  271. self.handlers.update(handlers)
  272. def __call__(self, environ, start_response):
  273. path = environ['PATH_INFO']
  274. method = environ['REQUEST_METHOD']
  275. req = HTTPGitRequest(environ, start_response, dumb=self.dumb,
  276. handlers=self.handlers)
  277. # environ['QUERY_STRING'] has qs args
  278. handler = None
  279. for smethod, spath in self.services.iterkeys():
  280. if smethod != method:
  281. continue
  282. mat = spath.search(path)
  283. if mat:
  284. handler = self.services[smethod, spath]
  285. break
  286. if handler is None:
  287. if self.fallback_app is not None:
  288. return self.fallback_app(environ, start_response)
  289. else:
  290. return req.not_found('Sorry, that method is not supported')
  291. return handler(req, self.backend, mat)
  292. class GunzipFilter(object):
  293. """WSGI middleware that unzips gzip-encoded requests before
  294. passing on to the underlying application.
  295. """
  296. def __init__(self, application):
  297. self.app = application
  298. def __call__(self, environ, start_response):
  299. if environ.get('HTTP_CONTENT_ENCODING', '') == 'gzip':
  300. environ.pop('HTTP_CONTENT_ENCODING')
  301. if 'CONTENT_LENGTH' in environ:
  302. del environ['CONTENT_LENGTH']
  303. environ['wsgi.input'] = gzip.GzipFile(filename=None,
  304. fileobj=environ['wsgi.input'], mode='r')
  305. return self.app(environ, start_response)
  306. class LimitedInputFilter(object):
  307. """WSGI middleware that limits the input length of a request to that
  308. specified in Content-Length.
  309. """
  310. def __init__(self, application):
  311. self.app = application
  312. def __call__(self, environ, start_response):
  313. # This is not necessary if this app is run from a conforming WSGI
  314. # server. Unfortunately, there's no way to tell that at this point.
  315. # TODO: git may used HTTP/1.1 chunked encoding instead of specifying
  316. # content-length
  317. content_length = environ.get('CONTENT_LENGTH', '')
  318. if content_length:
  319. environ['wsgi.input'] = _LengthLimitedFile(
  320. environ['wsgi.input'], int(content_length))
  321. return self.app(environ, start_response)
  322. def make_wsgi_chain(*args, **kwargs):
  323. """Factory function to create an instance of HTTPGitApplication,
  324. correctly wrapped with needed middleware.
  325. """
  326. app = HTTPGitApplication(*args, **kwargs)
  327. wrapped_app = GunzipFilter(LimitedInputFilter(app))
  328. return wrapped_app
  329. # The reference server implementation is based on wsgiref, which is not
  330. # distributed with python 2.4. If wsgiref is not present, users will not be
  331. # able to use the HTTP server without a little extra work.
  332. try:
  333. from wsgiref.simple_server import (
  334. WSGIRequestHandler,
  335. ServerHandler,
  336. WSGIServer,
  337. make_server,
  338. )
  339. class ServerHandlerLogger(ServerHandler):
  340. """ServerHandler that uses dulwich's logger for logging exceptions."""
  341. def log_exception(self, exc_info):
  342. logger.exception('Exception happened during processing of request',
  343. exc_info=exc_info)
  344. def log_message(self, format, *args):
  345. logger.info(format, *args)
  346. def log_error(self, *args):
  347. logger.error(*args)
  348. class WSGIRequestHandlerLogger(WSGIRequestHandler):
  349. """WSGIRequestHandler that uses dulwich's logger for logging exceptions."""
  350. def log_exception(self, exc_info):
  351. logger.exception('Exception happened during processing of request',
  352. exc_info=exc_info)
  353. def log_message(self, format, *args):
  354. logger.info(format, *args)
  355. def log_error(self, *args):
  356. logger.error(*args)
  357. def handle(self):
  358. """Handle a single HTTP request"""
  359. self.raw_requestline = self.rfile.readline()
  360. if not self.parse_request(): # An error code has been sent, just exit
  361. return
  362. handler = ServerHandlerLogger(
  363. self.rfile, self.wfile, self.get_stderr(), self.get_environ()
  364. )
  365. handler.request_handler = self # backpointer for logging
  366. handler.run(self.server.get_app())
  367. class WSGIServerLogger(WSGIServer):
  368. def handle_error(self, request, client_address):
  369. """Handle an error. """
  370. logger.exception('Exception happened during processing of request from %s' % str(client_address))
  371. def main(argv=sys.argv):
  372. """Entry point for starting an HTTP git server."""
  373. if len(argv) > 1:
  374. gitdir = argv[1]
  375. else:
  376. gitdir = os.getcwd()
  377. # TODO: allow serving on other addresses/ports via command-line flag
  378. listen_addr = ''
  379. port = 8000
  380. log_utils.default_logging_config()
  381. backend = DictBackend({'/': Repo(gitdir)})
  382. app = make_wsgi_chain(backend)
  383. server = make_server(listen_addr, port, app,
  384. handler_class=WSGIRequestHandlerLogger,
  385. server_class=WSGIServerLogger)
  386. logger.info('Listening for HTTP connections on %s:%d', listen_addr,
  387. port)
  388. server.serve_forever()
  389. except ImportError:
  390. # No wsgiref found; don't provide the reference functionality, but leave
  391. # the rest of the WSGI-based implementation.
  392. def main(argv=sys.argv):
  393. """Stub entry point for failing to start a server without wsgiref."""
  394. sys.stderr.write(
  395. 'Sorry, the wsgiref module is required for dul-web.\n')
  396. sys.exit(1)
  397. if __name__ == '__main__':
  398. main()