Procházet zdrojové kódy

Some fixes for the dumb transfer protocol (#1646)

I noticed the dumb transport protocol was added in latest dulwich
release and tried to use that new feature in the Software Heritage git
loader so we could remove our [homemade
support](https://gitlab.softwareheritage.org/swh/devel/swh-loader-git/-/blob/67b322effb00e81d7696a0ce676ec0d3241143ef/swh/loader/git/dumb.py)
of it and only relies on dulwich.

I stumbled across a couple of bugs after plugging dulwich dumb transfer
protocol into the SWH loader, this pull request fixes the issue I
observed and adds more tests.

After these fixes, the loader works fine when encountering a repository
that only offers dumb transfer protocol. Almost all of our git loader
tests are still passing apart a couple that need some mocking adaptation
due to the changes in the loader implementation.
Jelmer Vernooij před 1 měsícem
rodič
revize
3f58b0f69d
5 změnil soubory, kde provedl 87 přidání a 32 odebrání
  1. 19 13
      dulwich/client.py
  2. 21 9
      dulwich/dumb.py
  3. 35 9
      tests/compat/test_dumb.py
  4. 6 0
      tests/test_client.py
  5. 6 1
      tests/test_dumb.py

+ 19 - 13
dulwich/client.py

@@ -40,6 +40,7 @@ Known capabilities that are not supported:
 """
 
 import copy
+import functools
 import logging
 import os
 import select
@@ -2506,7 +2507,7 @@ class AbstractHttpGitClient(GitClient):
         self.dumb = dumb
         GitClient.__init__(self, **kwargs)
 
-    def _http_request(self, url, headers=None, data=None):
+    def _http_request(self, url, headers=None, data=None, raise_for_status=True):
         """Perform HTTP request.
 
         Args:
@@ -2813,14 +2814,16 @@ class AbstractHttpGitClient(GitClient):
             wants = determine_wants(refs)
         if wants is not None:
             wants = [cid for cid in wants if cid != ZERO_SHA]
-        if not wants:
+        if not wants and not self.dumb:
             return FetchPackResult(refs, symrefs, agent)
-        if self.dumb:
+        elif self.dumb:
             # Use dumb HTTP protocol
             from .dumb import DumbRemoteHTTPRepo
 
             # Pass http_request function
-            dumb_repo = DumbRemoteHTTPRepo(url, self._http_request)
+            dumb_repo = DumbRemoteHTTPRepo(
+                url, functools.partial(self._http_request, raise_for_status=False)
+            )
 
             # Fetch pack data from dumb remote
             pack_data_list = list(
@@ -2829,6 +2832,8 @@ class AbstractHttpGitClient(GitClient):
                 )
             )
 
+            symrefs[b"HEAD"] = dumb_repo.get_head()
+
             # Write pack data
             if pack_data:
                 from .pack import write_pack_data
@@ -2983,7 +2988,7 @@ class Urllib3HttpGitClient(AbstractHttpGitClient):
             path = path.decode("utf-8")
         return urljoin(self._base_url, path).rstrip("/") + "/"
 
-    def _http_request(self, url, headers=None, data=None):
+    def _http_request(self, url, headers=None, data=None, raise_for_status=True):
         import urllib3.exceptions
 
         req_headers = self.pool_manager.headers.copy()
@@ -3007,14 +3012,15 @@ class Urllib3HttpGitClient(AbstractHttpGitClient):
         except urllib3.exceptions.HTTPError as e:
             raise GitProtocolError(str(e)) from e
 
-        if resp.status == 404:
-            raise NotGitRepository
-        if resp.status == 401:
-            raise HTTPUnauthorized(resp.headers.get("WWW-Authenticate"), url)
-        if resp.status == 407:
-            raise HTTPProxyUnauthorized(resp.headers.get("Proxy-Authenticate"), url)
-        if resp.status != 200:
-            raise GitProtocolError(f"unexpected http resp {resp.status} for {url}")
+        if raise_for_status:
+            if resp.status == 404:
+                raise NotGitRepository
+            if resp.status == 401:
+                raise HTTPUnauthorized(resp.headers.get("WWW-Authenticate"), url)
+            if resp.status == 407:
+                raise HTTPProxyUnauthorized(resp.headers.get("Proxy-Authenticate"), url)
+            if resp.status != 200:
+                raise GitProtocolError(f"unexpected http resp {resp.status} for {url}")
 
         resp.content_type = resp.headers.get("Content-Type")
         # Check if geturl() is available (urllib3 version >= 1.23)

+ 21 - 9
dulwich/dumb.py

@@ -42,7 +42,7 @@ from .objects import (
     hex_to_sha,
     sha_to_hex,
 )
-from .pack import Pack, PackIndex, UnpackedObject, load_pack_index_file
+from .pack import Pack, PackData, PackIndex, UnpackedObject, load_pack_index_file
 from .refs import Ref, read_info_refs, split_peeled_refs
 from .repo import BaseRepo
 
@@ -212,13 +212,13 @@ class DumbHTTPObjectStore(BaseObjectStore):
         # Convert hex to binary for pack operations
         binsha = hex_to_sha(sha)
 
-        for pack_name, idx in self._packs or []:
-            if idx is None:
-                idx = self._get_pack_index(pack_name)
+        for pack_name, pack_idx in self._packs or []:
+            if pack_idx is None:
+                pack_idx = self._get_pack_index(pack_name)
 
             try:
                 # Check if object is in this pack
-                idx.object_offset(binsha)
+                pack_idx.object_offset(binsha)
             except KeyError:
                 continue
 
@@ -232,12 +232,13 @@ class DumbHTTPObjectStore(BaseObjectStore):
 
             if not os.path.exists(pack_path):
                 # Download the pack file
-                pack_data = self._fetch_url(f"objects/pack/{pack_name}.pack")
+                data = self._fetch_url(f"objects/pack/{pack_name}.pack")
                 with open(pack_path, "wb") as f:
-                    f.write(pack_data)
+                    f.write(data)
 
             # Open the pack and get the object
-            pack = Pack(pack_path[:-5])  # Remove .pack extension
+            pack_data = PackData(pack_path)
+            pack = Pack.from_objects(pack_data, pack_idx)
             try:
                 return pack.get_raw(binsha)
             finally:
@@ -398,6 +399,17 @@ class DumbRemoteHTTPRepo(BaseRepo):
 
         return dict(self._refs)
 
+    def get_head(self) -> Ref:
+        head_resp_bytes = self._fetch_url("HEAD")
+        head_split = head_resp_bytes.replace(b"\n", b"").split(b" ")
+        head_target = head_split[1] if len(head_split) > 1 else head_split[0]
+        # handle HEAD legacy format containing a commit id instead of a ref name
+        for ref_name, ret_target in self.get_refs().items():
+            if ret_target == head_target:
+                head_target = ref_name
+                break
+        return head_target
+
     def get_peeled(self, ref: Ref) -> ObjectID:
         """Get the peeled value of a ref."""
         # For dumb HTTP, we don't have peeled refs readily available
@@ -461,4 +473,4 @@ class DumbRemoteHTTPRepo(BaseRepo):
                     to_fetch.add(item_sha)
 
             if progress:
-                progress(f"Fetching objects: {len(seen)} done")
+                progress(f"Fetching objects: {len(seen)} done\n".encode())

+ 35 - 9
tests/compat/test_dumb.py

@@ -29,6 +29,7 @@ from http.server import HTTPServer, SimpleHTTPRequestHandler
 from unittest import skipUnless
 
 from dulwich.client import HttpGitClient
+from dulwich.porcelain import clone
 from dulwich.repo import Repo
 from tests.compat.utils import (
     CompatTestCase,
@@ -100,9 +101,11 @@ class DumbHTTPGitServer:
         return f"http://127.0.0.1:{self.port}"
 
 
-class DumbHTTPClientTests(CompatTestCase):
+class DumbHTTPClientNoPackTests(CompatTestCase):
     """Tests for dumb HTTP client against real git repositories."""
 
+    with_pack = False
+
     def setUp(self):
         super().setUp()
         # Create a temporary directory for test repos
@@ -123,12 +126,17 @@ class DumbHTTPClientTests(CompatTestCase):
         )
         run_git_or_fail(["config", "user.name", "Test User"], cwd=self.work_path)
 
-        # Create initial commit
-        test_file = os.path.join(self.work_path, "test.txt")
-        with open(test_file, "w") as f:
-            f.write("Hello, world!\n")
-        run_git_or_fail(["add", "test.txt"], cwd=self.work_path)
-        run_git_or_fail(["commit", "-m", "Initial commit"], cwd=self.work_path)
+        nb_files = 10
+        if self.with_pack:
+            # adding more files will create a pack file in the repository
+            nb_files = 50
+
+        for i in range(nb_files):
+            test_file = os.path.join(self.work_path, f"test{i}.txt")
+            with open(test_file, "w") as f:
+                f.write(f"Hello, world {i}!\n")
+            run_git_or_fail(["add", f"test{i}.txt"], cwd=self.work_path)
+            run_git_or_fail(["commit", "-m", f"Commit {i}"], cwd=self.work_path)
 
         # Push to origin
         run_git_or_fail(
@@ -144,6 +152,20 @@ class DumbHTTPClientTests(CompatTestCase):
         self.server.start()
         self.addCleanup(self.server.stop)
 
+        pack_dir = os.path.join(self.origin_path, "objects", "pack")
+        if self.with_pack:
+            assert os.listdir(pack_dir)
+        else:
+            assert not os.listdir(pack_dir)
+
+    @skipUnless(
+        sys.platform != "win32", "git clone from Python HTTPServer fails on Windows"
+    )
+    def test_clone_dumb(self):
+        dest_path = os.path.join(self.temp_dir, "cloned")
+        repo = clone(self.server.url, dest_path)
+        assert b"HEAD" in repo
+
     def test_clone_from_dumb_http(self):
         """Test cloning from a dumb HTTP server."""
         dest_path = os.path.join(self.temp_dir, "cloned")
@@ -172,10 +194,10 @@ class DumbHTTPClientTests(CompatTestCase):
             dest_repo.reset_index()
 
             # Verify the clone
-            test_file = os.path.join(dest_path, "test.txt")
+            test_file = os.path.join(dest_path, "test0.txt")
             self.assertTrue(os.path.exists(test_file))
             with open(test_file) as f:
-                self.assertEqual("Hello, world!\n", f.read())
+                self.assertEqual("Hello, world 0!\n", f.read())
         finally:
             # Ensure repo is closed before cleanup
             dest_repo.close()
@@ -276,3 +298,7 @@ class DumbHTTPClientTests(CompatTestCase):
         finally:
             # Ensure repo is closed before cleanup
             dest_repo.close()
+
+
+class DumbHTTPClientWithPackTests(DumbHTTPClientNoPackTests):
+    with_pack = True

+ 6 - 0
tests/test_client.py

@@ -1305,6 +1305,7 @@ class HttpGitClientTests(TestCase):
         info_refs_content = (
             b"0123456789abcdef0123456789abcdef01234567\trefs/heads/master\n"
         )
+        head_content = b"ref: refs/heads/master"
 
         # Create a blob object for testing
         blob_content = b"Hello, dumb HTTP!"
@@ -1316,6 +1317,11 @@ class HttpGitClientTests(TestCase):
         blob_compressed = zlib.compress(blob_obj_data)
 
         responses = {
+            "/HEAD": {
+                "status": 200,
+                "content": head_content,
+                "content_type": "text/plain",
+            },
             "/git-upload-pack": {
                 "status": 404,
                 "content": b"Not Found",

+ 6 - 1
tests/test_dumb.py

@@ -280,7 +280,12 @@ fedcba9876543210fedcba9876543210fedcba98\trefs/tags/v1.0
         def determine_wants(refs):
             return [blob_sha]
 
-        result = list(self.repo.fetch_pack_data(graph_walker, determine_wants))
+        def progress(msg):
+            assert isinstance(msg, bytes)
+
+        result = list(
+            self.repo.fetch_pack_data(graph_walker, determine_wants, progress)
+        )
         self.assertEqual(1, len(result))
         self.assertEqual(Blob.type_num, result[0].pack_type_num)
         self.assertEqual([blob.as_raw_string()], result[0].obj_chunks)