lfs.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
  1. # lfs.py -- Implementation of the LFS
  2. # Copyright (C) 2020 Jelmer Vernooij
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  5. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  6. # General Public License as published by the Free Software Foundation; version 2.0
  7. # or (at your option) any later version. You can redistribute it and/or
  8. # modify it under the terms of either of these two licenses.
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # You should have received a copy of the licenses; if not, see
  17. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19. # License, Version 2.0.
  20. #
  21. """Git Large File Storage (LFS) support.
  22. This module provides support for Git LFS, which is a Git extension for
  23. versioning large files. It replaces large files with text pointers inside Git,
  24. while storing the file contents on a remote server.
  25. Key components:
  26. - LFS pointer file parsing and creation
  27. - LFS object storage and retrieval
  28. - HTTP client for LFS server communication
  29. - Integration with dulwich repositories
  30. """
  31. import hashlib
  32. import json
  33. import logging
  34. logger = logging.getLogger(__name__)
  35. import os
  36. import tempfile
  37. from collections.abc import Iterable
  38. from dataclasses import dataclass
  39. from typing import TYPE_CHECKING, BinaryIO, Optional, Union
  40. from urllib.parse import urljoin, urlparse
  41. from urllib.request import Request, urlopen
  42. if TYPE_CHECKING:
  43. import urllib3
  44. from .config import Config
  45. from .repo import Repo
  46. @dataclass
  47. class LFSAction:
  48. """LFS action structure."""
  49. href: str
  50. header: Optional[dict[str, str]] = None
  51. expires_at: Optional[str] = None
  52. @dataclass
  53. class LFSErrorInfo:
  54. """LFS error structure."""
  55. code: int
  56. message: str
  57. @dataclass
  58. class LFSBatchObject:
  59. """LFS batch object structure."""
  60. oid: str
  61. size: int
  62. authenticated: Optional[bool] = None
  63. actions: Optional[dict[str, LFSAction]] = None
  64. error: Optional[LFSErrorInfo] = None
  65. @dataclass
  66. class LFSBatchResponse:
  67. """LFS batch response structure."""
  68. transfer: str
  69. objects: list[LFSBatchObject]
  70. hash_algo: Optional[str] = None
  71. class LFSStore:
  72. """Stores objects on disk, indexed by SHA256."""
  73. def __init__(self, path: str) -> None:
  74. """Initialize LFSStore."""
  75. self.path = path
  76. @classmethod
  77. def create(cls, lfs_dir: str) -> "LFSStore":
  78. """Create a new LFS store."""
  79. if not os.path.isdir(lfs_dir):
  80. os.mkdir(lfs_dir)
  81. tmp_dir = os.path.join(lfs_dir, "tmp")
  82. if not os.path.isdir(tmp_dir):
  83. os.mkdir(tmp_dir)
  84. objects_dir = os.path.join(lfs_dir, "objects")
  85. if not os.path.isdir(objects_dir):
  86. os.mkdir(objects_dir)
  87. return cls(lfs_dir)
  88. @classmethod
  89. def from_repo(cls, repo: "Repo", create: bool = False) -> "LFSStore":
  90. """Create LFS store from repository."""
  91. lfs_dir = os.path.join(repo.controldir(), "lfs")
  92. if create:
  93. return cls.create(lfs_dir)
  94. return cls(lfs_dir)
  95. @classmethod
  96. def from_controldir(cls, controldir: str, create: bool = False) -> "LFSStore":
  97. """Create LFS store from control directory."""
  98. lfs_dir = os.path.join(controldir, "lfs")
  99. if create:
  100. return cls.create(lfs_dir)
  101. return cls(lfs_dir)
  102. def _sha_path(self, sha: str) -> str:
  103. return os.path.join(self.path, "objects", sha[0:2], sha[2:4], sha)
  104. def open_object(self, sha: str) -> BinaryIO:
  105. """Open an object by sha."""
  106. try:
  107. return open(self._sha_path(sha), "rb")
  108. except FileNotFoundError as exc:
  109. raise KeyError(sha) from exc
  110. def write_object(self, chunks: Iterable[bytes]) -> str:
  111. """Write an object.
  112. Returns: object SHA
  113. """
  114. sha = hashlib.sha256()
  115. tmpdir = os.path.join(self.path, "tmp")
  116. with tempfile.NamedTemporaryFile(dir=tmpdir, mode="wb", delete=False) as f:
  117. for chunk in chunks:
  118. sha.update(chunk)
  119. f.write(chunk)
  120. f.flush()
  121. tmppath = f.name
  122. path = self._sha_path(sha.hexdigest())
  123. if not os.path.exists(os.path.dirname(path)):
  124. os.makedirs(os.path.dirname(path))
  125. # Handle concurrent writes - if file already exists, just remove temp file
  126. if os.path.exists(path):
  127. os.remove(tmppath)
  128. else:
  129. os.rename(tmppath, path)
  130. return sha.hexdigest()
  131. class LFSPointer:
  132. """Represents an LFS pointer file."""
  133. def __init__(self, oid: str, size: int) -> None:
  134. """Initialize LFSPointer."""
  135. self.oid = oid
  136. self.size = size
  137. @classmethod
  138. def from_bytes(cls, data: bytes) -> Optional["LFSPointer"]:
  139. """Parse LFS pointer from bytes.
  140. Returns None if data is not a valid LFS pointer.
  141. """
  142. try:
  143. text = data.decode("utf-8")
  144. except UnicodeDecodeError:
  145. return None
  146. # LFS pointer files have a specific format
  147. lines = text.strip().split("\n")
  148. if len(lines) < 3:
  149. return None
  150. # Must start with version
  151. if not lines[0].startswith("version https://git-lfs.github.com/spec/v1"):
  152. return None
  153. oid = None
  154. size = None
  155. for line in lines[1:]:
  156. if line.startswith("oid sha256:"):
  157. oid = line[11:].strip()
  158. elif line.startswith("size "):
  159. try:
  160. size = int(line[5:].strip())
  161. # Size must be non-negative
  162. if size < 0:
  163. return None
  164. except ValueError:
  165. return None
  166. if oid is None or size is None:
  167. return None
  168. return cls(oid, size)
  169. def to_bytes(self) -> bytes:
  170. """Convert LFS pointer to bytes."""
  171. return (
  172. f"version https://git-lfs.github.com/spec/v1\n"
  173. f"oid sha256:{self.oid}\n"
  174. f"size {self.size}\n"
  175. ).encode()
  176. def is_valid_oid(self) -> bool:
  177. """Check if the OID is valid SHA256."""
  178. if len(self.oid) != 64:
  179. return False
  180. try:
  181. int(self.oid, 16)
  182. return True
  183. except ValueError:
  184. return False
  185. class LFSFilterDriver:
  186. """LFS filter driver implementation."""
  187. def __init__(
  188. self, lfs_store: "LFSStore", config: Optional["Config"] = None
  189. ) -> None:
  190. """Initialize LFSFilterDriver."""
  191. self.lfs_store = lfs_store
  192. self.config = config
  193. def clean(self, data: bytes) -> bytes:
  194. """Convert file content to LFS pointer (clean filter)."""
  195. # Check if data is already an LFS pointer
  196. pointer = LFSPointer.from_bytes(data)
  197. if pointer is not None:
  198. return data
  199. # Store the file content in LFS
  200. sha = self.lfs_store.write_object([data])
  201. # Create and return LFS pointer
  202. pointer = LFSPointer(sha, len(data))
  203. return pointer.to_bytes()
  204. def smudge(self, data: bytes, path: bytes = b"") -> bytes:
  205. """Convert LFS pointer to file content (smudge filter)."""
  206. # Try to parse as LFS pointer
  207. pointer = LFSPointer.from_bytes(data)
  208. if pointer is None:
  209. # Not an LFS pointer, return as-is
  210. return data
  211. # Validate the pointer
  212. if not pointer.is_valid_oid():
  213. return data
  214. try:
  215. # Read the actual content from LFS store
  216. with self.lfs_store.open_object(pointer.oid) as f:
  217. return f.read()
  218. except KeyError:
  219. # Object not found in LFS store, try to download it
  220. try:
  221. content = self._download_object(pointer)
  222. return content
  223. except LFSError as e:
  224. # Download failed, fall back to returning pointer
  225. logger.warning("LFS object download failed for %s: %s", pointer.oid, e)
  226. # Return pointer as-is when object is missing and download failed
  227. return data
  228. def _download_object(self, pointer: LFSPointer) -> bytes:
  229. """Download an LFS object from the server.
  230. Args:
  231. pointer: LFS pointer containing OID and size
  232. Returns:
  233. Downloaded content
  234. Raises:
  235. LFSError: If download fails for any reason
  236. """
  237. if self.config is None:
  238. raise LFSError("No configuration available for LFS download")
  239. # Create LFS client and download
  240. client = LFSClient.from_config(self.config)
  241. if client is None:
  242. raise LFSError("No LFS client available from configuration")
  243. content = client.download(pointer.oid, pointer.size)
  244. # Store the downloaded content in local LFS store
  245. stored_oid = self.lfs_store.write_object([content])
  246. # Verify the stored OID matches what we expected
  247. if stored_oid != pointer.oid:
  248. raise LFSError(
  249. f"Downloaded OID mismatch: expected {pointer.oid}, got {stored_oid}"
  250. )
  251. return content
  252. def _get_lfs_user_agent(config: Optional["Config"]) -> str:
  253. """Get User-Agent string for LFS requests, respecting git config."""
  254. try:
  255. if config:
  256. # Use configured user agent verbatim if set
  257. return config.get(b"http", b"useragent").decode()
  258. except KeyError:
  259. pass
  260. # Default LFS user agent (similar to git-lfs format)
  261. from . import __version__
  262. version_str = ".".join([str(x) for x in __version__])
  263. return f"git-lfs/dulwich/{version_str}"
  264. class LFSClient:
  265. """LFS client for network operations."""
  266. def __init__(self, url: str, config: Optional["Config"] = None) -> None:
  267. """Initialize LFS client.
  268. Args:
  269. url: LFS server URL
  270. config: Optional git config for authentication/proxy settings
  271. """
  272. self._base_url = url.rstrip("/") + "/" # Ensure trailing slash for urljoin
  273. self.config = config
  274. self._pool_manager: Optional[urllib3.PoolManager] = None
  275. @classmethod
  276. def from_config(cls, config: "Config") -> Optional["LFSClient"]:
  277. """Create LFS client from git config."""
  278. # Try to get LFS URL from config first
  279. try:
  280. url = config.get((b"lfs",), b"url").decode()
  281. except KeyError:
  282. pass
  283. else:
  284. return cls(url, config)
  285. # Fall back to deriving from remote URL (same as git-lfs)
  286. try:
  287. remote_url = config.get((b"remote", b"origin"), b"url").decode()
  288. except KeyError:
  289. pass
  290. else:
  291. # Convert SSH URLs to HTTPS if needed
  292. if remote_url.startswith("git@"):
  293. # Convert git@host:user/repo.git to https://host/user/repo.git
  294. if ":" in remote_url and "/" in remote_url:
  295. host_and_path = remote_url[4:] # Remove "git@"
  296. if ":" in host_and_path:
  297. host, path = host_and_path.split(":", 1)
  298. remote_url = f"https://{host}/{path}"
  299. # Ensure URL ends with .git for consistent LFS endpoint
  300. if not remote_url.endswith(".git"):
  301. remote_url = f"{remote_url}.git"
  302. # Standard LFS endpoint is remote_url + "/info/lfs"
  303. lfs_url = f"{remote_url}/info/lfs"
  304. parsed = urlparse(lfs_url)
  305. if not parsed.scheme or not parsed.netloc:
  306. return None
  307. return LFSClient(lfs_url, config)
  308. return None
  309. @property
  310. def url(self) -> str:
  311. """Get the LFS server URL without trailing slash."""
  312. return self._base_url.rstrip("/")
  313. def _get_pool_manager(self) -> "urllib3.PoolManager":
  314. """Get urllib3 pool manager with git config applied."""
  315. if self._pool_manager is None:
  316. from dulwich.client import default_urllib3_manager
  317. self._pool_manager = default_urllib3_manager(self.config) # type: ignore[assignment]
  318. return self._pool_manager
  319. def _make_request(
  320. self,
  321. method: str,
  322. path: str,
  323. data: Optional[bytes] = None,
  324. headers: Optional[dict[str, str]] = None,
  325. ) -> bytes:
  326. """Make an HTTP request to the LFS server."""
  327. url = urljoin(self._base_url, path)
  328. req_headers = {
  329. "Accept": "application/vnd.git-lfs+json",
  330. "Content-Type": "application/vnd.git-lfs+json",
  331. "User-Agent": _get_lfs_user_agent(self.config),
  332. }
  333. if headers:
  334. req_headers.update(headers)
  335. # Use urllib3 pool manager with git config applied
  336. pool_manager = self._get_pool_manager()
  337. response = pool_manager.request(method, url, headers=req_headers, body=data)
  338. if response.status >= 400:
  339. raise ValueError(
  340. f"HTTP {response.status}: {response.data.decode('utf-8', errors='ignore')}"
  341. )
  342. return response.data # type: ignore[return-value]
  343. def batch(
  344. self,
  345. operation: str,
  346. objects: list[dict[str, Union[str, int]]],
  347. ref: Optional[str] = None,
  348. ) -> LFSBatchResponse:
  349. """Perform batch operation to get transfer URLs.
  350. Args:
  351. operation: "download" or "upload"
  352. objects: List of {"oid": str, "size": int} dicts
  353. ref: Optional ref name
  354. Returns:
  355. Batch response from server
  356. """
  357. data: dict[
  358. str, Union[str, list[str], list[dict[str, Union[str, int]]], dict[str, str]]
  359. ] = {
  360. "operation": operation,
  361. "transfers": ["basic"],
  362. "objects": objects,
  363. }
  364. if ref:
  365. data["ref"] = {"name": ref}
  366. response = self._make_request(
  367. "POST", "objects/batch", json.dumps(data).encode("utf-8")
  368. )
  369. if not response:
  370. raise ValueError("Empty response from LFS server")
  371. response_data = json.loads(response)
  372. return self._parse_batch_response(response_data)
  373. def _parse_batch_response(self, data: dict) -> LFSBatchResponse:
  374. """Parse JSON response into LFSBatchResponse dataclass."""
  375. objects = []
  376. for obj_data in data.get("objects", []):
  377. actions = None
  378. if "actions" in obj_data:
  379. actions = {}
  380. for action_name, action_data in obj_data["actions"].items():
  381. actions[action_name] = LFSAction(
  382. href=action_data["href"],
  383. header=action_data.get("header"),
  384. expires_at=action_data.get("expires_at"),
  385. )
  386. error = None
  387. if "error" in obj_data:
  388. error = LFSErrorInfo(
  389. code=obj_data["error"]["code"], message=obj_data["error"]["message"]
  390. )
  391. batch_obj = LFSBatchObject(
  392. oid=obj_data["oid"],
  393. size=obj_data["size"],
  394. authenticated=obj_data.get("authenticated"),
  395. actions=actions,
  396. error=error,
  397. )
  398. objects.append(batch_obj)
  399. return LFSBatchResponse(
  400. transfer=data.get("transfer", "basic"),
  401. objects=objects,
  402. hash_algo=data.get("hash_algo"),
  403. )
  404. def download(self, oid: str, size: int, ref: Optional[str] = None) -> bytes:
  405. """Download an LFS object.
  406. Args:
  407. oid: Object ID (SHA256)
  408. size: Expected size
  409. ref: Optional ref name
  410. Returns:
  411. Object content
  412. """
  413. # Get download URL via batch API
  414. batch_resp = self.batch("download", [{"oid": oid, "size": size}], ref)
  415. if not batch_resp.objects:
  416. raise LFSError(f"No objects returned for {oid}")
  417. obj = batch_resp.objects[0]
  418. if obj.error:
  419. raise LFSError(f"Server error for {oid}: {obj.error.message}")
  420. if not obj.actions or "download" not in obj.actions:
  421. raise LFSError(f"No download actions for {oid}")
  422. download_action = obj.actions["download"]
  423. download_url = download_action.href
  424. # Download the object using urllib3 with git config
  425. download_headers = {"User-Agent": _get_lfs_user_agent(self.config)}
  426. if download_action.header:
  427. download_headers.update(download_action.header)
  428. pool_manager = self._get_pool_manager()
  429. response = pool_manager.request("GET", download_url, headers=download_headers)
  430. content = response.data
  431. # Verify size
  432. if len(content) != size:
  433. raise LFSError(f"Downloaded size {len(content)} != expected {size}")
  434. # Verify SHA256
  435. actual_oid = hashlib.sha256(content).hexdigest()
  436. if actual_oid != oid:
  437. raise LFSError(f"Downloaded OID {actual_oid} != expected {oid}")
  438. return content # type: ignore[return-value]
  439. def upload(
  440. self, oid: str, size: int, content: bytes, ref: Optional[str] = None
  441. ) -> None:
  442. """Upload an LFS object.
  443. Args:
  444. oid: Object ID (SHA256)
  445. size: Object size
  446. content: Object content
  447. ref: Optional ref name
  448. """
  449. # Get upload URL via batch API
  450. batch_resp = self.batch("upload", [{"oid": oid, "size": size}], ref)
  451. if not batch_resp.objects:
  452. raise LFSError(f"No objects returned for {oid}")
  453. obj = batch_resp.objects[0]
  454. if obj.error:
  455. raise LFSError(f"Server error for {oid}: {obj.error.message}")
  456. # If no actions, object already exists
  457. if not obj.actions:
  458. return
  459. if "upload" not in obj.actions:
  460. raise LFSError(f"No upload action for {oid}")
  461. upload_action = obj.actions["upload"]
  462. upload_url = upload_action.href
  463. # Upload the object
  464. req = Request(upload_url, data=content, method="PUT")
  465. if upload_action.header:
  466. for name, value in upload_action.header.items():
  467. req.add_header(name, value)
  468. with urlopen(req) as response:
  469. if response.status >= 400:
  470. raise LFSError(f"Upload failed with status {response.status}")
  471. # Verify if needed
  472. if obj.actions and "verify" in obj.actions:
  473. verify_action = obj.actions["verify"]
  474. verify_data = json.dumps({"oid": oid, "size": size}).encode("utf-8")
  475. req = Request(verify_action.href, data=verify_data, method="POST")
  476. req.add_header("Content-Type", "application/vnd.git-lfs+json")
  477. if verify_action.header:
  478. for name, value in verify_action.header.items():
  479. req.add_header(name, value)
  480. with urlopen(req) as response:
  481. if response.status >= 400:
  482. raise LFSError(f"Verification failed with status {response.status}")
  483. class LFSError(Exception):
  484. """LFS-specific error."""