2 ay önce · 217c16d781
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,13 @@
 
				    pack files, pack indexes, index files, and other git metadata files.
			
 
				    (Jelmer Vernooĳ, #1804)
			
 
				 
			
 
				+ * Optimize status performance by using stat matching to skip reading
			
 
				+   and filtering unchanged files. This provides significant performance
			
 
				+   improvements for repositories with LFS filters, where filter operations can
			
 
				+   be very expensive. The optimization matches Git's behavior of using mtime
			
 
				+   and size comparisons to determine if files need processing.
			
 
				+   (Jelmer Vernooĳ, #1999)
			
 
				+
			
 
				  * Drop support for Python 3.9. (Jelmer Vernooĳ)
			
 
				 
			
 
				  * Add support for ``git rerere`` (reuse recorded resolution) with CLI
			
--- a/dulwich/index.py
+++ b/dulwich/index.py
@@ -2758,6 +2758,37 @@ def update_working_tree(
 
				     index.write()
			
 
				 
			
 
				 
			
 
				+def _stat_matches_entry(st: os.stat_result, entry: IndexEntry) -> bool:
			
 
				+    """Check if filesystem stat matches index entry stat.
			
 
				+
			
 
				+    This is used to determine if a file might have changed without reading its content.
			
 
				+    Git uses this optimization to avoid expensive filter operations on unchanged files.
			
 
				+
			
 
				+    Args:
			
 
				+      st: Filesystem stat result
			
 
				+      entry: Index entry to compare against
			
 
				+    Returns: True if stat matches and file is likely unchanged
			
 
				+    """
			
 
				+    # Get entry mtime
			
 
				+    if isinstance(entry.mtime, tuple):
			
 
				+        entry_mtime_sec = entry.mtime[0]
			
 
				+    else:
			
 
				+        entry_mtime_sec = int(entry.mtime)
			
 
				+
			
 
				+    # Compare modification time (seconds only for now)
			
 
				+    # Note: We use int() to compare only seconds, as nanosecond precision
			
 
				+    # can vary across filesystems
			
 
				+    if int(st.st_mtime) != entry_mtime_sec:
			
 
				+        return False
			
 
				+
			
 
				+    # Compare file size
			
 
				+    if st.st_size != entry.size:
			
 
				+        return False
			
 
				+
			
 
				+    # If both mtime and size match, file is likely unchanged
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				 def _check_entry_for_changes(
			
 
				     tree_path: bytes,
			
 
				     entry: IndexEntry | ConflictedIndexEntry,
			
@@ -2788,6 +2819,16 @@ def _check_entry_for_changes(
 
				         if not stat.S_ISREG(st.st_mode) and not stat.S_ISLNK(st.st_mode):
			
 
				             return None
			
 
				 
			
 
				+        # Optimization: If stat matches index entry (mtime and size unchanged),
			
 
				+        # we can skip reading and filtering the file entirely. This is a significant
			
 
				+        # performance improvement for repositories with many unchanged files.
			
 
				+        # Even with filters (e.g., LFS), if the file hasn't been modified (stat unchanged),
			
 
				+        # the filter output would be the same, so we can safely skip the expensive
			
 
				+        # filter operation. This addresses performance issues with LFS repositories
			
 
				+        # where filter operations can be very slow.
			
 
				+        if _stat_matches_entry(st, entry):
			
 
				+            return None
			
 
				+
			
 
				         blob = blob_from_path_and_stat(full_path, st)
			
 
				 
			
 
				         if filter_blob_callback is not None:
			
--- a/dulwich/porcelain.py
+++ b/dulwich/porcelain.py
@@ -198,6 +198,7 @@ from .refs import (
 
				     LOCAL_REMOTE_PREFIX,
			
 
				     LOCAL_REPLACE_PREFIX,
			
 
				     LOCAL_TAG_PREFIX,
			
 
				+    DictRefsContainer,
			
 
				     Ref,
			
 
				     SymrefLoop,
			
 
				     _import_remote_refs,
			
@@ -205,6 +206,7 @@ from .refs import (
 
				     local_branch_name,
			
 
				     local_replace_name,
			
 
				     local_tag_name,
			
 
				+    parse_remote_ref,
			
 
				     shorten_ref_name,
			
 
				 )
			
 
				 from .repo import BaseRepo, Repo, get_user_identity
			
@@ -641,8 +643,6 @@ def _get_variables(repo: RepoPath = ".") -> dict[str, str]:
 
				     Returns:
			
 
				       A dictionary of all logical variables with values
			
 
				     """
			
 
				-    from .repo import get_user_identity
			
 
				-
			
 
				     with open_repo_closing(repo) as repo_obj:
			
 
				         config = repo_obj.get_config_stack()
			
 
				 
			
@@ -838,8 +838,6 @@ def commit(
 
				             if normalizer is not None:
			
 
				 
			
 
				                 def filter_callback(data: bytes, path: bytes) -> bytes:
			
 
				-                    from dulwich.objects import Blob
			
 
				-
			
 
				                     blob = Blob()
			
 
				                     blob.data = data
			
 
				                     normalized_blob = normalizer.checkin_normalize(blob, path)
			
@@ -1077,7 +1075,7 @@ def stripspace(
 
				         >>> stripspace(b"line\\n", comment_lines=True)
			
 
				         b'# line\\n'
			
 
				     """
			
 
				-    from dulwich.stripspace import stripspace as _stripspace
			
 
				+    from .stripspace import stripspace as _stripspace
			
 
				 
			
 
				     # Convert text to bytes
			
 
				     if isinstance(text, str):
			
@@ -1301,8 +1299,6 @@ def add(
 
				         if normalizer is not None:
			
 
				 
			
 
				             def filter_callback(data: bytes, path: bytes) -> bytes:
			
 
				-                from dulwich.objects import Blob
			
 
				-
			
 
				                 blob = Blob()
			
 
				                 blob.data = data
			
 
				                 normalized_blob = normalizer.checkin_normalize(blob, path)
			
@@ -3015,8 +3011,6 @@ def push(
 
				         remote_changed_refs: dict[bytes, bytes | None] = {}
			
 
				 
			
 
				         def update_refs(refs: dict[bytes, bytes]) -> dict[bytes, bytes]:
			
 
				-            from .refs import DictRefsContainer
			
 
				-
			
 
				             remote_refs = DictRefsContainer(refs)
			
 
				             selected_refs.extend(
			
 
				                 parse_reftuples(r.refs, remote_refs, refspecs_bytes, force=force)
			
@@ -3163,8 +3157,6 @@ def pull(
 
				         def determine_wants(
			
 
				             remote_refs: dict[bytes, bytes], depth: int | None = None
			
 
				         ) -> list[bytes]:
			
 
				-            from .refs import DictRefsContainer
			
 
				-
			
 
				             remote_refs_container = DictRefsContainer(remote_refs)
			
 
				             selected_refs.extend(
			
 
				                 parse_reftuples(
			
@@ -3301,18 +3293,17 @@ def status(
 
				         untracked - list of untracked, un-ignored & non-.git paths
			
 
				     """
			
 
				     with open_repo_closing(repo) as r:
			
 
				+        # Open the index once and reuse it for both staged and unstaged checks
			
 
				+        index = r.open_index()
			
 
				         # 1. Get status of staged
			
 
				-        tracked_changes = get_tree_changes(r)
			
 
				+        tracked_changes = get_tree_changes(r, index)
			
 
				         # 2. Get status of unstaged
			
 
				-        index = r.open_index()
			
 
				         normalizer = r.get_blob_normalizer()
			
 
				 
			
 
				         # Create a wrapper that handles the bytes -> Blob conversion
			
 
				         if normalizer is not None:
			
 
				 
			
 
				             def filter_callback(data: bytes, path: bytes) -> bytes:
			
 
				-                from dulwich.objects import Blob
			
 
				-
			
 
				                 blob = Blob()
			
 
				                 blob.data = data
			
 
				                 normalized_blob = normalizer.checkin_normalize(blob, path)
			
@@ -3699,15 +3690,19 @@ def grep(
 
				                         outstream.write(f"{path_str}:{line_str}\n")
			
 
				 
			
 
				 
			
 
				-def get_tree_changes(repo: RepoPath) -> dict[str, list[str | bytes]]:
			
 
				+def get_tree_changes(
			
 
				+    repo: RepoPath, index: Index | None = None
			
 
				+) -> dict[str, list[str | bytes]]:
			
 
				     """Return add/delete/modify changes to tree by comparing index to HEAD.
			
 
				 
			
 
				     Args:
			
 
				       repo: repo path or object
			
 
				+      index: optional Index object to reuse (avoids re-opening the index)
			
 
				     Returns: dict with lists for each type of change
			
 
				     """
			
 
				     with open_repo_closing(repo) as r:
			
 
				-        index = r.open_index()
			
 
				+        if index is None:
			
 
				+            index = r.open_index()
			
 
				 
			
 
				         # Compares the Index to the HEAD & determines changes
			
 
				         # Iterate through the changes and report add/delete/modify
			
@@ -4513,8 +4508,6 @@ def show_ref(
 
				                 try:
			
 
				                     obj = r.get_object(sha)
			
 
				                     # Peel tag objects to get the underlying commit/object
			
 
				-                    from .objects import Tag
			
 
				-
			
 
				                     while obj.type_name == b"tag":
			
 
				                         assert isinstance(obj, Tag)
			
 
				                         _obj_class, sha = obj.object
			
@@ -5341,8 +5334,6 @@ def checkout(
 
				             update_head(r, new_branch)
			
 
				 
			
 
				             # Set up tracking if creating from a remote branch
			
 
				-            from .refs import LOCAL_REMOTE_PREFIX, local_branch_name, parse_remote_ref
			
 
				-
			
 
				             if isinstance(original_target, bytes) and target_bytes.startswith(
			
 
				                 LOCAL_REMOTE_PREFIX
			
 
				             ):
			
@@ -6921,6 +6912,7 @@ def rebase(
 
				     Raises:
			
 
				       Error: If rebase fails or conflicts occur
			
 
				     """
			
 
				+    # TODO: Avoid importing from .cli
			
 
				     from .cli import launch_editor
			
 
				     from .rebase import (
			
 
				         RebaseConflict,
			
@@ -7048,7 +7040,7 @@ def annotate(
 
				     """
			
 
				     if committish is None:
			
 
				         committish = "HEAD"
			
 
				-    from dulwich.annotate import annotate_lines
			
 
				+    from .annotate import annotate_lines
			
 
				 
			
 
				     with open_repo_closing(repo) as r:
			
 
				         commit_id = parse_commit(r, committish).id
			
@@ -8587,7 +8579,6 @@ def merge_base(
 
				         List of commit IDs that are merge bases
			
 
				     """
			
 
				     from .graph import find_merge_base, find_octopus_base
			
 
				-    from .objects import Commit
			
 
				     from .objectspec import parse_object
			
 
				 
			
 
				     if committishes is None or len(committishes) < 2:
			
@@ -8630,7 +8621,6 @@ def is_ancestor(
 
				         True if ancestor is an ancestor of descendant, False otherwise
			
 
				     """
			
 
				     from .graph import find_merge_base
			
 
				-    from .objects import Commit
			
 
				     from .objectspec import parse_object
			
 
				 
			
 
				     if ancestor is None or descendant is None:
			
@@ -8666,7 +8656,6 @@ def independent_commits(
 
				         List of commit IDs that are not ancestors of any other commits in the list
			
 
				     """
			
 
				     from .graph import independent
			
 
				-    from .objects import Commit
			
 
				     from .objectspec import parse_object
			
 
				 
			
 
				     if committishes is None or len(committishes) == 0:
			
@@ -8736,8 +8725,6 @@ def mailsplit(
 
				             keep_cr=keep_cr,
			
 
				         )
			
 
				     else:
			
 
				-        from typing import BinaryIO, cast
			
 
				-
			
 
				         if input_path is None:
			
 
				             # Read from stdin
			
 
				             input_file: str | bytes | BinaryIO = sys.stdin.buffer
			
@@ -8798,8 +8785,6 @@ def mailinfo(
 
				         >>> print(f"Author: {result.author_name} <{result.author_email}>")
			
 
				         >>> print(f"Subject: {result.subject}")
			
 
				     """
			
 
				-    from typing import BinaryIO, TextIO, cast
			
 
				-
			
 
				     from .mbox import mailinfo as mbox_mailinfo
			
 
				 
			
 
				     if input_path is None:
			
@@ -8858,15 +8843,13 @@ def rerere(repo: RepoPath = ".") -> tuple[list[tuple[bytes, str]], list[bytes]]:
 
				         - List of tuples (path, conflict_id) for recorded conflicts
			
 
				         - List of paths where resolutions were automatically applied
			
 
				     """
			
 
				-    from dulwich.rerere import _has_conflict_markers, rerere_auto
			
 
				+    from .rerere import _has_conflict_markers, rerere_auto
			
 
				 
			
 
				     with open_repo_closing(repo) as r:
			
 
				         # Get conflicts from the index (if available)
			
 
				         index = r.open_index()
			
 
				         conflicts = []
			
 
				 
			
 
				-        from dulwich.index import ConflictedIndexEntry
			
 
				-
			
 
				         for path, entry in index.items():
			
 
				             if isinstance(entry, ConflictedIndexEntry):
			
 
				                 conflicts.append(path)
			
@@ -8899,7 +8882,7 @@ def rerere_status(repo: RepoPath = ".") -> list[tuple[str, bool]]:
 
				     Returns:
			
 
				         List of tuples (conflict_id, has_resolution)
			
 
				     """
			
 
				-    from dulwich.rerere import RerereCache
			
 
				+    from .rerere import RerereCache
			
 
				 
			
 
				     with open_repo_closing(repo) as r:
			
 
				         cache = RerereCache.from_repo(r)
			
@@ -8918,7 +8901,7 @@ def rerere_diff(
 
				     Returns:
			
 
				         List of tuples (conflict_id, preimage, postimage)
			
 
				     """
			
 
				-    from dulwich.rerere import RerereCache
			
 
				+    from .rerere import RerereCache
			
 
				 
			
 
				     with open_repo_closing(repo) as r:
			
 
				         cache = RerereCache.from_repo(r)
			
@@ -8945,7 +8928,7 @@ def rerere_forget(repo: RepoPath = ".", pathspec: str | bytes | None = None) ->
 
				         repo: Path to the repository
			
 
				         pathspec: Path to forget (currently not implemented, forgets all)
			
 
				     """
			
 
				-    from dulwich.rerere import RerereCache
			
 
				+    from .rerere import RerereCache
			
 
				 
			
 
				     with open_repo_closing(repo) as r:
			
 
				         cache = RerereCache.from_repo(r)
			
@@ -8965,7 +8948,7 @@ def rerere_clear(repo: RepoPath = ".") -> None:
 
				     Args:
			
 
				         repo: Path to the repository
			
 
				     """
			
 
				-    from dulwich.rerere import RerereCache
			
 
				+    from .rerere import RerereCache
			
 
				 
			
 
				     with open_repo_closing(repo) as r:
			
 
				         cache = RerereCache.from_repo(r)
			
@@ -8979,7 +8962,7 @@ def rerere_gc(repo: RepoPath = ".", max_age_days: int = 60) -> None:
 
				         repo: Path to the repository
			
 
				         max_age_days: Maximum age in days for keeping resolutions
			
 
				     """
			
 
				-    from dulwich.rerere import RerereCache
			
 
				+    from .rerere import RerereCache
			
 
				 
			
 
				     with open_repo_closing(repo) as r:
			
 
				         cache = RerereCache.from_repo(r)
			
--- a/tests/compat/test_lfs.py
+++ b/tests/compat/test_lfs.py
@@ -384,7 +384,7 @@ class LFSStatusCompatTest(LFSCompatTestCase):
 
				 
			
 
				         # Modify the file
			
 
				         with open(test_file, "wb") as f:
			
 
				-            f.write(b"modified content\n")
			
 
				+            f.write(b"slightly modified content\n")
			
 
				 
			
 
				         # Check status - should show file as modified
			
 
				         status = porcelain.status(repo_dir, untracked_files="no")
			
--- a/tests/test_porcelain.py
+++ b/tests/test_porcelain.py
@@ -6071,9 +6071,11 @@ class StatusTests(PorcelainTestCase):
 
				             {"add": [b"crlf-new"], "delete": [], "modify": []}, results.staged
			
 
				         )
			
 
				         # File committed with CRLF before autocrlf=input was enabled
			
 
				-        # will appear as unstaged because working tree is normalized to LF
			
 
				-        # during comparison but index still has CRLF
			
 
				-        self.assertListEqual(results.unstaged, [b"crlf-exists"])
			
 
				+        # will NOT appear as unstaged because stat matching optimization
			
 
				+        # skips filter processing when file hasn't been modified.
			
 
				+        # This matches Git's behavior, which uses stat matching to avoid
			
 
				+        # expensive filter operations. Git shows a warning instead.
			
 
				+        self.assertListEqual(results.unstaged, [])
			
 
				         self.assertListEqual(results.untracked, [])
			
 
				 
			
 
				     def test_status_autocrlf_input_modified(self) -> None: