Browse Source

Add support for core.preloadIndex configuration setting

This setting enables parallel stat operations when checking for unstaged
changes, improving performance on slow filesystems like NFS.

When core.preloadIndex is enabled, get_unstaged_changes() uses a thread
pool to parallelize filesystem operations. The implementation:
- Uses ThreadPoolExecutor with up to 8 workers (or CPU count if smaller)
- Falls back to serial processing if threading is unavailable
- Maintains compatibility by defaulting to serial processing

The setting is respected in both porcelain.status() and porcelain.add()
operations.
Jelmer Vernooij 1 tháng trước cách đây
mục cha
commit
ebeb83a3fd
5 tập tin đã thay đổi với 193 bổ sung28 xóa
  1. 4 0
      NEWS
  2. 88 26
      dulwich/index.py
  3. 16 2
      dulwich/porcelain.py
  4. 45 0
      tests/test_index.py
  5. 40 0
      tests/test_porcelain.py

+ 4 - 0
NEWS

@@ -2,6 +2,10 @@
 
  * Add ``reflog`` command in porcelain. (Jelmer Vernooij)
 
+ * Add support for ``core.preloadIndex`` configuration setting to enable
+   parallel stat operations when checking for unstaged changes. This improves
+   performance on slow filesystems like NFS. (Jelmer Vernooij)
+
 0.23.2	2025-07-07
 
  * Print deprecations on usage, not import.

+ 88 - 26
dulwich/index.py

@@ -1999,50 +1999,112 @@ def update_working_tree(
     index.write()
 
 
+def _check_entry_for_changes(
+    tree_path: bytes,
+    entry: Union[IndexEntry, ConflictedIndexEntry],
+    root_path: bytes,
+    filter_blob_callback: Optional[Callable] = None,
+) -> Optional[bytes]:
+    """Check a single index entry for changes.
+
+    Args:
+      tree_path: Path in the tree
+      entry: Index entry to check
+      root_path: Root filesystem path
+      filter_blob_callback: Optional callback to filter blobs
+    Returns: tree_path if changed, None otherwise
+    """
+    if isinstance(entry, ConflictedIndexEntry):
+        # Conflicted files are always unstaged
+        return tree_path
+
+    full_path = _tree_to_fs_path(root_path, tree_path)
+    try:
+        st = os.lstat(full_path)
+        if stat.S_ISDIR(st.st_mode):
+            if _has_directory_changed(tree_path, entry):
+                return tree_path
+            return None
+
+        if not stat.S_ISREG(st.st_mode) and not stat.S_ISLNK(st.st_mode):
+            return None
+
+        blob = blob_from_path_and_stat(full_path, st)
+
+        if filter_blob_callback is not None:
+            blob = filter_blob_callback(blob, tree_path)
+    except FileNotFoundError:
+        # The file was removed, so we assume that counts as
+        # different from whatever file used to exist.
+        return tree_path
+    else:
+        if blob.id != entry.sha:
+            return tree_path
+    return None
+
+
 def get_unstaged_changes(
     index: Index,
     root_path: Union[str, bytes],
     filter_blob_callback: Optional[Callable] = None,
+    preload_index: bool = False,
 ) -> Generator[bytes, None, None]:
     """Walk through an index and check for differences against working tree.
 
     Args:
       index: index to check
       root_path: path in which to find files
+      filter_blob_callback: Optional callback to filter blobs
+      preload_index: If True, use parallel threads to check files (requires threading support)
     Returns: iterator over paths with unstaged changes
     """
     # For each entry in the index check the sha1 & ensure not staged
     if not isinstance(root_path, bytes):
         root_path = os.fsencode(root_path)
 
-    for tree_path, entry in index.iteritems():
-        full_path = _tree_to_fs_path(root_path, tree_path)
-        if isinstance(entry, ConflictedIndexEntry):
-            # Conflicted files are always unstaged
-            yield tree_path
-            continue
-
+    if preload_index:
+        # Use parallel processing for better performance on slow filesystems
         try:
-            st = os.lstat(full_path)
-            if stat.S_ISDIR(st.st_mode):
-                if _has_directory_changed(tree_path, entry):
-                    yield tree_path
-                continue
-
-            if not stat.S_ISREG(st.st_mode) and not stat.S_ISLNK(st.st_mode):
-                continue
-
-            blob = blob_from_path_and_stat(full_path, st)
-
-            if filter_blob_callback is not None:
-                blob = filter_blob_callback(blob, tree_path)
-        except FileNotFoundError:
-            # The file was removed, so we assume that counts as
-            # different from whatever file used to exist.
-            yield tree_path
+            import multiprocessing
+            from concurrent.futures import ThreadPoolExecutor
+        except ImportError:
+            # If threading is not available, fall back to serial processing
+            preload_index = False
         else:
-            if blob.id != entry.sha:
-                yield tree_path
+            # Collect all entries first
+            entries = list(index.iteritems())
+
+            # Use number of CPUs but cap at 8 threads to avoid overhead
+            num_workers = min(multiprocessing.cpu_count(), 8)
+
+            # Process entries in parallel
+            with ThreadPoolExecutor(max_workers=num_workers) as executor:
+                # Submit all tasks
+                futures = [
+                    executor.submit(
+                        _check_entry_for_changes,
+                        tree_path,
+                        entry,
+                        root_path,
+                        filter_blob_callback,
+                    )
+                    for tree_path, entry in entries
+                ]
+
+                # Yield results as they complete
+                for future in futures:
+                    result = future.result()
+                    if result is not None:
+                        yield result
+
+    if not preload_index:
+        # Serial processing
+        for tree_path, entry in index.iteritems():
+            result = _check_entry_for_changes(
+                tree_path, entry, root_path, filter_blob_callback
+            )
+            if result is not None:
+                yield result
 
 
 def _tree_to_fs_path(root_path: bytes, tree_path: bytes) -> bytes:

+ 16 - 2
dulwich/porcelain.py

@@ -656,7 +656,14 @@ def add(repo: Union[str, os.PathLike, BaseRepo] = ".", paths=None):
         index = r.open_index()
         normalizer = r.get_blob_normalizer()
         filter_callback = normalizer.checkin_normalize
-        all_unstaged_paths = list(get_unstaged_changes(index, r.path, filter_callback))
+
+        # Check if core.preloadIndex is enabled
+        config = r.get_config_stack()
+        preload_index = config.get_boolean(b"core", b"preloadIndex", False)
+
+        all_unstaged_paths = list(
+            get_unstaged_changes(index, r.path, filter_callback, preload_index)
+        )
 
         if not paths:
             # When no paths specified, add all untracked and modified files from repo root
@@ -2058,7 +2065,14 @@ def status(repo=".", ignored=False, untracked_files="normal"):
         index = r.open_index()
         normalizer = r.get_blob_normalizer()
         filter_callback = normalizer.checkin_normalize
-        unstaged_changes = list(get_unstaged_changes(index, r.path, filter_callback))
+
+        # Check if core.preloadIndex is enabled
+        config = r.get_config_stack()
+        preload_index = config.get_boolean(b"core", b"preloadIndex", False)
+
+        unstaged_changes = list(
+            get_unstaged_changes(index, r.path, filter_callback, preload_index)
+        )
 
         untracked_paths = get_untracked_paths(
             r.path,

+ 45 - 0
tests/test_index.py

@@ -771,6 +771,51 @@ class GetUnstagedChangesTests(TestCase):
 
             self.assertEqual(list(changes), [b"foo1"])
 
+    def test_get_unstaged_changes_with_preload(self) -> None:
+        """Unit test for get_unstaged_changes with preload_index=True."""
+        repo_dir = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, repo_dir)
+        with Repo.init(repo_dir) as repo:
+            # Create multiple files to test parallel processing
+            files = []
+            for i in range(10):
+                filename = f"foo{i}"
+                fullpath = os.path.join(repo_dir, filename)
+                with open(fullpath, "wb") as f:
+                    f.write(b"origstuff" + str(i).encode())
+                files.append(filename)
+
+            repo.stage(files)
+            repo.do_commit(
+                b"test status",
+                author=b"author <email>",
+                committer=b"committer <email>",
+            )
+
+            # Modify some files
+            modified_files = [b"foo1", b"foo3", b"foo5", b"foo7"]
+            for filename in modified_files:
+                fullpath = os.path.join(repo_dir, filename.decode())
+                with open(fullpath, "wb") as f:
+                    f.write(b"newstuff")
+                os.utime(fullpath, (0, 0))
+
+            # Test with preload_index=False (serial)
+            changes_serial = list(
+                get_unstaged_changes(repo.open_index(), repo_dir, preload_index=False)
+            )
+            changes_serial.sort()
+
+            # Test with preload_index=True (parallel)
+            changes_parallel = list(
+                get_unstaged_changes(repo.open_index(), repo_dir, preload_index=True)
+            )
+            changes_parallel.sort()
+
+            # Both should return the same results
+            self.assertEqual(changes_serial, changes_parallel)
+            self.assertEqual(changes_serial, sorted(modified_files))
+
     def test_get_unstaged_deleted_changes(self) -> None:
         """Unit test for get_unstaged_changes."""
         repo_dir = tempfile.mkdtemp()

+ 40 - 0
tests/test_porcelain.py

@@ -4787,6 +4787,46 @@ class StatusTests(PorcelainTestCase):
         self.assertEqual(results.staged["add"][0], filename_add.encode("ascii"))
         self.assertEqual(results.unstaged, [b"foo"])
 
+    def test_status_with_core_preloadindex(self) -> None:
+        """Test status with core.preloadIndex enabled."""
+        # Set core.preloadIndex to true
+        config = self.repo.get_config()
+        config.set(b"core", b"preloadIndex", b"true")
+        config.write_to_path()
+
+        # Create multiple files
+        files = []
+        for i in range(10):
+            filename = f"file{i}"
+            fullpath = os.path.join(self.repo.path, filename)
+            with open(fullpath, "w") as f:
+                f.write(f"content{i}")
+            files.append(fullpath)
+
+        porcelain.add(repo=self.repo.path, paths=files)
+        porcelain.commit(
+            repo=self.repo.path,
+            message=b"test preload status",
+            author=b"author <email>",
+            committer=b"committer <email>",
+        )
+
+        # Modify some files
+        modified_files = ["file1", "file3", "file5", "file7"]
+        for filename in modified_files:
+            fullpath = os.path.join(self.repo.path, filename)
+            with open(fullpath, "w") as f:
+                f.write("modified content")
+            os.utime(fullpath, (0, 0))
+
+        # Status should work correctly with preloadIndex enabled
+        results = porcelain.status(self.repo)
+
+        # Check that we detected the correct unstaged changes
+        unstaged_sorted = sorted(results.unstaged)
+        expected_sorted = sorted([f.encode("ascii") for f in modified_files])
+        self.assertEqual(unstaged_sorted, expected_sorted)
+
     def test_status_all(self) -> None:
         del_path = os.path.join(self.repo.path, "foo")
         mod_path = os.path.join(self.repo.path, "bar")