Browse Source

Merge remote-tracking branch 'comet-ml/line-ending-convert-support'

Jelmer Vernooij 6 years ago
parent
commit
9c0ae13b08

+ 1 - 1
dulwich/config.py

@@ -129,7 +129,7 @@ class Config(object):
     def get_boolean(self, section, name, default=None):
         """Retrieve a configuration setting as boolean.
 
-        :param section: Tuple with section name and optional subsection namee
+        :param section: Tuple with section name and optional subsection name
         :param name: Name of the setting, including section and possible
             subsection.
         :return: Contents of the setting

+ 7 - 2
dulwich/index.py

@@ -604,7 +604,7 @@ def read_submodule_head(path):
         return None
 
 
-def get_unstaged_changes(index, root_path):
+def get_unstaged_changes(index, root_path, filter_blob_callback=None):
     """Walk through an index and check for differences against working tree.
 
     :param index: index to check
@@ -618,7 +618,12 @@ def get_unstaged_changes(index, root_path):
     for tree_path, entry in index.iteritems():
         full_path = _tree_to_fs_path(root_path, tree_path)
         try:
-            blob = blob_from_path_and_stat(full_path, os.lstat(full_path))
+            blob = blob_from_path_and_stat(
+                full_path, os.lstat(full_path)
+            )
+
+            if filter_blob_callback is not None:
+                blob = filter_blob_callback(blob, tree_path)
         except OSError as e:
             if e.errno != errno.ENOENT:
                 raise

+ 93 - 0
dulwich/line_ending.py

@@ -126,6 +126,9 @@ Sources:
 - https://adaptivepatchwork.com/2012/03/01/mind-the-end-of-your-line/
 """
 
+from dulwich.objects import Blob
+from dulwich.patch import is_binary
+
 CRLF = b"\r\n"
 LF = b"\n"
 
@@ -150,6 +153,24 @@ def convert_lf_to_crlf(text_hunk):
     return intermediary.replace(LF, CRLF)
 
 
+def get_checkout_filter(core_eol, core_autocrlf, git_attributes):
+    """ Returns the correct checkout filter based on the passed arguments
+    """
+    # TODO this function should process the git_attributes for the path and if
+    # the text attribute is not defined, fallback on the
+    # get_checkout_filter_autocrlf function with the autocrlf value
+    return get_checkout_filter_autocrlf(core_autocrlf)
+
+
+def get_checkin_filter(core_eol, core_autocrlf, git_attributes):
+    """ Returns the correct checkin filter based on the passed arguments
+    """
+    # TODO this function should process the git_attributes for the path and if
+    # the text attribute is not defined, fallback on the
+    # get_checkin_filter_autocrlf function with the autocrlf value
+    return get_checkin_filter_autocrlf(core_autocrlf)
+
+
 def get_checkout_filter_autocrlf(core_autocrlf):
     """ Returns the correct checkout filter base on autocrlf value
 
@@ -179,3 +200,75 @@ def get_checkin_filter_autocrlf(core_autocrlf):
 
     # Checking filter should never be `convert_lf_to_crlf`
     return None
+
+
+class BlobNormalizer(object):
+    """ An object to store computation result of which filter to apply based
+    on configuration, gitattributes, path and operation (checkin or checkout)
+    """
+
+    def __init__(self, config_stack, gitattributes):
+        self.config_stack = config_stack
+        self.gitattributes = gitattributes
+
+        # Compute which filters we needs based on parameters
+        try:
+            core_eol = config_stack.get("core", "eol")
+        except KeyError:
+            core_eol = "native"
+
+        try:
+            core_autocrlf = config_stack.get("core", "autocrlf").lower()
+        except KeyError:
+            core_autocrlf = False
+
+        self.fallback_read_filter = get_checkout_filter(
+            core_eol, core_autocrlf, self.gitattributes
+        )
+        self.fallback_write_filter = get_checkin_filter(
+            core_eol, core_autocrlf, self.gitattributes
+        )
+
+    def checkin_normalize(self, blob, tree_path):
+        """ Normalize a blob during a checkin operation
+        """
+        if self.fallback_write_filter is not None:
+            return normalize_blob(
+                blob, self.fallback_write_filter, binary_detection=False
+            )
+
+        return blob
+
+    def checkout_normalize(self, blob, tree_path):
+        """ Normalize a blob during a checkout operation
+        """
+        if self.fallback_read_filter is not None:
+            return normalize_blob(
+                blob, self.fallback_read_filter, binary_detection=False
+            )
+
+        return blob
+
+
+def normalize_blob(blob, conversion, binary_detection):
+    """ Takes a blob as input returns either the original blob if
+    binary_detection is True and the blob content looks like binary, else
+    return a new blob with converted data
+    """
+    # Read the original blob
+    data = blob.data
+
+    # If we need to detect if a file is binary and the file is detected as
+    # binary, do not apply the conversion function and return the original
+    # chunked text
+    if binary_detection is True:
+        if is_binary(data):
+            return blob
+
+    # Now apply the conversion
+    converted_data = conversion(data)
+
+    new_blob = Blob()
+    new_blob.data = converted_data
+
+    return new_blob

+ 5 - 1
dulwich/porcelain.py

@@ -882,7 +882,11 @@ def status(repo=".", ignored=False):
         tracked_changes = get_tree_changes(r)
         # 2. Get status of unstaged
         index = r.open_index()
-        unstaged_changes = list(get_unstaged_changes(index, r.path))
+        normalizer = r.get_blob_normalizer()
+        filter_callback = normalizer.checkin_normalize
+        unstaged_changes = list(
+            get_unstaged_changes(index, r.path, filter_callback)
+        )
         ignore_manager = IgnoreFilterManager.from_repo(r)
         untracked_paths = get_untracked_paths(r.path, r.path, index)
         if ignored:

+ 11 - 0
dulwich/repo.py

@@ -72,6 +72,8 @@ from dulwich.hooks import (
     CommitMsgShellHook,
     )
 
+from dulwich.line_ending import BlobNormalizer
+
 from dulwich.refs import (  # noqa: F401
     ANNOTATED_TAG_SUFFIX,
     check_ref_format,
@@ -1020,6 +1022,7 @@ class Repo(BaseRepo):
             _fs_to_tree_path,
             )
         index = self.open_index()
+        blob_normalizer = self.get_blob_normalizer()
         for fs_path in fs_paths:
             if not isinstance(fs_path, bytes):
                 fs_path = fs_path.encode(sys.getfilesystemencoding())
@@ -1040,6 +1043,7 @@ class Repo(BaseRepo):
             else:
                 if not stat.S_ISDIR(st.st_mode):
                     blob = blob_from_path_and_stat(full_path, st)
+                    blob = blob_normalizer.checkin_normalize(blob, fs_path)
                     self.object_store.add_object(blob)
                     index[tree_path] = index_entry_from_stat(st, blob.id, 0)
                 else:
@@ -1261,6 +1265,13 @@ class Repo(BaseRepo):
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
 
+    def get_blob_normalizer(self):
+        """ Return a BlobNormalizer object
+        """
+        # TODO Parse the git attributes files
+        git_attributes = {}
+        return BlobNormalizer(self.get_config_stack(), git_attributes)
+
 
 class MemoryRepo(BaseRepo):
     """Repo that stores refs, objects, and named files in memory.

+ 112 - 0
dulwich/tests/test_line_ending.py

@@ -23,11 +23,13 @@
 """Tests for the line ending conversion."""
 
 from dulwich.line_ending import (
+    normalize_blob,
     convert_crlf_to_lf,
     convert_lf_to_crlf,
     get_checkin_filter_autocrlf,
     get_checkout_filter_autocrlf,
 )
+from dulwich.objects import Blob
 from dulwich.tests import TestCase
 
 
@@ -92,3 +94,113 @@ class GetLineEndingAutocrlfFilters(TestCase):
         checkout_filter = get_checkout_filter_autocrlf(b"input")
 
         self.assertEqual(checkout_filter, None)
+
+
+class NormalizeBlobTestCase(TestCase):
+    def test_normalize_to_lf_no_op(self):
+        base_content = b"line1\nline2"
+        base_sha = "f8be7bb828880727816015d21abcbc37d033f233"
+
+        base_blob = Blob()
+        base_blob.set_raw_string(base_content)
+
+        self.assertEqual(base_blob.as_raw_chunks(), [base_content])
+        self.assertEqual(base_blob.sha().hexdigest(), base_sha)
+
+        filtered_blob = normalize_blob(
+            base_blob, convert_crlf_to_lf, binary_detection=False
+        )
+
+        self.assertEqual(filtered_blob.as_raw_chunks(), [base_content])
+        self.assertEqual(filtered_blob.sha().hexdigest(), base_sha)
+
+    def test_normalize_to_lf(self):
+        base_content = b"line1\r\nline2"
+        base_sha = "3a1bd7a52799fe5cf6411f1d35f4c10bacb1db96"
+
+        base_blob = Blob()
+        base_blob.set_raw_string(base_content)
+
+        self.assertEqual(base_blob.as_raw_chunks(), [base_content])
+        self.assertEqual(base_blob.sha().hexdigest(), base_sha)
+
+        filtered_blob = normalize_blob(
+            base_blob, convert_crlf_to_lf, binary_detection=False
+        )
+
+        normalized_content = b"line1\nline2"
+        normalized_sha = "f8be7bb828880727816015d21abcbc37d033f233"
+
+        self.assertEqual(filtered_blob.as_raw_chunks(), [normalized_content])
+        self.assertEqual(filtered_blob.sha().hexdigest(), normalized_sha)
+
+    def test_normalize_to_lf_binary(self):
+        base_content = b"line1\r\nline2\0"
+        base_sha = "b44504193b765f7cd79673812de8afb55b372ab2"
+
+        base_blob = Blob()
+        base_blob.set_raw_string(base_content)
+
+        self.assertEqual(base_blob.as_raw_chunks(), [base_content])
+        self.assertEqual(base_blob.sha().hexdigest(), base_sha)
+
+        filtered_blob = normalize_blob(
+            base_blob, convert_crlf_to_lf, binary_detection=True
+        )
+
+        self.assertEqual(filtered_blob.as_raw_chunks(), [base_content])
+        self.assertEqual(filtered_blob.sha().hexdigest(), base_sha)
+
+    def test_normalize_to_crlf_no_op(self):
+        base_content = b"line1\r\nline2"
+        base_sha = "3a1bd7a52799fe5cf6411f1d35f4c10bacb1db96"
+
+        base_blob = Blob()
+        base_blob.set_raw_string(base_content)
+
+        self.assertEqual(base_blob.as_raw_chunks(), [base_content])
+        self.assertEqual(base_blob.sha().hexdigest(), base_sha)
+
+        filtered_blob = normalize_blob(
+            base_blob, convert_lf_to_crlf, binary_detection=False
+        )
+
+        self.assertEqual(filtered_blob.as_raw_chunks(), [base_content])
+        self.assertEqual(filtered_blob.sha().hexdigest(), base_sha)
+
+    def test_normalize_to_crlf(self):
+        base_content = b"line1\nline2"
+        base_sha = "f8be7bb828880727816015d21abcbc37d033f233"
+
+        base_blob = Blob()
+        base_blob.set_raw_string(base_content)
+
+        self.assertEqual(base_blob.as_raw_chunks(), [base_content])
+        self.assertEqual(base_blob.sha().hexdigest(), base_sha)
+
+        filtered_blob = normalize_blob(
+            base_blob, convert_lf_to_crlf, binary_detection=False
+        )
+
+        normalized_content = b"line1\r\nline2"
+        normalized_sha = "3a1bd7a52799fe5cf6411f1d35f4c10bacb1db96"
+
+        self.assertEqual(filtered_blob.as_raw_chunks(), [normalized_content])
+        self.assertEqual(filtered_blob.sha().hexdigest(), normalized_sha)
+
+    def test_normalize_to_crlf_binary(self):
+        base_content = b"line1\r\nline2\0"
+        base_sha = "b44504193b765f7cd79673812de8afb55b372ab2"
+
+        base_blob = Blob()
+        base_blob.set_raw_string(base_content)
+
+        self.assertEqual(base_blob.as_raw_chunks(), [base_content])
+        self.assertEqual(base_blob.sha().hexdigest(), base_sha)
+
+        filtered_blob = normalize_blob(
+            base_blob, convert_lf_to_crlf, binary_detection=True
+        )
+
+        self.assertEqual(filtered_blob.as_raw_chunks(), [base_content])
+        self.assertEqual(filtered_blob.sha().hexdigest(), base_sha)

+ 71 - 0
dulwich/tests/test_porcelain.py

@@ -341,6 +341,26 @@ class AddTests(PorcelainTestCase):
             paths=["../foo"])
         self.assertEqual([], list(self.repo.open_index()))
 
+    def test_add_file_clrf_conversion(self):
+        # Set the right configuration to the repo
+        c = self.repo.get_config()
+        c.set("core", "autocrlf", "input")
+        c.write_to_path()
+
+        # Add a file with CRLF line-ending
+        fullpath = os.path.join(self.repo.path, 'foo')
+        with open(fullpath, 'wb') as f:
+            f.write(b"line1\r\nline2")
+        porcelain.add(self.repo.path, paths=[fullpath])
+
+        # The line-endings should have been converted to LF
+        index = self.repo.open_index()
+        self.assertIn(b"foo", index)
+
+        entry = index[b"foo"]
+        blob = self.repo[entry.sha]
+        self.assertEqual(blob.data, b"line1\nline2")
+
 
 class RemoveTests(PorcelainTestCase):
 
@@ -908,6 +928,57 @@ class StatusTests(PorcelainTestCase):
         self.assertListEqual(results.unstaged, [b'blye'])
         self.assertListEqual(results.untracked, ['blyat'])
 
+    def test_status_crlf_mismatch(self):
+        # First make a commit as if the file has been added on a Linux system
+        # or with core.autocrlf=True
+        file_path = os.path.join(self.repo.path, 'crlf')
+        with open(file_path, 'wb') as f:
+            f.write(b'line1\nline2')
+        porcelain.add(repo=self.repo.path, paths=[file_path])
+        porcelain.commit(repo=self.repo.path, message=b'test status',
+                         author=b'author <email>',
+                         committer=b'committer <email>')
+
+        # Then update the file as if it was created by CGit on a Windows
+        # system with core.autocrlf=true
+        with open(file_path, 'wb') as f:
+            f.write(b'line1\r\nline2')
+
+        results = porcelain.status(self.repo)
+        self.assertDictEqual(
+            {'add': [], 'delete': [], 'modify': []},
+            results.staged)
+        self.assertListEqual(results.unstaged, [b'crlf'])
+        self.assertListEqual(results.untracked, [])
+
+    def test_status_crlf_convert(self):
+        # First make a commit as if the file has been added on a Linux system
+        # or with core.autocrlf=True
+        file_path = os.path.join(self.repo.path, 'crlf')
+        with open(file_path, 'wb') as f:
+            f.write(b'line1\nline2')
+        porcelain.add(repo=self.repo.path, paths=[file_path])
+        porcelain.commit(repo=self.repo.path, message=b'test status',
+                         author=b'author <email>',
+                         committer=b'committer <email>')
+
+        # Then update the file as if it was created by CGit on a Windows
+        # system with core.autocrlf=true
+        with open(file_path, 'wb') as f:
+            f.write(b'line1\r\nline2')
+
+        # TODO: It should be set automatically by looking at the configuration
+        c = self.repo.get_config()
+        c.set("core", "autocrlf", True)
+        c.write_to_path()
+
+        results = porcelain.status(self.repo)
+        self.assertDictEqual(
+            {'add': [], 'delete': [], 'modify': []},
+            results.staged)
+        self.assertListEqual(results.unstaged, [])
+        self.assertListEqual(results.untracked, [])
+
     def test_get_tree_changes_add(self):
         """Unit test for get_tree_changes add."""