Jelajahi Sumber

diff_tree: Use hashcodes as block keys instead of strings.

This avoids storing essentially the entire contents of every blob in its
block dict, at the cost of some accuracy on hash conflicts.

Change-Id: I54df3f7e8ac32e7147f9b64d9944a514f7700a4f
Signed-off-by: Jelmer Vernooij <jelmer@samba.org>
Dave Borowitz 14 tahun lalu
induk
melakukan
20ab821ca4
2 mengubah file dengan 18 tambahan dan 9 penghapusan
  1. 13 5
      dulwich/diff_tree.py
  2. 5 4
      dulwich/tests/test_diff_tree.py

+ 13 - 5
dulwich/diff_tree.py

@@ -200,7 +200,7 @@ def _count_blocks(obj):
     Splits the data into blocks either on lines or <=64-byte chunks of lines.
 
     :param obj: The object to count blocks for.
-    :return: A dict of block -> number of occurrences.
+    :return: A dict of block hashcode -> total bytes occurring.
     """
     block_counts = defaultdict(int)
     block = StringIO()
@@ -216,17 +216,25 @@ def _count_blocks(obj):
         block_write(c)
         n += 1
         if c == '\n' or n == _BLOCK_SIZE:
-            block_counts[block_getvalue()] += 1
+            value = block_getvalue()
+            block_counts[hash(value)] += len(value)
             block_seek(0)
             block_truncate()
             n = 0
     if n > 0:
-        block_counts[block_getvalue()] += 1
+        last_block = block_getvalue()
+        block_counts[hash(last_block)] += len(last_block)
     return block_counts
 
 
 def _common_bytes(blocks1, blocks2):
-    """Count the number of common bytes in two block count dicts."""
+    """Count the number of common bytes in two block count dicts.
+
+    :param block1: The first dict of block hashcode -> total bytes.
+    :param block2: The second dict of block hashcode -> total bytes.
+    :return: The number of bytes in common between blocks1 and blocks2. This is
+        only approximate due to possible hash collisions.
+    """
     # Iterate over the smaller of the two dicts, since this is symmetrical.
     if len(blocks1) > len(blocks2):
         blocks1, blocks2 = blocks2, blocks1
@@ -234,7 +242,7 @@ def _common_bytes(blocks1, blocks2):
     for block, count1 in blocks1.iteritems():
         count2 = blocks2.get(block)
         if count2:
-            score += min(count1, count2) * len(block)
+            score += min(count1, count2)
     return score
 
 

+ 5 - 4
dulwich/tests/test_diff_tree.py

@@ -301,21 +301,22 @@ class RenameDetectionTest(DiffTestCase):
 
     def test_count_blocks(self):
         blob = make_object(Blob, data='a\nb\na\n')
-        self.assertEqual({'a\n': 2, 'b\n': 1}, _count_blocks(blob))
+        self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, _count_blocks(blob))
 
     def test_count_blocks_no_newline(self):
         blob = make_object(Blob, data='a\na')
-        self.assertEqual({'a\n': 1, 'a': 1}, _count_blocks(blob))
+        self.assertEqual({hash('a\n'): 2, hash('a'): 1}, _count_blocks(blob))
 
     def test_count_blocks_chunks(self):
         blob = ShaFile.from_raw_chunks(Blob.type_num, ['a\nb', '\na\n'])
-        self.assertEqual({'a\n': 2, 'b\n': 1}, _count_blocks(blob))
+        self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, _count_blocks(blob))
 
     def test_count_blocks_long_lines(self):
         a = 'a' * 64
         data = a + 'xxx\ny\n' + a + 'zzz\n'
         blob = make_object(Blob, data=data)
-        self.assertEqual({'a' * 64: 2, 'xxx\n': 1, 'y\n': 1, 'zzz\n': 1},
+        self.assertEqual({hash('a' * 64): 128, hash('xxx\n'): 4, hash('y\n'): 2,
+                          hash('zzz\n'): 4},
                          _count_blocks(blob))
 
     def assertSimilar(self, expected_score, blob1, blob2):