2
0
Jelmer Vernooij 2 сар өмнө
parent
commit
baa23738f8

+ 3 - 51
dulwich/index.py

@@ -48,6 +48,7 @@ from .objects import (
     sha_to_hex,
 )
 from .pack import ObjectContainer, SHA1Reader, SHA1Writer
+from .varint import decode_varint, encode_varint
 
 # 2-bit stage (during merge)
 FLAG_STAGEMASK = 0x3000
@@ -76,55 +77,6 @@ EOIE_EXTENSION = b"EOIE"
 IEOT_EXTENSION = b"IEOT"
 
 
-def _encode_varint(value: int) -> bytes:
-    """Encode an integer using variable-width encoding.
-
-    Same format as used for OFS_DELTA pack entries and index v4 path compression.
-    Uses 7 bits per byte, with the high bit indicating continuation.
-
-    Args:
-      value: Integer to encode
-    Returns:
-      Encoded bytes
-    """
-    if value == 0:
-        return b"\x00"
-
-    result = []
-    while value > 0:
-        byte = value & 0x7F  # Take lower 7 bits
-        value >>= 7
-        if value > 0:
-            byte |= 0x80  # Set continuation bit
-        result.append(byte)
-
-    return bytes(result)
-
-
-def _decode_varint(data: bytes, offset: int = 0) -> tuple[int, int]:
-    """Decode a variable-width encoded integer.
-
-    Args:
-      data: Bytes to decode from
-      offset: Starting offset in data
-    Returns:
-      tuple of (decoded_value, new_offset)
-    """
-    value = 0
-    shift = 0
-    pos = offset
-
-    while pos < len(data):
-        byte = data[pos]
-        pos += 1
-        value |= (byte & 0x7F) << shift
-        shift += 7
-        if not (byte & 0x80):  # No continuation bit
-            break
-
-    return value, pos
-
-
 def _compress_path(path: bytes, previous_path: bytes) -> bytes:
     """Compress a path relative to the previous path for index version 4.
 
@@ -152,7 +104,7 @@ def _compress_path(path: bytes, previous_path: bytes) -> bytes:
     suffix = path[common_len:]
 
     # Encode: varint(remove_len) + suffix + NUL
-    return _encode_varint(remove_len) + suffix + b"\x00"
+    return encode_varint(remove_len) + suffix + b"\x00"
 
 
 def _decompress_path(
@@ -168,7 +120,7 @@ def _decompress_path(
       tuple of (decompressed_path, new_offset)
     """
     # Decode the number of bytes to remove from previous path
-    remove_len, new_offset = _decode_varint(data, offset)
+    remove_len, new_offset = decode_varint(data, offset)
 
     # Find the NUL terminator for the suffix
     suffix_start = new_offset

+ 105 - 0
dulwich/varint.py

@@ -0,0 +1,105 @@
+# varint.py -- Variable-width integer encoding/decoding
+# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
+#
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
+#
+
+"""Variable-width integer encoding/decoding.
+
+This format is used in multiple places in Git:
+- Git index file format version 4 for path compression
+- Git pack files for OFS_DELTA entries
+- Git reftable format for various fields
+"""
+
+from typing import BinaryIO, Optional
+
+
+def encode_varint(value: int) -> bytes:
+    """Encode an integer using variable-width encoding.
+
+    Same format as used for OFS_DELTA pack entries and index v4 path compression.
+    Uses 7 bits per byte, with the high bit indicating continuation.
+
+    Args:
+      value: Integer to encode
+    Returns:
+      Encoded bytes
+    """
+    if value == 0:
+        return b"\x00"
+
+    result = []
+    while value > 0:
+        byte = value & 0x7F  # Take lower 7 bits
+        value >>= 7
+        if value > 0:
+            byte |= 0x80  # Set continuation bit
+        result.append(byte)
+
+    return bytes(result)
+
+
+def decode_varint(data: bytes, offset: int = 0) -> tuple[int, int]:
+    """Decode a variable-width encoded integer from bytes.
+
+    Args:
+      data: Bytes to decode from
+      offset: Starting offset in data
+    Returns:
+      tuple of (decoded_value, new_offset)
+    """
+    value = 0
+    shift = 0
+    pos = offset
+
+    while pos < len(data):
+        byte = data[pos]
+        pos += 1
+        value |= (byte & 0x7F) << shift
+        shift += 7
+        if not (byte & 0x80):  # No continuation bit
+            break
+
+    return value, pos
+
+
+def decode_varint_from_stream(stream: BinaryIO) -> Optional[int]:
+    """Decode a variable-width encoded integer from a stream.
+
+    Args:
+      stream: Stream to read from
+    Returns:
+      Decoded integer, or None if end of stream
+    """
+    value = 0
+    shift = 0
+
+    while True:
+        byte_data = stream.read(1)
+        if not byte_data:
+            return None  # End of stream
+
+        byte = byte_data[0]
+        value |= (byte & 0x7F) << shift
+        shift += 7
+
+        if not (byte & 0x80):  # No continuation bit
+            break
+
+    return value

+ 1 - 0
tests/__init__.py

@@ -153,6 +153,7 @@ def self_test_suite():
         "stash",
         "submodule",
         "utils",
+        "varint",
         "walk",
         "web",
     ]

+ 3 - 3
tests/test_index.py

@@ -1430,13 +1430,13 @@ class TestPathPrefixCompression(TestCase):
 
     def test_varint_encoding_decoding(self):
         """Test variable-width integer encoding and decoding."""
-        from dulwich.index import _decode_varint, _encode_varint
+        from dulwich.varint import decode_varint, encode_varint
 
         test_values = [0, 1, 127, 128, 255, 256, 16383, 16384, 65535, 65536]
 
         for value in test_values:
-            encoded = _encode_varint(value)
-            decoded, _ = _decode_varint(encoded, 0)
+            encoded = encode_varint(value)
+            decoded, _ = decode_varint(encoded, 0)
             self.assertEqual(value, decoded, f"Failed for value {value}")
 
     def test_path_compression_simple(self):

+ 120 - 0
tests/test_varint.py

@@ -0,0 +1,120 @@
+"""Tests for variable-width integer encoding/decoding."""
+
+import unittest
+from io import BytesIO
+
+from dulwich.varint import (
+    decode_varint,
+    decode_varint_from_stream,
+    encode_varint,
+)
+
+
+class TestVarint(unittest.TestCase):
+    """Test variable-width integer encoding and decoding."""
+
+    def test_encode_decode_basic(self):
+        """Test basic varint encoding/decoding."""
+        test_values = [0, 1, 127, 128, 255, 256, 16383, 16384, 65535, 65536]
+
+        for value in test_values:
+            encoded = encode_varint(value)
+            decoded, new_offset = decode_varint(encoded, 0)
+            self.assertEqual(value, decoded, f"Failed for value {value}")
+            self.assertEqual(
+                len(encoded), new_offset, f"Offset mismatch for value {value}"
+            )
+
+    def test_encode_decode_stream(self):
+        """Test varint encoding/decoding with streams."""
+        test_values = [0, 1, 127, 128, 255, 256, 16383, 16384, 65535, 65536]
+
+        for value in test_values:
+            encoded = encode_varint(value)
+            stream = BytesIO(encoded)
+            decoded = decode_varint_from_stream(stream)
+            self.assertEqual(
+                value, decoded, f"Failed for stream decode of value {value}"
+            )
+
+    def test_multiple_varints(self):
+        """Test encoding/decoding multiple varints in sequence."""
+        values = [42, 127, 128, 1000, 0]
+
+        # Encode all values
+        encoded_data = b""
+        for value in values:
+            encoded_data += encode_varint(value)
+
+        # Decode all values using byte offset
+        offset = 0
+        decoded_values = []
+        for _ in values:
+            decoded, offset = decode_varint(encoded_data, offset)
+            decoded_values.append(decoded)
+
+        self.assertEqual(values, decoded_values)
+
+        # Decode all values using stream
+        stream = BytesIO(encoded_data)
+        decoded_values_stream = []
+        for _ in values:
+            decoded = decode_varint_from_stream(stream)
+            self.assertIsNotNone(decoded)
+            decoded_values_stream.append(decoded)
+
+        self.assertEqual(values, decoded_values_stream)
+
+    def test_stream_end_of_data(self):
+        """Test that stream decode returns None at end of data."""
+        stream = BytesIO(b"")
+        result = decode_varint_from_stream(stream)
+        self.assertIsNone(result)
+
+        # Test with some data followed by end
+        stream = BytesIO(encode_varint(42))
+        result1 = decode_varint_from_stream(stream)
+        self.assertEqual(42, result1)
+
+        result2 = decode_varint_from_stream(stream)
+        self.assertIsNone(result2)
+
+    def test_specific_encoding_values(self):
+        """Test specific encoding patterns."""
+        # Single byte values (0-127)
+        for i in range(128):
+            encoded = encode_varint(i)
+            self.assertEqual(1, len(encoded))
+            self.assertEqual(i, encoded[0])
+
+        # Two byte values (128-16383)
+        encoded_128 = encode_varint(128)
+        self.assertEqual(b"\x80\x01", encoded_128)
+
+        encoded_255 = encode_varint(255)
+        self.assertEqual(b"\xff\x01", encoded_255)
+
+        encoded_256 = encode_varint(256)
+        self.assertEqual(b"\x80\x02", encoded_256)
+
+    def test_large_values(self):
+        """Test encoding/decoding of large values."""
+        large_values = [
+            (1 << 7) - 1,  # 127
+            (1 << 7),  # 128
+            (1 << 14) - 1,  # 16383
+            (1 << 14),  # 16384
+            (1 << 21) - 1,  # 2097151
+            (1 << 21),  # 2097152
+            (1 << 28) - 1,  # 268435455
+            (1 << 28),  # 268435456
+        ]
+
+        for value in large_values:
+            encoded = encode_varint(value)
+            decoded, _ = decode_varint(encoded, 0)
+            self.assertEqual(value, decoded, f"Failed for large value {value}")
+
+
+if __name__ == "__main__":
+    unittest.main()