Browse Source

Fixed #30190 -- Added JSONL serializer.

Ali Vakilzade 4 years ago
parent
commit
e29637681b

+ 1 - 0
AUTHORS

@@ -52,6 +52,7 @@ answer newbie questions, and generally made Django that much better:
     Alex Robbins <alexander.j.robbins@gmail.com>
     Alexey Boriskin <alex@boriskin.me>
     Alexey Tsivunin <most-208@yandex.ru>
+    Ali Vakilzade <ali@vakilzade.com>
     Aljosa Mohorovic <aljosa.mohorovic@gmail.com>
     Amit Chakradeo <https://amit.chakradeo.net/>
     Amit Ramon <amit.ramon@gmail.com>

+ 1 - 0
django/core/serializers/__init__.py

@@ -28,6 +28,7 @@ BUILTIN_SERIALIZERS = {
     "python": "django.core.serializers.python",
     "json": "django.core.serializers.json",
     "yaml": "django.core.serializers.pyyaml",
+    "jsonl": "django.core.serializers.jsonl",
 }
 
 _serializers = {}

+ 57 - 0
django/core/serializers/jsonl.py

@@ -0,0 +1,57 @@
+"""
+Serialize data to/from JSON Lines
+"""
+
+import json
+
+from django.core.serializers.base import DeserializationError
+from django.core.serializers.json import DjangoJSONEncoder
+from django.core.serializers.python import (
+    Deserializer as PythonDeserializer, Serializer as PythonSerializer,
+)
+
+
+class Serializer(PythonSerializer):
+    """Convert a queryset to JSON Lines."""
+    internal_use_only = False
+
+    def _init_options(self):
+        self._current = None
+        self.json_kwargs = self.options.copy()
+        self.json_kwargs.pop('stream', None)
+        self.json_kwargs.pop('fields', None)
+        self.json_kwargs.pop('indent', None)
+        self.json_kwargs['separators'] = (',', ': ')
+        self.json_kwargs.setdefault('cls', DjangoJSONEncoder)
+        self.json_kwargs.setdefault('ensure_ascii', False)
+
+    def start_serialization(self):
+        self._init_options()
+
+    def end_object(self, obj):
+        # self._current has the field data
+        json.dump(self.get_dump_object(obj), self.stream, **self.json_kwargs)
+        self.stream.write("\n")
+        self._current = None
+
+    def getvalue(self):
+        # Grandparent super
+        return super(PythonSerializer, self).getvalue()
+
+
+def Deserializer(stream_or_string, **options):
+    """Deserialize a stream or string of JSON data."""
+    if isinstance(stream_or_string, bytes):
+        stream_or_string = stream_or_string.decode()
+    if isinstance(stream_or_string, (bytes, str)):
+        stream_or_string = stream_or_string.split("\n")
+
+    for line in stream_or_string:
+        if not line.strip():
+            continue
+        try:
+            yield list(PythonDeserializer([json.loads(line), ], **options))[0]
+        except (GeneratorExit, DeserializationError):
+            raise
+        except Exception as exc:
+            raise DeserializationError() from exc

+ 4 - 1
docs/releases/3.2.txt

@@ -215,7 +215,10 @@ Security
 Serialization
 ~~~~~~~~~~~~~
 
-* ...
+* The new :ref:`JSONL <serialization-formats-jsonl>` serializer allows using
+  the JSON Lines format with :djadmin:`dumpdata` and :djadmin:`loaddata`. This
+  can be useful for populating large databases because data is loaded line by
+  line into memory, rather than being loaded all at once.
 
 Signals
 ~~~~~~~

+ 21 - 0
docs/topics/serialization.txt

@@ -160,11 +160,14 @@ Identifier  Information
 
 ``json``    Serializes to and from JSON_.
 
+``jsonl``   Serializes to and from JSONL_.
+
 ``yaml``    Serializes to YAML (YAML Ain't a Markup Language). This
             serializer is only available if PyYAML_ is installed.
 ==========  ==============================================================
 
 .. _json: https://json.org/
+.. _jsonl: http://jsonlines.org/
 .. _PyYAML: https://pyyaml.org/
 
 XML
@@ -307,6 +310,24 @@ The JSON serializer uses ``DjangoJSONEncoder`` for encoding. A subclass of
 
 .. _ecma-262: https://www.ecma-international.org/ecma-262/5.1/#sec-15.9.1.15
 
+.. _serialization-formats-jsonl:
+
+JSONL
+-----
+
+.. versionadded:: 3.2
+
+*JSONL* stands for *JSON Lines*. With this format, objects are separated by new
+lines, and each line contains a valid JSON object. JSONL serialized data look
+like this::
+
+    { "pk": "4b678b301dfd8a4e0dad910de3ae245b", "model": "sessions.session", "fields": { ... }}
+    { "pk": "88bea72c02274f3c9bf1cb2bb8cee4fc", "model": "sessions.session", "fields": { ... }}
+    { "pk": "9cf0e26691b64147a67e2a9f06ad7a53", "model": "sessions.session", "fields": { ... }}
+
+JSONL can be useful for populating large databases, since the data can be
+processed line by line, rather than being loaded into memory all at once.
+
 YAML
 ----
 

+ 312 - 0
tests/serializers/test_jsonl.py

@@ -0,0 +1,312 @@
+import decimal
+import json
+import re
+
+from django.core import serializers
+from django.core.serializers.base import DeserializationError
+from django.db import models
+from django.test import TestCase, TransactionTestCase
+from django.test.utils import isolate_apps
+
+from .models import Score
+from .tests import SerializersTestBase, SerializersTransactionTestBase
+
+
+class JsonlSerializerTestCase(SerializersTestBase, TestCase):
+    serializer_name = "jsonl"
+    pkless_str = [
+        """{
+            "pk": null,
+            "model": "serializers.category",
+            "fields": {"name": "Reference"}
+        }""",
+        """{
+            "model": "serializers.category",
+            "fields": {"name": "Non-fiction"}
+        }"""
+    ]
+    pkless_str = "\n".join([s.replace("\n", "") for s in pkless_str])
+
+    mapping_ordering_str = """{
+"model": "serializers.article",
+"pk": %(article_pk)s,
+"fields": {
+"author": %(author_pk)s,
+"headline": "Poker has no place on ESPN",
+"pub_date": "2006-06-16T11:00:00",
+"categories": [
+%(first_category_pk)s,
+%(second_category_pk)s
+],
+"meta_data": []
+}
+}""".replace("\n", "") + "\n"
+
+    @staticmethod
+    def _validate_output(serial_str):
+        try:
+            for line in serial_str.split("\n"):
+                if line:
+                    json.loads(line)
+        except Exception:
+            return False
+        else:
+            return True
+
+    @staticmethod
+    def _get_pk_values(serial_str):
+        serial_list = [json.loads(line) for line in serial_str.split("\n") if line]
+        return [obj_dict['pk'] for obj_dict in serial_list]
+
+    @staticmethod
+    def _get_field_values(serial_str, field_name):
+        serial_list = [json.loads(line) for line in serial_str.split("\n") if line]
+        return [obj_dict['fields'][field_name] for obj_dict in serial_list if field_name in obj_dict['fields']]
+
+    def test_no_indentation(self):
+        s = serializers.jsonl.Serializer()
+        json_data = s.serialize([Score(score=5.0), Score(score=6.0)], indent=2)
+        for line in json_data.splitlines():
+            self.assertIsNone(re.search(r'.+,\s*$', line))
+
+    @isolate_apps('serializers')
+    def test_custom_encoder(self):
+        class ScoreDecimal(models.Model):
+            score = models.DecimalField()
+
+        class CustomJSONEncoder(json.JSONEncoder):
+            def default(self, o):
+                if isinstance(o, decimal.Decimal):
+                    return str(o)
+                return super().default(o)
+
+        s = serializers.jsonl.Serializer()
+        json_data = s.serialize(
+            [ScoreDecimal(score=decimal.Decimal(1.0))], cls=CustomJSONEncoder
+        )
+        self.assertIn('"fields": {"score": "1"}', json_data)
+
+    def test_json_deserializer_exception(self):
+        with self.assertRaises(DeserializationError):
+            for obj in serializers.deserialize("jsonl", """[{"pk":1}"""):
+                pass
+
+    def test_helpful_error_message_invalid_pk(self):
+        """
+        If there is an invalid primary key, the error message should contain
+        the model associated with it.
+        """
+        test_string = """{
+            "pk": "badpk",
+            "model": "serializers.player",
+            "fields": {
+                "name": "Bob",
+                "rank": 1,
+                "team": "Team"
+            }
+        }""".replace("\n", "")
+        with self.assertRaisesMessage(DeserializationError, "(serializers.player:pk=badpk)"):
+            list(serializers.deserialize('jsonl', test_string))
+
+    def test_helpful_error_message_invalid_field(self):
+        """
+        If there is an invalid field value, the error message should contain
+        the model associated with it.
+        """
+        test_string = """{
+            "pk": "1",
+            "model": "serializers.player",
+            "fields": {
+                "name": "Bob",
+                "rank": "invalidint",
+                "team": "Team"
+            }
+        }""".replace("\n", "")
+        expected = "(serializers.player:pk=1) field_value was 'invalidint'"
+        with self.assertRaisesMessage(DeserializationError, expected):
+            list(serializers.deserialize('jsonl', test_string))
+
+    def test_helpful_error_message_for_foreign_keys(self):
+        """
+        Invalid foreign keys with a natural key should throw a helpful error
+        message, such as what the failing key is.
+        """
+        test_string = """{
+            "pk": 1,
+            "model": "serializers.category",
+            "fields": {
+                "name": "Unknown foreign key",
+                "meta_data": [
+                    "doesnotexist",
+                    "metadata"
+                ]
+            }
+        }""".replace("\n", "")
+        key = ["doesnotexist", "metadata"]
+        expected = "(serializers.category:pk=1) field_value was '%r'" % key
+        with self.assertRaisesMessage(DeserializationError, expected):
+            list(serializers.deserialize('jsonl', test_string))
+
+    def test_helpful_error_message_for_many2many_non_natural(self):
+        """
+        Invalid many-to-many keys should throw a helpful error message.
+        """
+        test_strings = [
+            """{
+                "pk": 1,
+                "model": "serializers.article",
+                "fields": {
+                    "author": 1,
+                    "headline": "Unknown many to many",
+                    "pub_date": "2014-09-15T10:35:00",
+                    "categories": [1, "doesnotexist"]
+                }
+            }""",
+            """{
+                "pk": 1,
+                "model": "serializers.author",
+                "fields": {
+                    "name": "Agnes"
+                }
+            }""",
+            """{
+                "pk": 1,
+                "model": "serializers.category",
+                "fields": {
+                    "name": "Reference"
+                }
+            }"""
+        ]
+        test_string = "\n".join([s.replace("\n", "") for s in test_strings])
+        expected = "(serializers.article:pk=1) field_value was 'doesnotexist'"
+        with self.assertRaisesMessage(DeserializationError, expected):
+            list(serializers.deserialize('jsonl', test_string))
+
+    def test_helpful_error_message_for_many2many_natural1(self):
+        """
+        Invalid many-to-many keys should throw a helpful error message.
+        This tests the code path where one of a list of natural keys is invalid.
+        """
+        test_strings = [
+            """{
+                "pk": 1,
+                "model": "serializers.categorymetadata",
+                "fields": {
+                    "kind": "author",
+                    "name": "meta1",
+                    "value": "Agnes"
+                }
+            }""",
+            """{
+                "pk": 1,
+                "model": "serializers.article",
+                "fields": {
+                    "author": 1,
+                    "headline": "Unknown many to many",
+                    "pub_date": "2014-09-15T10:35:00",
+                    "meta_data": [
+                        ["author", "meta1"],
+                        ["doesnotexist", "meta1"],
+                        ["author", "meta1"]
+                    ]
+                }
+            }""",
+            """{
+                "pk": 1,
+                "model": "serializers.author",
+                "fields": {
+                    "name": "Agnes"
+                }
+            }"""
+        ]
+        test_string = "\n".join([s.replace("\n", "") for s in test_strings])
+        key = ["doesnotexist", "meta1"]
+        expected = "(serializers.article:pk=1) field_value was '%r'" % key
+        with self.assertRaisesMessage(DeserializationError, expected):
+            for obj in serializers.deserialize('jsonl', test_string):
+                obj.save()
+
+    def test_helpful_error_message_for_many2many_natural2(self):
+        """
+        Invalid many-to-many keys should throw a helpful error message. This
+        tests the code path where a natural many-to-many key has only a single
+        value.
+        """
+        test_strings = [
+            """{
+                "pk": 1,
+                "model": "serializers.article",
+                "fields": {
+                    "author": 1,
+                    "headline": "Unknown many to many",
+                    "pub_date": "2014-09-15T10:35:00",
+                    "meta_data": [1, "doesnotexist"]
+                }
+            }""",
+            """{
+                "pk": 1,
+                "model": "serializers.categorymetadata",
+                "fields": {
+                    "kind": "author",
+                    "name": "meta1",
+                    "value": "Agnes"
+                }
+            }""",
+            """{
+                "pk": 1,
+                "model": "serializers.author",
+                "fields": {
+                    "name": "Agnes"
+                }
+            }"""
+        ]
+        test_string = "\n".join([s.replace("\n", "") for s in test_strings])
+        expected = "(serializers.article:pk=1) field_value was 'doesnotexist'"
+        with self.assertRaisesMessage(DeserializationError, expected):
+            for obj in serializers.deserialize('jsonl', test_string, ignore=False):
+                obj.save()
+
+    def test_helpful_error_message_for_many2many_not_iterable(self):
+        """
+        Not iterable many-to-many field value throws a helpful error message.
+        """
+        test_string = """{
+            "pk": 1,
+            "model": "serializers.m2mdata",
+            "fields": {"data": null}
+        }""".replace("\n", "")
+
+        expected = "(serializers.m2mdata:pk=1) field_value was 'None'"
+        with self.assertRaisesMessage(DeserializationError, expected):
+            next(serializers.deserialize('jsonl', test_string, ignore=False))
+
+
+class JsonSerializerTransactionTestCase(SerializersTransactionTestBase, TransactionTestCase):
+    serializer_name = "jsonl"
+    fwd_ref_str = [
+        """{
+            "pk": 1,
+            "model": "serializers.article",
+            "fields": {
+                "headline": "Forward references pose no problem",
+                "pub_date": "2006-06-16T15:00:00",
+                "categories": [1],
+                "author": 1
+            }
+        }""",
+        """{
+            "pk": 1,
+            "model": "serializers.category",
+            "fields": {
+                "name": "Reference"
+            }
+        }""",
+        """{
+            "pk": 1,
+            "model": "serializers.author",
+            "fields": {
+                "name": "Agnes"
+            }
+        }"""
+    ]
+    fwd_ref_str = "\n".join([s.replace("\n", "") for s in fwd_ref_str])