2
0

xml_serializer.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. """
  2. XML serializer.
  3. """
  4. import json
  5. from xml.dom import pulldom
  6. from xml.sax import handler
  7. from xml.sax.expatreader import ExpatParser as _ExpatParser
  8. from django.apps import apps
  9. from django.conf import settings
  10. from django.core.exceptions import ObjectDoesNotExist
  11. from django.core.serializers import base
  12. from django.db import DEFAULT_DB_ALIAS, models
  13. from django.utils.xmlutils import SimplerXMLGenerator, UnserializableContentError
  14. class Serializer(base.Serializer):
  15. """Serialize a QuerySet to XML."""
  16. def indent(self, level):
  17. if self.options.get("indent") is not None:
  18. self.xml.ignorableWhitespace(
  19. "\n" + " " * self.options.get("indent") * level
  20. )
  21. def start_serialization(self):
  22. """
  23. Start serialization -- open the XML document and the root element.
  24. """
  25. self.xml = SimplerXMLGenerator(
  26. self.stream, self.options.get("encoding", settings.DEFAULT_CHARSET)
  27. )
  28. self.xml.startDocument()
  29. self.xml.startElement("django-objects", {"version": "1.0"})
  30. def end_serialization(self):
  31. """
  32. End serialization -- end the document.
  33. """
  34. self.indent(0)
  35. self.xml.endElement("django-objects")
  36. self.xml.endDocument()
  37. def start_object(self, obj):
  38. """
  39. Called as each object is handled.
  40. """
  41. if not hasattr(obj, "_meta"):
  42. raise base.SerializationError(
  43. "Non-model object (%s) encountered during serialization" % type(obj)
  44. )
  45. self.indent(1)
  46. attrs = {"model": str(obj._meta)}
  47. if not self.use_natural_primary_keys or not hasattr(obj, "natural_key"):
  48. obj_pk = obj.pk
  49. if obj_pk is not None:
  50. attrs["pk"] = str(obj_pk)
  51. self.xml.startElement("object", attrs)
  52. def end_object(self, obj):
  53. """
  54. Called after handling all fields for an object.
  55. """
  56. self.indent(1)
  57. self.xml.endElement("object")
  58. def handle_field(self, obj, field):
  59. """
  60. Handle each field on an object (except for ForeignKeys and
  61. ManyToManyFields).
  62. """
  63. self.indent(2)
  64. self.xml.startElement(
  65. "field",
  66. {
  67. "name": field.name,
  68. "type": field.get_internal_type(),
  69. },
  70. )
  71. # Get a "string version" of the object's data.
  72. if getattr(obj, field.name) is not None:
  73. value = field.value_to_string(obj)
  74. if field.get_internal_type() == "JSONField":
  75. # Dump value since JSONField.value_to_string() doesn't output
  76. # strings.
  77. value = json.dumps(value, cls=field.encoder)
  78. try:
  79. self.xml.characters(value)
  80. except UnserializableContentError:
  81. raise ValueError(
  82. "%s.%s (pk:%s) contains unserializable characters"
  83. % (obj.__class__.__name__, field.name, obj.pk)
  84. )
  85. else:
  86. self.xml.addQuickElement("None")
  87. self.xml.endElement("field")
  88. def handle_fk_field(self, obj, field):
  89. """
  90. Handle a ForeignKey (they need to be treated slightly
  91. differently from regular fields).
  92. """
  93. self._start_relational_field(field)
  94. related_att = getattr(obj, field.get_attname())
  95. if related_att is not None:
  96. if self.use_natural_foreign_keys and hasattr(
  97. field.remote_field.model, "natural_key"
  98. ):
  99. related = getattr(obj, field.name)
  100. # If related object has a natural key, use it
  101. related = related.natural_key()
  102. # Iterable natural keys are rolled out as subelements
  103. for key_value in related:
  104. self.xml.startElement("natural", {})
  105. self.xml.characters(str(key_value))
  106. self.xml.endElement("natural")
  107. else:
  108. self.xml.characters(str(related_att))
  109. else:
  110. self.xml.addQuickElement("None")
  111. self.xml.endElement("field")
  112. def handle_m2m_field(self, obj, field):
  113. """
  114. Handle a ManyToManyField. Related objects are only serialized as
  115. references to the object's PK (i.e. the related *data* is not dumped,
  116. just the relation).
  117. """
  118. if field.remote_field.through._meta.auto_created:
  119. self._start_relational_field(field)
  120. if self.use_natural_foreign_keys and hasattr(
  121. field.remote_field.model, "natural_key"
  122. ):
  123. # If the objects in the m2m have a natural key, use it
  124. def handle_m2m(value):
  125. natural = value.natural_key()
  126. # Iterable natural keys are rolled out as subelements
  127. self.xml.startElement("object", {})
  128. for key_value in natural:
  129. self.xml.startElement("natural", {})
  130. self.xml.characters(str(key_value))
  131. self.xml.endElement("natural")
  132. self.xml.endElement("object")
  133. def queryset_iterator(obj, field):
  134. return getattr(obj, field.name).iterator()
  135. else:
  136. def handle_m2m(value):
  137. self.xml.addQuickElement("object", attrs={"pk": str(value.pk)})
  138. def queryset_iterator(obj, field):
  139. return (
  140. getattr(obj, field.name)
  141. .select_related(None)
  142. .only("pk")
  143. .iterator()
  144. )
  145. m2m_iter = getattr(obj, "_prefetched_objects_cache", {}).get(
  146. field.name,
  147. queryset_iterator(obj, field),
  148. )
  149. for relobj in m2m_iter:
  150. handle_m2m(relobj)
  151. self.xml.endElement("field")
  152. def _start_relational_field(self, field):
  153. """Output the <field> element for relational fields."""
  154. self.indent(2)
  155. self.xml.startElement(
  156. "field",
  157. {
  158. "name": field.name,
  159. "rel": field.remote_field.__class__.__name__,
  160. "to": str(field.remote_field.model._meta),
  161. },
  162. )
  163. class Deserializer(base.Deserializer):
  164. """Deserialize XML."""
  165. def __init__(
  166. self,
  167. stream_or_string,
  168. *,
  169. using=DEFAULT_DB_ALIAS,
  170. ignorenonexistent=False,
  171. **options,
  172. ):
  173. super().__init__(stream_or_string, **options)
  174. self.handle_forward_references = options.pop("handle_forward_references", False)
  175. self.event_stream = pulldom.parse(self.stream, self._make_parser())
  176. self.db = using
  177. self.ignore = ignorenonexistent
  178. def _make_parser(self):
  179. """Create a hardened XML parser (no custom/external entities)."""
  180. return DefusedExpatParser()
  181. def __next__(self):
  182. for event, node in self.event_stream:
  183. if event == "START_ELEMENT" and node.nodeName == "object":
  184. self.event_stream.expandNode(node)
  185. return self._handle_object(node)
  186. raise StopIteration
  187. def _handle_object(self, node):
  188. """Convert an <object> node to a DeserializedObject."""
  189. # Look up the model using the model loading mechanism. If this fails,
  190. # bail.
  191. Model = self._get_model_from_node(node, "model")
  192. # Start building a data dictionary from the object.
  193. data = {}
  194. if node.hasAttribute("pk"):
  195. data[Model._meta.pk.attname] = Model._meta.pk.to_python(
  196. node.getAttribute("pk")
  197. )
  198. # Also start building a dict of m2m data (this is saved as
  199. # {m2m_accessor_attribute : [list_of_related_objects]})
  200. m2m_data = {}
  201. deferred_fields = {}
  202. field_names = {f.name for f in Model._meta.get_fields()}
  203. # Deserialize each field.
  204. for field_node in node.getElementsByTagName("field"):
  205. # If the field is missing the name attribute, bail (are you
  206. # sensing a pattern here?)
  207. field_name = field_node.getAttribute("name")
  208. if not field_name:
  209. raise base.DeserializationError(
  210. "<field> node is missing the 'name' attribute"
  211. )
  212. # Get the field from the Model. This will raise a
  213. # FieldDoesNotExist if, well, the field doesn't exist, which will
  214. # be propagated correctly unless ignorenonexistent=True is used.
  215. if self.ignore and field_name not in field_names:
  216. continue
  217. field = Model._meta.get_field(field_name)
  218. # As is usually the case, relation fields get the special treatment.
  219. if field.remote_field and isinstance(
  220. field.remote_field, models.ManyToManyRel
  221. ):
  222. value = self._handle_m2m_field_node(field_node, field)
  223. if value == base.DEFER_FIELD:
  224. deferred_fields[field] = [
  225. [
  226. getInnerText(nat_node).strip()
  227. for nat_node in obj_node.getElementsByTagName("natural")
  228. ]
  229. for obj_node in field_node.getElementsByTagName("object")
  230. ]
  231. else:
  232. m2m_data[field.name] = value
  233. elif field.remote_field and isinstance(
  234. field.remote_field, models.ManyToOneRel
  235. ):
  236. value = self._handle_fk_field_node(field_node, field)
  237. if value == base.DEFER_FIELD:
  238. deferred_fields[field] = [
  239. getInnerText(k).strip()
  240. for k in field_node.getElementsByTagName("natural")
  241. ]
  242. else:
  243. data[field.attname] = value
  244. else:
  245. if field_node.getElementsByTagName("None"):
  246. value = None
  247. else:
  248. value = field.to_python(getInnerText(field_node).strip())
  249. # Load value since JSONField.to_python() outputs strings.
  250. if field.get_internal_type() == "JSONField":
  251. value = json.loads(value, cls=field.decoder)
  252. data[field.name] = value
  253. obj = base.build_instance(Model, data, self.db)
  254. # Return a DeserializedObject so that the m2m data has a place to live.
  255. return base.DeserializedObject(obj, m2m_data, deferred_fields)
  256. def _handle_fk_field_node(self, node, field):
  257. """
  258. Handle a <field> node for a ForeignKey
  259. """
  260. # Check if there is a child node named 'None', returning None if so.
  261. if node.getElementsByTagName("None"):
  262. return None
  263. else:
  264. model = field.remote_field.model
  265. if hasattr(model._default_manager, "get_by_natural_key"):
  266. keys = node.getElementsByTagName("natural")
  267. if keys:
  268. # If there are 'natural' subelements, it must be a natural key
  269. field_value = [getInnerText(k).strip() for k in keys]
  270. try:
  271. obj = model._default_manager.db_manager(
  272. self.db
  273. ).get_by_natural_key(*field_value)
  274. except ObjectDoesNotExist:
  275. if self.handle_forward_references:
  276. return base.DEFER_FIELD
  277. else:
  278. raise
  279. obj_pk = getattr(obj, field.remote_field.field_name)
  280. # If this is a natural foreign key to an object that
  281. # has a FK/O2O as the foreign key, use the FK value
  282. if field.remote_field.model._meta.pk.remote_field:
  283. obj_pk = obj_pk.pk
  284. else:
  285. # Otherwise, treat like a normal PK
  286. field_value = getInnerText(node).strip()
  287. obj_pk = model._meta.get_field(
  288. field.remote_field.field_name
  289. ).to_python(field_value)
  290. return obj_pk
  291. else:
  292. field_value = getInnerText(node).strip()
  293. return model._meta.get_field(field.remote_field.field_name).to_python(
  294. field_value
  295. )
  296. def _handle_m2m_field_node(self, node, field):
  297. """
  298. Handle a <field> node for a ManyToManyField.
  299. """
  300. model = field.remote_field.model
  301. default_manager = model._default_manager
  302. if hasattr(default_manager, "get_by_natural_key"):
  303. def m2m_convert(n):
  304. keys = n.getElementsByTagName("natural")
  305. if keys:
  306. # If there are 'natural' subelements, it must be a natural key
  307. field_value = [getInnerText(k).strip() for k in keys]
  308. obj_pk = (
  309. default_manager.db_manager(self.db)
  310. .get_by_natural_key(*field_value)
  311. .pk
  312. )
  313. else:
  314. # Otherwise, treat like a normal PK value.
  315. obj_pk = model._meta.pk.to_python(n.getAttribute("pk"))
  316. return obj_pk
  317. else:
  318. def m2m_convert(n):
  319. return model._meta.pk.to_python(n.getAttribute("pk"))
  320. values = []
  321. try:
  322. for c in node.getElementsByTagName("object"):
  323. values.append(m2m_convert(c))
  324. except Exception as e:
  325. if isinstance(e, ObjectDoesNotExist) and self.handle_forward_references:
  326. return base.DEFER_FIELD
  327. else:
  328. raise base.M2MDeserializationError(e, c)
  329. else:
  330. return values
  331. def _get_model_from_node(self, node, attr):
  332. """
  333. Look up a model from a <object model=...> or a <field rel=... to=...>
  334. node.
  335. """
  336. model_identifier = node.getAttribute(attr)
  337. if not model_identifier:
  338. raise base.DeserializationError(
  339. "<%s> node is missing the required '%s' attribute"
  340. % (node.nodeName, attr)
  341. )
  342. try:
  343. return apps.get_model(model_identifier)
  344. except (LookupError, TypeError):
  345. raise base.DeserializationError(
  346. "<%s> node has invalid model identifier: '%s'"
  347. % (node.nodeName, model_identifier)
  348. )
  349. def getInnerText(node):
  350. """Get all the inner text of a DOM node (recursively)."""
  351. # inspired by https://mail.python.org/pipermail/xml-sig/2005-March/011022.html
  352. inner_text = []
  353. for child in node.childNodes:
  354. if (
  355. child.nodeType == child.TEXT_NODE
  356. or child.nodeType == child.CDATA_SECTION_NODE
  357. ):
  358. inner_text.append(child.data)
  359. elif child.nodeType == child.ELEMENT_NODE:
  360. inner_text.extend(getInnerText(child))
  361. else:
  362. pass
  363. return "".join(inner_text)
  364. # Below code based on Christian Heimes' defusedxml
  365. class DefusedExpatParser(_ExpatParser):
  366. """
  367. An expat parser hardened against XML bomb attacks.
  368. Forbid DTDs, external entity references
  369. """
  370. def __init__(self, *args, **kwargs):
  371. super().__init__(*args, **kwargs)
  372. self.setFeature(handler.feature_external_ges, False)
  373. self.setFeature(handler.feature_external_pes, False)
  374. def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
  375. raise DTDForbidden(name, sysid, pubid)
  376. def entity_decl(
  377. self, name, is_parameter_entity, value, base, sysid, pubid, notation_name
  378. ):
  379. raise EntitiesForbidden(name, value, base, sysid, pubid, notation_name)
  380. def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
  381. # expat 1.2
  382. raise EntitiesForbidden(name, None, base, sysid, pubid, notation_name)
  383. def external_entity_ref_handler(self, context, base, sysid, pubid):
  384. raise ExternalReferenceForbidden(context, base, sysid, pubid)
  385. def reset(self):
  386. _ExpatParser.reset(self)
  387. parser = self._parser
  388. parser.StartDoctypeDeclHandler = self.start_doctype_decl
  389. parser.EntityDeclHandler = self.entity_decl
  390. parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
  391. parser.ExternalEntityRefHandler = self.external_entity_ref_handler
  392. class DefusedXmlException(ValueError):
  393. """Base exception."""
  394. def __repr__(self):
  395. return str(self)
  396. class DTDForbidden(DefusedXmlException):
  397. """Document type definition is forbidden."""
  398. def __init__(self, name, sysid, pubid):
  399. super().__init__()
  400. self.name = name
  401. self.sysid = sysid
  402. self.pubid = pubid
  403. def __str__(self):
  404. tpl = "DTDForbidden(name='{}', system_id={!r}, public_id={!r})"
  405. return tpl.format(self.name, self.sysid, self.pubid)
  406. class EntitiesForbidden(DefusedXmlException):
  407. """Entity definition is forbidden."""
  408. def __init__(self, name, value, base, sysid, pubid, notation_name):
  409. super().__init__()
  410. self.name = name
  411. self.value = value
  412. self.base = base
  413. self.sysid = sysid
  414. self.pubid = pubid
  415. self.notation_name = notation_name
  416. def __str__(self):
  417. tpl = "EntitiesForbidden(name='{}', system_id={!r}, public_id={!r})"
  418. return tpl.format(self.name, self.sysid, self.pubid)
  419. class ExternalReferenceForbidden(DefusedXmlException):
  420. """Resolving an external reference is forbidden."""
  421. def __init__(self, context, base, sysid, pubid):
  422. super().__init__()
  423. self.context = context
  424. self.base = base
  425. self.sysid = sysid
  426. self.pubid = pubid
  427. def __str__(self):
  428. tpl = "ExternalReferenceForbidden(system_id='{}', public_id={})"
  429. return tpl.format(self.sysid, self.pubid)