objects.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708
  1. # objects.py -- Access to base git objects
  2. # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
  3. # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
  4. #
  5. # This program is free software; you can redistribute it and/or
  6. # modify it under the terms of the GNU General Public License
  7. # as published by the Free Software Foundation; version 2
  8. # of the License or (at your option) a later version of the License.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18. # MA 02110-1301, USA.
  19. """Access to base git objects."""
  20. import binascii
  21. from cStringIO import (
  22. StringIO,
  23. )
  24. import mmap
  25. import os
  26. import stat
  27. import time
  28. import zlib
  29. from dulwich.errors import (
  30. NotBlobError,
  31. NotCommitError,
  32. NotTreeError,
  33. )
  34. from dulwich.file import GitFile
  35. from dulwich.misc import (
  36. make_sha,
  37. )
  38. BLOB_ID = "blob"
  39. TAG_ID = "tag"
  40. TREE_ID = "tree"
  41. COMMIT_ID = "commit"
  42. PARENT_ID = "parent"
  43. AUTHOR_ID = "author"
  44. COMMITTER_ID = "committer"
  45. OBJECT_ID = "object"
  46. TYPE_ID = "type"
  47. TAGGER_ID = "tagger"
  48. ENCODING_ID = "encoding"
  49. S_IFGITLINK = 0160000
  50. def S_ISGITLINK(m):
  51. return (stat.S_IFMT(m) == S_IFGITLINK)
  52. def _decompress(string):
  53. dcomp = zlib.decompressobj()
  54. dcomped = dcomp.decompress(string)
  55. dcomped += dcomp.flush()
  56. return dcomped
  57. def sha_to_hex(sha):
  58. """Takes a string and returns the hex of the sha within"""
  59. hexsha = binascii.hexlify(sha)
  60. assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
  61. return hexsha
  62. def hex_to_sha(hex):
  63. """Takes a hex sha and returns a binary sha"""
  64. assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
  65. return binascii.unhexlify(hex)
  66. def serializable_property(name, docstring=None):
  67. def set(obj, value):
  68. obj._ensure_parsed()
  69. setattr(obj, "_"+name, value)
  70. obj._needs_serialization = True
  71. def get(obj):
  72. obj._ensure_parsed()
  73. return getattr(obj, "_"+name)
  74. return property(get, set, doc=docstring)
  75. class ShaFile(object):
  76. """A git SHA file."""
  77. @classmethod
  78. def _parse_legacy_object(cls, map):
  79. """Parse a legacy object, creating it and setting object._text"""
  80. text = _decompress(map)
  81. object = None
  82. for posstype in type_map.keys():
  83. if text.startswith(posstype):
  84. object = type_map[posstype]()
  85. text = text[len(posstype):]
  86. break
  87. assert object is not None, "%s is not a known object type" % text[:9]
  88. assert text[0] == ' ', "%s is not a space" % text[0]
  89. text = text[1:]
  90. size = 0
  91. i = 0
  92. while text[0] >= '0' and text[0] <= '9':
  93. if i > 0 and size == 0:
  94. raise AssertionError("Size is not in canonical format")
  95. size = (size * 10) + int(text[0])
  96. text = text[1:]
  97. i += 1
  98. object._size = size
  99. assert text[0] == "\0", "Size not followed by null"
  100. text = text[1:]
  101. object.set_raw_string(text)
  102. return object
  103. def as_legacy_object(self):
  104. text = self.as_raw_string()
  105. return zlib.compress("%s %d\0%s" % (self._type, len(text), text))
  106. def as_raw_chunks(self):
  107. if self._needs_serialization:
  108. self._chunked_text = self._serialize()
  109. self._needs_serialization = False
  110. return self._chunked_text
  111. def as_raw_string(self):
  112. return "".join(self.as_raw_chunks())
  113. def __str__(self):
  114. return self.as_raw_string()
  115. def __hash__(self):
  116. return hash(self.id)
  117. def as_pretty_string(self):
  118. return self.as_raw_string()
  119. def _ensure_parsed(self):
  120. if self._needs_parsing:
  121. self._deserialize(self._chunked_text)
  122. self._needs_parsing = False
  123. def set_raw_string(self, text):
  124. if type(text) != str:
  125. raise TypeError(text)
  126. self.set_raw_chunks([text])
  127. def set_raw_chunks(self, chunks):
  128. self._chunked_text = chunks
  129. self._sha = None
  130. self._needs_parsing = True
  131. self._needs_serialization = False
  132. @classmethod
  133. def _parse_object(cls, map):
  134. """Parse a new style object , creating it and setting object._text"""
  135. used = 0
  136. byte = ord(map[used])
  137. used += 1
  138. num_type = (byte >> 4) & 7
  139. try:
  140. object = num_type_map[num_type]()
  141. except KeyError:
  142. raise AssertionError("Not a known type: %d" % num_type)
  143. while (byte & 0x80) != 0:
  144. byte = ord(map[used])
  145. used += 1
  146. raw = map[used:]
  147. object.set_raw_string(_decompress(raw))
  148. return object
  149. @classmethod
  150. def _parse_file(cls, map):
  151. word = (ord(map[0]) << 8) + ord(map[1])
  152. if ord(map[0]) == 0x78 and (word % 31) == 0:
  153. return cls._parse_legacy_object(map)
  154. else:
  155. return cls._parse_object(map)
  156. def __init__(self):
  157. """Don't call this directly"""
  158. self._sha = None
  159. def _deserialize(self, chunks):
  160. raise NotImplementedError(self._deserialize)
  161. def _serialize(self):
  162. raise NotImplementedError(self._serialize)
  163. @classmethod
  164. def from_file(cls, filename):
  165. """Get the contents of a SHA file on disk"""
  166. size = os.path.getsize(filename)
  167. f = GitFile(filename, 'rb')
  168. try:
  169. map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
  170. shafile = cls._parse_file(map)
  171. return shafile
  172. finally:
  173. f.close()
  174. @classmethod
  175. def from_raw_string(cls, type, string):
  176. """Creates an object of the indicated type from the raw string given.
  177. Type is the numeric type of an object. String is the raw uncompressed
  178. contents.
  179. """
  180. real_class = num_type_map[type]
  181. obj = real_class()
  182. obj.type = type
  183. obj.set_raw_string(string)
  184. return obj
  185. @classmethod
  186. def from_raw_chunks(cls, type, chunks):
  187. """Creates an object of the indicated type from the raw chunks given.
  188. Type is the numeric type of an object. Chunks is a sequence of the raw
  189. uncompressed contents.
  190. """
  191. real_class = num_type_map[type]
  192. obj = real_class()
  193. obj.type = type
  194. obj.set_raw_chunks(chunks)
  195. return obj
  196. @classmethod
  197. def from_string(cls, string):
  198. """Create a blob from a string."""
  199. shafile = cls()
  200. shafile.set_raw_string(string)
  201. return shafile
  202. def _header(self):
  203. return "%s %lu\0" % (self._type, self.raw_length())
  204. def raw_length(self):
  205. """Returns the length of the raw string of this object."""
  206. ret = 0
  207. for chunk in self.as_raw_chunks():
  208. ret += len(chunk)
  209. return ret
  210. def _make_sha(self):
  211. ret = make_sha()
  212. ret.update(self._header())
  213. for chunk in self.as_raw_chunks():
  214. ret.update(chunk)
  215. return ret
  216. def sha(self):
  217. """The SHA1 object that is the name of this object."""
  218. if self._needs_serialization or self._sha is None:
  219. self._sha = self._make_sha()
  220. return self._sha
  221. @property
  222. def id(self):
  223. return self.sha().hexdigest()
  224. def get_type(self):
  225. return self._num_type
  226. def set_type(self, type):
  227. self._num_type = type
  228. type = property(get_type, set_type)
  229. def __repr__(self):
  230. return "<%s %s>" % (self.__class__.__name__, self.id)
  231. def __ne__(self, other):
  232. return self.id != other.id
  233. def __eq__(self, other):
  234. """Return true if the sha of the two objects match.
  235. The __le__ etc methods aren't overriden as they make no sense,
  236. certainly at this level.
  237. """
  238. return self.id == other.id
  239. class Blob(ShaFile):
  240. """A Git Blob object."""
  241. _type = BLOB_ID
  242. _num_type = 3
  243. def __init__(self):
  244. super(Blob, self).__init__()
  245. self._chunked_text = []
  246. self._needs_parsing = False
  247. self._needs_serialization = False
  248. def _get_data(self):
  249. return self.as_raw_string()
  250. def _set_data(self, data):
  251. self.set_raw_string(data)
  252. data = property(_get_data, _set_data,
  253. "The text contained within the blob object.")
  254. def _get_chunked(self):
  255. return self._chunked_text
  256. def _set_chunked(self, chunks):
  257. self._chunked_text = chunks
  258. chunked = property(_get_chunked, _set_chunked,
  259. "The text within the blob object, as chunks (not necessarily lines).")
  260. @classmethod
  261. def from_file(cls, filename):
  262. blob = ShaFile.from_file(filename)
  263. if blob._type != cls._type:
  264. raise NotBlobError(filename)
  265. return blob
  266. class Tag(ShaFile):
  267. """A Git Tag object."""
  268. _type = TAG_ID
  269. _num_type = 4
  270. def __init__(self):
  271. super(Tag, self).__init__()
  272. self._needs_parsing = False
  273. self._needs_serialization = True
  274. @classmethod
  275. def from_file(cls, filename):
  276. blob = ShaFile.from_file(filename)
  277. if blob._type != cls._type:
  278. raise NotBlobError(filename)
  279. return blob
  280. @classmethod
  281. def from_string(cls, string):
  282. """Create a blob from a string."""
  283. shafile = cls()
  284. shafile.set_raw_string(string)
  285. return shafile
  286. def _serialize(self):
  287. chunks = []
  288. chunks.append("%s %s\n" % (OBJECT_ID, self._object_sha))
  289. chunks.append("%s %s\n" % (TYPE_ID, num_type_map[self._object_type]._type))
  290. chunks.append("%s %s\n" % (TAG_ID, self._name))
  291. if self._tagger:
  292. if self._tag_time is None:
  293. chunks.append("%s %s\n" % (TAGGER_ID, self._tagger))
  294. else:
  295. chunks.append("%s %s %d %s\n" % (TAGGER_ID, self._tagger, self._tag_time, format_timezone(self._tag_timezone)))
  296. chunks.append("\n") # To close headers
  297. chunks.append(self._message)
  298. return chunks
  299. def _deserialize(self, chunks):
  300. """Grab the metadata attached to the tag"""
  301. self._tagger = None
  302. f = StringIO("".join(chunks))
  303. for l in f:
  304. l = l.rstrip("\n")
  305. if l == "":
  306. break # empty line indicates end of headers
  307. (field, value) = l.split(" ", 1)
  308. if field == OBJECT_ID:
  309. self._object_sha = value
  310. elif field == TYPE_ID:
  311. self._object_type = type_map[value]
  312. elif field == TAG_ID:
  313. self._name = value
  314. elif field == TAGGER_ID:
  315. try:
  316. sep = value.index("> ")
  317. except ValueError:
  318. self._tagger = value
  319. self._tag_time = None
  320. self._tag_timezone = None
  321. else:
  322. self._tagger = value[0:sep+1]
  323. (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
  324. try:
  325. self._tag_time = int(timetext)
  326. except ValueError: #Not a unix timestamp
  327. self._tag_time = time.strptime(timetext)
  328. self._tag_timezone = parse_timezone(timezonetext)
  329. else:
  330. raise AssertionError("Unknown field %s" % field)
  331. self._message = f.read()
  332. def _get_object(self):
  333. """Returns the object pointed by this tag, represented as a tuple(type, sha)"""
  334. self._ensure_parsed()
  335. return (self._object_type, self._object_sha)
  336. def _set_object(self, value):
  337. self._ensure_parsed()
  338. (self._object_type, self._object_sha) = value
  339. self._needs_serialization = True
  340. object = property(_get_object, _set_object)
  341. name = serializable_property("name", "The name of this tag")
  342. tagger = serializable_property("tagger",
  343. "Returns the name of the person who created this tag")
  344. tag_time = serializable_property("tag_time",
  345. "The creation timestamp of the tag. As the number of seconds since the epoch")
  346. tag_timezone = serializable_property("tag_timezone",
  347. "The timezone that tag_time is in.")
  348. message = serializable_property("message", "The message attached to this tag")
  349. def parse_tree(text):
  350. """Parse a tree text.
  351. :param text: Serialized text to parse
  352. :return: Dictionary with names as keys, (mode, sha) tuples as values
  353. """
  354. ret = {}
  355. count = 0
  356. l = len(text)
  357. while count < l:
  358. mode_end = text.index(' ', count)
  359. mode = int(text[count:mode_end], 8)
  360. name_end = text.index('\0', mode_end)
  361. name = text[mode_end+1:name_end]
  362. count = name_end+21
  363. sha = text[name_end+1:count]
  364. ret[name] = (mode, sha_to_hex(sha))
  365. return ret
  366. def serialize_tree(items):
  367. """Serialize the items in a tree to a text.
  368. :param items: Sorted iterable over (name, mode, sha) tuples
  369. :return: Serialized tree text as chunks
  370. """
  371. for name, mode, hexsha in items:
  372. yield "%04o %s\0%s" % (mode, name, hex_to_sha(hexsha))
  373. def sorted_tree_items(entries):
  374. """Iterate over a tree entries dictionary in the order in which
  375. the items would be serialized.
  376. :param entries: Dictionary mapping names to (mode, sha) tuples
  377. :return: Iterator over (name, mode, sha)
  378. """
  379. def cmp_entry((name1, value1), (name2, value2)):
  380. if stat.S_ISDIR(value1[0]):
  381. name1 += "/"
  382. if stat.S_ISDIR(value2[0]):
  383. name2 += "/"
  384. return cmp(name1, name2)
  385. for name, entry in sorted(entries.iteritems(), cmp=cmp_entry):
  386. yield name, entry[0], entry[1]
  387. class Tree(ShaFile):
  388. """A Git tree object"""
  389. _type = TREE_ID
  390. _num_type = 2
  391. def __init__(self):
  392. super(Tree, self).__init__()
  393. self._entries = {}
  394. self._needs_parsing = False
  395. self._needs_serialization = True
  396. @classmethod
  397. def from_file(cls, filename):
  398. tree = ShaFile.from_file(filename)
  399. if tree._type != cls._type:
  400. raise NotTreeError(filename)
  401. return tree
  402. def __contains__(self, name):
  403. self._ensure_parsed()
  404. return name in self._entries
  405. def __getitem__(self, name):
  406. self._ensure_parsed()
  407. return self._entries[name]
  408. def __setitem__(self, name, value):
  409. assert isinstance(value, tuple)
  410. assert len(value) == 2
  411. self._ensure_parsed()
  412. self._entries[name] = value
  413. self._needs_serialization = True
  414. def __delitem__(self, name):
  415. self._ensure_parsed()
  416. del self._entries[name]
  417. self._needs_serialization = True
  418. def __len__(self):
  419. self._ensure_parsed()
  420. return len(self._entries)
  421. def add(self, mode, name, hexsha):
  422. assert type(mode) == int
  423. assert type(name) == str
  424. assert type(hexsha) == str
  425. self._ensure_parsed()
  426. self._entries[name] = mode, hexsha
  427. self._needs_serialization = True
  428. def entries(self):
  429. """Return a list of tuples describing the tree entries"""
  430. self._ensure_parsed()
  431. # The order of this is different from iteritems() for historical
  432. # reasons
  433. return [
  434. (mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
  435. def iteritems(self):
  436. """Iterate over all entries in the order in which they would be
  437. serialized.
  438. :return: Iterator over (name, mode, sha) tuples
  439. """
  440. self._ensure_parsed()
  441. return sorted_tree_items(self._entries)
  442. def _deserialize(self, chunks):
  443. """Grab the entries in the tree"""
  444. self._entries = parse_tree("".join(chunks))
  445. def _serialize(self):
  446. return list(serialize_tree(self.iteritems()))
  447. def as_pretty_string(self):
  448. text = []
  449. for name, mode, hexsha in self.iteritems():
  450. if mode & stat.S_IFDIR:
  451. kind = "tree"
  452. else:
  453. kind = "blob"
  454. text.append("%04o %s %s\t%s\n" % (mode, kind, hexsha, name))
  455. return "".join(text)
  456. def parse_timezone(text):
  457. offset = int(text)
  458. signum = (offset < 0) and -1 or 1
  459. offset = abs(offset)
  460. hours = int(offset / 100)
  461. minutes = (offset % 100)
  462. return signum * (hours * 3600 + minutes * 60)
  463. def format_timezone(offset):
  464. if offset % 60 != 0:
  465. raise ValueError("Unable to handle non-minute offset.")
  466. sign = (offset < 0) and '-' or '+'
  467. offset = abs(offset)
  468. return '%c%02d%02d' % (sign, offset / 3600, (offset / 60) % 60)
  469. class Commit(ShaFile):
  470. """A git commit object"""
  471. _type = COMMIT_ID
  472. _num_type = 1
  473. def __init__(self):
  474. super(Commit, self).__init__()
  475. self._parents = []
  476. self._encoding = None
  477. self._needs_parsing = False
  478. self._needs_serialization = True
  479. self._extra = {}
  480. @classmethod
  481. def from_file(cls, filename):
  482. commit = ShaFile.from_file(filename)
  483. if commit._type != cls._type:
  484. raise NotCommitError(filename)
  485. return commit
  486. def _deserialize(self, chunks):
  487. self._parents = []
  488. self._extra = []
  489. self._author = None
  490. f = StringIO("".join(chunks))
  491. for l in f:
  492. l = l.rstrip("\n")
  493. if l == "":
  494. # Empty line indicates end of headers
  495. break
  496. (field, value) = l.split(" ", 1)
  497. if field == TREE_ID:
  498. self._tree = value
  499. elif field == PARENT_ID:
  500. self._parents.append(value)
  501. elif field == AUTHOR_ID:
  502. self._author, timetext, timezonetext = value.rsplit(" ", 2)
  503. self._author_time = int(timetext)
  504. self._author_timezone = parse_timezone(timezonetext)
  505. elif field == COMMITTER_ID:
  506. self._committer, timetext, timezonetext = value.rsplit(" ", 2)
  507. self._commit_time = int(timetext)
  508. self._commit_timezone = parse_timezone(timezonetext)
  509. elif field == ENCODING_ID:
  510. self._encoding = value
  511. else:
  512. self._extra.append((field, value))
  513. self._message = f.read()
  514. def _serialize(self):
  515. chunks = []
  516. chunks.append("%s %s\n" % (TREE_ID, self._tree))
  517. for p in self._parents:
  518. chunks.append("%s %s\n" % (PARENT_ID, p))
  519. chunks.append("%s %s %s %s\n" % (AUTHOR_ID, self._author, str(self._author_time), format_timezone(self._author_timezone)))
  520. chunks.append("%s %s %s %s\n" % (COMMITTER_ID, self._committer, str(self._commit_time), format_timezone(self._commit_timezone)))
  521. if self.encoding:
  522. chunks.append("%s %s\n" % (ENCODING_ID, self.encoding))
  523. for k, v in self.extra:
  524. if "\n" in k or "\n" in v:
  525. raise AssertionError("newline in extra data: %r -> %r" % (k, v))
  526. chunks.append("%s %s\n" % (k, v))
  527. chunks.append("\n") # There must be a new line after the headers
  528. chunks.append(self._message)
  529. return chunks
  530. tree = serializable_property("tree", "Tree that is the state of this commit")
  531. def _get_parents(self):
  532. """Return a list of parents of this commit."""
  533. self._ensure_parsed()
  534. return self._parents
  535. def _set_parents(self, value):
  536. """Set a list of parents of this commit."""
  537. self._ensure_parsed()
  538. self._needs_serialization = True
  539. self._parents = value
  540. parents = property(_get_parents, _set_parents)
  541. def _get_extra(self):
  542. """Return extra settings of this commit."""
  543. self._ensure_parsed()
  544. return self._extra
  545. extra = property(_get_extra)
  546. author = serializable_property("author",
  547. "The name of the author of the commit")
  548. committer = serializable_property("committer",
  549. "The name of the committer of the commit")
  550. message = serializable_property("message",
  551. "The commit message")
  552. commit_time = serializable_property("commit_time",
  553. "The timestamp of the commit. As the number of seconds since the epoch.")
  554. commit_timezone = serializable_property("commit_timezone",
  555. "The zone the commit time is in")
  556. author_time = serializable_property("author_time",
  557. "The timestamp the commit was written. as the number of seconds since the epoch.")
  558. author_timezone = serializable_property("author_timezone",
  559. "Returns the zone the author time is in.")
  560. encoding = serializable_property("encoding",
  561. "Encoding of the commit message.")
  562. type_map = {
  563. BLOB_ID : Blob,
  564. TREE_ID : Tree,
  565. COMMIT_ID : Commit,
  566. TAG_ID: Tag,
  567. }
  568. num_type_map = {
  569. 0: None,
  570. 1: Commit,
  571. 2: Tree,
  572. 3: Blob,
  573. 4: Tag,
  574. # 5 Is reserved for further expansion
  575. }
  576. try:
  577. # Try to import C versions
  578. from dulwich._objects import parse_tree, sorted_tree_items
  579. except ImportError:
  580. pass