objects.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
  1. # objects.py -- Access to base git objects
  2. # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
  3. # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
  4. #
  5. # This program is free software; you can redistribute it and/or
  6. # modify it under the terms of the GNU General Public License
  7. # as published by the Free Software Foundation; version 2
  8. # of the License or (at your option) a later version of the License.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18. # MA 02110-1301, USA.
  19. """Access to base git objects."""
  20. import mmap
  21. import os
  22. import stat
  23. import zlib
  24. from dulwich.errors import (
  25. NotBlobError,
  26. NotCommitError,
  27. NotTreeError,
  28. )
  29. from dulwich.misc import (
  30. make_sha,
  31. )
  32. BLOB_ID = "blob"
  33. TAG_ID = "tag"
  34. TREE_ID = "tree"
  35. COMMIT_ID = "commit"
  36. PARENT_ID = "parent"
  37. AUTHOR_ID = "author"
  38. COMMITTER_ID = "committer"
  39. OBJECT_ID = "object"
  40. TYPE_ID = "type"
  41. TAGGER_ID = "tagger"
  42. def _decompress(string):
  43. dcomp = zlib.decompressobj()
  44. dcomped = dcomp.decompress(string)
  45. dcomped += dcomp.flush()
  46. return dcomped
  47. def sha_to_hex(sha):
  48. """Takes a string and returns the hex of the sha within"""
  49. hexsha = "".join(["%02x" % ord(c) for c in sha])
  50. assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
  51. return hexsha
  52. def hex_to_sha(hex):
  53. """Takes a hex sha and returns a binary sha"""
  54. assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
  55. return ''.join([chr(int(hex[i:i+2], 16)) for i in xrange(0, len(hex), 2)])
  56. def serializable_property(name, docstring=None):
  57. def set(obj, value):
  58. obj._ensure_parsed()
  59. setattr(obj, "_"+name, value)
  60. obj._needs_serialization = True
  61. def get(obj):
  62. obj._ensure_parsed()
  63. return getattr(obj, "_"+name)
  64. return property(get, set, doc=docstring)
  65. class ShaFile(object):
  66. """A git SHA file."""
  67. @classmethod
  68. def _parse_legacy_object(cls, map):
  69. """Parse a legacy object, creating it and setting object._text"""
  70. text = _decompress(map)
  71. object = None
  72. for posstype in type_map.keys():
  73. if text.startswith(posstype):
  74. object = type_map[posstype]()
  75. text = text[len(posstype):]
  76. break
  77. assert object is not None, "%s is not a known object type" % text[:9]
  78. assert text[0] == ' ', "%s is not a space" % text[0]
  79. text = text[1:]
  80. size = 0
  81. i = 0
  82. while text[0] >= '0' and text[0] <= '9':
  83. if i > 0 and size == 0:
  84. assert False, "Size is not in canonical format"
  85. size = (size * 10) + int(text[0])
  86. text = text[1:]
  87. i += 1
  88. object._size = size
  89. assert text[0] == "\0", "Size not followed by null"
  90. text = text[1:]
  91. object.set_raw_string(text)
  92. return object
  93. def as_legacy_object(self):
  94. text = self.as_raw_string()
  95. return zlib.compress("%s %d\0%s" % (self._type, len(text), text))
  96. def as_raw_string(self):
  97. if self._needs_serialization:
  98. self.serialize()
  99. return self._text
  100. def __str__(self):
  101. return self.as_raw_string()
  102. def as_pretty_string(self):
  103. return self.as_raw_string()
  104. def _ensure_parsed(self):
  105. if self._needs_parsing:
  106. self._parse_text()
  107. def set_raw_string(self, text):
  108. if type(text) != str:
  109. raise TypeError(text)
  110. self._text = text
  111. self._sha = None
  112. self._needs_parsing = True
  113. self._needs_serialization = False
  114. @classmethod
  115. def _parse_object(cls, map):
  116. """Parse a new style object , creating it and setting object._text"""
  117. used = 0
  118. byte = ord(map[used])
  119. used += 1
  120. num_type = (byte >> 4) & 7
  121. try:
  122. object = num_type_map[num_type]()
  123. except KeyError:
  124. raise AssertionError("Not a known type: %d" % num_type)
  125. while (byte & 0x80) != 0:
  126. byte = ord(map[used])
  127. used += 1
  128. raw = map[used:]
  129. object.set_raw_string(_decompress(raw))
  130. return object
  131. @classmethod
  132. def _parse_file(cls, map):
  133. word = (ord(map[0]) << 8) + ord(map[1])
  134. if ord(map[0]) == 0x78 and (word % 31) == 0:
  135. return cls._parse_legacy_object(map)
  136. else:
  137. return cls._parse_object(map)
  138. def __init__(self):
  139. """Don't call this directly"""
  140. self._sha = None
  141. def _parse_text(self):
  142. """For subclasses to do initialisation time parsing"""
  143. @classmethod
  144. def from_file(cls, filename):
  145. """Get the contents of a SHA file on disk"""
  146. size = os.path.getsize(filename)
  147. f = open(filename, 'rb')
  148. try:
  149. map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
  150. shafile = cls._parse_file(map)
  151. return shafile
  152. finally:
  153. f.close()
  154. @classmethod
  155. def from_raw_string(cls, type, string):
  156. """Creates an object of the indicated type from the raw string given.
  157. Type is the numeric type of an object. String is the raw uncompressed
  158. contents.
  159. """
  160. real_class = num_type_map[type]
  161. obj = real_class()
  162. obj.type = type
  163. obj.set_raw_string(string)
  164. return obj
  165. def _header(self):
  166. return "%s %lu\0" % (self._type, len(self.as_raw_string()))
  167. def sha(self):
  168. """The SHA1 object that is the name of this object."""
  169. if self._needs_serialization or self._sha is None:
  170. self._sha = make_sha()
  171. self._sha.update(self._header())
  172. self._sha.update(self.as_raw_string())
  173. return self._sha
  174. @property
  175. def id(self):
  176. return self.sha().hexdigest()
  177. def get_type(self):
  178. return self._num_type
  179. def set_type(self, type):
  180. self._num_type = type
  181. type = property(get_type, set_type)
  182. def __repr__(self):
  183. return "<%s %s>" % (self.__class__.__name__, self.id)
  184. def __eq__(self, other):
  185. """Return true id the sha of the two objects match.
  186. The __le__ etc methods aren't overriden as they make no sense,
  187. certainly at this level.
  188. """
  189. return self.sha().digest() == other.sha().digest()
  190. class Blob(ShaFile):
  191. """A Git Blob object."""
  192. _type = BLOB_ID
  193. _num_type = 3
  194. _needs_serialization = False
  195. _needs_parsing = False
  196. def get_data(self):
  197. return self._text
  198. def set_data(self, data):
  199. self._text = data
  200. data = property(get_data, set_data,
  201. "The text contained within the blob object.")
  202. @classmethod
  203. def from_file(cls, filename):
  204. blob = ShaFile.from_file(filename)
  205. if blob._type != cls._type:
  206. raise NotBlobError(filename)
  207. return blob
  208. @classmethod
  209. def from_string(cls, string):
  210. """Create a blob from a string."""
  211. shafile = cls()
  212. shafile.set_raw_string(string)
  213. return shafile
  214. class Tag(ShaFile):
  215. """A Git Tag object."""
  216. _type = TAG_ID
  217. _num_type = 4
  218. @classmethod
  219. def from_file(cls, filename):
  220. blob = ShaFile.from_file(filename)
  221. if blob._type != cls._type:
  222. raise NotBlobError(filename)
  223. return blob
  224. @classmethod
  225. def from_string(cls, string):
  226. """Create a blob from a string."""
  227. shafile = cls()
  228. shafile.set_raw_string(string)
  229. return shafile
  230. def _parse_text(self):
  231. """Grab the metadata attached to the tag"""
  232. text = self._text
  233. count = 0
  234. assert text.startswith(OBJECT_ID), "Invalid tag object, " \
  235. "must start with %s" % OBJECT_ID
  236. count += len(OBJECT_ID)
  237. assert text[count] == ' ', "Invalid tag object, " \
  238. "%s must be followed by space not %s" % (OBJECT_ID, text[count])
  239. count += 1
  240. self._object_sha = text[count:count+40]
  241. count += 40
  242. assert text[count] == '\n', "Invalid tag object, " \
  243. "%s sha must be followed by newline" % OBJECT_ID
  244. count += 1
  245. assert text[count:].startswith(TYPE_ID), "Invalid tag object, " \
  246. "%s sha must be followed by %s" % (OBJECT_ID, TYPE_ID)
  247. count += len(TYPE_ID)
  248. assert text[count] == ' ', "Invalid tag object, " \
  249. "%s must be followed by space not %s" % (TAG_ID, text[count])
  250. count += 1
  251. self._object_type = ""
  252. while text[count] != '\n':
  253. self._object_type += text[count]
  254. count += 1
  255. count += 1
  256. assert self._object_type in (COMMIT_ID, BLOB_ID, TREE_ID, TAG_ID), "Invalid tag object, " \
  257. "unexpected object type %s" % self._object_type
  258. self._object_type = type_map[self._object_type]
  259. assert text[count:].startswith(TAG_ID), "Invalid tag object, " \
  260. "object type must be followed by %s" % (TAG_ID)
  261. count += len(TAG_ID)
  262. assert text[count] == ' ', "Invalid tag object, " \
  263. "%s must be followed by space not %s" % (TAG_ID, text[count])
  264. count += 1
  265. self._name = ""
  266. while text[count] != '\n':
  267. self._name += text[count]
  268. count += 1
  269. count += 1
  270. assert text[count:].startswith(TAGGER_ID), "Invalid tag object, " \
  271. "%s must be followed by %s" % (TAG_ID, TAGGER_ID)
  272. count += len(TAGGER_ID)
  273. assert text[count] == ' ', "Invalid tag object, " \
  274. "%s must be followed by space not %s" % (TAGGER_ID, text[count])
  275. count += 1
  276. self._tagger = ""
  277. while text[count] != '>':
  278. assert text[count] != '\n', "Malformed tagger information"
  279. self._tagger += text[count]
  280. count += 1
  281. self._tagger += text[count]
  282. count += 1
  283. assert text[count] == ' ', "Invalid tag object, " \
  284. "tagger information must be followed by space not %s" % text[count]
  285. count += 1
  286. self._tag_time = int(text[count:count+10])
  287. while text[count] != '\n':
  288. count += 1
  289. count += 1
  290. assert text[count] == '\n', "There must be a new line after the headers"
  291. count += 1
  292. self._message = text[count:]
  293. self._needs_parsing = False
  294. def get_object(self):
  295. """Returns the object pointed by this tag, represented as a tuple(type, sha)"""
  296. self._ensure_parsed()
  297. return (self._object_type, self._object_sha)
  298. object = property(get_object)
  299. name = serializable_property("name", "The name of this tag")
  300. tagger = serializable_property("tagger",
  301. "Returns the name of the person who created this tag")
  302. tag_time = serializable_property("tag_time",
  303. "The creation timestamp of the tag. As the number of seconds since the epoch")
  304. message = serializable_property("message", "The message attached to this tag")
  305. def parse_tree(text):
  306. ret = {}
  307. count = 0
  308. while count < len(text):
  309. mode = 0
  310. chr = text[count]
  311. while chr != ' ':
  312. assert chr >= '0' and chr <= '7', "%s is not a valid mode char" % chr
  313. mode = (mode << 3) + (ord(chr) - ord('0'))
  314. count += 1
  315. chr = text[count]
  316. count += 1
  317. chr = text[count]
  318. name = ''
  319. while chr != '\0':
  320. name += chr
  321. count += 1
  322. chr = text[count]
  323. count += 1
  324. chr = text[count]
  325. sha = text[count:count+20]
  326. hexsha = sha_to_hex(sha)
  327. ret[name] = (mode, hexsha)
  328. count = count + 20
  329. return ret
  330. class Tree(ShaFile):
  331. """A Git tree object"""
  332. _type = TREE_ID
  333. _num_type = 2
  334. def __init__(self):
  335. super(Tree, self).__init__()
  336. self._entries = {}
  337. self._needs_parsing = False
  338. self._needs_serialization = True
  339. @classmethod
  340. def from_file(cls, filename):
  341. tree = ShaFile.from_file(filename)
  342. if tree._type != cls._type:
  343. raise NotTreeError(filename)
  344. return tree
  345. def __contains__(self, name):
  346. self._ensure_parsed()
  347. return name in self._entries
  348. def __getitem__(self, name):
  349. self._ensure_parsed()
  350. return self._entries[name]
  351. def __setitem__(self, name, value):
  352. assert isinstance(value, tuple)
  353. assert len(value) == 2
  354. self._ensure_parsed()
  355. self._entries[name] = value
  356. self._needs_serialization = True
  357. def __delitem__(self, name):
  358. self._ensure_parsed()
  359. del self._entries[name]
  360. self._needs_serialization = True
  361. def add(self, mode, name, hexsha):
  362. assert type(mode) == int
  363. assert type(name) == str
  364. assert type(hexsha) == str
  365. self._ensure_parsed()
  366. self._entries[name] = mode, hexsha
  367. self._needs_serialization = True
  368. def entries(self):
  369. """Return a list of tuples describing the tree entries"""
  370. self._ensure_parsed()
  371. # The order of this is different from iteritems() for historical reasons
  372. return [(mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
  373. def iteritems(self):
  374. def cmp_entry((name1, value1), (name2, value2)):
  375. if stat.S_ISDIR(value1[0]):
  376. name1 += "/"
  377. if stat.S_ISDIR(value2[0]):
  378. name2 += "/"
  379. return cmp(name1, name2)
  380. self._ensure_parsed()
  381. for name, entry in sorted(self._entries.iteritems(), cmp=cmp_entry):
  382. yield name, entry[0], entry[1]
  383. def _parse_text(self):
  384. """Grab the entries in the tree"""
  385. self._entries = parse_tree(self._text)
  386. self._needs_parsing = False
  387. def serialize(self):
  388. self._text = ""
  389. for name, mode, hexsha in self.iteritems():
  390. self._text += "%04o %s\0%s" % (mode, name, hex_to_sha(hexsha))
  391. self._needs_serialization = False
  392. def as_pretty_string(self):
  393. text = ""
  394. for name, mode, hexsha in self.iteritems():
  395. if mode & stat.S_IFDIR:
  396. kind = "tree"
  397. else:
  398. kind = "blob"
  399. text += "%04o %s %s\t%s\n" % (mode, kind, hexsha, name)
  400. return text
  401. def parse_timezone(text):
  402. offset = int(text)
  403. hours = int(offset / 100)
  404. minutes = (offset % 100)
  405. return (hours * 3600) + (minutes * 60)
  406. def format_timezone(offset):
  407. if offset % 60 != 0:
  408. raise ValueError("Unable to handle non-minute offset.")
  409. return '%+03d%02d' % (offset / 3600, (offset / 60) % 60)
  410. class Commit(ShaFile):
  411. """A git commit object"""
  412. _type = COMMIT_ID
  413. _num_type = 1
  414. def __init__(self):
  415. super(Commit, self).__init__()
  416. self._parents = []
  417. self._needs_parsing = False
  418. self._needs_serialization = True
  419. @classmethod
  420. def from_file(cls, filename):
  421. commit = ShaFile.from_file(filename)
  422. if commit._type != cls._type:
  423. raise NotCommitError(filename)
  424. return commit
  425. def _parse_text(self):
  426. text = self._text
  427. count = 0
  428. assert text.startswith(TREE_ID), "Invalid commit object, " \
  429. "must start with %s" % TREE_ID
  430. count += len(TREE_ID)
  431. assert text[count] == ' ', "Invalid commit object, " \
  432. "%s must be followed by space not %s" % (TREE_ID, text[count])
  433. count += 1
  434. self._tree = text[count:count+40]
  435. count = count + 40
  436. assert text[count] == "\n", "Invalid commit object, " \
  437. "tree sha must be followed by newline"
  438. count += 1
  439. self._parents = []
  440. while text[count:].startswith(PARENT_ID):
  441. count += len(PARENT_ID)
  442. assert text[count] == ' ', "Invalid commit object, " \
  443. "%s must be followed by space not %s" % (PARENT_ID, text[count])
  444. count += 1
  445. self._parents.append(text[count:count+40])
  446. count += 40
  447. assert text[count] == "\n", "Invalid commit object, " \
  448. "parent sha must be followed by newline"
  449. count += 1
  450. self._author = None
  451. if text[count:].startswith(AUTHOR_ID):
  452. count += len(AUTHOR_ID)
  453. assert text[count] == ' ', "Invalid commit object, " \
  454. "%s must be followed by space not %s" % (AUTHOR_ID, text[count])
  455. count += 1
  456. self._author = ''
  457. while text[count] != '>':
  458. assert text[count] != '\n', "Malformed author information"
  459. self._author += text[count]
  460. count += 1
  461. self._author += text[count]
  462. count += 1
  463. assert text[count] == ' ', "Invalid commit object, " \
  464. "author information must be followed by space not %s" % text[count]
  465. count += 1
  466. time_text = ''
  467. while text[count] != ' ':
  468. assert text[count] != '\n', "Malformed author information"
  469. time_text += text[count]
  470. count += 1
  471. self._author_time = int(time_text)
  472. self._author_timezone = parse_timezone(text[count:count+6])
  473. count += 1
  474. while text[count] != '\n':
  475. count += 1
  476. count += 1
  477. self._committer = None
  478. if text[count:].startswith(COMMITTER_ID):
  479. count += len(COMMITTER_ID)
  480. assert text[count] == ' ', "Invalid commit object, " \
  481. "%s must be followed by space not %s" % (COMMITTER_ID, text[count])
  482. count += 1
  483. self._committer = ''
  484. while text[count] != '>':
  485. assert text[count] != '\n', "Malformed committer information"
  486. self._committer += text[count]
  487. count += 1
  488. self._committer += text[count]
  489. count += 1
  490. assert text[count] == ' ', "Invalid commit object, " \
  491. "commiter information must be followed by space not %s" % text[count]
  492. count += 1
  493. time_text = ""
  494. while text[count] != ' ':
  495. assert text[count] != '\n', "Malformed committer information"
  496. time_text += text[count]
  497. count += 1
  498. self._commit_time = int(time_text)
  499. self._commit_timezone = parse_timezone(text[count:count+6])
  500. count += 1
  501. while text[count] != '\n':
  502. count += 1
  503. count += 1
  504. assert text[count] == '\n', "There must be a new line after the headers"
  505. count += 1
  506. # XXX: There can be an encoding field.
  507. self._message = text[count:]
  508. self._needs_parsing = False
  509. def serialize(self):
  510. self._text = ""
  511. self._text += "%s %s\n" % (TREE_ID, self._tree)
  512. for p in self._parents:
  513. self._text += "%s %s\n" % (PARENT_ID, p)
  514. self._text += "%s %s %s %s\n" % (AUTHOR_ID, self._author, str(self._author_time), format_timezone(self._author_timezone))
  515. self._text += "%s %s %s %s\n" % (COMMITTER_ID, self._committer, str(self._commit_time), format_timezone(self._commit_timezone))
  516. self._text += "\n" # There must be a new line after the headers
  517. self._text += self._message
  518. self._needs_serialization = False
  519. tree = serializable_property("tree", "Tree that is the state of this commit")
  520. def get_parents(self):
  521. """Return a list of parents of this commit."""
  522. self._ensure_parsed()
  523. return self._parents
  524. def set_parents(self, value):
  525. """Return a list of parents of this commit."""
  526. self._ensure_parsed()
  527. self._needs_serialization = True
  528. self._parents = value
  529. parents = property(get_parents, set_parents)
  530. author = serializable_property("author",
  531. "The name of the author of the commit")
  532. committer = serializable_property("committer",
  533. "The name of the committer of the commit")
  534. message = serializable_property("message",
  535. "The commit message")
  536. commit_time = serializable_property("commit_time",
  537. "The timestamp of the commit. As the number of seconds since the epoch.")
  538. commit_timezone = serializable_property("commit_timezone",
  539. "The zone the commit time is in")
  540. author_time = serializable_property("author_time",
  541. "The timestamp the commit was written. as the number of seconds since the epoch.")
  542. author_timezone = serializable_property("author_timezone",
  543. "Returns the zone the author time is in.")
  544. type_map = {
  545. BLOB_ID : Blob,
  546. TREE_ID : Tree,
  547. COMMIT_ID : Commit,
  548. TAG_ID: Tag,
  549. }
  550. num_type_map = {
  551. 0: None,
  552. 1: Commit,
  553. 2: Tree,
  554. 3: Blob,
  555. 4: Tag,
  556. # 5 Is reserved for further expansion
  557. }
  558. try:
  559. # Try to import C versions
  560. from dulwich._objects import hex_to_sha, sha_to_hex, parse_tree
  561. except ImportError:
  562. pass