2
0

objects.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. # objects.py -- Access to base git objects
  2. # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
  3. # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
  4. #
  5. # This program is free software; you can redistribute it and/or
  6. # modify it under the terms of the GNU General Public License
  7. # as published by the Free Software Foundation; version 2
  8. # of the License or (at your option) a later version of the License.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18. # MA 02110-1301, USA.
  19. """Access to base git objects."""
  20. import binascii
  21. from cStringIO import (
  22. StringIO,
  23. )
  24. import mmap
  25. import os
  26. import stat
  27. import time
  28. import zlib
  29. from dulwich.errors import (
  30. NotBlobError,
  31. NotCommitError,
  32. NotTreeError,
  33. )
  34. from dulwich.misc import (
  35. make_sha,
  36. )
  37. BLOB_ID = "blob"
  38. TAG_ID = "tag"
  39. TREE_ID = "tree"
  40. COMMIT_ID = "commit"
  41. PARENT_ID = "parent"
  42. AUTHOR_ID = "author"
  43. COMMITTER_ID = "committer"
  44. OBJECT_ID = "object"
  45. TYPE_ID = "type"
  46. TAGGER_ID = "tagger"
  47. ENCODING_ID = "encoding"
  48. S_IFGITLINK = 0160000
  49. def S_ISGITLINK(m):
  50. return (stat.S_IFMT(m) == S_IFGITLINK)
  51. def _decompress(string):
  52. dcomp = zlib.decompressobj()
  53. dcomped = dcomp.decompress(string)
  54. dcomped += dcomp.flush()
  55. return dcomped
  56. def sha_to_hex(sha):
  57. """Takes a string and returns the hex of the sha within"""
  58. hexsha = binascii.hexlify(sha)
  59. assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
  60. return hexsha
  61. def hex_to_sha(hex):
  62. """Takes a hex sha and returns a binary sha"""
  63. assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
  64. return binascii.unhexlify(hex)
  65. def serializable_property(name, docstring=None):
  66. def set(obj, value):
  67. obj._ensure_parsed()
  68. setattr(obj, "_"+name, value)
  69. obj._needs_serialization = True
  70. def get(obj):
  71. obj._ensure_parsed()
  72. return getattr(obj, "_"+name)
  73. return property(get, set, doc=docstring)
  74. class ShaFile(object):
  75. """A git SHA file."""
  76. @classmethod
  77. def _parse_legacy_object(cls, map):
  78. """Parse a legacy object, creating it and setting object._text"""
  79. text = _decompress(map)
  80. object = None
  81. for posstype in type_map.keys():
  82. if text.startswith(posstype):
  83. object = type_map[posstype]()
  84. text = text[len(posstype):]
  85. break
  86. assert object is not None, "%s is not a known object type" % text[:9]
  87. assert text[0] == ' ', "%s is not a space" % text[0]
  88. text = text[1:]
  89. size = 0
  90. i = 0
  91. while text[0] >= '0' and text[0] <= '9':
  92. if i > 0 and size == 0:
  93. raise AssertionError("Size is not in canonical format")
  94. size = (size * 10) + int(text[0])
  95. text = text[1:]
  96. i += 1
  97. object._size = size
  98. assert text[0] == "\0", "Size not followed by null"
  99. text = text[1:]
  100. object.set_raw_string(text)
  101. return object
  102. def as_legacy_object(self):
  103. text = self.as_raw_string()
  104. return zlib.compress("%s %d\0%s" % (self._type, len(text), text))
  105. def as_raw_string(self):
  106. if self._needs_serialization:
  107. self.serialize()
  108. return self._text
  109. def __str__(self):
  110. return self.as_raw_string()
  111. def __hash__(self):
  112. return hash(self.id)
  113. def as_pretty_string(self):
  114. return self.as_raw_string()
  115. def _ensure_parsed(self):
  116. if self._needs_parsing:
  117. self._parse_text()
  118. def set_raw_string(self, text):
  119. if type(text) != str:
  120. raise TypeError(text)
  121. self._text = text
  122. self._sha = None
  123. self._needs_parsing = True
  124. self._needs_serialization = False
  125. @classmethod
  126. def _parse_object(cls, map):
  127. """Parse a new style object , creating it and setting object._text"""
  128. used = 0
  129. byte = ord(map[used])
  130. used += 1
  131. num_type = (byte >> 4) & 7
  132. try:
  133. object = num_type_map[num_type]()
  134. except KeyError:
  135. raise AssertionError("Not a known type: %d" % num_type)
  136. while (byte & 0x80) != 0:
  137. byte = ord(map[used])
  138. used += 1
  139. raw = map[used:]
  140. object.set_raw_string(_decompress(raw))
  141. return object
  142. @classmethod
  143. def _parse_file(cls, map):
  144. word = (ord(map[0]) << 8) + ord(map[1])
  145. if ord(map[0]) == 0x78 and (word % 31) == 0:
  146. return cls._parse_legacy_object(map)
  147. else:
  148. return cls._parse_object(map)
  149. def __init__(self):
  150. """Don't call this directly"""
  151. self._sha = None
  152. def _parse_text(self):
  153. """For subclasses to do initialisation time parsing"""
  154. @classmethod
  155. def from_file(cls, filename):
  156. """Get the contents of a SHA file on disk"""
  157. size = os.path.getsize(filename)
  158. f = open(filename, 'rb')
  159. try:
  160. map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
  161. shafile = cls._parse_file(map)
  162. return shafile
  163. finally:
  164. f.close()
  165. @classmethod
  166. def from_raw_string(cls, type, string):
  167. """Creates an object of the indicated type from the raw string given.
  168. Type is the numeric type of an object. String is the raw uncompressed
  169. contents.
  170. """
  171. real_class = num_type_map[type]
  172. obj = real_class()
  173. obj.type = type
  174. obj.set_raw_string(string)
  175. return obj
  176. def _header(self):
  177. return "%s %lu\0" % (self._type, len(self.as_raw_string()))
  178. def sha(self):
  179. """The SHA1 object that is the name of this object."""
  180. if self._needs_serialization or self._sha is None:
  181. self._sha = make_sha()
  182. self._sha.update(self._header())
  183. self._sha.update(self.as_raw_string())
  184. return self._sha
  185. @property
  186. def id(self):
  187. return self.sha().hexdigest()
  188. def get_type(self):
  189. return self._num_type
  190. def set_type(self, type):
  191. self._num_type = type
  192. type = property(get_type, set_type)
  193. def __repr__(self):
  194. return "<%s %s>" % (self.__class__.__name__, self.id)
  195. def __ne__(self, other):
  196. return self.id != other.id
  197. def __eq__(self, other):
  198. """Return true id the sha of the two objects match.
  199. The __le__ etc methods aren't overriden as they make no sense,
  200. certainly at this level.
  201. """
  202. return self.id == other.id
  203. class Blob(ShaFile):
  204. """A Git Blob object."""
  205. _type = BLOB_ID
  206. _num_type = 3
  207. _needs_serialization = False
  208. _needs_parsing = False
  209. def get_data(self):
  210. return self._text
  211. def set_data(self, data):
  212. self._text = data
  213. data = property(get_data, set_data,
  214. "The text contained within the blob object.")
  215. @classmethod
  216. def from_file(cls, filename):
  217. blob = ShaFile.from_file(filename)
  218. if blob._type != cls._type:
  219. raise NotBlobError(filename)
  220. return blob
  221. @classmethod
  222. def from_string(cls, string):
  223. """Create a blob from a string."""
  224. shafile = cls()
  225. shafile.set_raw_string(string)
  226. return shafile
  227. class Tag(ShaFile):
  228. """A Git Tag object."""
  229. _type = TAG_ID
  230. _num_type = 4
  231. def __init__(self):
  232. super(Tag, self).__init__()
  233. self._needs_parsing = False
  234. self._needs_serialization = True
  235. @classmethod
  236. def from_file(cls, filename):
  237. blob = ShaFile.from_file(filename)
  238. if blob._type != cls._type:
  239. raise NotBlobError(filename)
  240. return blob
  241. @classmethod
  242. def from_string(cls, string):
  243. """Create a blob from a string."""
  244. shafile = cls()
  245. shafile.set_raw_string(string)
  246. return shafile
  247. def serialize(self):
  248. f = StringIO()
  249. f.write("%s %s\n" % (OBJECT_ID, self._object_sha))
  250. f.write("%s %s\n" % (TYPE_ID, num_type_map[self._object_type]._type))
  251. f.write("%s %s\n" % (TAG_ID, self._name))
  252. if self._tagger:
  253. if self._tag_time is None:
  254. f.write("%s %s\n" % (TAGGER_ID, self._tagger))
  255. else:
  256. f.write("%s %s %d %s\n" % (TAGGER_ID, self._tagger, self._tag_time, format_timezone(self._tag_timezone)))
  257. f.write("\n") # To close headers
  258. f.write(self._message)
  259. self._text = f.getvalue()
  260. self._needs_serialization = False
  261. def _parse_text(self):
  262. """Grab the metadata attached to the tag"""
  263. self._tagger = None
  264. f = StringIO(self._text)
  265. for l in f:
  266. l = l.rstrip("\n")
  267. if l == "":
  268. break # empty line indicates end of headers
  269. (field, value) = l.split(" ", 1)
  270. if field == OBJECT_ID:
  271. self._object_sha = value
  272. elif field == TYPE_ID:
  273. self._object_type = type_map[value]
  274. elif field == TAG_ID:
  275. self._name = value
  276. elif field == TAGGER_ID:
  277. try:
  278. sep = value.index("> ")
  279. except ValueError:
  280. self._tagger = value
  281. self._tag_time = None
  282. self._tag_timezone = None
  283. else:
  284. self._tagger = value[0:sep+1]
  285. (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
  286. try:
  287. self._tag_time = int(timetext)
  288. except ValueError: #Not a unix timestamp
  289. self._tag_time = time.strptime(timetext)
  290. self._tag_timezone = parse_timezone(timezonetext)
  291. else:
  292. raise AssertionError("Unknown field %s" % field)
  293. self._message = f.read()
  294. self._needs_parsing = False
  295. def get_object(self):
  296. """Returns the object pointed by this tag, represented as a tuple(type, sha)"""
  297. self._ensure_parsed()
  298. return (self._object_type, self._object_sha)
  299. def set_object(self, value):
  300. self._ensure_parsed()
  301. (self._object_type, self._object_sha) = value
  302. self._needs_serialization = True
  303. object = property(get_object, set_object)
  304. name = serializable_property("name", "The name of this tag")
  305. tagger = serializable_property("tagger",
  306. "Returns the name of the person who created this tag")
  307. tag_time = serializable_property("tag_time",
  308. "The creation timestamp of the tag. As the number of seconds since the epoch")
  309. tag_timezone = serializable_property("tag_timezone",
  310. "The timezone that tag_time is in.")
  311. message = serializable_property("message", "The message attached to this tag")
  312. def parse_tree(text):
  313. ret = {}
  314. count = 0
  315. l = len(text)
  316. while count < l:
  317. mode_end = text.index(' ', count)
  318. mode = int(text[count:mode_end], 8)
  319. name_end = text.index('\0', mode_end)
  320. name = text[mode_end+1:name_end]
  321. count = name_end+21
  322. sha = text[name_end+1:count]
  323. ret[name] = (mode, sha_to_hex(sha))
  324. return ret
  325. class Tree(ShaFile):
  326. """A Git tree object"""
  327. _type = TREE_ID
  328. _num_type = 2
  329. def __init__(self):
  330. super(Tree, self).__init__()
  331. self._entries = {}
  332. self._needs_parsing = False
  333. self._needs_serialization = True
  334. @classmethod
  335. def from_file(cls, filename):
  336. tree = ShaFile.from_file(filename)
  337. if tree._type != cls._type:
  338. raise NotTreeError(filename)
  339. return tree
  340. def __contains__(self, name):
  341. self._ensure_parsed()
  342. return name in self._entries
  343. def __getitem__(self, name):
  344. self._ensure_parsed()
  345. return self._entries[name]
  346. def __setitem__(self, name, value):
  347. assert isinstance(value, tuple)
  348. assert len(value) == 2
  349. self._ensure_parsed()
  350. self._entries[name] = value
  351. self._needs_serialization = True
  352. def __delitem__(self, name):
  353. self._ensure_parsed()
  354. del self._entries[name]
  355. self._needs_serialization = True
  356. def __len__(self):
  357. self._ensure_parsed()
  358. return len(self._entries)
  359. def add(self, mode, name, hexsha):
  360. assert type(mode) == int
  361. assert type(name) == str
  362. assert type(hexsha) == str
  363. self._ensure_parsed()
  364. self._entries[name] = mode, hexsha
  365. self._needs_serialization = True
  366. def entries(self):
  367. """Return a list of tuples describing the tree entries"""
  368. self._ensure_parsed()
  369. # The order of this is different from iteritems() for historical reasons
  370. return [(mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
  371. def iteritems(self):
  372. def cmp_entry((name1, value1), (name2, value2)):
  373. if stat.S_ISDIR(value1[0]):
  374. name1 += "/"
  375. if stat.S_ISDIR(value2[0]):
  376. name2 += "/"
  377. return cmp(name1, name2)
  378. self._ensure_parsed()
  379. for name, entry in sorted(self._entries.iteritems(), cmp=cmp_entry):
  380. yield name, entry[0], entry[1]
  381. def _parse_text(self):
  382. """Grab the entries in the tree"""
  383. self._entries = parse_tree(self._text)
  384. self._needs_parsing = False
  385. def serialize(self):
  386. f = StringIO()
  387. for name, mode, hexsha in self.iteritems():
  388. f.write("%04o %s\0%s" % (mode, name, hex_to_sha(hexsha)))
  389. self._text = f.getvalue()
  390. self._needs_serialization = False
  391. def as_pretty_string(self):
  392. text = ""
  393. for name, mode, hexsha in self.iteritems():
  394. if mode & stat.S_IFDIR:
  395. kind = "tree"
  396. else:
  397. kind = "blob"
  398. text += "%04o %s %s\t%s\n" % (mode, kind, hexsha, name)
  399. return text
  400. def parse_timezone(text):
  401. offset = int(text)
  402. signum = (offset < 0) and -1 or 1
  403. offset = abs(offset)
  404. hours = int(offset / 100)
  405. minutes = (offset % 100)
  406. return signum * (hours * 3600 + minutes * 60)
  407. def format_timezone(offset):
  408. if offset % 60 != 0:
  409. raise ValueError("Unable to handle non-minute offset.")
  410. sign = (offset < 0) and '-' or '+'
  411. offset = abs(offset)
  412. return '%c%02d%02d' % (sign, offset / 3600, (offset / 60) % 60)
  413. class Commit(ShaFile):
  414. """A git commit object"""
  415. _type = COMMIT_ID
  416. _num_type = 1
  417. def __init__(self):
  418. super(Commit, self).__init__()
  419. self._parents = []
  420. self._encoding = None
  421. self._needs_parsing = False
  422. self._needs_serialization = True
  423. @classmethod
  424. def from_file(cls, filename):
  425. commit = ShaFile.from_file(filename)
  426. if commit._type != cls._type:
  427. raise NotCommitError(filename)
  428. return commit
  429. def _parse_text(self):
  430. self._parents = []
  431. self._author = None
  432. f = StringIO(self._text)
  433. for l in f:
  434. l = l.rstrip("\n")
  435. if l == "":
  436. # Empty line indicates end of headers
  437. break
  438. (field, value) = l.split(" ", 1)
  439. if field == TREE_ID:
  440. self._tree = value
  441. elif field == PARENT_ID:
  442. self._parents.append(value)
  443. elif field == AUTHOR_ID:
  444. self._author, timetext, timezonetext = value.rsplit(" ", 2)
  445. self._author_time = int(timetext)
  446. self._author_timezone = parse_timezone(timezonetext)
  447. elif field == COMMITTER_ID:
  448. self._committer, timetext, timezonetext = value.rsplit(" ", 2)
  449. self._commit_time = int(timetext)
  450. self._commit_timezone = parse_timezone(timezonetext)
  451. elif field == ENCODING_ID:
  452. self._encoding = value
  453. else:
  454. raise AssertionError("Unknown field %s" % field)
  455. self._message = f.read()
  456. self._needs_parsing = False
  457. def serialize(self):
  458. f = StringIO()
  459. f.write("%s %s\n" % (TREE_ID, self._tree))
  460. for p in self._parents:
  461. f.write("%s %s\n" % (PARENT_ID, p))
  462. f.write("%s %s %s %s\n" % (AUTHOR_ID, self._author, str(self._author_time), format_timezone(self._author_timezone)))
  463. f.write("%s %s %s %s\n" % (COMMITTER_ID, self._committer, str(self._commit_time), format_timezone(self._commit_timezone)))
  464. if self.encoding:
  465. f.write("%s %s\n" % (ENCODING_ID, self.encoding))
  466. f.write("\n") # There must be a new line after the headers
  467. f.write(self._message)
  468. self._text = f.getvalue()
  469. self._needs_serialization = False
  470. tree = serializable_property("tree", "Tree that is the state of this commit")
  471. def get_parents(self):
  472. """Return a list of parents of this commit."""
  473. self._ensure_parsed()
  474. return self._parents
  475. def set_parents(self, value):
  476. """Return a list of parents of this commit."""
  477. self._ensure_parsed()
  478. self._needs_serialization = True
  479. self._parents = value
  480. parents = property(get_parents, set_parents)
  481. author = serializable_property("author",
  482. "The name of the author of the commit")
  483. committer = serializable_property("committer",
  484. "The name of the committer of the commit")
  485. message = serializable_property("message",
  486. "The commit message")
  487. commit_time = serializable_property("commit_time",
  488. "The timestamp of the commit. As the number of seconds since the epoch.")
  489. commit_timezone = serializable_property("commit_timezone",
  490. "The zone the commit time is in")
  491. author_time = serializable_property("author_time",
  492. "The timestamp the commit was written. as the number of seconds since the epoch.")
  493. author_timezone = serializable_property("author_timezone",
  494. "Returns the zone the author time is in.")
  495. encoding = serializable_property("encoding",
  496. "Encoding of the commit message.")
  497. type_map = {
  498. BLOB_ID : Blob,
  499. TREE_ID : Tree,
  500. COMMIT_ID : Commit,
  501. TAG_ID: Tag,
  502. }
  503. num_type_map = {
  504. 0: None,
  505. 1: Commit,
  506. 2: Tree,
  507. 3: Blob,
  508. 4: Tag,
  509. # 5 Is reserved for further expansion
  510. }
  511. try:
  512. # Try to import C versions
  513. from dulwich._objects import parse_tree
  514. except ImportError:
  515. pass