index.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699
  1. # index.py -- File parser/writer for the git index file
  2. # Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@samba.org>
  3. #
  4. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  5. # General Public License as public by the Free Software Foundation; version 2.0
  6. # or (at your option) any later version. You can redistribute it and/or
  7. # modify it under the terms of either of these two licenses.
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # You should have received a copy of the licenses; if not, see
  16. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  17. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  18. # License, Version 2.0.
  19. #
  20. """Parser for the git index file format."""
  21. import collections
  22. import errno
  23. import os
  24. import stat
  25. import struct
  26. import sys
  27. from dulwich.file import GitFile
  28. from dulwich.objects import (
  29. Blob,
  30. S_IFGITLINK,
  31. S_ISGITLINK,
  32. Tree,
  33. hex_to_sha,
  34. sha_to_hex,
  35. )
  36. from dulwich.pack import (
  37. SHA1Reader,
  38. SHA1Writer,
  39. )
  40. IndexEntry = collections.namedtuple(
  41. 'IndexEntry', [
  42. 'ctime', 'mtime', 'dev', 'ino', 'mode', 'uid', 'gid', 'size', 'sha',
  43. 'flags'])
  44. def pathsplit(path):
  45. """Split a /-delimited path into a directory part and a basename.
  46. :param path: The path to split.
  47. :return: Tuple with directory name and basename
  48. """
  49. try:
  50. (dirname, basename) = path.rsplit(b"/", 1)
  51. except ValueError:
  52. return (b"", path)
  53. else:
  54. return (dirname, basename)
  55. def pathjoin(*args):
  56. """Join a /-delimited path.
  57. """
  58. return b"/".join([p for p in args if p])
  59. def read_cache_time(f):
  60. """Read a cache time.
  61. :param f: File-like object to read from
  62. :return: Tuple with seconds and nanoseconds
  63. """
  64. return struct.unpack(">LL", f.read(8))
  65. def write_cache_time(f, t):
  66. """Write a cache time.
  67. :param f: File-like object to write to
  68. :param t: Time to write (as int, float or tuple with secs and nsecs)
  69. """
  70. if isinstance(t, int):
  71. t = (t, 0)
  72. elif isinstance(t, float):
  73. (secs, nsecs) = divmod(t, 1.0)
  74. t = (int(secs), int(nsecs * 1000000000))
  75. elif not isinstance(t, tuple):
  76. raise TypeError(t)
  77. f.write(struct.pack(">LL", *t))
  78. def read_cache_entry(f):
  79. """Read an entry from a cache file.
  80. :param f: File-like object to read from
  81. :return: tuple with: device, inode, mode, uid, gid, size, sha, flags
  82. """
  83. beginoffset = f.tell()
  84. ctime = read_cache_time(f)
  85. mtime = read_cache_time(f)
  86. (dev, ino, mode, uid, gid, size, sha, flags, ) = \
  87. struct.unpack(">LLLLLL20sH", f.read(20 + 4 * 6 + 2))
  88. name = f.read((flags & 0x0fff))
  89. # Padding:
  90. real_size = ((f.tell() - beginoffset + 8) & ~7)
  91. f.read((beginoffset + real_size) - f.tell())
  92. return (name, ctime, mtime, dev, ino, mode, uid, gid, size,
  93. sha_to_hex(sha), flags & ~0x0fff)
  94. def write_cache_entry(f, entry):
  95. """Write an index entry to a file.
  96. :param f: File object
  97. :param entry: Entry to write, tuple with:
  98. (name, ctime, mtime, dev, ino, mode, uid, gid, size, sha, flags)
  99. """
  100. beginoffset = f.tell()
  101. (name, ctime, mtime, dev, ino, mode, uid, gid, size, sha, flags) = entry
  102. write_cache_time(f, ctime)
  103. write_cache_time(f, mtime)
  104. flags = len(name) | (flags & ~0x0fff)
  105. f.write(struct.pack(
  106. b'>LLLLLL20sH', dev & 0xFFFFFFFF, ino & 0xFFFFFFFF,
  107. mode, uid, gid, size, hex_to_sha(sha), flags))
  108. f.write(name)
  109. real_size = ((f.tell() - beginoffset + 8) & ~7)
  110. f.write(b'\0' * ((beginoffset + real_size) - f.tell()))
  111. def read_index(f):
  112. """Read an index file, yielding the individual entries."""
  113. header = f.read(4)
  114. if header != b'DIRC':
  115. raise AssertionError("Invalid index file header: %r" % header)
  116. (version, num_entries) = struct.unpack(b'>LL', f.read(4 * 2))
  117. assert version in (1, 2)
  118. for i in range(num_entries):
  119. yield read_cache_entry(f)
  120. def read_index_dict(f):
  121. """Read an index file and return it as a dictionary.
  122. :param f: File object to read from
  123. """
  124. ret = {}
  125. for x in read_index(f):
  126. ret[x[0]] = IndexEntry(*x[1:])
  127. return ret
  128. def write_index(f, entries):
  129. """Write an index file.
  130. :param f: File-like object to write to
  131. :param entries: Iterable over the entries to write
  132. """
  133. f.write(b'DIRC')
  134. f.write(struct.pack(b'>LL', 2, len(entries)))
  135. for x in entries:
  136. write_cache_entry(f, x)
  137. def write_index_dict(f, entries):
  138. """Write an index file based on the contents of a dictionary.
  139. """
  140. entries_list = []
  141. for name in sorted(entries):
  142. entries_list.append((name,) + tuple(entries[name]))
  143. write_index(f, entries_list)
  144. def cleanup_mode(mode):
  145. """Cleanup a mode value.
  146. This will return a mode that can be stored in a tree object.
  147. :param mode: Mode to clean up.
  148. """
  149. if stat.S_ISLNK(mode):
  150. return stat.S_IFLNK
  151. elif stat.S_ISDIR(mode):
  152. return stat.S_IFDIR
  153. elif S_ISGITLINK(mode):
  154. return S_IFGITLINK
  155. ret = stat.S_IFREG | 0o644
  156. ret |= (mode & 0o111)
  157. return ret
  158. class Index(object):
  159. """A Git Index file."""
  160. def __init__(self, filename):
  161. """Open an index file.
  162. :param filename: Path to the index file
  163. """
  164. self._filename = filename
  165. self.clear()
  166. self.read()
  167. @property
  168. def path(self):
  169. return self._filename
  170. def __repr__(self):
  171. return "%s(%r)" % (self.__class__.__name__, self._filename)
  172. def write(self):
  173. """Write current contents of index to disk."""
  174. f = GitFile(self._filename, 'wb')
  175. try:
  176. f = SHA1Writer(f)
  177. write_index_dict(f, self._byname)
  178. finally:
  179. f.close()
  180. def read(self):
  181. """Read current contents of index from disk."""
  182. if not os.path.exists(self._filename):
  183. return
  184. f = GitFile(self._filename, 'rb')
  185. try:
  186. f = SHA1Reader(f)
  187. for x in read_index(f):
  188. self[x[0]] = IndexEntry(*x[1:])
  189. # FIXME: Additional data?
  190. f.read(os.path.getsize(self._filename)-f.tell()-20)
  191. f.check_sha()
  192. finally:
  193. f.close()
  194. def __len__(self):
  195. """Number of entries in this index file."""
  196. return len(self._byname)
  197. def __getitem__(self, name):
  198. """Retrieve entry by relative path.
  199. :return: tuple with (ctime, mtime, dev, ino, mode, uid, gid, size, sha,
  200. flags)
  201. """
  202. return self._byname[name]
  203. def __iter__(self):
  204. """Iterate over the paths in this index."""
  205. return iter(self._byname)
  206. def get_sha1(self, path):
  207. """Return the (git object) SHA1 for the object at a path."""
  208. return self[path].sha
  209. def get_mode(self, path):
  210. """Return the POSIX file mode for the object at a path."""
  211. return self[path].mode
  212. def iterblobs(self):
  213. """Iterate over path, sha, mode tuples for use with commit_tree."""
  214. for path in self:
  215. entry = self[path]
  216. yield path, entry.sha, cleanup_mode(entry.mode)
  217. def clear(self):
  218. """Remove all contents from this index."""
  219. self._byname = {}
  220. def __setitem__(self, name, x):
  221. assert isinstance(name, bytes)
  222. assert len(x) == 10
  223. # Remove the old entry if any
  224. self._byname[name] = IndexEntry(*x)
  225. def __delitem__(self, name):
  226. assert isinstance(name, bytes)
  227. del self._byname[name]
  228. def iteritems(self):
  229. return self._byname.items()
  230. def update(self, entries):
  231. for name, value in entries.items():
  232. self[name] = value
  233. def changes_from_tree(self, object_store, tree, want_unchanged=False):
  234. """Find the differences between the contents of this index and a tree.
  235. :param object_store: Object store to use for retrieving tree contents
  236. :param tree: SHA1 of the root tree
  237. :param want_unchanged: Whether unchanged files should be reported
  238. :return: Iterator over tuples with (oldpath, newpath), (oldmode,
  239. newmode), (oldsha, newsha)
  240. """
  241. def lookup_entry(path):
  242. entry = self[path]
  243. return entry.sha, entry.mode
  244. for (name, mode, sha) in changes_from_tree(
  245. self._byname.keys(), lookup_entry, object_store, tree,
  246. want_unchanged=want_unchanged):
  247. yield (name, mode, sha)
  248. def commit(self, object_store):
  249. """Create a new tree from an index.
  250. :param object_store: Object store to save the tree in
  251. :return: Root tree SHA
  252. """
  253. return commit_tree(object_store, self.iterblobs())
  254. def commit_tree(object_store, blobs):
  255. """Commit a new tree.
  256. :param object_store: Object store to add trees to
  257. :param blobs: Iterable over blob path, sha, mode entries
  258. :return: SHA1 of the created tree.
  259. """
  260. trees = {b'': {}}
  261. def add_tree(path):
  262. if path in trees:
  263. return trees[path]
  264. dirname, basename = pathsplit(path)
  265. t = add_tree(dirname)
  266. assert isinstance(basename, bytes)
  267. newtree = {}
  268. t[basename] = newtree
  269. trees[path] = newtree
  270. return newtree
  271. for path, sha, mode in blobs:
  272. tree_path, basename = pathsplit(path)
  273. tree = add_tree(tree_path)
  274. tree[basename] = (mode, sha)
  275. def build_tree(path):
  276. tree = Tree()
  277. for basename, entry in trees[path].items():
  278. if isinstance(entry, dict):
  279. mode = stat.S_IFDIR
  280. sha = build_tree(pathjoin(path, basename))
  281. else:
  282. (mode, sha) = entry
  283. tree.add(basename, mode, sha)
  284. object_store.add_object(tree)
  285. return tree.id
  286. return build_tree(b'')
  287. def commit_index(object_store, index):
  288. """Create a new tree from an index.
  289. :param object_store: Object store to save the tree in
  290. :param index: Index file
  291. :note: This function is deprecated, use index.commit() instead.
  292. :return: Root tree sha.
  293. """
  294. return commit_tree(object_store, index.iterblobs())
  295. def changes_from_tree(names, lookup_entry, object_store, tree,
  296. want_unchanged=False):
  297. """Find the differences between the contents of a tree and
  298. a working copy.
  299. :param names: Iterable of names in the working copy
  300. :param lookup_entry: Function to lookup an entry in the working copy
  301. :param object_store: Object store to use for retrieving tree contents
  302. :param tree: SHA1 of the root tree, or None for an empty tree
  303. :param want_unchanged: Whether unchanged files should be reported
  304. :return: Iterator over tuples with (oldpath, newpath), (oldmode, newmode),
  305. (oldsha, newsha)
  306. """
  307. # TODO(jelmer): Support a include_trees option
  308. other_names = set(names)
  309. if tree is not None:
  310. for (name, mode, sha) in object_store.iter_tree_contents(tree):
  311. try:
  312. (other_sha, other_mode) = lookup_entry(name)
  313. except KeyError:
  314. # Was removed
  315. yield ((name, None), (mode, None), (sha, None))
  316. else:
  317. other_names.remove(name)
  318. if (want_unchanged or other_sha != sha or other_mode != mode):
  319. yield ((name, name), (mode, other_mode), (sha, other_sha))
  320. # Mention added files
  321. for name in other_names:
  322. try:
  323. (other_sha, other_mode) = lookup_entry(name)
  324. except KeyError:
  325. pass
  326. else:
  327. yield ((None, name), (None, other_mode), (None, other_sha))
  328. def index_entry_from_stat(stat_val, hex_sha, flags, mode=None):
  329. """Create a new index entry from a stat value.
  330. :param stat_val: POSIX stat_result instance
  331. :param hex_sha: Hex sha of the object
  332. :param flags: Index flags
  333. """
  334. if mode is None:
  335. mode = cleanup_mode(stat_val.st_mode)
  336. return (stat_val.st_ctime, stat_val.st_mtime, stat_val.st_dev,
  337. stat_val.st_ino, mode, stat_val.st_uid,
  338. stat_val.st_gid, stat_val.st_size, hex_sha, flags)
  339. def build_file_from_blob(blob, mode, target_path, honor_filemode=True):
  340. """Build a file or symlink on disk based on a Git object.
  341. :param obj: The git object
  342. :param mode: File mode
  343. :param target_path: Path to write to
  344. :param honor_filemode: An optional flag to honor core.filemode setting in
  345. config file, default is core.filemode=True, change executable bit
  346. :return: stat object for the file
  347. """
  348. try:
  349. oldstat = os.lstat(target_path)
  350. except OSError as e:
  351. if e.errno == errno.ENOENT:
  352. oldstat = None
  353. else:
  354. raise
  355. contents = blob.as_raw_string()
  356. if stat.S_ISLNK(mode):
  357. # FIXME: This will fail on Windows. What should we do instead?
  358. if oldstat:
  359. os.unlink(target_path)
  360. if sys.platform == 'win32' and sys.version_info[0] == 3:
  361. # os.readlink on Python3 on Windows requires a unicode string.
  362. # TODO(jelmer): Don't assume tree_encoding == fs_encoding
  363. tree_encoding = sys.getfilesystemencoding()
  364. contents = contents.decode(tree_encoding)
  365. target_path = target_path.decode(tree_encoding)
  366. os.symlink(contents, target_path)
  367. else:
  368. if oldstat is not None and oldstat.st_size == len(contents):
  369. with open(target_path, 'rb') as f:
  370. if f.read() == contents:
  371. return oldstat
  372. with open(target_path, 'wb') as f:
  373. # Write out file
  374. f.write(contents)
  375. if honor_filemode:
  376. os.chmod(target_path, mode)
  377. return os.lstat(target_path)
  378. INVALID_DOTNAMES = (b".git", b".", b"..", b"")
  379. def validate_path_element_default(element):
  380. return element.lower() not in INVALID_DOTNAMES
  381. def validate_path_element_ntfs(element):
  382. stripped = element.rstrip(b". ").lower()
  383. if stripped in INVALID_DOTNAMES:
  384. return False
  385. if stripped == b"git~1":
  386. return False
  387. return True
  388. def validate_path(path, element_validator=validate_path_element_default):
  389. """Default path validator that just checks for .git/."""
  390. parts = path.split(b"/")
  391. for p in parts:
  392. if not element_validator(p):
  393. return False
  394. else:
  395. return True
  396. def build_index_from_tree(root_path, index_path, object_store, tree_id,
  397. honor_filemode=True,
  398. validate_path_element=validate_path_element_default):
  399. """Generate and materialize index from a tree
  400. :param tree_id: Tree to materialize
  401. :param root_path: Target dir for materialized index files
  402. :param index_path: Target path for generated index
  403. :param object_store: Non-empty object store holding tree contents
  404. :param honor_filemode: An optional flag to honor core.filemode setting in
  405. config file, default is core.filemode=True, change executable bit
  406. :param validate_path_element: Function to validate path elements to check
  407. out; default just refuses .git and .. directories.
  408. :note:: existing index is wiped and contents are not merged
  409. in a working dir. Suitable only for fresh clones.
  410. """
  411. index = Index(index_path)
  412. if not isinstance(root_path, bytes):
  413. root_path = root_path.encode(sys.getfilesystemencoding())
  414. for entry in object_store.iter_tree_contents(tree_id):
  415. if not validate_path(entry.path, validate_path_element):
  416. continue
  417. full_path = _tree_to_fs_path(root_path, entry.path)
  418. if not os.path.exists(os.path.dirname(full_path)):
  419. os.makedirs(os.path.dirname(full_path))
  420. # TODO(jelmer): Merge new index into working tree
  421. if S_ISGITLINK(entry.mode):
  422. if not os.path.isdir(full_path):
  423. os.mkdir(full_path)
  424. st = os.lstat(full_path)
  425. # TODO(jelmer): record and return submodule paths
  426. else:
  427. obj = object_store[entry.sha]
  428. st = build_file_from_blob(
  429. obj, entry.mode, full_path, honor_filemode=honor_filemode)
  430. # Add file to index
  431. if not honor_filemode or S_ISGITLINK(entry.mode):
  432. # we can not use tuple slicing to build a new tuple,
  433. # because on windows that will convert the times to
  434. # longs, which causes errors further along
  435. st_tuple = (entry.mode, st.st_ino, st.st_dev, st.st_nlink,
  436. st.st_uid, st.st_gid, st.st_size, st.st_atime,
  437. st.st_mtime, st.st_ctime)
  438. st = st.__class__(st_tuple)
  439. index[entry.path] = index_entry_from_stat(st, entry.sha, 0)
  440. index.write()
  441. def blob_from_path_and_stat(fs_path, st):
  442. """Create a blob from a path and a stat object.
  443. :param fs_path: Full file system path to file
  444. :param st: A stat object
  445. :return: A `Blob` object
  446. """
  447. assert isinstance(fs_path, bytes)
  448. blob = Blob()
  449. if not stat.S_ISLNK(st.st_mode):
  450. with open(fs_path, 'rb') as f:
  451. blob.data = f.read()
  452. else:
  453. if sys.platform == 'win32' and sys.version_info[0] == 3:
  454. # os.readlink on Python3 on Windows requires a unicode string.
  455. # TODO(jelmer): Don't assume tree_encoding == fs_encoding
  456. tree_encoding = sys.getfilesystemencoding()
  457. fs_path = fs_path.decode(tree_encoding)
  458. blob.data = os.readlink(fs_path).encode(tree_encoding)
  459. else:
  460. blob.data = os.readlink(fs_path)
  461. return blob
  462. def get_unstaged_changes(index, root_path):
  463. """Walk through an index and check for differences against working tree.
  464. :param index: index to check
  465. :param root_path: path in which to find files
  466. :return: iterator over paths with unstaged changes
  467. """
  468. # For each entry in the index check the sha1 & ensure not staged
  469. if not isinstance(root_path, bytes):
  470. root_path = root_path.encode(sys.getfilesystemencoding())
  471. for tree_path, entry in index.iteritems():
  472. full_path = _tree_to_fs_path(root_path, tree_path)
  473. try:
  474. blob = blob_from_path_and_stat(full_path, os.lstat(full_path))
  475. except OSError as e:
  476. if e.errno != errno.ENOENT:
  477. raise
  478. # The file was removed, so we assume that counts as
  479. # different from whatever file used to exist.
  480. yield tree_path
  481. except IOError as e:
  482. if e.errno != errno.EISDIR:
  483. raise
  484. # This is actually a directory
  485. if os.path.exists(os.path.join(tree_path, '.git')):
  486. # Submodule
  487. from dulwich.errors import NotGitRepository
  488. from dulwich.repo import Repo
  489. try:
  490. if entry.sha != Repo(tree_path).head():
  491. yield tree_path
  492. except NotGitRepository:
  493. yield tree_path
  494. else:
  495. # The file was changed to a directory, so consider it removed.
  496. yield tree_path
  497. else:
  498. if blob.id != entry.sha:
  499. yield tree_path
  500. os_sep_bytes = os.sep.encode('ascii')
  501. def _tree_to_fs_path(root_path, tree_path):
  502. """Convert a git tree path to a file system path.
  503. :param root_path: Root filesystem path
  504. :param tree_path: Git tree path as bytes
  505. :return: File system path.
  506. """
  507. assert isinstance(tree_path, bytes)
  508. if os_sep_bytes != b'/':
  509. sep_corrected_path = tree_path.replace(b'/', os_sep_bytes)
  510. else:
  511. sep_corrected_path = tree_path
  512. return os.path.join(root_path, sep_corrected_path)
  513. def _fs_to_tree_path(fs_path, fs_encoding=None):
  514. """Convert a file system path to a git tree path.
  515. :param fs_path: File system path.
  516. :param fs_encoding: File system encoding
  517. :return: Git tree path as bytes
  518. """
  519. if fs_encoding is None:
  520. fs_encoding = sys.getfilesystemencoding()
  521. if not isinstance(fs_path, bytes):
  522. fs_path_bytes = fs_path.encode(fs_encoding)
  523. else:
  524. fs_path_bytes = fs_path
  525. if os_sep_bytes != b'/':
  526. tree_path = fs_path_bytes.replace(os_sep_bytes, b'/')
  527. else:
  528. tree_path = fs_path_bytes
  529. return tree_path
  530. def iter_fresh_entries(index, root_path):
  531. """Iterate over current versions of index entries on disk.
  532. :param index: Index file
  533. :param root_path: Root path to access from
  534. :return: Iterator over path, index_entry
  535. """
  536. for path in set(index):
  537. p = _tree_to_fs_path(root_path, path)
  538. try:
  539. st = os.lstat(p)
  540. blob = blob_from_path_and_stat(p, st)
  541. except OSError as e:
  542. if e.errno == errno.ENOENT:
  543. del index[path]
  544. else:
  545. raise
  546. except IOError as e:
  547. if e.errno == errno.EISDIR:
  548. del index[path]
  549. else:
  550. raise
  551. else:
  552. yield path, index_entry_from_stat(st, blob.id, 0)
  553. def iter_fresh_blobs(index, root_path):
  554. """Iterate over versions of blobs on disk referenced by index.
  555. :param index: Index file
  556. :param root_path: Root path to access from
  557. :return: Iterator over path, sha, mode
  558. """
  559. for path, entry in iter_fresh_entries(index, root_path):
  560. entry = IndexEntry(*entry)
  561. yield path, entry.sha, cleanup_mode(entry.mode)
  562. def refresh_index(index, root_path):
  563. """Refresh the contents of an index.
  564. This is the equivalent to running 'git commit -a'.
  565. :param index: Index to update
  566. :param root_path: Root filesystem path
  567. """
  568. for path, entry in iter_fresh_entries(index, root_path):
  569. index[path] = path