index.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. # index.py -- File parser/writer for the git index file
  2. # Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@samba.org>
  3. #
  4. # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
  5. # General Public License as public by the Free Software Foundation; version 2.0
  6. # or (at your option) any later version. You can redistribute it and/or
  7. # modify it under the terms of either of these two licenses.
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # You should have received a copy of the licenses; if not, see
  16. # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  17. # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  18. # License, Version 2.0.
  19. #
  20. """Parser for the git index file format."""
  21. import collections
  22. import errno
  23. import os
  24. import stat
  25. import struct
  26. import sys
  27. from dulwich.file import GitFile
  28. from dulwich.objects import (
  29. Blob,
  30. S_IFGITLINK,
  31. S_ISGITLINK,
  32. Tree,
  33. hex_to_sha,
  34. sha_to_hex,
  35. )
  36. from dulwich.pack import (
  37. SHA1Reader,
  38. SHA1Writer,
  39. )
  40. IndexEntry = collections.namedtuple(
  41. 'IndexEntry', [
  42. 'ctime', 'mtime', 'dev', 'ino', 'mode', 'uid', 'gid', 'size', 'sha',
  43. 'flags'])
  44. def pathsplit(path):
  45. """Split a /-delimited path into a directory part and a basename.
  46. :param path: The path to split.
  47. :return: Tuple with directory name and basename
  48. """
  49. try:
  50. (dirname, basename) = path.rsplit(b"/", 1)
  51. except ValueError:
  52. return (b"", path)
  53. else:
  54. return (dirname, basename)
  55. def pathjoin(*args):
  56. """Join a /-delimited path.
  57. """
  58. return b"/".join([p for p in args if p])
  59. def read_cache_time(f):
  60. """Read a cache time.
  61. :param f: File-like object to read from
  62. :return: Tuple with seconds and nanoseconds
  63. """
  64. return struct.unpack(">LL", f.read(8))
  65. def write_cache_time(f, t):
  66. """Write a cache time.
  67. :param f: File-like object to write to
  68. :param t: Time to write (as int, float or tuple with secs and nsecs)
  69. """
  70. if isinstance(t, int):
  71. t = (t, 0)
  72. elif isinstance(t, float):
  73. (secs, nsecs) = divmod(t, 1.0)
  74. t = (int(secs), int(nsecs * 1000000000))
  75. elif not isinstance(t, tuple):
  76. raise TypeError(t)
  77. f.write(struct.pack(">LL", *t))
  78. def read_cache_entry(f):
  79. """Read an entry from a cache file.
  80. :param f: File-like object to read from
  81. :return: tuple with: device, inode, mode, uid, gid, size, sha, flags
  82. """
  83. beginoffset = f.tell()
  84. ctime = read_cache_time(f)
  85. mtime = read_cache_time(f)
  86. (dev, ino, mode, uid, gid, size, sha, flags, ) = \
  87. struct.unpack(">LLLLLL20sH", f.read(20 + 4 * 6 + 2))
  88. name = f.read((flags & 0x0fff))
  89. # Padding:
  90. real_size = ((f.tell() - beginoffset + 8) & ~7)
  91. f.read((beginoffset + real_size) - f.tell())
  92. return (name, ctime, mtime, dev, ino, mode, uid, gid, size,
  93. sha_to_hex(sha), flags & ~0x0fff)
  94. def write_cache_entry(f, entry):
  95. """Write an index entry to a file.
  96. :param f: File object
  97. :param entry: Entry to write, tuple with:
  98. (name, ctime, mtime, dev, ino, mode, uid, gid, size, sha, flags)
  99. """
  100. beginoffset = f.tell()
  101. (name, ctime, mtime, dev, ino, mode, uid, gid, size, sha, flags) = entry
  102. write_cache_time(f, ctime)
  103. write_cache_time(f, mtime)
  104. flags = len(name) | (flags & ~0x0fff)
  105. f.write(struct.pack(
  106. b'>LLLLLL20sH', dev & 0xFFFFFFFF, ino & 0xFFFFFFFF,
  107. mode, uid, gid, size, hex_to_sha(sha), flags))
  108. f.write(name)
  109. real_size = ((f.tell() - beginoffset + 8) & ~7)
  110. f.write(b'\0' * ((beginoffset + real_size) - f.tell()))
  111. def read_index(f):
  112. """Read an index file, yielding the individual entries."""
  113. header = f.read(4)
  114. if header != b'DIRC':
  115. raise AssertionError("Invalid index file header: %r" % header)
  116. (version, num_entries) = struct.unpack(b'>LL', f.read(4 * 2))
  117. assert version in (1, 2)
  118. for i in range(num_entries):
  119. yield read_cache_entry(f)
  120. def read_index_dict(f):
  121. """Read an index file and return it as a dictionary.
  122. :param f: File object to read from
  123. """
  124. ret = {}
  125. for x in read_index(f):
  126. ret[x[0]] = IndexEntry(*x[1:])
  127. return ret
  128. def write_index(f, entries):
  129. """Write an index file.
  130. :param f: File-like object to write to
  131. :param entries: Iterable over the entries to write
  132. """
  133. f.write(b'DIRC')
  134. f.write(struct.pack(b'>LL', 2, len(entries)))
  135. for x in entries:
  136. write_cache_entry(f, x)
  137. def write_index_dict(f, entries):
  138. """Write an index file based on the contents of a dictionary.
  139. """
  140. entries_list = []
  141. for name in sorted(entries):
  142. entries_list.append((name,) + tuple(entries[name]))
  143. write_index(f, entries_list)
  144. def cleanup_mode(mode):
  145. """Cleanup a mode value.
  146. This will return a mode that can be stored in a tree object.
  147. :param mode: Mode to clean up.
  148. """
  149. if stat.S_ISLNK(mode):
  150. return stat.S_IFLNK
  151. elif stat.S_ISDIR(mode):
  152. return stat.S_IFDIR
  153. elif S_ISGITLINK(mode):
  154. return S_IFGITLINK
  155. ret = stat.S_IFREG | 0o644
  156. ret |= (mode & 0o111)
  157. return ret
  158. class Index(object):
  159. """A Git Index file."""
  160. def __init__(self, filename):
  161. """Open an index file.
  162. :param filename: Path to the index file
  163. """
  164. self._filename = filename
  165. self.clear()
  166. self.read()
  167. @property
  168. def path(self):
  169. return self._filename
  170. def __repr__(self):
  171. return "%s(%r)" % (self.__class__.__name__, self._filename)
  172. def write(self):
  173. """Write current contents of index to disk."""
  174. f = GitFile(self._filename, 'wb')
  175. try:
  176. f = SHA1Writer(f)
  177. write_index_dict(f, self._byname)
  178. finally:
  179. f.close()
  180. def read(self):
  181. """Read current contents of index from disk."""
  182. if not os.path.exists(self._filename):
  183. return
  184. f = GitFile(self._filename, 'rb')
  185. try:
  186. f = SHA1Reader(f)
  187. for x in read_index(f):
  188. self[x[0]] = IndexEntry(*x[1:])
  189. # FIXME: Additional data?
  190. f.read(os.path.getsize(self._filename)-f.tell()-20)
  191. f.check_sha()
  192. finally:
  193. f.close()
  194. def __len__(self):
  195. """Number of entries in this index file."""
  196. return len(self._byname)
  197. def __getitem__(self, name):
  198. """Retrieve entry by relative path.
  199. :return: tuple with (ctime, mtime, dev, ino, mode, uid, gid, size, sha,
  200. flags)
  201. """
  202. return self._byname[name]
  203. def __iter__(self):
  204. """Iterate over the paths in this index."""
  205. return iter(self._byname)
  206. def get_sha1(self, path):
  207. """Return the (git object) SHA1 for the object at a path."""
  208. return self[path].sha
  209. def get_mode(self, path):
  210. """Return the POSIX file mode for the object at a path."""
  211. return self[path].mode
  212. def iterblobs(self):
  213. """Iterate over path, sha, mode tuples for use with commit_tree."""
  214. for path in self:
  215. entry = self[path]
  216. yield path, entry.sha, cleanup_mode(entry.mode)
  217. def clear(self):
  218. """Remove all contents from this index."""
  219. self._byname = {}
  220. def __setitem__(self, name, x):
  221. assert isinstance(name, bytes)
  222. assert len(x) == 10
  223. # Remove the old entry if any
  224. self._byname[name] = x
  225. def __delitem__(self, name):
  226. assert isinstance(name, bytes)
  227. del self._byname[name]
  228. def iteritems(self):
  229. return self._byname.items()
  230. def update(self, entries):
  231. for name, value in entries.items():
  232. self[name] = value
  233. def changes_from_tree(self, object_store, tree, want_unchanged=False):
  234. """Find the differences between the contents of this index and a tree.
  235. :param object_store: Object store to use for retrieving tree contents
  236. :param tree: SHA1 of the root tree
  237. :param want_unchanged: Whether unchanged files should be reported
  238. :return: Iterator over tuples with (oldpath, newpath), (oldmode,
  239. newmode), (oldsha, newsha)
  240. """
  241. def lookup_entry(path):
  242. entry = self[path]
  243. return entry.sha, entry.mode
  244. for (name, mode, sha) in changes_from_tree(
  245. self._byname.keys(), lookup_entry, object_store, tree,
  246. want_unchanged=want_unchanged):
  247. yield (name, mode, sha)
  248. def commit(self, object_store):
  249. """Create a new tree from an index.
  250. :param object_store: Object store to save the tree in
  251. :return: Root tree SHA
  252. """
  253. return commit_tree(object_store, self.iterblobs())
  254. def commit_tree(object_store, blobs):
  255. """Commit a new tree.
  256. :param object_store: Object store to add trees to
  257. :param blobs: Iterable over blob path, sha, mode entries
  258. :return: SHA1 of the created tree.
  259. """
  260. trees = {b'': {}}
  261. def add_tree(path):
  262. if path in trees:
  263. return trees[path]
  264. dirname, basename = pathsplit(path)
  265. t = add_tree(dirname)
  266. assert isinstance(basename, bytes)
  267. newtree = {}
  268. t[basename] = newtree
  269. trees[path] = newtree
  270. return newtree
  271. for path, sha, mode in blobs:
  272. tree_path, basename = pathsplit(path)
  273. tree = add_tree(tree_path)
  274. tree[basename] = (mode, sha)
  275. def build_tree(path):
  276. tree = Tree()
  277. for basename, entry in trees[path].items():
  278. if isinstance(entry, dict):
  279. mode = stat.S_IFDIR
  280. sha = build_tree(pathjoin(path, basename))
  281. else:
  282. (mode, sha) = entry
  283. tree.add(basename, mode, sha)
  284. object_store.add_object(tree)
  285. return tree.id
  286. return build_tree(b'')
  287. def commit_index(object_store, index):
  288. """Create a new tree from an index.
  289. :param object_store: Object store to save the tree in
  290. :param index: Index file
  291. :note: This function is deprecated, use index.commit() instead.
  292. :return: Root tree sha.
  293. """
  294. return commit_tree(object_store, index.iterblobs())
  295. def changes_from_tree(names, lookup_entry, object_store, tree,
  296. want_unchanged=False):
  297. """Find the differences between the contents of a tree and
  298. a working copy.
  299. :param names: Iterable of names in the working copy
  300. :param lookup_entry: Function to lookup an entry in the working copy
  301. :param object_store: Object store to use for retrieving tree contents
  302. :param tree: SHA1 of the root tree, or None for an empty tree
  303. :param want_unchanged: Whether unchanged files should be reported
  304. :return: Iterator over tuples with (oldpath, newpath), (oldmode, newmode),
  305. (oldsha, newsha)
  306. """
  307. other_names = set(names)
  308. if tree is not None:
  309. for (name, mode, sha) in object_store.iter_tree_contents(tree):
  310. try:
  311. (other_sha, other_mode) = lookup_entry(name)
  312. except KeyError:
  313. # Was removed
  314. yield ((name, None), (mode, None), (sha, None))
  315. else:
  316. other_names.remove(name)
  317. if (want_unchanged or other_sha != sha or other_mode != mode):
  318. yield ((name, name), (mode, other_mode), (sha, other_sha))
  319. # Mention added files
  320. for name in other_names:
  321. try:
  322. (other_sha, other_mode) = lookup_entry(name)
  323. except KeyError:
  324. pass
  325. else:
  326. yield ((None, name), (None, other_mode), (None, other_sha))
  327. def index_entry_from_stat(stat_val, hex_sha, flags, mode=None):
  328. """Create a new index entry from a stat value.
  329. :param stat_val: POSIX stat_result instance
  330. :param hex_sha: Hex sha of the object
  331. :param flags: Index flags
  332. """
  333. if mode is None:
  334. mode = cleanup_mode(stat_val.st_mode)
  335. return (stat_val.st_ctime, stat_val.st_mtime, stat_val.st_dev,
  336. stat_val.st_ino, mode, stat_val.st_uid,
  337. stat_val.st_gid, stat_val.st_size, hex_sha, flags)
  338. def build_file_from_blob(blob, mode, target_path, honor_filemode=True):
  339. """Build a file or symlink on disk based on a Git object.
  340. :param obj: The git object
  341. :param mode: File mode
  342. :param target_path: Path to write to
  343. :param honor_filemode: An optional flag to honor core.filemode setting in
  344. config file, default is core.filemode=True, change executable bit
  345. :return: stat object for the file
  346. """
  347. try:
  348. oldstat = os.lstat(target_path)
  349. except OSError as e:
  350. if e.errno == errno.ENOENT:
  351. oldstat = None
  352. else:
  353. raise
  354. contents = blob.as_raw_string()
  355. if stat.S_ISLNK(mode):
  356. # FIXME: This will fail on Windows. What should we do instead?
  357. if oldstat:
  358. os.unlink(target_path)
  359. if sys.platform == 'win32' and sys.version_info[0] == 3:
  360. # os.readlink on Python3 on Windows requires a unicode string.
  361. # TODO(jelmer): Don't assume tree_encoding == fs_encoding
  362. tree_encoding = sys.getfilesystemencoding()
  363. contents = contents.decode(tree_encoding)
  364. target_path = target_path.decode(tree_encoding)
  365. os.symlink(contents, target_path)
  366. else:
  367. if oldstat is not None and oldstat.st_size == len(contents):
  368. with open(target_path, 'rb') as f:
  369. if f.read() == contents:
  370. return oldstat
  371. with open(target_path, 'wb') as f:
  372. # Write out file
  373. f.write(contents)
  374. if honor_filemode:
  375. os.chmod(target_path, mode)
  376. return os.lstat(target_path)
  377. INVALID_DOTNAMES = (b".git", b".", b"..", b"")
  378. def validate_path_element_default(element):
  379. return element.lower() not in INVALID_DOTNAMES
  380. def validate_path_element_ntfs(element):
  381. stripped = element.rstrip(b". ").lower()
  382. if stripped in INVALID_DOTNAMES:
  383. return False
  384. if stripped == b"git~1":
  385. return False
  386. return True
  387. def validate_path(path, element_validator=validate_path_element_default):
  388. """Default path validator that just checks for .git/."""
  389. parts = path.split(b"/")
  390. for p in parts:
  391. if not element_validator(p):
  392. return False
  393. else:
  394. return True
  395. def build_index_from_tree(root_path, index_path, object_store, tree_id,
  396. honor_filemode=True,
  397. validate_path_element=validate_path_element_default):
  398. """Generate and materialize index from a tree
  399. :param tree_id: Tree to materialize
  400. :param root_path: Target dir for materialized index files
  401. :param index_path: Target path for generated index
  402. :param object_store: Non-empty object store holding tree contents
  403. :param honor_filemode: An optional flag to honor core.filemode setting in
  404. config file, default is core.filemode=True, change executable bit
  405. :param validate_path_element: Function to validate path elements to check
  406. out; default just refuses .git and .. directories.
  407. :note:: existing index is wiped and contents are not merged
  408. in a working dir. Suitable only for fresh clones.
  409. """
  410. index = Index(index_path)
  411. if not isinstance(root_path, bytes):
  412. root_path = root_path.encode(sys.getfilesystemencoding())
  413. for entry in object_store.iter_tree_contents(tree_id):
  414. if not validate_path(entry.path, validate_path_element):
  415. continue
  416. full_path = _tree_to_fs_path(root_path, entry.path)
  417. if not os.path.exists(os.path.dirname(full_path)):
  418. os.makedirs(os.path.dirname(full_path))
  419. # TODO(jelmer): Merge new index into working tree
  420. if S_ISGITLINK(entry.mode):
  421. if not os.path.isdir(full_path):
  422. os.mkdir(full_path)
  423. st = os.lstat(full_path)
  424. # TODO(jelmer): record and return submodule paths
  425. else:
  426. obj = object_store[entry.sha]
  427. st = build_file_from_blob(
  428. obj, entry.mode, full_path, honor_filemode=honor_filemode)
  429. # Add file to index
  430. if not honor_filemode or S_ISGITLINK(entry.mode):
  431. # we can not use tuple slicing to build a new tuple,
  432. # because on windows that will convert the times to
  433. # longs, which causes errors further along
  434. st_tuple = (entry.mode, st.st_ino, st.st_dev, st.st_nlink,
  435. st.st_uid, st.st_gid, st.st_size, st.st_atime,
  436. st.st_mtime, st.st_ctime)
  437. st = st.__class__(st_tuple)
  438. index[entry.path] = index_entry_from_stat(st, entry.sha, 0)
  439. index.write()
  440. def blob_from_path_and_stat(fs_path, st):
  441. """Create a blob from a path and a stat object.
  442. :param fs_path: Full file system path to file
  443. :param st: A stat object
  444. :return: A `Blob` object
  445. """
  446. assert isinstance(fs_path, bytes)
  447. blob = Blob()
  448. if not stat.S_ISLNK(st.st_mode):
  449. with open(fs_path, 'rb') as f:
  450. blob.data = f.read()
  451. else:
  452. if sys.platform == 'win32' and sys.version_info[0] == 3:
  453. # os.readlink on Python3 on Windows requires a unicode string.
  454. # TODO(jelmer): Don't assume tree_encoding == fs_encoding
  455. tree_encoding = sys.getfilesystemencoding()
  456. fs_path = fs_path.decode(tree_encoding)
  457. blob.data = os.readlink(fs_path).encode(tree_encoding)
  458. else:
  459. blob.data = os.readlink(fs_path)
  460. return blob
  461. def get_unstaged_changes(index, root_path):
  462. """Walk through an index and check for differences against working tree.
  463. :param index: index to check
  464. :param root_path: path in which to find files
  465. :return: iterator over paths with unstaged changes
  466. """
  467. # For each entry in the index check the sha1 & ensure not staged
  468. if not isinstance(root_path, bytes):
  469. root_path = root_path.encode(sys.getfilesystemencoding())
  470. for tree_path, entry in index.iteritems():
  471. full_path = _tree_to_fs_path(root_path, tree_path)
  472. try:
  473. blob = blob_from_path_and_stat(full_path, os.lstat(full_path))
  474. except OSError as e:
  475. if e.errno != errno.ENOENT:
  476. raise
  477. # The file was removed, so we assume that counts as
  478. # different from whatever file used to exist.
  479. yield tree_path
  480. except IOError as e:
  481. if e.errno != errno.EISDIR:
  482. raise
  483. # This is actually a directory
  484. if os.path.exists(os.path.join(tree_path, '.git')):
  485. # Submodule
  486. from dulwich.repo import NotGitRepository, Repo
  487. try:
  488. if entry.sha != Repo(tree_path).head():
  489. yield tree_path
  490. except NotGitRepository:
  491. yield tree_path
  492. else:
  493. # The file was changed to a directory, so consider it removed.
  494. yield tree_path
  495. else:
  496. if blob.id != entry.sha:
  497. yield tree_path
  498. os_sep_bytes = os.sep.encode('ascii')
  499. def _tree_to_fs_path(root_path, tree_path):
  500. """Convert a git tree path to a file system path.
  501. :param root_path: Root filesystem path
  502. :param tree_path: Git tree path as bytes
  503. :return: File system path.
  504. """
  505. assert isinstance(tree_path, bytes)
  506. if os_sep_bytes != b'/':
  507. sep_corrected_path = tree_path.replace(b'/', os_sep_bytes)
  508. else:
  509. sep_corrected_path = tree_path
  510. return os.path.join(root_path, sep_corrected_path)
  511. def _fs_to_tree_path(fs_path, fs_encoding=None):
  512. """Convert a file system path to a git tree path.
  513. :param fs_path: File system path.
  514. :param fs_encoding: File system encoding
  515. :return: Git tree path as bytes
  516. """
  517. if fs_encoding is None:
  518. fs_encoding = sys.getfilesystemencoding()
  519. if not isinstance(fs_path, bytes):
  520. fs_path_bytes = fs_path.encode(fs_encoding)
  521. else:
  522. fs_path_bytes = fs_path
  523. if os_sep_bytes != b'/':
  524. tree_path = fs_path_bytes.replace(os_sep_bytes, b'/')
  525. else:
  526. tree_path = fs_path_bytes
  527. return tree_path