Преглед изворни кода

Drop Pure- from the description.

Jelmer Vernooij пре 14 година
родитељ
комит
5ce1d366fb
56 измењених фајлова са 2993 додато и 842 уклоњено
  1. 3 2
      .testr.conf
  2. 2 0
      AUTHORS
  3. 11 14
      Makefile
  4. 36 5
      NEWS
  5. 6 13
      bin/dul-receive-pack
  6. 6 13
      bin/dul-upload-pack
  7. 3 2
      debian/changelog
  8. 1 1
      debian/control
  9. 10 9
      docs/tutorial/0-introduction.txt
  10. 0 119
      docs/tutorial/1-initial-commit.txt
  11. 28 0
      docs/tutorial/1-repo.txt
  12. 0 61
      docs/tutorial/2-change-file.txt
  13. 184 0
      docs/tutorial/2-object-store.txt
  14. 0 41
      docs/tutorial/3-add-file.txt
  15. 11 0
      docs/tutorial/3-conclusion.txt
  16. 0 30
      docs/tutorial/4-remove-file.txt
  17. 0 33
      docs/tutorial/5-rename-file.txt
  18. 0 14
      docs/tutorial/6-conclusion.txt
  19. 3 6
      docs/tutorial/index.txt
  20. 0 178
      docs/tutorial/test.py
  21. 76 1
      dulwich/_compat.py
  22. 449 0
      dulwich/_diff_tree.c
  23. 53 14
      dulwich/_objects.c
  24. 20 7
      dulwich/client.py
  25. 495 0
      dulwich/diff_tree.py
  26. 4 0
      dulwich/errors.py
  27. 36 3
      dulwich/fastexport.py
  28. 1 16
      dulwich/file.py
  29. 29 63
      dulwich/object_store.py
  30. 32 20
      dulwich/objects.py
  31. 7 9
      dulwich/pack.py
  32. 26 2
      dulwich/patch.py
  33. 1 2
      dulwich/protocol.py
  34. 34 18
      dulwich/repo.py
  35. 35 1
      dulwich/server.py
  36. 85 28
      dulwich/tests/__init__.py
  37. 37 0
      dulwich/tests/compat/__init__.py
  38. 4 0
      dulwich/tests/compat/test_client.py
  39. 2 3
      dulwich/tests/compat/test_server.py
  40. 3 2
      dulwich/tests/compat/test_utils.py
  41. 3 4
      dulwich/tests/compat/test_web.py
  42. 8 2
      dulwich/tests/compat/utils.py
  43. 67 0
      dulwich/tests/test_blackbox.py
  44. 61 0
      dulwich/tests/test_client.py
  45. 671 0
      dulwich/tests/test_diff_tree.py
  46. 69 0
      dulwich/tests/test_fastexport.py
  47. 32 8
      dulwich/tests/test_object_store.py
  48. 51 62
      dulwich/tests/test_objects.py
  49. 26 16
      dulwich/tests/test_pack.py
  50. 138 1
      dulwich/tests/test_patch.py
  51. 34 17
      dulwich/tests/test_repository.py
  52. 55 0
      dulwich/tests/test_server.py
  53. 0 1
      dulwich/tests/test_web.py
  54. 42 0
      dulwich/tests/utils.py
  55. 1 1
      dulwich/web.py
  56. 2 0
      setup.py

+ 3 - 2
.testr.conf

@@ -1,3 +1,4 @@
 [DEFAULT]
-test_command=PYTHONPATH=. python -m subunit.run $IDLIST
-test_id_list_default=dulwich.tests.test_suite
+test_command=PYTHONPATH=. python -m subunit.run $IDOPTION $LISTOPT dulwich.tests.test_suite
+test_id_option=--load-list $IDFILE
+test_list_option=--list

+ 2 - 0
AUTHORS

@@ -3,4 +3,6 @@ James Westby <jw+debian@jameswestby.net>
 John Carr <john.carr@unrouted.co.uk>
 Dave Borowitz <dborowitz@google.com>
 
+Hervé Cauwelier <herve@itaapy.com> wrote the original tutorial.
+
 See the revision history for a full list of contributors.

+ 11 - 14
Makefile

@@ -1,8 +1,12 @@
 PYTHON = python
 SETUP = $(PYTHON) setup.py
 PYDOCTOR ?= pydoctor
-TESTRUNNER = $(shell which nosetests)
-TESTFLAGS =
+ifeq ($(shell $(PYTHON) -c "import sys; print sys.version_info >= (2, 7)"),True)
+TESTRUNNER ?= unittest
+else
+TESTRUNNER ?= unittest2
+endif
+RUNTEST = PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) -m $(TESTRUNNER)
 
 all: build
 
@@ -19,21 +23,14 @@ install::
 	$(SETUP) install
 
 check:: build
-	PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) $(TESTRUNNER) dulwich
-	which git > /dev/null && PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) $(TESTRUNNER) $(TESTFLAGS) -i compat
+	$(RUNTEST) dulwich.tests.test_suite
 
-check-noextensions:: clean
-	PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) $(TESTRUNNER) $(TESTFLAGS) dulwich
+check-nocompat:: build
+	$(RUNTEST) dulwich.tests.nocompat_test_suite
 
-check-compat:: build
-	PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) $(TESTRUNNER) $(TESTFLAGS) -i compat
+check-noextensions:: clean
+	$(RUNTEST) dulwich.tests.test_suite
 
 clean::
 	$(SETUP) clean --all
 	rm -f dulwich/*.so
-
-coverage:: build
-	PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) $(TESTRUNNER) --cover-package=dulwich --with-coverage --cover-erase --cover-inclusive dulwich
-
-coverage-annotate: coverage
-	python-coverage -a -o /usr

+ 36 - 5
NEWS

@@ -2,17 +2,48 @@
 
  FEATURES
 
+  * New `dulwich.diff_tree` module for simple content-based rename detection.
+    (Dave Borowitz)
+
   * Add Tree.items(). (Jelmer Vernooij)
 
   * Add eof() and unread_pkt_line() methods to Protocol. (Dave Borowitz)
 
+  * Add write_tree_diff(). (Jelmer Vernooij)
+
+  * Add `serve_command` function for git server commands as executables.
+    (Jelmer Vernooij)
+
+  * dulwich.client.get_transport_and_path now supports rsync-style repository URLs.
+    (Dave Borowitz, #568493)
+
  BUG FIXES
 
   * Correct short-circuiting operation for no-op fetches in the server.
     (Dave Borowitz)
 
-  * Support parsing git mbox patches without a version tail, as generated by Mercurial. 
-    (Jelmer Vernooij)
+  * Support parsing git mbox patches without a version tail, as generated by
+    Mercurial.  (Jelmer Vernooij)
+
+  * Fix dul-receive-pack and dul-upload-pack. (Jelmer Vernooij)
+
+  * Zero-padded file modes in Tree objects no longer trigger an exception but
+    the check code warns about them. (Augie Fackler, #581064)
+
+  * Repo.init() now honors the mkdir flag. (#671159)
+
+  * The ref format is now checked when setting a ref rather than when reading it back.
+    (Dave Borowitz, #653527)
+
+  * Make sure pack files are closed correctly. (Tay Ray Chuan)
+
+ DOCUMENTATION
+
+  * Run the tutorial inside the test suite. (Jelmer Vernooij)
+
+  * Reorganized and updated the tutorial. (Jelmer Vernooij, Dave Borowitz, #610550,
+     #610540)
+
 
 0.6.2	2010-10-16
 
@@ -176,7 +207,7 @@ note: This list is most likely incomplete for 0.6.0.
   * Implement RefsContainer.__contains__. (Jelmer Vernooij)
 
   * Cope with \r in ref files on Windows. (
-	http://github.com/jelmer/dulwich/issues/#issue/13, Jelmer Vernooij)
+    http://github.com/jelmer/dulwich/issues/#issue/13, Jelmer Vernooij)
 
   * Fix GitFile breakage on Windows. (Anatoly Techtonik, #557585)
 
@@ -246,7 +277,7 @@ note: This list is most likely incomplete for 0.6.0.
     with chunks of strings rather than with full-text strings. 
     (Jelmer Vernooij)
 
-0.5.0	2010-03-03
+0.5.02010-03-03
 
  BUG FIXES
 
@@ -347,7 +378,7 @@ note: This list is most likely incomplete for 0.6.0.
 
   * Removed Repo.set_ref, Repo.remove_ref, Repo.tags, Repo.get_refs and 
     Repo.heads in favor of Repo.refs, a dictionary-like object for accessing
-	refs.
+    refs.
 
  BUG FIXES
 

+ 6 - 13
bin/dul-receive-pack

@@ -17,19 +17,12 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 # MA  02110-1301, USA.
 
+from dulwich.server import serve_command, ReceivePackHandler
+import os
 import sys
-from dulwich.repo import Repo
-from dulwich.server import GitBackend, ReceivePackHandler
 
-def send_fn(data):
-    sys.stdout.write(data)
-    sys.stdout.flush()
+if len(sys.argv) < 2:
+    print >>sys.stderr, "usage: %s <git-dir>" % os.path.basename(sys.argv[0])
+    sys.exit(1)
 
-if __name__ == "__main__":
-    gitdir = None
-    if len(sys.argv) > 1:
-        gitdir = sys.argv[1]
-
-    backend = GitBackend(Repo(gitdir))
-    handler = ReceivePackHandler(backend, sys.stdin.read, send_fn)
-    handler.handle()
+sys.exit(serve_command(ReceivePackHandler))

+ 6 - 13
bin/dul-upload-pack

@@ -17,19 +17,12 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 # MA  02110-1301, USA.
 
+from dulwich.server import serve_command, UploadPackHandler
+import os
 import sys
-from dulwich.repo import Repo
-from dulwich.server import GitBackend, UploadPackHandler
 
-def send_fn(data):
-    sys.stdout.write(data)
-    sys.stdout.flush()
+if len(sys.argv) < 2:
+    print >>sys.stderr, "usage: %s <git-dir>" % os.path.basename(sys.argv[0])
+    sys.exit(1)
 
-if __name__ == "__main__":
-    gitdir = None
-    if len(sys.argv) > 1:
-        gitdir = sys.argv[1]
-
-    backend = GitBackend(Repo(gitdir))
-    handler = UploadPackHandler(backend, sys.stdin.read, send_fn)
-    handler.handle()
+sys.exit(serve_command(UploadPackHandler))

+ 3 - 2
debian/changelog

@@ -1,8 +1,9 @@
-dulwich (0.6.2+bzr704-1) UNRELEASED; urgency=low
+dulwich (0.6.2+bzr788-1) UNRELEASED; urgency=low
 
   * New upstream snapshot.
+  * Drop Pure- from the description.
 
- -- Jelmer Vernooij <jelmer@debian.org>  Sat, 20 Nov 2010 15:19:03 +0100
+ -- Jelmer Vernooij <jelmer@debian.org>  Tue, 28 Dec 2010 01:50:22 +0100
 
 dulwich (0.6.2+bzr702-1) unstable; urgency=low
 

+ 1 - 1
debian/control

@@ -14,7 +14,7 @@ Provides: ${python:Provides}
 Depends: ${python:Depends}, ${misc:Depends}, ${shlibs:Depends}
 XB-Python-Version: ${python:Versions}
 Recommends: python-fastimport
-Description: Pure-python Git library
+Description: Python Git library
  Dulwich is a Python implementation of the file formats and protocols 
  used by the Git version control system. It can currently read from and write 
  to existing Git repositories and implements the protocol for pushing and 

+ 10 - 9
docs/tutorial/0-introduction.txt

@@ -45,16 +45,16 @@ tree.
 The Tree
 --------
 
-A tree is a collection of file information, the state of your working copy at
+A tree is a collection of file information, the state of a single directory at
 a given point in time.
 
 A tree file looks like this::
 
-  tree <content length><NUL><file mode> <filename><NUL><blob sha>...
+  tree <content length><NUL><file mode> <filename><NUL><item sha>...
 
 And repeats for every file in the tree.
 
-Note that for a unknown reason, the SHA-1 digest is in binary form here.
+Note that the SHA-1 digest is in binary form here.
 
 The file mode is like the octal argument you could give to the ``chmod``
 command.  Except it is in extended form to tell regular files from
@@ -88,14 +88,15 @@ accelerate operations and reduce space.
 More About Git formats
 ----------------------
 
-These three objects make 90 % of a Git repository. The rest is branch
-information and optimizations.
+These three objects make up most of the contents of a Git repository and are
+used for the history. They can either appear as simple files on disk (one file
+per object) or in a ``pack`` file, which is a container for a number of these
+objects.
 
-For instance there is an index of the current state of the working copy.
-There are also pack files to group several small objects in a single indexed
-file.
+The is also an index of the current state of the working copy in the
+repository as well as files to track the existing branches and tags.
 
-For a more detailled explanation of object formats and SHA-1 digests, see:
+For a more detailed explanation of object formats and SHA-1 digests, see:
 http://www-cs-students.stanford.edu/~blynn/gitmagic/ch08.html
 
 Just note that recent versions of Git compress object files using zlib.

+ 0 - 119
docs/tutorial/1-initial-commit.txt

@@ -1,119 +0,0 @@
-The Repository
-==============
-
-After this introduction, let's start directly with code::
-
-  >>> from dulwich.repo import Repo
-
-The access to every object is through the Repo object. You can open an
-existing repository or you can create a new one. There are two types of Git
-repositories:
-
-  Regular Repositories -- They are the ones you create using ``git init`` and
-  you daily use. They contain a ``.git`` folder.
-
-  Bare Repositories -- There is not ".git" folder. The top-level folder
-  contains itself the "branches", "hooks"... folders. These are used for
-  published repositories (mirrors).
-
-Let's create a folder and turn it into a repository, like ``git init`` would::
-
-  >>> from os import mkdir
-  >>> mkdir("myrepo")
-  >>> repo = Repo.init("myrepo")
-  >>> repo
-  <Repo at '/tmp/myrepo/'>
-
-You can already look a the structure of the "myrepo/.git" folder, though it
-is mostly empty for now.
-
-Initial commit
-==============
-
-When you use Git, you generally add or modify content. As our repository is
-empty for now, we'll start by adding a new file::
-
-  >>> from dulwich.objects import Blob
-  >>> blob = Blob.from_string("My file content\n")
-  >>> blob.id
-  'c55063a4d5d37aa1af2b2dad3a70aa34dae54dc6'
-
-Of course you could create a blob from an existing file using ``from_file``
-instead.
-
-As said in the introduction, file content is separed from file name. Let's
-give this content a name::
-
-  >>> from dulwich.objects import Tree
-  >>> tree = Tree()
-  >>> tree.add(0100644, "spam", blob.id)
-
-Note that "0100644" is the octal form for a regular file with common
-permissions. You can hardcode them or you can use the ``stat`` module.
-
-The tree state of our repository still needs to be placed in time. That's the
-job of the commit::
-
-  >>> from dulwich.objects import Commit, parse_timezone
-  >>> from time import time
-  >>> commit = Commit()
-  >>> commit.tree = tree.id
-  >>> author = "Your Name <your.email@example.com>"
-  >>> commit.author = commit.committer = author
-  >>> commit.commit_time = commit.author_time = int(time())
-  >>> tz = parse_timezone('-0200')
-  >>> commit.commit_timezone = commit.author_timezone = tz
-  >>> commit.encoding = "UTF-8"
-  >>> commit.message = "Initial commit"
-
-Note that the initial commit has no parents.
-
-At this point, the repository is still empty because all operations happen in
-memory. Let's "commit" it.
-
-  >>> object_store = repo.object_store
-  >>> object_store.add_object(blob)
-
-Now the ".git/objects" folder contains a first SHA-1 file. Let's continue
-saving the changes::
-
-  >>> object_store.add_object(tree)
-  >>> object_store.add_object(commit)
-
-Now the physical repository contains three objects but still has no branch.
-Let's create the master branch like Git would::
-
-  >>> repo.refs['refs/heads/master'] = commit.id
-
-The master branch now has a commit where to start, but Git itself would not
-known what is the current branch. That's another reference::
-
-  >>> repo.refs['HEAD'] = 'ref: refs/heads/master'
-
-Now our repository is officialy tracking a branch named "master" refering to a
-single commit.
-
-Playing again with Git
-======================
-
-At this point you can come back to the shell, go into the "myrepo" folder and
-type ``git status`` to let Git confirm that this is a regular repository on
-branch "master".
-
-Git will tell you that the file "spam" is deleted, which is normal because
-Git is comparing the repository state with the current working copy. And we
-have absolutely no working copy using Dulwich because we don't need it at
-all!
-
-You can checkout the last state using ``git checkout -f``. The force flag
-will prevent Git from complaining that there are uncommitted changes in the
-working copy.
-
-The file ``spam`` appears and with no surprise contains the same bytes as the
-blob::
-
-  $ cat spam
-  My file content
-
-.. attention:: Remember to recreate the repo object when you modify the
-               repository outside of Dulwich!

+ 28 - 0
docs/tutorial/1-repo.txt

@@ -0,0 +1,28 @@
+The Repository
+==============
+
+After this introduction, let's start directly with code::
+
+  >>> from dulwich.repo import Repo
+
+The access to a repository is through the Repo object. You can open an
+existing repository or you can create a new one. There are two types of Git
+repositories:
+
+  Regular Repositories -- They are the ones you create using ``git init`` and
+  you daily use. They contain a ``.git`` folder.
+
+  Bare Repositories -- There is not ".git" folder. The top-level folder
+  contains itself the "branches", "hooks"... folders. These are used for
+  published repositories (mirrors). They do not have a working tree.
+
+Let's create a folder and turn it into a repository, like ``git init`` would::
+
+  >>> from os import mkdir
+  >>> mkdir("myrepo")
+  >>> repo = Repo.init("myrepo")
+  >>> repo
+  <Repo at 'myrepo'>
+
+You can already look a the structure of the "myrepo/.git" folder, though it
+is mostly empty for now.

+ 0 - 61
docs/tutorial/2-change-file.txt

@@ -1,61 +0,0 @@
-Changing a File and Commit it
-=============================
-
-Now we have a first commit, the next one will show a difference.
-
-As seen in the introduction, it's about making a path in a tree point to a
-new blob. The old blob will remain to compute the diff. The tree is altered
-and the new commit'task is to point to this new version.
-
-In the following examples, we assume we still have the ``repo`` and ``tree``
-object from the previous chapter.
-
-Let's first build the blob::
-
-  >>> spam = Blob.from_string("My new file content\n")
-  >>> spam.id
-  '16ee2682887a962f854ebd25a61db16ef4efe49f'
-
-An alternative is to alter the previously constructed blob object::
-
-  >>> blob.data = "My new file content\n"
-  >>> blob.id
-  '16ee2682887a962f854ebd25a61db16ef4efe49f'
-
-In any case, update the blob id known as "spam". You also have the
-opportunity of changing its mode::
-
-  >>> tree["spam"] = (0100644, spam.id)
-
-Now let's record the change::
-
-  >>> c2 = Commit()
-  >>> c2.tree = tree.id
-  >>> c2.parents = [commit.id]
-  >>> c2.author = c2.committer = author
-  >>> c2.commit_time = c2.author_time = int(time())
-  >>> c2.commit_timezone = c2.author_timezone = tz
-  >>> c2.encoding = "UTF-8"
-  >>> c2.message = 'Changing "spam"'
-
-In this new commit we record the changed tree id, and most important, the
-previous commit as the parent. Parents are actually a list because a commit
-may happen to have several parents after merging branches.
-
-Remain to record this whole new family::
-
-  >>> object_store.add_object(spam)
-  >>> object_store.add_object(tree)
-  >>> object_store.add_object(c2)
-
-You can already ask git to introspect this commit using ``git show`` and the
-value of ``commit.id`` as an argument. You'll see the difference will the
-previous blob recorded as "spam".
-
-You won't see it using git log because the head is still the previous
-commit. It's easy to remedy::
-
-  >>> repo.refs['refs/heads/master'] = c2.id
-
-Now all git tools will work as expected. Though don't forget that Dulwich is
-still open!

+ 184 - 0
docs/tutorial/2-object-store.txt

@@ -0,0 +1,184 @@
+The object store
+================
+
+The objects are stored in the ``object store`` of the repository.
+
+  >>> from dulwich.repo import Repo
+  >>> repo = Repo.init("myrepo", mkdir=True)
+
+Initial commit
+--------------
+
+When you use Git, you generally add or modify content. As our repository is
+empty for now, we'll start by adding a new file::
+
+  >>> from dulwich.objects import Blob
+  >>> blob = Blob.from_string("My file content\n")
+  >>> blob.id
+  'c55063a4d5d37aa1af2b2dad3a70aa34dae54dc6'
+
+Of course you could create a blob from an existing file using ``from_file``
+instead.
+
+As said in the introduction, file content is separed from file name. Let's
+give this content a name::
+
+  >>> from dulwich.objects import Tree
+  >>> tree = Tree()
+  >>> tree.add(0100644, "spam", blob.id)
+
+Note that "0100644" is the octal form for a regular file with common
+permissions. You can hardcode them or you can use the ``stat`` module.
+
+The tree state of our repository still needs to be placed in time. That's the
+job of the commit::
+
+  >>> from dulwich.objects import Commit, parse_timezone
+  >>> from time import time
+  >>> commit = Commit()
+  >>> commit.tree = tree.id
+  >>> author = "Your Name <your.email@example.com>"
+  >>> commit.author = commit.committer = author
+  >>> commit.commit_time = commit.author_time = int(time())
+  >>> tz = parse_timezone('-0200')[0]
+  >>> commit.commit_timezone = commit.author_timezone = tz
+  >>> commit.encoding = "UTF-8"
+  >>> commit.message = "Initial commit"
+
+Note that the initial commit has no parents.
+
+At this point, the repository is still empty because all operations happen in
+memory. Let's "commit" it.
+
+  >>> object_store = repo.object_store
+  >>> object_store.add_object(blob)
+
+Now the ".git/objects" folder contains a first SHA-1 file. Let's continue
+saving the changes::
+
+  >>> object_store.add_object(tree)
+  >>> object_store.add_object(commit)
+
+Now the physical repository contains three objects but still has no branch.
+Let's create the master branch like Git would::
+
+  >>> repo.refs['refs/heads/master'] = commit.id
+
+The master branch now has a commit where to start. When we commit to master, we
+are also moving HEAD, which is Git's currently checked out branch:
+
+  >>> head = repo.refs['HEAD']
+  >>> head == commit.id
+  True
+  >>> head == repo.refs['refs/heads/master']
+  True
+
+How did that work? As it turns out, HEAD is a special kind of ref called a
+symbolic ref, and it points at master. Most functions on the refs container
+work transparently with symbolic refs, but we can also take a peek inside HEAD:
+
+  >>> repo.refs.read_ref('HEAD')
+  'ref: refs/heads/master'
+
+Normally, you won't need to use read_ref. If you want to change what ref HEAD
+points to, in order to check out another branch, just use set_symbolic_ref.
+
+Now our repository is officially tracking a branch named "master" referring to a
+single commit.
+
+Playing again with Git
+----------------------
+
+At this point you can come back to the shell, go into the "myrepo" folder and
+type ``git status`` to let Git confirm that this is a regular repository on
+branch "master".
+
+Git will tell you that the file "spam" is deleted, which is normal because
+Git is comparing the repository state with the current working copy. And we
+have absolutely no working copy using Dulwich because we don't need it at
+all!
+
+You can checkout the last state using ``git checkout -f``. The force flag
+will prevent Git from complaining that there are uncommitted changes in the
+working copy.
+
+The file ``spam`` appears and with no surprise contains the same bytes as the
+blob::
+
+  $ cat spam
+  My file content
+
+Changing a File and Committing it
+---------------------------------
+
+Now we have a first commit, the next one will show a difference.
+
+As seen in the introduction, it's about making a path in a tree point to a
+new blob. The old blob will remain to compute the diff. The tree is altered
+and the new commit'task is to point to this new version.
+
+Let's first build the blob::
+
+  >>> from dulwich.objects import Blob
+  >>> spam = Blob.from_string("My new file content\n")
+  >>> spam.id
+  '16ee2682887a962f854ebd25a61db16ef4efe49f'
+
+An alternative is to alter the previously constructed blob object::
+
+  >>> blob.data = "My new file content\n"
+  >>> blob.id
+  '16ee2682887a962f854ebd25a61db16ef4efe49f'
+
+In any case, update the blob id known as "spam". You also have the
+opportunity of changing its mode::
+
+  >>> tree["spam"] = (0100644, spam.id)
+
+Now let's record the change::
+
+  >>> from dulwich.objects import Commit
+  >>> from time import time
+  >>> c2 = Commit()
+  >>> c2.tree = tree.id
+  >>> c2.parents = [commit.id]
+  >>> c2.author = c2.committer = "John Doe <john@example.com>"
+  >>> c2.commit_time = c2.author_time = int(time())
+  >>> c2.commit_timezone = c2.author_timezone = 0
+  >>> c2.encoding = "UTF-8"
+  >>> c2.message = 'Changing "spam"'
+
+In this new commit we record the changed tree id, and most important, the
+previous commit as the parent. Parents are actually a list because a commit
+may happen to have several parents after merging branches.
+
+Let's put the objects in the object store::
+
+  >>> repo.object_store.add_object(spam)
+  >>> repo.object_store.add_object(tree)
+  >>> repo.object_store.add_object(c2)
+
+You can already ask git to introspect this commit using ``git show`` and the
+value of ``c2.id`` as an argument. You'll see the difference will the
+previous blob recorded as "spam".
+
+The diff between the previous head and the new one can be printed using
+write_tree_diff::
+
+  >>> from dulwich.patch import write_tree_diff
+  >>> import sys
+  >>> write_tree_diff(sys.stdout, repo.object_store, commit.tree, tree.id)
+  diff --git a/spam b/spam
+  index c55063a..16ee268 100644
+  --- a/spam
+  +++ b/spam
+  @@ -1,1 +1,1 @@
+  -My file content
+  +My new file content
+
+You won't see it using git log because the head is still the previous
+commit. It's easy to remedy::
+
+  >>> repo.refs['refs/heads/master'] = c2.id
+
+Now all git tools will work as expected.

+ 0 - 41
docs/tutorial/3-add-file.txt

@@ -1,41 +0,0 @@
-Adding a file
-=============
-
-If you followed well, the next lesson will be straightforward.
-
-We need a new blob::
-
-    >>> ham = Blob.from_string("Another\nmultiline\nfile\n")
-    >>> ham.id
-    'a3b5eda0b83eb8fb6e5dce91ecafda9e97269c70'
-
-But the same tree::
-
-    >>> tree["ham"] = (0100644, spam.id)
-
-And a new commit::
-
-  >>> c3 = Commit()
-  >>> c3.tree = tree.id
-  >>> c3.parents = [commit.id]
-  >>> c3.author = c3.committer = author
-  >>> c3.commit_time = c3.author_time = int(time())
-  >>> c3.commit_timezone = c3.author_timezone = tz
-  >>> c3.encoding = "UTF-8"
-  >>> c3.message = 'Adding "ham"'
-
-Save it all::
-
-    >>> object_store.add_object(spam)
-    >>> object_store.add_object(tree)
-    >>> object_store.add_object(c3)
-
-Update the head::
-
-    >>> repo.refs['refs/heads/master'] = commit.id
-
-A call to ``git show`` will confirm the addition of "spam".
-
-Remember you can also call ``git checkout -f`` to make it appear.
-
-Well... Adding "spam" was not such a good idea... We'll remove it.

+ 11 - 0
docs/tutorial/3-conclusion.txt

@@ -0,0 +1,11 @@
+Conclusion
+==========
+
+This tutorial currently only covers a small (but important) part of Dulwich.
+It still needs to be extended to cover packs, tags, refs, reflogs and network
+communication.
+
+Dulwich is abstracting much of the Git plumbing, so there would be more to
+see.
+
+For now, that's all folks!

+ 0 - 30
docs/tutorial/4-remove-file.txt

@@ -1,30 +0,0 @@
-Removing a file
-===============
-
-Removing a file just means removing its entry in the tree. The blob won't be
-deleted because Git tries to preserve the history of your repository.
-
-It's all pythonic::
-
-    >>> del tree["ham"]
-
-  >>> c4 = Commit()
-  >>> c4.tree = tree.id
-  >>> c4.parents = [commit.id]
-  >>> c4.author = c4.committer = author
-  >>> c4.commit_time = c4.author_time = int(time())
-  >>> c4.commit_timezone = c4.author_timezone = tz
-  >>> c4.encoding = "UTF-8"
-  >>> c4.message = 'Removing "ham"'
-
-Here we only have the new tree and the commit to save::
-
-    >>> object_store.add_object(spam)
-    >>> object_store.add_object(tree)
-    >>> object_store.add_object(c4)
-
-And of course update the head::
-
-    >>> repo.refs['refs/heads/master'] = commit.id
-
-If you don't trust me, ask ``git show``. ;-)

+ 0 - 33
docs/tutorial/5-rename-file.txt

@@ -1,33 +0,0 @@
-Renaming a file
-===============
-
-Remember you learned that the file name and content are distinct. So renaming
-a file is just about associating a blob id to a new name. We won't store more
-content, and the operation will be painless.
-
-Let's transfer the blob id from the old name to the new one::
-
-    >>> tree["eggs"] = tree["spam"]
-    >>> del tree["spam"]
-
-As usual, we need a commit to store the new tree id::
-
-  >>> c5 = Commit()
-  >>> c5.tree = tree.id
-  >>> c5.parents = [commit.id]
-  >>> c5.author = c5.committer = author
-  >>> c5.commit_time = c5.author_time = int(time())
-  >>> c5.commit_timezone = c5.author_timezone = tz
-  >>> c5.encoding = "UTF-8"
-  >>> c5.message = 'Rename "spam" to "eggs"'
-
-As for a deletion, we only have a tree and a commit to save::
-
-    >>> object_store.add_object(tree)
-    >>> object_store.add_object(c5)
-
-Remains to make the head bleeding-edge::
-
-    >>> repo.refs['refs/heads/master'] = commit.id
-
-As a last exercise, see how ``git show`` illustrates it.

+ 0 - 14
docs/tutorial/6-conclusion.txt

@@ -1,14 +0,0 @@
-Conclusion
-==========
-
-You'll find the ``test.py`` program with some tips I use to ease generating
-objects.
-
-You can also make Tag objects, but this is left as a exercise to the reader.
-
-Dulwich is abstracting  much of the Git plumbing, so there would be more to
-see.
-
-Dulwich is also able to clone and push repositories.
-
-That's all folks!

+ 3 - 6
docs/tutorial/index.txt

@@ -5,9 +5,6 @@ Dulwich Tutorial
 .. contents::
 
 .. include:: 0-introduction.txt
-.. include:: 1-initial-commit.txt
-.. include:: 2-change-file.txt
-.. include:: 3-add-file.txt
-.. include:: 4-remove-file.txt
-.. include:: 5-rename-file.txt
-.. include:: 6-conclusion.txt
+.. include:: 1-repo.txt
+.. include:: 2-object-store.txt
+.. include:: 3-conclusion.txt

+ 0 - 178
docs/tutorial/test.py

@@ -1,178 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: UTF-8 -*-
-
-# Import from the Standard Library
-from os import F_OK, access, mkdir
-from pprint import pprint
-from shutil import rmtree
-from subprocess import call
-from time import time
-
-# Import from dulwich
-from dulwich.repo import Repo
-from dulwich.objects import Blob, Tree, Commit, parse_timezone
-
-
-DIRNAME = "myrepo"
-AUTHOR = "Your Name <your.email@example.com>"
-TZ = parse_timezone('-200')
-ENCODING = "UTF-8"
-
-
-def make_commit(repo, tree_id, message):
-    """Build a commit object on the same pattern. Only changing values are
-    required as parameters.
-    """
-    commit = Commit()
-    try:
-        commit.parents = [repo.head()]
-    except KeyError:
-        # The initial commit has no parent
-        pass
-    commit.tree = tree_id
-    commit.message = message
-    commit.author = commit.committer = AUTHOR
-    commit.commit_time = commit.author_time = int(time())
-    commit.commit_timezone = commit.author_timezone = TZ
-    commit.encoding = ENCODING
-    return commit
-
-
-
-def make_tree(repo):
-    """Return the last known tree.
-    """
-    commit_id = repo.head()
-    commit = repo.commit(commit_id)
-    tree_id = commit.tree
-    return repo.tree(tree_id)
-
-
-
-def update_master(repo, commit_id):
-    repo.refs['refs/heads/master'] = commit_id
-
-
-
-def initial_commit(repo):
-    # Add file content
-    blob = Blob.from_string("My file content\n")
-    # Add file
-    tree = Tree()
-    tree.add(0100644, "spam", blob.id)
-    # Set commit
-    commit = make_commit(repo, tree.id, "Initial commit")
-    # Initial commit
-    object_store = repo.object_store
-    object_store.add_object(blob)
-    object_store.add_object(tree)
-    object_store.add_object(commit)
-    # Update master
-    update_master(repo, commit.id)
-    # Set the master branch as the default
-    repo.refs['HEAD'] = 'ref: refs/heads/master'
-
-
-
-def test_change(repo):
-    tree = make_tree(repo)
-    # Change a file
-    spam = Blob.from_string("My new file content\n")
-    tree.add(0100644, "spam", spam.id)
-    # Set commit
-    commit = make_commit(repo, tree.id, "Change spam")
-    # Second commit
-    object_store = repo.object_store
-    object_store.add_object(spam)
-    object_store.add_object(tree)
-    object_store.add_object(commit)
-    # Update master
-    update_master(repo, commit.id)
-
-
-
-def test_add(repo):
-    tree = make_tree(repo)
-    # Add another file
-    ham = Blob.from_string("Another\nmultiline\nfile\n")
-    tree.add(0100644, "ham", ham.id)
-    # Set commit
-    commit = make_commit(repo, tree.id, "Add ham")
-    # Second commit
-    object_store = repo.object_store
-    object_store.add_object(ham)
-    object_store.add_object(tree)
-    object_store.add_object(commit)
-    # Update master
-    update_master(repo, commit.id)
-
-
-
-def test_remove(repo):
-    tree = make_tree(repo)
-    # Remove a file
-    del tree["ham"]
-    # Set commit
-    commit = make_commit(repo, tree.id, 'Remove "ham"')
-    # Third commit
-    # No blob change, just tree operation
-    object_store = repo.object_store
-    object_store.add_object(tree)
-    object_store.add_object(commit)
-    # Update master
-    update_master(repo, commit.id)
-
-
-
-def test_rename(repo):
-    tree = make_tree(repo)
-    # Rename a file
-    tree["eggs"] = tree["spam"]
-    del tree["spam"]
-    # Set commit
-    commit = make_commit(repo, tree.id, 'Rename "spam" to "eggs"')
-    # Fourth commit
-    # No blob change, just tree operation
-    object_store = repo.object_store
-    object_store.add_object(tree)
-    object_store.add_object(commit)
-    # Update master
-    update_master(repo, commit.id)
-
-
-
-def test_history(repo):
-    pprint(repo.revision_history(repo.head()))
-
-
-
-def test_file(repo):
-    tree = make_tree(repo)
-    print "entries", tree.entries()
-    mode, blob_id = tree["eggs"]
-    blob = repo.get_blob(blob_id)
-    print "eggs", repr(blob.data)
-
-
-
-if __name__ == '__main__':
-    # Creating the repository
-    if access(DIRNAME, F_OK):
-        rmtree(DIRNAME)
-    mkdir(DIRNAME)
-    repo = Repo.init(DIRNAME)
-    initial_commit(repo)
-    test_change(repo)
-    test_add(repo)
-    test_remove(repo)
-    test_rename(repo)
-    last_commit_id = repo.head()
-    call(['git', 'gc'], cwd=DIRNAME)
-    # Re-load the repo
-    del repo
-    repo = Repo(DIRNAME)
-    # XXX the ref was removed and dulwich doesn't know where to read it
-    update_master(repo, last_commit_id)
-    assert last_commit_id == repo.head()
-    test_history(repo)
-    test_file(repo)

+ 76 - 1
dulwich/misc.py → dulwich/_compat.py

@@ -1,4 +1,4 @@
-# misc.py -- For dealing with python2.4 oddness
+# _compat.py -- For dealing with python2.4 oddness
 # Copyright (C) 2008 Canonical Ltd.
 #
 # This program is free software; you can redistribute it and/or
@@ -101,10 +101,45 @@ def unpack_from(fmt, buf, offset=0):
         return struct.unpack(fmt, b)
 
 
+try:
+    from itertools import permutations
+except ImportError:
+    # Implementation of permutations from Python 2.6 documentation:
+    # http://docs.python.org/2.6/library/itertools.html#itertools.permutations
+    # Copyright (c) 2001-2010 Python Software Foundation; All Rights Reserved
+    # Modified syntax slightly to run under Python 2.4.
+    def permutations(iterable, r=None):
+        # permutations('ABCD', 2) --> AB AC AD BA BC BD CA CB CD DA DB DC
+        # permutations(range(3)) --> 012 021 102 120 201 210
+        pool = tuple(iterable)
+        n = len(pool)
+        if r is None:
+            r = n
+        if r > n:
+            return
+        indices = range(n)
+        cycles = range(n, n-r, -1)
+        yield tuple(pool[i] for i in indices[:r])
+        while n:
+            for i in reversed(range(r)):
+                cycles[i] -= 1
+                if cycles[i] == 0:
+                    indices[i:] = indices[i+1:] + indices[i:i+1]
+                    cycles[i] = n - i
+                else:
+                    j = cycles[i]
+                    indices[i], indices[-j] = indices[-j], indices[i]
+                    yield tuple(pool[i] for i in indices[:r])
+                    break
+            else:
+                return
+
+
 try:
     from collections import namedtuple
 
     TreeEntryTuple = namedtuple('TreeEntryTuple', ['path', 'mode', 'sha'])
+    TreeChangeTuple = namedtuple('TreeChangeTuple', ['type', 'old', 'new'])
 except ImportError:
     # Provide manual implementations of namedtuples for Python <2.5.
     # If the class definitions change, be sure to keep these in sync by running
@@ -153,3 +188,43 @@ except ImportError:
             path = _property(_itemgetter(0))
             mode = _property(_itemgetter(1))
             sha = _property(_itemgetter(2))
+
+
+    class TreeChangeTuple(tuple):
+            'TreeChangeTuple(type, old, new)'
+
+            __slots__ = ()
+
+            _fields = ('type', 'old', 'new')
+
+            def __new__(_cls, type, old, new):
+                return _tuple.__new__(_cls, (type, old, new))
+
+            @classmethod
+            def _make(cls, iterable, new=tuple.__new__, len=len):
+                'Make a new TreeChangeTuple object from a sequence or iterable'
+                result = new(cls, iterable)
+                if len(result) != 3:
+                    raise TypeError('Expected 3 arguments, got %d' % len(result))
+                return result
+
+            def __repr__(self):
+                return 'TreeChangeTuple(type=%r, old=%r, new=%r)' % self
+
+            def _asdict(t):
+                'Return a new dict which maps field names to their values'
+                return {'type': t[0], 'old': t[1], 'new': t[2]}
+
+            def _replace(_self, **kwds):
+                'Return a new TreeChangeTuple object replacing specified fields with new values'
+                result = _self._make(map(kwds.pop, ('type', 'old', 'new'), _self))
+                if kwds:
+                    raise ValueError('Got unexpected field names: %r' % kwds.keys())
+                return result
+
+            def __getnewargs__(self):
+                return tuple(self)
+
+            type = _property(_itemgetter(0))
+            old = _property(_itemgetter(1))
+            new = _property(_itemgetter(2))

+ 449 - 0
dulwich/_diff_tree.c

@@ -0,0 +1,449 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License or (at your option) a later version of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA  02110-1301, USA.
+ */
+
+#include <Python.h>
+#include <sys/stat.h>
+
+#if (PY_VERSION_HEX < 0x02050000)
+typedef int Py_ssize_t;
+#endif
+
+#if (PY_VERSION_HEX < 0x02060000)
+#define Py_SIZE(ob)             (((PyVarObject*)(ob))->ob_size)
+#endif
+
+static PyObject *tree_entry_cls = NULL, *null_entry = NULL,
+	*defaultdict_cls = NULL, *int_cls = NULL;
+static int block_size;
+
+/**
+ * Free an array of PyObject pointers, decrementing any references.
+ */
+static void free_objects(PyObject **objs, Py_ssize_t n)
+{
+	Py_ssize_t i;
+	for (i = 0; i < n; i++)
+		Py_XDECREF(objs[i]);
+	PyMem_Free(objs);
+}
+
+/**
+ * Get the entries of a tree, prepending the given path.
+ *
+ * :param path: The path to prepend, without trailing slashes.
+ * :param path_len: The length of path.
+ * :param tree: The Tree object to iterate.
+ * :param n: Set to the length of result.
+ * :return: A (C) array of PyObject pointers to TreeEntry objects for each path
+ *     in tree.
+ */
+static PyObject **tree_entries(char *path, Py_ssize_t path_len, PyObject *tree,
+		Py_ssize_t *n)
+{
+	PyObject *iteritems, *items, **result = NULL;
+	PyObject *old_entry, *name, *sha;
+	Py_ssize_t i = 0, name_len, new_path_len;
+	char *new_path;
+
+	if (tree == Py_None) {
+		*n = 0;
+		result = PyMem_New(PyObject*, 0);
+		if (!result) {
+			PyErr_SetNone(PyExc_MemoryError);
+			return NULL;
+		}
+		return result;
+	}
+
+	iteritems = PyObject_GetAttrString(tree, "iteritems");
+	if (!iteritems)
+		return NULL;
+	items = PyObject_CallFunctionObjArgs(iteritems, Py_True, NULL);
+	Py_DECREF(iteritems);
+	if (!items) {
+		return NULL;
+	}
+	/* The C implementation of iteritems returns a list, so depend on that. */
+	if (!PyList_Check(items)) {
+		PyErr_SetString(PyExc_TypeError,
+			"Tree.iteritems() did not return a list");
+		return NULL;
+	}
+
+	*n = PyList_Size(items);
+	result = PyMem_New(PyObject*, *n);
+	if (!result) {
+		PyErr_SetNone(PyExc_MemoryError);
+		goto error;
+	}
+	for (i = 0; i < *n; i++) {
+		old_entry = PyList_GetItem(items, i);
+		if (!old_entry)
+			goto error;
+		sha = PyTuple_GetItem(old_entry, 2);
+		if (!sha)
+			goto error;
+		name = PyTuple_GET_ITEM(old_entry, 0);
+		name_len = PyString_Size(name);
+		if (PyErr_Occurred())
+			goto error;
+
+		new_path_len = name_len;
+		if (path_len)
+			new_path_len += path_len + 1;
+		new_path = PyMem_Malloc(new_path_len);
+		if (!new_path) {
+			PyErr_SetNone(PyExc_MemoryError);
+			goto error;
+		}
+		if (path_len) {
+			memcpy(new_path, path, path_len);
+			new_path[path_len] = '/';
+			memcpy(new_path + path_len + 1, PyString_AS_STRING(name), name_len);
+		} else {
+			memcpy(new_path, PyString_AS_STRING(name), name_len);
+		}
+
+		result[i] = PyObject_CallFunction(tree_entry_cls, "s#OO", new_path,
+			new_path_len, PyTuple_GET_ITEM(old_entry, 1), sha);
+		PyMem_Free(new_path);
+		if (!result[i]) {
+			goto error;
+		}
+	}
+	Py_DECREF(items);
+	return result;
+
+error:
+	free_objects(result, i);
+	Py_DECREF(items);
+	return NULL;
+}
+
+/**
+ * Use strcmp to compare the paths of two TreeEntry objects.
+ */
+static int entry_path_cmp(PyObject *entry1, PyObject *entry2)
+{
+	PyObject *path1 = NULL, *path2 = NULL;
+	int result = 0;
+
+	path1 = PyObject_GetAttrString(entry1, "path");
+	if (!path1)
+		goto done;
+	if (!PyString_Check(path1)) {
+		PyErr_SetString(PyExc_TypeError, "path is not a string");
+		goto done;
+	}
+
+	path2 = PyObject_GetAttrString(entry2, "path");
+	if (!path2)
+		goto done;
+	if (!PyString_Check(path2)) {
+		PyErr_SetString(PyExc_TypeError, "path is not a string");
+		goto done;
+	}
+
+	result = strcmp(PyString_AS_STRING(path1), PyString_AS_STRING(path2));
+
+done:
+	Py_XDECREF(path1);
+	Py_XDECREF(path2);
+	return result;
+}
+
+static PyObject *py_merge_entries(PyObject *self, PyObject *args)
+{
+	PyObject *path, *tree1, *tree2, **entries1 = NULL, **entries2 = NULL;
+	PyObject *e1, *e2, *pair, *result = NULL;
+	Py_ssize_t path_len, n1 = 0, n2 = 0, i1 = 0, i2 = 0;
+	char *path_str;
+	int cmp;
+
+	if (!PyArg_ParseTuple(args, "OOO", &path, &tree1, &tree2))
+		return NULL;
+
+	path_str = PyString_AsString(path);
+	if (!path_str) {
+		PyErr_SetString(PyExc_TypeError, "path is not a string");
+		return NULL;
+	}
+	path_len = PyString_GET_SIZE(path);
+
+	entries1 = tree_entries(path_str, path_len, tree1, &n1);
+	if (!entries1)
+		goto error;
+	entries2 = tree_entries(path_str, path_len, tree2, &n2);
+	if (!entries2)
+		goto error;
+
+	result = PyList_New(n1 + n2);
+	if (!result)
+		goto error;
+	/* PyList_New sets the len of the list, not its allocated size, so we
+	 * need to trim it to the size we actually use. */
+	Py_SIZE(result) = 0;
+
+	while (i1 < n1 && i2 < n2) {
+		cmp = entry_path_cmp(entries1[i1], entries2[i2]);
+		if (PyErr_Occurred())
+			goto error;
+		if (!cmp) {
+			e1 = entries1[i1++];
+			e2 = entries2[i2++];
+		} else if (cmp < 0) {
+			e1 = entries1[i1++];
+			e2 = null_entry;
+		} else {
+			e1 = null_entry;
+			e2 = entries2[i2++];
+		}
+		pair = PyTuple_Pack(2, e1, e2);
+		if (!pair)
+			goto error;
+		PyList_SET_ITEM(result, Py_SIZE(result)++, pair);
+	}
+
+	while (i1 < n1) {
+		pair = PyTuple_Pack(2, entries1[i1++], null_entry);
+		if (!pair)
+			goto error;
+		PyList_SET_ITEM(result, Py_SIZE(result)++, pair);
+	}
+	while (i2 < n2) {
+		pair = PyTuple_Pack(2, null_entry, entries2[i2++]);
+		if (!pair)
+			goto error;
+		PyList_SET_ITEM(result, Py_SIZE(result)++, pair);
+	}
+	goto done;
+
+error:
+	Py_XDECREF(result);
+	result = NULL;
+
+done:
+	free_objects(entries1, n1);
+	free_objects(entries2, n2);
+	return result;
+}
+
+static PyObject *py_is_tree(PyObject *self, PyObject *args)
+{
+	PyObject *entry, *mode, *result;
+	long lmode;
+
+	if (!PyArg_ParseTuple(args, "O", &entry))
+		return NULL;
+
+	mode = PyObject_GetAttrString(entry, "mode");
+	if (!mode)
+		return NULL;
+
+	if (mode == Py_None) {
+		result = Py_False;
+	} else {
+		lmode = PyInt_AsLong(mode);
+		if (lmode == -1 && PyErr_Occurred()) {
+			Py_DECREF(mode);
+			return NULL;
+		}
+		result = PyBool_FromLong(S_ISDIR((mode_t)lmode));
+	}
+	Py_INCREF(result);
+	Py_DECREF(mode);
+	return result;
+}
+
+static int add_hash(PyObject *get, PyObject *set, char *str, int n)
+{
+	PyObject *str_obj = NULL, *hash_obj = NULL, *value = NULL,
+		*set_value = NULL;
+	long hash;
+
+	/* It would be nice to hash without copying str into a PyString, but that
+	 * isn't exposed by the API. */
+	str_obj = PyString_FromStringAndSize(str, n);
+	if (!str_obj)
+		goto error;
+	hash = PyObject_Hash(str_obj);
+	if (hash == -1)
+		goto error;
+	hash_obj = PyInt_FromLong(hash);
+	if (!hash_obj)
+		goto error;
+
+	value = PyObject_CallFunctionObjArgs(get, hash_obj, NULL);
+	if (!value)
+		goto error;
+	set_value = PyObject_CallFunction(set, "(Ol)", hash_obj,
+		PyInt_AS_LONG(value) + n);
+	if (!set_value)
+		goto error;
+
+	Py_DECREF(str_obj);
+	Py_DECREF(hash_obj);
+	Py_DECREF(value);
+	Py_DECREF(set_value);
+	return 0;
+
+error:
+	Py_XDECREF(str_obj);
+	Py_XDECREF(hash_obj);
+	Py_XDECREF(value);
+	Py_XDECREF(set_value);
+	return -1;
+}
+
+static PyObject *py_count_blocks(PyObject *self, PyObject *args)
+{
+	PyObject *obj, *chunks = NULL, *chunk, *counts = NULL, *get = NULL,
+		*set = NULL;
+	char *chunk_str, *block = NULL;
+	Py_ssize_t num_chunks, chunk_len;
+	int i, j, n = 0;
+	char c;
+
+	if (!PyArg_ParseTuple(args, "O", &obj))
+		goto error;
+
+	counts = PyObject_CallFunctionObjArgs(defaultdict_cls, int_cls, NULL);
+	if (!counts)
+		goto error;
+	get = PyObject_GetAttrString(counts, "__getitem__");
+	set = PyObject_GetAttrString(counts, "__setitem__");
+
+	chunks = PyObject_CallMethod(obj, "as_raw_chunks", NULL);
+	if (!chunks)
+		goto error;
+	if (!PyList_Check(chunks)) {
+		PyErr_SetString(PyExc_TypeError,
+			"as_raw_chunks() did not return a list");
+		goto error;
+	}
+	num_chunks = PyList_GET_SIZE(chunks);
+	block = PyMem_New(char, block_size);
+	if (!block) {
+		PyErr_SetNone(PyExc_MemoryError);
+		goto error;
+	}
+
+	for (i = 0; i < num_chunks; i++) {
+		chunk = PyList_GET_ITEM(chunks, i);
+		if (!PyString_Check(chunk)) {
+			PyErr_SetString(PyExc_TypeError, "chunk is not a string");
+			goto error;
+		}
+		if (PyString_AsStringAndSize(chunk, &chunk_str, &chunk_len) == -1)
+			goto error;
+
+		for (j = 0; j < chunk_len; j++) {
+			c = chunk_str[j];
+			block[n++] = c;
+			if (c == '\n' || n == block_size) {
+				if (add_hash(get, set, block, n) == -1)
+					goto error;
+				n = 0;
+			}
+		}
+	}
+	if (n && add_hash(get, set, block, n) == -1)
+		goto error;
+
+	Py_DECREF(chunks);
+	Py_DECREF(get);
+	Py_DECREF(set);
+	PyMem_Free(block);
+	return counts;
+
+error:
+	Py_XDECREF(chunks);
+	Py_XDECREF(get);
+	Py_XDECREF(set);
+	Py_XDECREF(counts);
+	PyMem_Free(block);
+	return NULL;
+}
+
+static PyMethodDef py_diff_tree_methods[] = {
+	{ "_is_tree", (PyCFunction)py_is_tree, METH_VARARGS, NULL },
+	{ "_merge_entries", (PyCFunction)py_merge_entries, METH_VARARGS, NULL },
+	{ "_count_blocks", (PyCFunction)py_count_blocks, METH_VARARGS, NULL },
+	{ NULL, NULL, 0, NULL }
+};
+
+PyMODINIT_FUNC
+init_diff_tree(void)
+{
+	PyObject *m, *objects_mod = NULL, *diff_tree_mod = NULL;
+        PyObject *block_size_obj = NULL;
+	m = Py_InitModule("_diff_tree", py_diff_tree_methods);
+	if (!m)
+		goto error;
+
+	objects_mod = PyImport_ImportModule("dulwich.objects");
+	if (!objects_mod)
+		goto error;
+
+	tree_entry_cls = PyObject_GetAttrString(objects_mod, "TreeEntry");
+	Py_DECREF(objects_mod);
+	if (!tree_entry_cls)
+		goto error;
+
+	diff_tree_mod = PyImport_ImportModule("dulwich.diff_tree");
+	if (!diff_tree_mod)
+		goto error;
+
+	null_entry = PyObject_GetAttrString(diff_tree_mod, "_NULL_ENTRY");
+	if (!null_entry)
+		goto error;
+
+	block_size_obj = PyObject_GetAttrString(diff_tree_mod, "_BLOCK_SIZE");
+	if (!block_size_obj)
+		goto error;
+	block_size = (int)PyInt_AsLong(block_size_obj);
+
+	if (PyErr_Occurred())
+		goto error;
+
+	defaultdict_cls = PyObject_GetAttrString(diff_tree_mod, "defaultdict");
+	if (!defaultdict_cls)
+		goto error;
+
+	/* This is kind of hacky, but I don't know of a better way to get the
+	 * PyObject* version of int. */
+	int_cls = PyDict_GetItemString(PyEval_GetBuiltins(), "int");
+	if (!int_cls) {
+		PyErr_SetString(PyExc_NameError, "int");
+		goto error;
+	}
+
+	Py_DECREF(objects_mod);
+	Py_DECREF(diff_tree_mod);
+	return;
+
+error:
+	Py_XDECREF(objects_mod);
+	Py_XDECREF(diff_tree_mod);
+	Py_XDECREF(null_entry);
+	Py_XDECREF(block_size_obj);
+	Py_XDECREF(defaultdict_cls);
+	Py_XDECREF(int_cls);
+	return;
+}

+ 53 - 14
dulwich/_objects.c

@@ -36,6 +36,7 @@ size_t strnlen(char *text, size_t maxlen)
 #define bytehex(x) (((x)<0xa)?('0'+(x)):('a'-0xa+(x)))
 
 static PyObject *tree_entry_cls;
+static PyObject *object_format_exception_cls;
 
 static PyObject *sha_to_pyhex(const unsigned char *sha)
 {
@@ -49,17 +50,22 @@ static PyObject *sha_to_pyhex(const unsigned char *sha)
 	return PyString_FromStringAndSize(hexsha, 40);
 }
 
-static PyObject *py_parse_tree(PyObject *self, PyObject *args)
+static PyObject *py_parse_tree(PyObject *self, PyObject *args, PyObject *kw)
 {
 	char *text, *start, *end;
-	int len, namelen;
-	PyObject *ret, *item, *name;
+	int len, namelen, strict;
+	PyObject *ret, *item, *name, *py_strict = NULL;
+	static char *kwlist[] = {"text", "strict", NULL};
 
-	if (!PyArg_ParseTuple(args, "s#", &text, &len))
+	if (!PyArg_ParseTupleAndKeywords(args, kw, "s#|O", kwlist,
+	                                 &text, &len, &py_strict))
 		return NULL;
 
+
+	strict = py_strict ?  PyObject_IsTrue(py_strict) : 0;
+
 	/* TODO: currently this returns a list; if memory usage is a concern,
-	* consider rewriting as a custom iterator object */
+	 * consider rewriting as a custom iterator object */
 	ret = PyList_New(0);
 
 	if (ret == NULL) {
@@ -71,6 +77,13 @@ static PyObject *py_parse_tree(PyObject *self, PyObject *args)
 
 	while (text < end) {
 		long mode;
+		if (strict && text[0] == '0') {
+			PyErr_SetString(object_format_exception_cls,
+			                "Illegal leading zero on mode");
+			Py_DECREF(ret);
+			return NULL;
+		}
+
 		mode = strtol(text, &text, 8);
 
 		if (*text != ' ') {
@@ -97,7 +110,7 @@ static PyObject *py_parse_tree(PyObject *self, PyObject *args)
 		}
 
 		item = Py_BuildValue("(NlN)", name, mode,
-							 sha_to_pyhex((unsigned char *)text+namelen+1));
+		                     sha_to_pyhex((unsigned char *)text+namelen+1));
 		if (item == NULL) {
 			Py_DECREF(ret);
 			Py_DECREF(name);
@@ -146,18 +159,32 @@ int cmp_tree_item(const void *_a, const void *_b)
 	return strcmp(remain_a, remain_b);
 }
 
-static PyObject *py_sorted_tree_items(PyObject *self, PyObject *entries)
+int cmp_tree_item_name_order(const void *_a, const void *_b) {
+	const struct tree_item *a = _a, *b = _b;
+	return strcmp(a->name, b->name);
+}
+
+static PyObject *py_sorted_tree_items(PyObject *self, PyObject *args)
 {
 	struct tree_item *qsort_entries = NULL;
-	int num_entries, n = 0, i;
-	PyObject *ret, *key, *value, *py_mode, *py_sha;
+	int name_order, num_entries, n = 0, i;
+	PyObject *entries, *py_name_order, *ret, *key, *value, *py_mode, *py_sha;
 	Py_ssize_t pos = 0;
+	int (*cmp)(const void *, const void *);
+
+	if (!PyArg_ParseTuple(args, "OO", &entries, &py_name_order))
+		goto error;
 
 	if (!PyDict_Check(entries)) {
 		PyErr_SetString(PyExc_TypeError, "Argument not a dictionary");
 		goto error;
 	}
 
+	name_order = PyObject_IsTrue(py_name_order);
+	if (name_order == -1)
+		goto error;
+	cmp = name_order ? cmp_tree_item_name_order : cmp_tree_item;
+
 	num_entries = PyDict_Size(entries);
 	if (PyErr_Occurred())
 		goto error;
@@ -193,13 +220,13 @@ static PyObject *py_sorted_tree_items(PyObject *self, PyObject *entries)
 		qsort_entries[n].mode = PyInt_AS_LONG(py_mode);
 
 		qsort_entries[n].tuple = PyObject_CallFunctionObjArgs(
-				tree_entry_cls, key, py_mode, py_sha, NULL);
+		                tree_entry_cls, key, py_mode, py_sha, NULL);
 		if (qsort_entries[n].tuple == NULL)
 			goto error;
 		n++;
 	}
 
-	qsort(qsort_entries, num_entries, sizeof(struct tree_item), cmp_tree_item);
+	qsort(qsort_entries, num_entries, sizeof(struct tree_item), cmp);
 
 	ret = PyList_New(num_entries);
 	if (ret == NULL) {
@@ -222,20 +249,32 @@ error:
 }
 
 static PyMethodDef py_objects_methods[] = {
-	{ "parse_tree", (PyCFunction)py_parse_tree, METH_VARARGS, NULL },
-	{ "sorted_tree_items", (PyCFunction)py_sorted_tree_items, METH_O, NULL },
+	{ "parse_tree", (PyCFunction)py_parse_tree, METH_VARARGS | METH_KEYWORDS,
+	  NULL },
+	{ "sorted_tree_items", py_sorted_tree_items, METH_VARARGS, NULL },
 	{ NULL, NULL, 0, NULL }
 };
 
 PyMODINIT_FUNC
 init_objects(void)
 {
-	PyObject *m, *objects_mod;
+	PyObject *m, *objects_mod, *errors_mod;
 
 	m = Py_InitModule3("_objects", py_objects_methods, NULL);
 	if (m == NULL)
 		return;
 
+
+	errors_mod = PyImport_ImportModule("dulwich.errors");
+	if (errors_mod == NULL)
+		return;
+
+	object_format_exception_cls = PyObject_GetAttrString(
+		errors_mod, "ObjectFormatException");
+	Py_DECREF(errors_mod);
+	if (object_format_exception_cls == NULL)
+		return;
+
 	/* This is a circular import but should be safe since this module is
 	 * imported at at the very bottom of objects.py. */
 	objects_mod = PyImport_ImportModule("dulwich.objects");

+ 20 - 7
dulwich/client.py

@@ -24,6 +24,7 @@ __docformat__ = 'restructuredText'
 import select
 import socket
 import subprocess
+import urlparse
 
 from dulwich.errors import (
     SendPackError,
@@ -358,11 +359,23 @@ def get_transport_and_path(uri):
     :param uri: URI or path
     :return: Tuple with client instance and relative path.
     """
-    from dulwich.client import TCPGitClient, SSHGitClient, SubprocessGitClient
-    for handler, transport in (("git://", TCPGitClient), ("git+ssh://", SSHGitClient)):
-        if uri.startswith(handler):
-            host, path = uri[len(handler):].split("/", 1)
-            return transport(host), "/"+path
-    # FIXME: Parse rsync-like git URLs (user@host:/path), bug 568493
-    # if its not git or git+ssh, try a local url..
+    parsed = urlparse.urlparse(uri)
+    if parsed.scheme == 'git':
+        return TCPGitClient(parsed.hostname, port=parsed.port), parsed.path
+    elif parsed.scheme == 'git+ssh':
+        return SSHGitClient(parsed.hostname, port=parsed.port,
+                            username=parsed.username), parsed.path
+
+    if parsed.scheme and not parsed.netloc:
+        # SSH with no user@, zero or one leading slash.
+        return SSHGitClient(parsed.scheme), parsed.path
+    elif parsed.scheme:
+        raise ValueError('Unknown git protocol scheme: %s' % parsed.scheme)
+    elif '@' in parsed.path and ':' in parsed.path:
+        # SSH with user@host:foo.
+        user_host, path = parsed.path.split(':')
+        user, host = user_host.rsplit('@')
+        return SSHGitClient(host, username=user), path
+
+    # Otherwise, assume it's a local path.
     return SubprocessGitClient(), uri

+ 495 - 0
dulwich/diff_tree.py

@@ -0,0 +1,495 @@
+# diff_tree.py -- Utilities for diffing files and trees.
+# Copyright (C) 2010 Google, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# or (at your option) a later version of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Utilities for diffing files and trees."""
+
+from cStringIO import StringIO
+import itertools
+import stat
+
+from dulwich._compat import (
+    defaultdict,
+    TreeChangeTuple,
+    )
+from dulwich.objects import (
+    S_ISGITLINK,
+    TreeEntry,
+    )
+
+# TreeChange type constants.
+CHANGE_ADD = 'add'
+CHANGE_MODIFY = 'modify'
+CHANGE_DELETE = 'delete'
+CHANGE_RENAME = 'rename'
+CHANGE_COPY = 'copy'
+CHANGE_UNCHANGED = 'unchanged'
+
+_NULL_ENTRY = TreeEntry(None, None, None)
+
+_MAX_SCORE = 100
+_RENAME_THRESHOLD = 60
+_MAX_FILES = 200
+_REWRITE_THRESHOLD = None
+
+
+class TreeChange(TreeChangeTuple):
+    """Class encapsulating a single change between two trees."""
+
+    @classmethod
+    def add(cls, new):
+        return cls(CHANGE_ADD, _NULL_ENTRY, new)
+
+    @classmethod
+    def delete(cls, old):
+        return cls(CHANGE_DELETE, old, _NULL_ENTRY)
+
+
+def _tree_entries(path, tree):
+    result = []
+    if not tree:
+        return result
+    for entry in tree.iteritems(name_order=True):
+        result.append(entry.in_path(path))
+    return result
+
+
+def _merge_entries(path, tree1, tree2):
+    """Merge the entries of two trees.
+
+    :param path: A path to prepend to all tree entry names.
+    :param tree1: The first Tree object to iterate, or None.
+    :param tree2: The second Tree object to iterate, or None.
+    :return: A list of pairs of TreeEntry objects for each pair of entries in
+        the trees. If an entry exists in one tree but not the other, the other
+        entry will have all attributes set to None. If neither entry's path is
+        None, they are guaranteed to match.
+    """
+    entries1 = _tree_entries(path, tree1)
+    entries2 = _tree_entries(path, tree2)
+    i1 = i2 = 0
+    len1 = len(entries1)
+    len2 = len(entries2)
+
+    result = []
+    while i1 < len1 and i2 < len2:
+        entry1 = entries1[i1]
+        entry2 = entries2[i2]
+        if entry1.path < entry2.path:
+            result.append((entry1, _NULL_ENTRY))
+            i1 += 1
+        elif entry1.path > entry2.path:
+            result.append((_NULL_ENTRY, entry2))
+            i2 += 1
+        else:
+            result.append((entry1, entry2))
+            i1 += 1
+            i2 += 1
+    for i in xrange(i1, len1):
+        result.append((entries1[i], _NULL_ENTRY))
+    for i in xrange(i2, len2):
+        result.append((_NULL_ENTRY, entries2[i]))
+    return result
+
+
+def _is_tree(entry):
+    mode = entry.mode
+    if mode is None:
+        return False
+    return stat.S_ISDIR(mode)
+
+
+def walk_trees(store, tree1_id, tree2_id, prune_identical=False):
+    """Recursively walk all the entries of two trees.
+
+    Iteration is depth-first pre-order, as in e.g. os.walk.
+
+    :param store: An ObjectStore for looking up objects.
+    :param tree1_id: The SHA of the first Tree object to iterate, or None.
+    :param tree2_id: The SHA of the second Tree object to iterate, or None.
+    :param prune_identical: If True, identical subtrees will not be walked.
+    :return: Iterator over Pairs of TreeEntry objects for each pair of entries
+        in the trees and their subtrees recursively. If an entry exists in one
+        tree but not the other, the other entry will have all attributes set
+        to None. If neither entry's path is None, they are guaranteed to
+        match.
+    """
+    # This could be fairly easily generalized to >2 trees if we find a use case.
+    mode1 = tree1_id and stat.S_IFDIR or None
+    mode2 = tree2_id and stat.S_IFDIR or None
+    todo = [(TreeEntry('', mode1, tree1_id), TreeEntry('', mode2, tree2_id))]
+    while todo:
+        entry1, entry2 = todo.pop()
+        is_tree1 = _is_tree(entry1)
+        is_tree2 = _is_tree(entry2)
+        if prune_identical and is_tree1 and is_tree2 and entry1 == entry2:
+            continue
+
+        tree1 = is_tree1 and store[entry1.sha] or None
+        tree2 = is_tree2 and store[entry2.sha] or None
+        path = entry1.path or entry2.path
+        todo.extend(reversed(_merge_entries(path, tree1, tree2)))
+        yield entry1, entry2
+
+
+def _skip_tree(entry):
+    if entry.mode is None or stat.S_ISDIR(entry.mode):
+        return _NULL_ENTRY
+    return entry
+
+
+def tree_changes(store, tree1_id, tree2_id, want_unchanged=False):
+    """Find the differences between the contents of two trees.
+
+    :param store: An ObjectStore for looking up objects.
+    :param tree1_id: The SHA of the source tree.
+    :param tree2_id: The SHA of the target tree.
+    :param want_unchanged: If True, include TreeChanges for unmodified entries
+        as well.
+    :return: Iterator over TreeChange instances for each change between the
+        source and target tree.
+    """
+    entries = walk_trees(store, tree1_id, tree2_id,
+                         prune_identical=(not want_unchanged))
+    for entry1, entry2 in entries:
+        if entry1 == entry2 and not want_unchanged:
+            continue
+
+        # Treat entries for trees as missing.
+        entry1 = _skip_tree(entry1)
+        entry2 = _skip_tree(entry2)
+
+        if entry1 != _NULL_ENTRY and entry2 != _NULL_ENTRY:
+            if stat.S_IFMT(entry1.mode) != stat.S_IFMT(entry2.mode):
+                # File type changed: report as delete/add.
+                yield TreeChange.delete(entry1)
+                entry1 = _NULL_ENTRY
+                change_type = CHANGE_ADD
+            elif entry1 == entry2:
+                change_type = CHANGE_UNCHANGED
+            else:
+                change_type = CHANGE_MODIFY
+        elif entry1 != _NULL_ENTRY:
+            change_type = CHANGE_DELETE
+        elif entry2 != _NULL_ENTRY:
+            change_type = CHANGE_ADD
+        else:
+            # Both were None because at least one was a tree.
+            continue
+        yield TreeChange(change_type, entry1, entry2)
+
+
+_BLOCK_SIZE = 64
+
+
+def _count_blocks(obj):
+    """Count the blocks in an object.
+
+    Splits the data into blocks either on lines or <=64-byte chunks of lines.
+
+    :param obj: The object to count blocks for.
+    :return: A dict of block hashcode -> total bytes occurring.
+    """
+    block_counts = defaultdict(int)
+    block = StringIO()
+    n = 0
+
+    # Cache attrs as locals to avoid expensive lookups in the inner loop.
+    block_write = block.write
+    block_seek = block.seek
+    block_truncate = block.truncate
+    block_getvalue = block.getvalue
+
+    for c in itertools.chain(*obj.as_raw_chunks()):
+        block_write(c)
+        n += 1
+        if c == '\n' or n == _BLOCK_SIZE:
+            value = block_getvalue()
+            block_counts[hash(value)] += len(value)
+            block_seek(0)
+            block_truncate()
+            n = 0
+    if n > 0:
+        last_block = block_getvalue()
+        block_counts[hash(last_block)] += len(last_block)
+    return block_counts
+
+
+def _common_bytes(blocks1, blocks2):
+    """Count the number of common bytes in two block count dicts.
+
+    :param block1: The first dict of block hashcode -> total bytes.
+    :param block2: The second dict of block hashcode -> total bytes.
+    :return: The number of bytes in common between blocks1 and blocks2. This is
+        only approximate due to possible hash collisions.
+    """
+    # Iterate over the smaller of the two dicts, since this is symmetrical.
+    if len(blocks1) > len(blocks2):
+        blocks1, blocks2 = blocks2, blocks1
+    score = 0
+    for block, count1 in blocks1.iteritems():
+        count2 = blocks2.get(block)
+        if count2:
+            score += min(count1, count2)
+    return score
+
+
+def _similarity_score(obj1, obj2, block_cache=None):
+    """Compute a similarity score for two objects.
+
+    :param obj1: The first object to score.
+    :param obj2: The second object to score.
+    :param block_cache: An optional dict of SHA to block counts to cache results
+        between calls.
+    :return: The similarity score between the two objects, defined as the number
+        of bytes in common between the two objects divided by the maximum size,
+        scaled to the range 0-100.
+    """
+    if block_cache is None:
+        block_cache = {}
+    if obj1.id not in block_cache:
+        block_cache[obj1.id] = _count_blocks(obj1)
+    if obj2.id not in block_cache:
+        block_cache[obj2.id] = _count_blocks(obj2)
+
+    common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])
+    max_size = max(obj1.raw_length(), obj2.raw_length())
+    if not max_size:
+        return _MAX_SCORE
+    return int(float(common_bytes) * _MAX_SCORE / max_size)
+
+
+def _tree_change_key(entry):
+    # Sort by old path then new path. If only one exists, use it for both keys.
+    path1 = entry.old.path
+    path2 = entry.new.path
+    if path1 is None:
+        path1 = path2
+    if path2 is None:
+        path2 = path1
+    return (path1, path2)
+
+
+class RenameDetector(object):
+    """Object for handling rename detection between two trees."""
+
+    def __init__(self, store, tree1_id, tree2_id,
+                 rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES,
+                 rewrite_threshold=_REWRITE_THRESHOLD,
+                 find_copies_harder=False):
+        """Initialize the rename detector.
+
+        :param store: An ObjectStore for looking up objects.
+        :param tree1_id: The SHA of the first Tree.
+        :param tree2_id: The SHA of the second Tree.
+        :param rename_threshold: The threshold similarity score for considering
+            an add/delete pair to be a rename/copy; see _similarity_score.
+        :param max_files: The maximum number of adds and deletes to consider, or
+            None for no limit. The detector is guaranteed to compare no more
+            than max_files ** 2 add/delete pairs. This limit is provided because
+            rename detection can be quadratic in the project size. If the limit
+            is exceeded, no content rename detection is attempted.
+        :param rewrite_threshold: The threshold similarity score below which a
+            modify should be considered a delete/add, or None to not break
+            modifies; see _similarity_score.
+        :param find_copies_harder: If True, consider unmodified files when
+            detecting copies.
+        """
+        self._tree1_id = tree1_id
+        self._tree2_id = tree2_id
+        self._store = store
+        self._rename_threshold = rename_threshold
+        self._rewrite_threshold = rewrite_threshold
+        self._max_files = max_files
+        self._find_copies_harder = find_copies_harder
+
+        self._adds = []
+        self._deletes = []
+        self._changes = []
+
+    def _should_split(self, change):
+        if (self._rewrite_threshold is None or change.type != CHANGE_MODIFY or
+            change.old.sha == change.new.sha):
+            return False
+        old_obj = self._store[change.old.sha]
+        new_obj = self._store[change.new.sha]
+        return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
+
+    def _collect_changes(self):
+        for change in tree_changes(self._store, self._tree1_id, self._tree2_id,
+                                   want_unchanged=self._find_copies_harder):
+            if change.type == CHANGE_ADD:
+                self._adds.append(change)
+            elif change.type == CHANGE_DELETE:
+                self._deletes.append(change)
+            elif self._should_split(change):
+                self._deletes.append(TreeChange.delete(change.old))
+                self._adds.append(TreeChange.add(change.new))
+            elif (self._find_copies_harder and (
+              change.type == CHANGE_MODIFY or change.type == CHANGE_UNCHANGED)):
+                # Treat modified/unchanged as deleted rather than splitting it,
+                # to avoid spurious renames.
+                self._deletes.append(change)
+            else:
+                self._changes.append(change)
+
+    def _prune(self, add_paths, delete_paths):
+        self._adds = [a for a in self._adds if a.new.path not in add_paths]
+        self._deletes = [d for d in self._deletes
+                         if d.old.path not in delete_paths]
+
+    def _find_exact_renames(self):
+        add_map = defaultdict(list)
+        for add in self._adds:
+            add_map[add.new.sha].append(add.new)
+        delete_map = defaultdict(list)
+        for delete in self._deletes:
+            # Keep track of whether the delete was actually marked as a delete.
+            # If not, it must have been added due to find_copies_harder, and
+            # needs to be marked as a copy.
+            is_delete = delete.type == CHANGE_DELETE
+            delete_map[delete.old.sha].append((delete.old, is_delete))
+
+        add_paths = set()
+        delete_paths = set()
+        for sha, sha_deletes in delete_map.iteritems():
+            sha_adds = add_map[sha]
+            for (old, is_delete), new in itertools.izip(sha_deletes, sha_adds):
+                if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode):
+                    continue
+                delete_paths.add(old.path)
+                add_paths.add(new.path)
+                new_type = is_delete and CHANGE_RENAME or CHANGE_COPY
+                self._changes.append(TreeChange(new_type, old, new))
+
+            num_extra_adds = len(sha_adds) - len(sha_deletes)
+            # TODO(dborowitz): Less arbitrary way of dealing with extra copies.
+            old = sha_deletes[0][0]
+            if num_extra_adds:
+                for new in sha_adds[-num_extra_adds:]:
+                    add_paths.add(new.path)
+                    self._changes.append(TreeChange(CHANGE_COPY, old, new))
+        self._prune(add_paths, delete_paths)
+
+    def _find_content_renames(self):
+        # TODO: Optimizations:
+        #  - Compare object sizes before counting blocks.
+        #  - Skip if delete's S_IFMT differs from all adds.
+        #  - Skip if adds or deletes is empty.
+        # Match C git's behavior of not attempting to find content renames if
+        # the matrix size exceeds the threshold.
+        if len(self._adds) * len(self._deletes) > self._max_files ** 2:
+            return
+
+        check_paths = self._rename_threshold is not None
+        candidates = []
+        for delete in self._deletes:
+            if S_ISGITLINK(delete.old.mode):
+                continue  # Git links don't exist in this repo.
+            old_sha = delete.old.sha
+            old_obj = self._store[old_sha]
+            old_blocks = _count_blocks(old_obj)
+            for add in self._adds:
+                if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
+                    continue
+                new_obj = self._store[add.new.sha]
+                score = _similarity_score(old_obj, new_obj,
+                                          block_cache={old_sha: old_blocks})
+                if score > self._rename_threshold:
+                    if check_paths and delete.old.path == add.new.path:
+                        # If the paths match, this must be a split modify, so
+                        # make sure it comes out as a modify.
+                        new_type = CHANGE_MODIFY
+                    elif delete.type != CHANGE_DELETE:
+                        # If it's in deletes but not marked as a delete, it must
+                        # have been added due to find_copies_harder, and needs
+                        # to be marked as a copy.
+                        new_type = CHANGE_COPY
+                    else:
+                        new_type = CHANGE_RENAME
+                    rename = TreeChange(new_type, delete.old, add.new)
+                    candidates.append((-score, rename))
+
+        # Sort scores from highest to lowest, but keep names in ascending order.
+        candidates.sort()
+
+        delete_paths = set()
+        add_paths = set()
+        for _, change in candidates:
+            new_path = change.new.path
+            if new_path in add_paths:
+                continue
+            old_path = change.old.path
+            orig_type = change.type
+            if old_path in delete_paths:
+                change = TreeChange(CHANGE_COPY, change.old, change.new)
+
+            # If the candidate was originally a copy, that means it came from a
+            # modified or unchanged path, so we don't want to prune it.
+            if orig_type != CHANGE_COPY:
+                delete_paths.add(old_path)
+            add_paths.add(new_path)
+            self._changes.append(change)
+        self._prune(add_paths, delete_paths)
+
+    def _join_modifies(self):
+        if self._rewrite_threshold is None:
+            return
+
+        modifies = {}
+        delete_map = dict((d.old.path, d) for d in self._deletes)
+        for add in self._adds:
+            path = add.new.path
+            delete = delete_map.get(path)
+            if (delete is not None and
+              stat.S_IFMT(delete.old.mode) == stat.S_IFMT(add.new.mode)):
+                modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new)
+
+        self._adds = [a for a in self._adds if a.new.path not in modifies]
+        self._deletes = [a for a in self._deletes if a.new.path not in modifies]
+        self._changes += modifies.values()
+
+    def _sorted_changes(self):
+        result = []
+        result.extend(self._adds)
+        result.extend(self._deletes)
+        result.extend(self._changes)
+        result.sort(key=_tree_change_key)
+        return result
+
+    def _prune_unchanged(self):
+        self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED]
+
+    def changes_with_renames(self):
+        """Iterate TreeChanges between the two trees, with rename detection."""
+        self._collect_changes()
+        self._find_exact_renames()
+        self._find_content_renames()
+        self._join_modifies()
+        self._prune_unchanged()
+        return self._sorted_changes()
+
+
+# Hold on to the pure-python implementations for testing.
+_is_tree_py = _is_tree
+_merge_entries_py = _merge_entries
+_count_blocks_py = _count_blocks
+try:
+    # Try to import C versions
+    from dulwich._diff_tree import _is_tree, _merge_entries, _count_blocks
+except ImportError:
+    pass

+ 4 - 0
dulwich/errors.py

@@ -166,3 +166,7 @@ class NoIndexPresent(Exception):
 
 class CommitError(Exception):
     """An error occurred while performing a commit."""
+
+
+class RefFormatError(Exception):
+    """Indicates an invalid ref name."""

+ 36 - 3
dulwich/fastexport.py

@@ -117,6 +117,7 @@ class GitImportProcessor(processor.ImportProcessor):
         self.repo = repo
         self.last_commit = None
         self.markers = {}
+        self._contents = {}
 
     def import_stream(self, stream):
         p = parser.ImportParser(stream)
@@ -151,10 +152,32 @@ class GitImportProcessor(processor.ImportProcessor):
         commit.commit_time = int(commit_timestamp)
         commit.message = cmd.message
         commit.parents = []
-        contents = {}
+        if cmd.from_:
+            self._reset_base(cmd.from_)
+        for filecmd in cmd.iter_files():
+            if filecmd.name == "filemodify":
+                if filecmd.data is not None:
+                    blob = Blob.from_string(filecmd.data)
+                    self.repo.object_store.add(blob)
+                    blob_id = blob.id
+                else:
+                    assert filecmd.dataref[0] == ":", "non-marker refs not supported yet"
+                    blob_id = self.markers[filecmd.dataref[1:]]
+                self._contents[filecmd.path] = (filecmd.mode, blob_id)
+            elif filecmd.name == "filedelete":
+                del self._contents[filecmd.path]
+            elif filecmd.name == "filecopy":
+                self._contents[filecmd.dest_path] = self._contents[filecmd.src_path]
+            elif filecmd.name == "filerename":
+                self._contents[filecmd.new_path] = self._contents[filecmd.old_path]
+                del self._contents[filecmd.old_path]
+            elif filecmd.name == "filedeleteall":
+                self._contents = {}
+            else:
+                raise Exception("Command %s not supported" % filecmd.name)
         commit.tree = commit_tree(self.repo.object_store,
             ((path, hexsha, mode) for (path, (mode, hexsha)) in
-                contents.iteritems()))
+                self._contents.iteritems()))
         if self.last_commit is not None:
             commit.parents.append(self.last_commit)
         commit.parents += cmd.merges
@@ -168,9 +191,19 @@ class GitImportProcessor(processor.ImportProcessor):
         """Process a ProgressCommand."""
         pass
 
+    def _reset_base(self, commit_id):
+        if self.last_commit == commit_id:
+            return
+        self.last_commit = commit_id
+        self._contents = {}
+        tree_id = self.repo[commit_id].tree
+        for (path, mode, hexsha) in (
+                self.repo.object_store.iter_tree_contents(tree_id)):
+            self._contents[path] = (mode, hexsha)
+
     def reset_handler(self, cmd):
         """Process a ResetCommand."""
-        self.last_commit = cmd.from_
+        self._reset_base(cmd.from_)
         self.rep.refs[cmd.from_] = cmd.id
 
     def tag_handler(self, cmd):

+ 1 - 16
dulwich/file.py

@@ -71,22 +71,7 @@ def GitFile(filename, mode='rb', bufsize=-1):
     Only read-only and write-only (binary) modes are supported; r+, w+, and a
     are not.  To read and write from the same file, you can take advantage of
     the fact that opening a file for write does not actually open the file you
-    request:
-
-    >>> write_file = GitFile('filename', 'wb')
-    >>> read_file = GitFile('filename', 'rb')
-    >>> read_file.readlines()
-    ['contents\n', 'of\n', 'the\n', 'file\n']
-    >>> write_file.write('foo')
-    >>> read_file.close()
-    >>> write_file.close()
-    >>> new_file = GitFile('filename', 'rb')
-    'foo'
-    >>> new_file.close()
-    >>> other_file = GitFile('filename', 'wb')
-    Traceback (most recent call last):
-        ...
-    OSError: [Errno 17] File exists: 'filename.lock'
+    request.
     """
     if 'a' in mode:
         raise IOError('append mode not supported for Git files')

+ 29 - 63
dulwich/object_store.py

@@ -23,11 +23,14 @@
 import errno
 import itertools
 import os
-import posixpath
 import stat
 import tempfile
 import urllib2
 
+from dulwich.diff_tree import (
+    tree_changes,
+    walk_trees,
+    )
 from dulwich.errors import (
     NotTreeError,
     )
@@ -129,52 +132,14 @@ class BaseObjectStore(object):
         :param object_store: Object store to use for retrieving tree contents
         :param tree: SHA1 of the root tree
         :param want_unchanged: Whether unchanged files should be reported
-        :return: Iterator over tuples with (oldpath, newpath), (oldmode, newmode), (oldsha, newsha)
+        :return: Iterator over tuples with
+            (oldpath, newpath), (oldmode, newmode), (oldsha, newsha)
         """
-        todo = set([(source, target, "")])
-        while todo:
-            (sid, tid, path) = todo.pop()
-            if sid is not None:
-                stree = self[sid]
-            else:
-                stree = {}
-            if tid is not None:
-                ttree = self[tid]
-            else:
-                ttree = {}
-            for name, oldmode, oldhexsha in stree.iteritems():
-                oldchildpath = posixpath.join(path, name)
-                try:
-                    (newmode, newhexsha) = ttree[name]
-                    newchildpath = oldchildpath
-                except KeyError:
-                    newmode = None
-                    newhexsha = None
-                    newchildpath = None
-                if (want_unchanged or oldmode != newmode or
-                    oldhexsha != newhexsha):
-                    if stat.S_ISDIR(oldmode):
-                        if newmode is None or stat.S_ISDIR(newmode):
-                            todo.add((oldhexsha, newhexsha, oldchildpath))
-                        else:
-                            # entry became a file
-                            todo.add((oldhexsha, None, oldchildpath))
-                            yield ((None, newchildpath), (None, newmode), (None, newhexsha))
-                    else:
-                        if newmode is not None and stat.S_ISDIR(newmode):
-                            # entry became a dir
-                            yield ((oldchildpath, None), (oldmode, None), (oldhexsha, None))
-                            todo.add((None, newhexsha, newchildpath))
-                        else:
-                            yield ((oldchildpath, newchildpath), (oldmode, newmode), (oldhexsha, newhexsha))
-
-            for name, newmode, newhexsha in ttree.iteritems():
-                childpath = posixpath.join(path, name)
-                if not name in stree:
-                    if not stat.S_ISDIR(newmode):
-                        yield ((None, childpath), (None, newmode), (None, newhexsha))
-                    else:
-                        todo.add((None, newhexsha, childpath))
+        for change in tree_changes(self, source, target,
+                                   want_unchanged=want_unchanged):
+            yield ((change.old.path, change.new.path),
+                   (change.old.mode, change.new.mode),
+                   (change.old.sha, change.new.sha))
 
     def iter_tree_contents(self, tree_id, include_trees=False):
         """Iterate the contents of a tree and all subtrees.
@@ -183,19 +148,12 @@ class BaseObjectStore(object):
 
         :param tree_id: SHA1 of the tree.
         :param include_trees: If True, include tree objects in the iteration.
-        :return: Yields tuples of (path, mode, hexhsa) for objects in a tree.
+        :return: Iterator over TreeEntry namedtuples for all the objects in a
+            tree.
         """
-        todo = [('', stat.S_IFDIR, tree_id)]
-        while todo:
-            path, mode, hexsha = todo.pop()
-            is_subtree = stat.S_ISDIR(mode)
-            if not is_subtree or include_trees:
-                yield path, mode, hexsha
-            if is_subtree:
-                entries = reversed(list(self[hexsha].iteritems()))
-                for name, entry_mode, entry_hexsha in entries:
-                    entry_path = posixpath.join(path, name)
-                    todo.append((entry_path, entry_mode, entry_hexsha))
+        for entry, _ in walk_trees(self, tree_id, None):
+            if not stat.S_ISDIR(entry.mode) or include_trees:
+                yield entry
 
     def find_missing_objects(self, haves, wants, progress=None,
                              get_tagged=None):
@@ -338,7 +296,7 @@ class PackBasedObjectStore(BaseObjectStore):
             sha = name
             hexsha = None
         else:
-            raise AssertionError
+            raise AssertionError("Invalid object name %r" % name)
         for pack in self.packs:
             try:
                 return pack.get_raw(sha)
@@ -443,10 +401,14 @@ class DiskObjectStore(PackBasedObjectStore):
         data.create_index_v2(temppath)
         p = Pack.from_objects(data, load_pack_index(temppath))
 
-        # Write a full pack version
-        temppath = os.path.join(self.pack_dir,
-            sha_to_hex(urllib2.randombytes(20))+".temppack")
-        write_pack(temppath, ((o, None) for o in p.iterobjects()), len(p))
+        try:
+            # Write a full pack version
+            temppath = os.path.join(self.pack_dir,
+                sha_to_hex(urllib2.randombytes(20))+".temppack")
+            write_pack(temppath, ((o, None) for o in p.iterobjects()), len(p))
+        finally:
+            p.close()
+
         pack_sha = load_pack_index(temppath+".idx").objects_sha1()
         newbasename = os.path.join(self.pack_dir, "pack-%s" % pack_sha)
         os.rename(temppath+".pack", newbasename+".pack")
@@ -579,6 +541,10 @@ class MemoryObjectStore(BaseObjectStore):
     def __getitem__(self, name):
         return self._data[name]
 
+    def __delitem__(self, name):
+        """Delete an object from this store, for testing only."""
+        del self._data[name]
+
     def add_object(self, obj):
         """Add a single object to this object store.
 

+ 32 - 20
dulwich/objects.py

@@ -38,7 +38,7 @@ from dulwich.errors import (
     ObjectFormatException,
     )
 from dulwich.file import GitFile
-from dulwich.misc import (
+from dulwich._compat import (
     make_sha,
     TreeEntryTuple,
     )
@@ -143,7 +143,7 @@ def check_identity(identity, error_msg):
     """Check if the specified identity is valid.
 
     This will raise an exception if the identity is not valid.
-    
+
     :param identity: Identity string
     :param error_msg: Error message to use in exception
     """
@@ -175,7 +175,7 @@ class FixedSha(object):
 class ShaFile(object):
     """A git SHA file."""
 
-    __slots__ = ('_needs_parsing', '_chunked_text', '_file', '_path', 
+    __slots__ = ('_needs_parsing', '_chunked_text', '_file', '_path',
                  '_sha', '_needs_serialization', '_magic')
 
     @staticmethod
@@ -564,7 +564,7 @@ class Tag(ShaFile):
     type_name = 'tag'
     type_num = 4
 
-    __slots__ = ('_tag_timezone_neg_utc', '_name', '_object_sha', 
+    __slots__ = ('_tag_timezone_neg_utc', '_name', '_object_sha',
                  '_object_class', '_tag_time', '_tag_timezone',
                  '_tagger', '_message')
 
@@ -694,18 +694,20 @@ class TreeEntry(TreeEntryTuple):
         return TreeEntry(posixpath.join(path, self.path), self.mode, self.sha)
 
 
-def parse_tree(text):
+def parse_tree(text, strict=False):
     """Parse a tree text.
 
     :param text: Serialized text to parse
     :return: iterator of tuples of (name, mode, sha)
+    :raise ObjectFormatException: if the object was malformed in some way
     """
     count = 0
     l = len(text)
     while count < l:
         mode_end = text.index(' ', count)
         mode_text = text[count:mode_end]
-        assert mode_text[0] != '0'
+        if strict and mode_text.startswith('0'):
+            raise ObjectFormatException("Invalid mode '%s'" % mode_text)
         try:
             mode = int(mode_text, 8)
         except ValueError:
@@ -730,14 +732,17 @@ def serialize_tree(items):
         yield "%04o %s\0%s" % (mode, name, hex_to_sha(hexsha))
 
 
-def sorted_tree_items(entries):
-    """Iterate over a tree entries dictionary in the order in which 
-    the items would be serialized.
+def sorted_tree_items(entries, name_order):
+    """Iterate over a tree entries dictionary.
 
+    :param name_order: If True, iterate entries in order of their name. If
+        False, iterate entries in tree order, that is, treat subtree entries as
+        having '/' appended.
     :param entries: Dictionary mapping names to (mode, sha) tuples
     :return: Iterator over (name, mode, hexsha)
     """
-    for name, entry in sorted(entries.iteritems(), cmp=cmp_entry):
+    cmp_func = name_order and cmp_entry_name_order or cmp_entry
+    for name, entry in sorted(entries.iteritems(), cmp=cmp_func):
         mode, hexsha = entry
         # Stricter type checks than normal to mirror checks in the C version.
         mode = int(mode)
@@ -747,7 +752,7 @@ def sorted_tree_items(entries):
 
 
 def cmp_entry((name1, value1), (name2, value2)):
-    """Compare two tree entries."""
+    """Compare two tree entries in tree order."""
     if stat.S_ISDIR(value1[0]):
         name1 += "/"
     if stat.S_ISDIR(value2[0]):
@@ -755,6 +760,11 @@ def cmp_entry((name1, value1), (name2, value2)):
     return cmp(name1, name2)
 
 
+def cmp_entry_name_order(entry1, entry2):
+    """Compare two tree entries in name order."""
+    return cmp(entry1[0], entry2[0])
+
+
 class Tree(ShaFile):
     """A Git tree object"""
 
@@ -822,9 +832,9 @@ class Tree(ShaFile):
 
     def entries(self):
         """Return a list of tuples describing the tree entries.
-        
-        :note: The order of the tuples that are returned is different from that 
-            returned by the items and iteritems methods. This function will be 
+
+        :note: The order of the tuples that are returned is different from that
+            returned by the items and iteritems methods. This function will be
             deprecated in the future.
         """
         self._ensure_parsed()
@@ -833,13 +843,14 @@ class Tree(ShaFile):
         return [
             (mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
 
-    def iteritems(self):
-        """Iterate over entries in the order in which they would be serialized.
+    def iteritems(self, name_order=False):
+        """Iterate over entries.
 
+        :param name_order: If True, iterate in name order instead of tree order.
         :return: Iterator over (name, mode, sha) tuples
         """
         self._ensure_parsed()
-        return sorted_tree_items(self._entries)
+        return sorted_tree_items(self._entries, name_order)
 
     def items(self):
         """Return the sorted entries in this tree.
@@ -869,7 +880,8 @@ class Tree(ShaFile):
                          stat.S_IFLNK, stat.S_IFDIR, S_IFGITLINK,
                          # TODO: optionally exclude as in git fsck --strict
                          stat.S_IFREG | 0664)
-        for name, mode, sha in parse_tree("".join(self._chunked_text)):
+        for name, mode, sha in parse_tree(''.join(self._chunked_text),
+                                          True):
             check_hexsha(sha, 'invalid sha %s' % sha)
             if '/' in name or name in ('', '.', '..'):
                 raise ObjectFormatException('invalid name %s' % name)
@@ -903,7 +915,7 @@ def parse_timezone(text):
     """Parse a timezone text fragment (e.g. '+0100').
 
     :param text: Text to parse.
-    :return: Tuple with timezone as seconds difference to UTC 
+    :return: Tuple with timezone as seconds difference to UTC
         and a boolean indicating whether this was a UTC timezone
         prefixed with a negative sign (-0000).
     """
@@ -968,7 +980,7 @@ class Commit(ShaFile):
         self._parents = []
         self._extra = []
         self._author = None
-        for field, value in parse_commit("".join(self._chunked_text)):
+        for field, value in parse_commit(''.join(self._chunked_text)):
             if field == _TREE_HEADER:
                 self._tree = value
             elif field == _PARENT_HEADER:

+ 7 - 9
dulwich/pack.py

@@ -33,7 +33,7 @@ a pointer in to the corresponding packfile.
 try:
     from collections import defaultdict
 except ImportError:
-    from misc import defaultdict
+    from _compat import defaultdict
 
 from cStringIO import (
     StringIO,
@@ -53,7 +53,7 @@ import struct
 try:
     from struct import unpack_from
 except ImportError:
-    from dulwich.misc import unpack_from
+    from dulwich._compat import unpack_from
 import sys
 import zlib
 
@@ -65,7 +65,7 @@ from dulwich.file import GitFile
 from dulwich.lru_cache import (
     LRUSizeCache,
     )
-from dulwich.misc import (
+from dulwich._compat import (
     make_sha,
     SEEK_END,
     )
@@ -243,7 +243,7 @@ class PackIndex(object):
 
     def __iter__(self):
         """Iterate over the SHAs in this pack."""
-        raise NotImplementedError(self.__iter__)
+        return imap(sha_to_hex, self._itersha())
 
     def iterentries(self):
         """Iterate over the entries in this pack index.
@@ -278,10 +278,6 @@ class PackIndex(object):
         """
         raise NotImplementedError(self._object_index)
 
-    def __iter__(self):
-        """Iterate over the SHAs in this pack."""
-        return imap(sha_to_hex, self._itersha())
-
     def objects_sha1(self):
         """Return the hex SHA1 over all the shas of all objects in this pack.
 
@@ -350,7 +346,7 @@ class FilePackIndex(PackIndex):
         else:
             self._file = file
         if contents is None:
-            self._contents, self._size = _load_file_contents(file, size)
+            self._contents, self._size = _load_file_contents(self._file, size)
         else:
             self._contents, self._size = (contents, size)
 
@@ -364,6 +360,8 @@ class FilePackIndex(PackIndex):
 
     def close(self):
         self._file.close()
+        if getattr(self._contents, "close", None) is not None:
+            self._contents.close()
 
     def __len__(self):
         """Return the number of entries in this pack index."""

+ 26 - 2
dulwich/patch.py

@@ -28,6 +28,7 @@ import subprocess
 import time
 
 from dulwich.objects import (
+    Blob,
     Commit,
     )
 
@@ -136,14 +137,37 @@ def write_blob_diff(f, (old_path, old_mode, old_blob),
             f.write("new mode %o\n" % new_mode)
         else:
             f.write("deleted mode %o\n" % old_mode)
-    f.write("index %s..%s %o\n" % (
-        blob_id(old_blob), blob_id(new_blob), new_mode))
+    f.write("index %s..%s" % (blob_id(old_blob), blob_id(new_blob)))
+    if new_mode is not None:
+        f.write(" %o" % new_mode)
+    f.write("\n")
     old_contents = lines(old_blob)
     new_contents = lines(new_blob)
     f.writelines(unified_diff(old_contents, new_contents,
         old_path, new_path))
 
 
+def write_tree_diff(f, store, old_tree, new_tree):
+    """Write tree diff.
+
+    :param f: File-like object to write to.
+    :param old_tree: Old tree id
+    :param new_tree: New tree id
+    """
+    changes = store.tree_changes(old_tree, new_tree)
+    for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
+        if oldsha is None:
+            old_blob = Blob.from_string("")
+        else:
+            old_blob = store[oldsha]
+        if newsha is None:
+            new_blob = Blob.from_string("")
+        else:
+            new_blob = store[newsha]
+        write_blob_diff(f, (oldpath, oldmode, old_blob),
+                           (newpath, newmode, new_blob))
+
+
 def git_am_patch_split(f):
     """Parse a git-am-style patch and split it up into bits.
 

+ 1 - 2
dulwich/protocol.py

@@ -20,14 +20,13 @@
 """Generic functions for talking the git smart server protocol."""
 
 from cStringIO import StringIO
-import os
 import socket
 
 from dulwich.errors import (
     HangupException,
     GitProtocolError,
     )
-from dulwich.misc import (
+from dulwich._compat import (
     SEEK_END,
     )
 

+ 34 - 18
dulwich/repo.py

@@ -35,6 +35,7 @@ from dulwich.errors import (
     NotTagError,
     PackedRefsException,
     CommitError,
+    RefFormatError,
     )
 from dulwich.file import (
     ensure_dir_exists,
@@ -213,7 +214,7 @@ class RefsContainer(object):
         if name == 'HEAD':
             return
         if not name.startswith('refs/') or not check_ref_format(name[5:]):
-            raise KeyError(name)
+            raise RefFormatError(name)
 
     def read_ref(self, refname):
         """Read a reference without following any references.
@@ -763,13 +764,13 @@ class BaseRepo(object):
         self.object_store = object_store
         self.refs = refs
 
-    def _init_files(self):
+    def _init_files(self, bare):
         """Initialize a default set of named files."""
         self._put_named_file('description', "Unnamed repository")
         self._put_named_file('config', ('[core]\n'
                                         'repositoryformatversion = 0\n'
                                         'filemode = true\n'
-                                        'bare = ' + str(self.bare).lower() + '\n'
+                                        'bare = ' + str(bare).lower() + '\n'
                                         'logallrefupdates = true\n'))
         self._put_named_file(os.path.join('info', 'exclude'), '')
 
@@ -777,7 +778,7 @@ class BaseRepo(object):
         """Get a file from the control dir with a specific name.
 
         Although the filename should be interpreted as a filename relative to
-        the control dir in a disk-baked Repo, the object returned need not be
+        the control dir in a disk-based Repo, the object returned need not be
         pointing to a file in that location.
 
         :param path: The path to the file, relative to the control dir.
@@ -990,7 +991,10 @@ class BaseRepo(object):
                 return self.object_store[name]
             except KeyError:
                 pass
-        return self.object_store[self.refs[name]]
+        try:
+            return self.object_store[self.refs[name]]
+        except RefFormatError:
+            raise KeyError(name)
 
     def __contains__(self, name):
         if len(name) in (20, 40):
@@ -1017,7 +1021,7 @@ class BaseRepo(object):
     def do_commit(self, message, committer=None,
                   author=None, commit_timestamp=None,
                   commit_timezone=None, author_timestamp=None,
-                  author_timezone=None, tree=None):
+                  author_timezone=None, tree=None, encoding=None):
         """Create a new commit.
 
         :param message: Commit message
@@ -1028,7 +1032,9 @@ class BaseRepo(object):
         :param author_timestamp: Author timestamp (defaults to commit timestamp)
         :param author_timezone: Author timestamp timezone
             (defaults to commit timestamp timezone)
-        :param tree: SHA1 of the tree root to use (if not specified the current index will be committed).
+        :param tree: SHA1 of the tree root to use (if not specified the
+            current index will be committed).
+        :param encoding: Encoding
         :return: New commit SHA1
         """
         import time
@@ -1037,6 +1043,8 @@ class BaseRepo(object):
             index = self.open_index()
             c.tree = index.commit(self.object_store)
         else:
+            if len(tree) != 40:
+                raise ValueError("tree must be a 40-byte hex sha string")
             c.tree = tree
         # TODO: Allow username to be missing, and get it from .git/config
         if committer is None:
@@ -1058,6 +1066,8 @@ class BaseRepo(object):
         if author_timezone is None:
             author_timezone = commit_timezone
         c.author_timezone = author_timezone
+        if encoding is not None:
+            c.encoding = encoding
         c.message = message
         try:
             old_head = self.refs["HEAD"]
@@ -1116,7 +1126,7 @@ class Repo(BaseRepo):
         """Get a file from the control dir with a specific name.
 
         Although the filename should be interpreted as a filename relative to
-        the control dir in a disk-baked Repo, the object returned need not be
+        the control dir in a disk-based Repo, the object returned need not be
         pointing to a file in that location.
 
         :param path: The path to the file, relative to the control dir.
@@ -1184,22 +1194,28 @@ class Repo(BaseRepo):
         return "<Repo at %r>" % self.path
 
     @classmethod
-    def init(cls, path, mkdir=True):
-        controldir = os.path.join(path, ".git")
-        os.mkdir(controldir)
-        cls.init_bare(controldir)
-        return cls(path)
-
-    @classmethod
-    def init_bare(cls, path, mkdir=True):
+    def _init_maybe_bare(cls, path, bare):
         for d in BASE_DIRECTORIES:
             os.mkdir(os.path.join(path, *d))
         DiskObjectStore.init(os.path.join(path, OBJECTDIR))
         ret = cls(path)
         ret.refs.set_symbolic_ref("HEAD", "refs/heads/master")
-        ret._init_files()
+        ret._init_files(bare)
         return ret
 
+    @classmethod
+    def init(cls, path, mkdir=False):
+        if mkdir:
+            os.mkdir(path)
+        controldir = os.path.join(path, ".git")
+        os.mkdir(controldir)
+        cls._init_maybe_bare(controldir, False)
+        return cls(path)
+
+    @classmethod
+    def init_bare(cls, path):
+        return cls._init_maybe_bare(path, True)
+
     create = init_bare
 
 
@@ -1249,5 +1265,5 @@ class MemoryRepo(BaseRepo):
             ret.object_store.add_object(obj)
         for refname, sha in refs.iteritems():
             ret.refs[refname] = sha
-        ret._init_files()
+        ret._init_files(bare=True)
         return ret

+ 35 - 1
dulwich/server.py

@@ -48,8 +48,10 @@ from dulwich.pack import (
     write_pack_data,
     )
 from dulwich.protocol import (
+    BufferedPktLineWriter,
     MULTI_ACK,
     MULTI_ACK_DETAILED,
+    Protocol,
     ProtocolFile,
     ReceivableProtocol,
     SINGLE_ACK,
@@ -58,7 +60,6 @@ from dulwich.protocol import (
     ack_type,
     extract_capabilities,
     extract_want_line_capabilities,
-    BufferedPktLineWriter,
     )
 from dulwich.repo import (
     Repo,
@@ -155,6 +156,14 @@ class DictBackend(Backend):
         return self.repos[path]
 
 
+class FileSystemBackend(Backend):
+    """Simple backend that looks up Git repositories in the local file system."""
+
+    def open_repository(self, path):
+        logger.debug('opening repository at %s', path)
+        return Repo(path)
+
+
 class Handler(object):
     """Smart protocol command handler base class."""
 
@@ -775,3 +784,28 @@ def main(argv=sys.argv):
     backend = DictBackend({'/': Repo(gitdir)})
     server = TCPGitServer(backend, 'localhost')
     server.serve_forever()
+
+
+def serve_command(handler_cls, argv=sys.argv, backend=None, inf=sys.stdin,
+                  outf=sys.stdout):
+    """Serve a single command.
+
+    This is mostly useful for the implementation of commands used by e.g. git+ssh.
+
+    :param handler_cls: `Handler` class to use for the request
+    :param argv: execv-style command-line arguments. Defaults to sys.argv.
+    :param backend: `Backend` to use
+    :param inf: File-like object to read from, defaults to standard input.
+    :param outf: File-like object to write to, defaults to standard output.
+    :return: Exit code for use with sys.exit. 0 on success, 1 on failure.
+    """
+    if backend is None:
+        backend = FileSystemBackend()
+    def send_fn(data):
+        outf.write(data)
+        outf.flush()
+    proto = Protocol(inf.read, send_fn)
+    handler = handler_cls(backend, argv[1:], proto)
+    # FIXME: Catch exceptions and write a single-line summary to outf.
+    handler.handle()
+    return 0

+ 85 - 28
dulwich/tests/__init__.py

@@ -19,42 +19,69 @@
 
 """Tests for Dulwich."""
 
+import doctest
+import os
 import unittest
-
-try:
-    from testtools.testcase import TestCase
-except ImportError:
-    from unittest import TestCase
+import shutil
+import subprocess
+import sys
+import tempfile
 
 try:
     # If Python itself provides an exception, use that
     from unittest import SkipTest as TestSkipped
 except ImportError:
-    # Check if the nose exception can be used
     try:
-        import nose
+        from unittest2 import SkipTest as TestSkipped
     except ImportError:
-        try:
-            import testtools.testcase
-        except ImportError:
-            class TestSkipped(Exception):
-                def __init__(self, msg):
-                    self.msg = msg
-        else:
-            TestSkipped = testtools.testcase.TestCase.skipException
-    else:
-        TestSkipped = nose.SkipTest
-        try:
-            import testtools.testcase
-        except ImportError:
-            pass
-        else:
-            # Make testtools use the same exception class as nose
-            testtools.testcase.TestCase.skipException = TestSkipped
+        from testtools.testcase import TestSkipped
+
+try:
+    from testtools.testcase import TestCase
+except ImportError:
+    from unittest import TestCase
+else:
+    TestCase.skipException = TestSkipped
 
 
-def test_suite():
+class BlackboxTestCase(TestCase):
+    """Blackbox testing."""
+
+    bin_directory = os.path.abspath(os.path.join(os.path.dirname(__file__),
+        "..", "..", "bin"))
+
+    def bin_path(self, name):
+        """Determine the full path of a binary.
+
+        :param name: Name of the script
+        :return: Full path
+        """
+        return os.path.join(self.bin_directory, name)
+
+    def run_command(self, name, args):
+        """Run a Dulwich command.
+
+        :param name: Name of the command, as it exists in bin/
+        :param args: Arguments to the command
+        """
+        env = dict(os.environ)
+        env["PYTHONPATH"] = os.pathsep.join(sys.path)
+
+        # Since they don't have any extensions, Windows can't recognize
+        # executablility of the Python files in /bin. Even then, we'd have to
+        # expect the user to set up file associations for .py files.
+        #
+        # Save us from all that headache and call python with the bin script.
+        argv = [sys.executable, self.bin_path(name)] + args
+        return subprocess.Popen(argv,
+            stdout=subprocess.PIPE,
+            stdin=subprocess.PIPE, stderr=subprocess.PIPE,
+            env=env)
+
+
+def self_test_suite():
     names = [
+        'blackbox',
         'client',
         'fastexport',
         'file',
@@ -70,8 +97,38 @@ def test_suite():
         'web',
         ]
     module_names = ['dulwich.tests.test_' + name for name in names]
-    result = unittest.TestSuite()
     loader = unittest.TestLoader()
-    suite = loader.loadTestsFromNames(module_names)
-    result.addTests(suite)
+    return loader.loadTestsFromNames(module_names)
+
+
+def tutorial_test_suite():
+    tutorial = [
+        '0-introduction',
+        '1-repo',
+        '2-object-store',
+        '3-conclusion',
+        ]
+    tutorial_files = ["../../docs/tutorial/%s.txt" % name for name in tutorial]
+    def setup(test):
+        test.__dulwich_tempdir = tempfile.mkdtemp()
+        os.chdir(test.__dulwich_tempdir)
+    def teardown(test):
+        shutil.rmtree(test.__dulwich_tempdir)
+    return doctest.DocFileSuite(setUp=setup, tearDown=teardown,
+        *tutorial_files)
+
+
+def nocompat_test_suite():
+    result = unittest.TestSuite()
+    result.addTests(self_test_suite())
+    result.addTests(tutorial_test_suite())
+    return result
+
+
+def test_suite():
+    result = unittest.TestSuite()
+    result.addTests(self_test_suite())
+    result.addTests(tutorial_test_suite())
+    from dulwich.tests.compat import test_suite as compat_test_suite
+    result.addTests(compat_test_suite())
     return result

+ 37 - 0
dulwich/tests/compat/__init__.py

@@ -0,0 +1,37 @@
+# __init__.py -- Compatibility tests for dulwich
+# Copyright (C) 2010 Jelmer Vernooij <jelmer@samba.org>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License or (at your option) any later version of
+# the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Compatibility tests for Dulwich."""
+
+import unittest
+
+def test_suite():
+    names = [
+        'client',
+        'pack',
+        'repository',
+        'server',
+        'utils',
+        ]
+    module_names = ['dulwich.tests.compat.test_' + name for name in names]
+    result = unittest.TestSuite()
+    loader = unittest.TestLoader()
+    suite = loader.loadTestsFromNames(module_names)
+    result.addTests(suite)
+    return result

+ 4 - 0
dulwich/tests/compat/test_client.py

@@ -43,6 +43,7 @@ from utils import (
     run_git_or_fail,
     )
 
+
 class DulwichClientTestBase(object):
     """Tests for client/server compatibility."""
 
@@ -167,6 +168,7 @@ class DulwichClientTestBase(object):
 
 
 class DulwichTCPClientTest(CompatTestCase, DulwichClientTestBase):
+
     def setUp(self):
         CompatTestCase.setUp(self)
         DulwichClientTestBase.setUp(self)
@@ -211,6 +213,7 @@ class TestSSHVendor(object):
 
 
 class DulwichMockSSHClientTest(CompatTestCase, DulwichClientTestBase):
+
     def setUp(self):
         CompatTestCase.setUp(self)
         DulwichClientTestBase.setUp(self)
@@ -230,6 +233,7 @@ class DulwichMockSSHClientTest(CompatTestCase, DulwichClientTestBase):
 
 
 class DulwichSubprocessClientTest(CompatTestCase, DulwichClientTestBase):
+
     def setUp(self):
         CompatTestCase.setUp(self)
         DulwichClientTestBase.setUp(self)

+ 2 - 3
dulwich/tests/compat/test_server.py

@@ -20,8 +20,8 @@
 """Compatibility tests between Dulwich and the cgit server.
 
 Warning: these tests should be fairly stable, but when writing/debugging new
-tests, deadlocks may freeze the test process such that it cannot be Ctrl-C'ed.
-On *nix, you can kill the tests with Ctrl-Z, "kill %".
+    tests, deadlocks may freeze the test process such that it cannot be
+    Ctrl-C'ed. On POSIX systems, you can kill the tests with Ctrl-Z, "kill %".
 """
 
 import threading
@@ -29,7 +29,6 @@ import threading
 from dulwich.server import (
     DictBackend,
     TCPGitServer,
-    ReceivePackHandler,
     )
 from server_utils import (
     ServerTests,

+ 3 - 2
dulwich/tests/compat/test_utils.py

@@ -19,9 +19,8 @@
 
 """Tests for git compatibility utilities."""
 
-from unittest import TestCase
-
 from dulwich.tests import (
+    TestCase,
     TestSkipped,
     )
 import utils
@@ -30,6 +29,7 @@ import utils
 class GitVersionTests(TestCase):
 
     def setUp(self):
+        super(GitVersionTests, self).setUp()
         self._orig_run_git = utils.run_git
         self._version_str = None  # tests can override to set stub version
 
@@ -39,6 +39,7 @@ class GitVersionTests(TestCase):
         utils.run_git = run_git
 
     def tearDown(self):
+        super(GitVersionTests, self).tearDown()
         utils.run_git = self._orig_run_git
 
     def test_git_version_none(self):

+ 3 - 4
dulwich/tests/compat/test_web.py

@@ -19,15 +19,14 @@
 
 """Compatibility tests between Dulwich and the cgit HTTP server.
 
-Warning: these tests should be fairly stable, but when writing/debugging new
-tests, deadlocks may freeze the test process such that it cannot be Ctrl-C'ed.
-On *nix, you can kill the tests with Ctrl-Z, "kill %".
+warning: these tests should be fairly stable, but when writing/debugging new
+    tests, deadlocks may freeze the test process such that it cannot be
+    Ctrl-C'ed. On POSIX systems, you can kill the tests with Ctrl-Z, "kill %".
 """
 
 import threading
 from wsgiref import simple_server
 
-import dulwich
 from dulwich.server import (
     DictBackend,
     )

+ 8 - 2
dulwich/tests/compat/utils.py

@@ -36,6 +36,8 @@ from dulwich.tests import (
 
 _DEFAULT_GIT = 'git'
 _VERSION_LEN = 4
+_REPOS_DATA_DIR = os.path.abspath(os.path.join(
+    os.path.dirname(__file__), os.pardir, 'data', 'repos'))
 
 
 def git_version(git_path=_DEFAULT_GIT):
@@ -78,6 +80,10 @@ def require_git_version(required_version, git_path=_DEFAULT_GIT):
     :raise TestSkipped: if no suitable git version was found at the given path.
     """
     found_version = git_version(git_path=git_path)
+    if found_version is None:
+        raise TestSkipped('Test requires git >= %s, but c git not found' %
+                         (required_version, ))
+
     if len(required_version) > _VERSION_LEN:
         raise ValueError('Invalid version tuple %s, expected %i parts' %
                          (required_version, _VERSION_LEN))
@@ -142,8 +148,7 @@ def import_repo_to_dir(name):
     :returns: The path to the imported repository.
     """
     temp_dir = tempfile.mkdtemp()
-    export_path = os.path.join(os.path.dirname(__file__), os.pardir, 'data',
-                               'repos', name)
+    export_path = os.path.join(_REPOS_DATA_DIR, name)
     temp_repo_dir = os.path.join(temp_dir, name)
     export_file = open(export_path, 'rb')
     run_git_or_fail(['init', '--quiet', '--bare', temp_repo_dir])
@@ -152,6 +157,7 @@ def import_repo_to_dir(name):
     export_file.close()
     return temp_repo_dir
 
+
 def import_repo(name):
     """Import a repo from a fast-export file in a temporary directory.
 

+ 67 - 0
dulwich/tests/test_blackbox.py

@@ -0,0 +1,67 @@
+# test_blackbox.py -- blackbox tests
+# Copyright (C) 2010 Jelmer Vernooij <jelmer@samba.org>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License or (at your option) a later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Blackbox tests for Dulwich commands."""
+
+import tempfile
+
+from dulwich.repo import (
+    Repo,
+    )
+from dulwich.tests import (
+    BlackboxTestCase,
+    )
+
+
+class GitReceivePackTests(BlackboxTestCase):
+    """Blackbox tests for dul-receive-pack."""
+
+    def setUp(self):
+        super(GitReceivePackTests, self).setUp()
+        self.path = tempfile.mkdtemp()
+        self.repo = Repo.init(self.path)
+
+    def test_basic(self):
+        process = self.run_command("dul-receive-pack", [self.path])
+        (stdout, stderr) = process.communicate("0000")
+        self.assertEquals('', stderr)
+        self.assertEquals('0000', stdout[-4:])
+        self.assertEquals(0, process.returncode)
+
+    def test_missing_arg(self):
+        process = self.run_command("dul-receive-pack", [])
+        (stdout, stderr) = process.communicate()
+        self.assertEquals('usage: dul-receive-pack <git-dir>\n', stderr)
+        self.assertEquals('', stdout)
+        self.assertEquals(1, process.returncode)
+
+
+class GitUploadPackTests(BlackboxTestCase):
+    """Blackbox tests for dul-upload-pack."""
+
+    def setUp(self):
+        super(GitUploadPackTests, self).setUp()
+        self.path = tempfile.mkdtemp()
+        self.repo = Repo.init(self.path)
+
+    def test_missing_arg(self):
+        process = self.run_command("dul-upload-pack", [])
+        (stdout, stderr) = process.communicate()
+        self.assertEquals('usage: dul-upload-pack <git-dir>\n', stderr)
+        self.assertEquals('', stdout)
+        self.assertEquals(1, process.returncode)

+ 61 - 0
dulwich/tests/test_client.py

@@ -20,12 +20,16 @@ from cStringIO import StringIO
 
 from dulwich.client import (
     GitClient,
+    TCPGitClient,
+    SubprocessGitClient,
     SSHGitClient,
+    get_transport_and_path,
     )
 from dulwich.tests import (
     TestCase,
     )
 from dulwich.protocol import (
+    TCP_GIT_PORT,
     Protocol,
     )
 
@@ -66,6 +70,63 @@ class GitClientTests(TestCase):
         self.client.fetch_pack("bla", lambda heads: [], None, None, None)
         self.assertEquals(self.rout.getvalue(), "0000")
 
+    def test_get_transport_and_path_tcp(self):
+        client, path = get_transport_and_path('git://foo.com/bar/baz')
+        self.assertTrue(isinstance(client, TCPGitClient))
+        self.assertEquals('foo.com', client._host)
+        self.assertEquals(TCP_GIT_PORT, client._port)
+        self.assertEqual('/bar/baz', path)
+
+        client, path = get_transport_and_path('git://foo.com:1234/bar/baz')
+        self.assertTrue(isinstance(client, TCPGitClient))
+        self.assertEquals('foo.com', client._host)
+        self.assertEquals(1234, client._port)
+        self.assertEqual('/bar/baz', path)
+
+    def test_get_transport_and_path_ssh_explicit(self):
+        client, path = get_transport_and_path('git+ssh://foo.com/bar/baz')
+        self.assertTrue(isinstance(client, SSHGitClient))
+        self.assertEquals('foo.com', client.host)
+        self.assertEquals(None, client.port)
+        self.assertEquals(None, client.username)
+        self.assertEqual('/bar/baz', path)
+
+        client, path = get_transport_and_path('git+ssh://foo.com:1234/bar/baz')
+        self.assertTrue(isinstance(client, SSHGitClient))
+        self.assertEquals('foo.com', client.host)
+        self.assertEquals(1234, client.port)
+        self.assertEqual('/bar/baz', path)
+
+    def test_get_transport_and_path_ssh_implicit(self):
+        client, path = get_transport_and_path('foo:/bar/baz')
+        self.assertTrue(isinstance(client, SSHGitClient))
+        self.assertEquals('foo', client.host)
+        self.assertEquals(None, client.port)
+        self.assertEquals(None, client.username)
+        self.assertEqual('/bar/baz', path)
+
+        client, path = get_transport_and_path('foo.com:/bar/baz')
+        self.assertTrue(isinstance(client, SSHGitClient))
+        self.assertEquals('foo.com', client.host)
+        self.assertEquals(None, client.port)
+        self.assertEquals(None, client.username)
+        self.assertEqual('/bar/baz', path)
+
+        client, path = get_transport_and_path('user@foo.com:/bar/baz')
+        self.assertTrue(isinstance(client, SSHGitClient))
+        self.assertEquals('foo.com', client.host)
+        self.assertEquals(None, client.port)
+        self.assertEquals('user', client.username)
+        self.assertEqual('/bar/baz', path)
+
+    def test_get_transport_and_path_subprocess(self):
+        client, path = get_transport_and_path('foo.bar/baz')
+        self.assertTrue(isinstance(client, SubprocessGitClient))
+        self.assertEquals('foo.bar/baz', path)
+
+    def test_get_transport_and_path_error(self):
+        self.assertRaises(ValueError, get_transport_and_path, 'foo://bar/baz')
+
 
 class SSHGitClientTests(TestCase):
 

+ 671 - 0
dulwich/tests/test_diff_tree.py

@@ -0,0 +1,671 @@
+# test_diff_tree.py -- Tests for file and tree diff utilities.
+# Copyright (C) 2010 Google, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# or (at your option) a later version of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Tests for file and tree diff utilities."""
+
+from dulwich.diff_tree import (
+    CHANGE_MODIFY,
+    CHANGE_RENAME,
+    CHANGE_COPY,
+    CHANGE_UNCHANGED,
+    TreeChange,
+    _merge_entries,
+    _merge_entries_py,
+    tree_changes,
+    _count_blocks,
+    _count_blocks_py,
+    _similarity_score,
+    _tree_change_key,
+    RenameDetector,
+    _is_tree,
+    _is_tree_py
+    )
+from dulwich.index import (
+    commit_tree,
+    )
+from dulwich._compat import (
+    permutations,
+    )
+from dulwich.object_store import (
+    MemoryObjectStore,
+    )
+from dulwich.objects import (
+    ShaFile,
+    Blob,
+    TreeEntry,
+    Tree,
+    )
+from dulwich.tests import (
+    TestCase,
+    )
+from dulwich.tests.utils import (
+    make_object,
+    functest_builder,
+    ext_functest_builder,
+    )
+
+# Shorthand mode for Files.
+F = 0100644
+
+
+class DiffTestCase(TestCase):
+
+    def setUp(self):
+        super(DiffTestCase, self).setUp()
+        self.store = MemoryObjectStore()
+        self.empty_tree = self.commit_tree([])
+
+    def commit_tree(self, entries):
+        commit_blobs = []
+        for entry in entries:
+            if len(entry) == 2:
+                path, obj = entry
+                mode = F
+            else:
+                path, obj, mode = entry
+            if isinstance(obj, Blob):
+                self.store.add_object(obj)
+                sha = obj.id
+            else:
+                sha = obj
+            commit_blobs.append((path, sha, mode))
+        return self.store[commit_tree(self.store, commit_blobs)]
+
+
+class TreeChangesTest(DiffTestCase):
+
+    def assertMergeFails(self, merge_entries, name, mode, sha):
+        t = Tree()
+        t[name] = (mode, sha)
+        self.assertRaises(TypeError, merge_entries, '', t, t)
+
+    def _do_test_merge_entries(self, merge_entries):
+        blob_a1 = make_object(Blob, data='a1')
+        blob_a2 = make_object(Blob, data='a2')
+        blob_b1 = make_object(Blob, data='b1')
+        blob_c2 = make_object(Blob, data='c2')
+        tree1 = self.commit_tree([('a', blob_a1, 0100644),
+                                  ('b', blob_b1, 0100755)])
+        tree2 = self.commit_tree([('a', blob_a2, 0100644),
+                                  ('c', blob_c2, 0100755)])
+
+        self.assertEqual([], merge_entries('', self.empty_tree,
+                                           self.empty_tree))
+        self.assertEqual([
+          ((None, None, None), ('a', 0100644, blob_a1.id)),
+          ((None, None, None), ('b', 0100755, blob_b1.id)),
+          ], merge_entries('', self.empty_tree, tree1))
+        self.assertEqual([
+          ((None, None, None), ('x/a', 0100644, blob_a1.id)),
+          ((None, None, None), ('x/b', 0100755, blob_b1.id)),
+          ], merge_entries('x', self.empty_tree, tree1))
+
+        self.assertEqual([
+          (('a', 0100644, blob_a2.id), (None, None, None)),
+          (('c', 0100755, blob_c2.id), (None, None, None)),
+          ], merge_entries('', tree2, self.empty_tree))
+
+        self.assertEqual([
+          (('a', 0100644, blob_a1.id), ('a', 0100644, blob_a2.id)),
+          (('b', 0100755, blob_b1.id), (None, None, None)),
+          ((None, None, None), ('c', 0100755, blob_c2.id)),
+          ], merge_entries('', tree1, tree2))
+
+        self.assertEqual([
+          (('a', 0100644, blob_a2.id), ('a', 0100644, blob_a1.id)),
+          ((None, None, None), ('b', 0100755, blob_b1.id)),
+          (('c', 0100755, blob_c2.id), (None, None, None)),
+          ], merge_entries('', tree2, tree1))
+
+        self.assertMergeFails(merge_entries, 0xdeadbeef, 0100644, '1' * 40)
+        self.assertMergeFails(merge_entries, 'a', 'deadbeef', '1' * 40)
+        self.assertMergeFails(merge_entries, 'a', 0100644, 0xdeadbeef)
+
+    test_merge_entries = functest_builder(_do_test_merge_entries,
+                                          _merge_entries_py)
+    test_merge_entries_extension = ext_functest_builder(_do_test_merge_entries,
+                                                        _merge_entries)
+
+    def _do_test_is_tree(self, is_tree):
+        self.assertFalse(is_tree(TreeEntry(None, None, None)))
+        self.assertFalse(is_tree(TreeEntry('a', 0100644, 'a' * 40)))
+        self.assertFalse(is_tree(TreeEntry('a', 0100755, 'a' * 40)))
+        self.assertFalse(is_tree(TreeEntry('a', 0120000, 'a' * 40)))
+        self.assertTrue(is_tree(TreeEntry('a', 0040000, 'a' * 40)))
+        self.assertRaises(TypeError, is_tree, TreeEntry('a', 'x', 'a' * 40))
+        self.assertRaises(AttributeError, is_tree, 1234)
+
+    test_is_tree = functest_builder(_do_test_is_tree, _is_tree_py)
+    test_is_tree_extension = ext_functest_builder(_do_test_is_tree, _is_tree)
+
+    def assertChangesEqual(self, expected, tree1, tree2, **kwargs):
+        actual = list(tree_changes(self.store, tree1.id, tree2.id, **kwargs))
+        self.assertEqual(expected, actual)
+
+    # For brevity, the following tests use tuples instead of TreeEntry objects.
+
+    def test_tree_changes_empty(self):
+        self.assertChangesEqual([], self.empty_tree, self.empty_tree)
+
+    def test_tree_changes_no_changes(self):
+        blob = make_object(Blob, data='blob')
+        tree = self.commit_tree([('a', blob), ('b/c', blob)])
+        self.assertChangesEqual([], self.empty_tree, self.empty_tree)
+        self.assertChangesEqual([], tree, tree)
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_UNCHANGED, ('a', F, blob.id), ('a', F, blob.id)),
+           TreeChange(CHANGE_UNCHANGED, ('b/c', F, blob.id),
+                      ('b/c', F, blob.id))],
+          tree, tree, want_unchanged=True)
+
+    def test_tree_changes_add_delete(self):
+        blob_a = make_object(Blob, data='a')
+        blob_b = make_object(Blob, data='b')
+        tree = self.commit_tree([('a', blob_a, 0100644),
+                                 ('x/b', blob_b, 0100755)])
+        self.assertChangesEqual(
+          [TreeChange.add(('a', 0100644, blob_a.id)),
+           TreeChange.add(('x/b', 0100755, blob_b.id))],
+          self.empty_tree, tree)
+        self.assertChangesEqual(
+          [TreeChange.delete(('a', 0100644, blob_a.id)),
+           TreeChange.delete(('x/b', 0100755, blob_b.id))],
+          tree, self.empty_tree)
+
+    def test_tree_changes_modify_contents(self):
+        blob_a1 = make_object(Blob, data='a1')
+        blob_a2 = make_object(Blob, data='a2')
+        tree1 = self.commit_tree([('a', blob_a1)])
+        tree2 = self.commit_tree([('a', blob_a2)])
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob_a1.id),
+                      ('a', F, blob_a2.id))], tree1, tree2)
+
+    def test_tree_changes_modify_mode(self):
+        blob_a = make_object(Blob, data='a')
+        tree1 = self.commit_tree([('a', blob_a, 0100644)])
+        tree2 = self.commit_tree([('a', blob_a, 0100755)])
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', 0100644, blob_a.id),
+                      ('a', 0100755, blob_a.id))], tree1, tree2)
+
+    def test_tree_changes_change_type(self):
+        blob_a1 = make_object(Blob, data='a')
+        blob_a2 = make_object(Blob, data='/foo/bar')
+        tree1 = self.commit_tree([('a', blob_a1, 0100644)])
+        tree2 = self.commit_tree([('a', blob_a2, 0120000)])
+        self.assertChangesEqual(
+          [TreeChange.delete(('a', 0100644, blob_a1.id)),
+           TreeChange.add(('a', 0120000, blob_a2.id))],
+          tree1, tree2)
+
+    def test_tree_changes_to_tree(self):
+        blob_a = make_object(Blob, data='a')
+        blob_x = make_object(Blob, data='x')
+        tree1 = self.commit_tree([('a', blob_a)])
+        tree2 = self.commit_tree([('a/x', blob_x)])
+        self.assertChangesEqual(
+          [TreeChange.delete(('a', F, blob_a.id)),
+           TreeChange.add(('a/x', F, blob_x.id))],
+          tree1, tree2)
+
+    def test_tree_changes_complex(self):
+        blob_a_1 = make_object(Blob, data='a1_1')
+        blob_bx1_1 = make_object(Blob, data='bx1_1')
+        blob_bx2_1 = make_object(Blob, data='bx2_1')
+        blob_by1_1 = make_object(Blob, data='by1_1')
+        blob_by2_1 = make_object(Blob, data='by2_1')
+        tree1 = self.commit_tree([
+          ('a', blob_a_1),
+          ('b/x/1', blob_bx1_1),
+          ('b/x/2', blob_bx2_1),
+          ('b/y/1', blob_by1_1),
+          ('b/y/2', blob_by2_1),
+          ])
+
+        blob_a_2 = make_object(Blob, data='a1_2')
+        blob_bx1_2 = blob_bx1_1
+        blob_by_2 = make_object(Blob, data='by_2')
+        blob_c_2 = make_object(Blob, data='c_2')
+        tree2 = self.commit_tree([
+          ('a', blob_a_2),
+          ('b/x/1', blob_bx1_2),
+          ('b/y', blob_by_2),
+          ('c', blob_c_2),
+          ])
+
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob_a_1.id),
+                      ('a', F, blob_a_2.id)),
+           TreeChange.delete(('b/x/2', F, blob_bx2_1.id)),
+           TreeChange.add(('b/y', F, blob_by_2.id)),
+           TreeChange.delete(('b/y/1', F, blob_by1_1.id)),
+           TreeChange.delete(('b/y/2', F, blob_by2_1.id)),
+           TreeChange.add(('c', F, blob_c_2.id))],
+          tree1, tree2)
+
+    def test_tree_changes_name_order(self):
+        blob = make_object(Blob, data='a')
+        tree1 = self.commit_tree([('a', blob), ('a.', blob), ('a..', blob)])
+        # Tree order is the reverse of this, so if we used tree order, 'a..'
+        # would not be merged.
+        tree2 = self.commit_tree([('a/x', blob), ('a./x', blob), ('a..', blob)])
+
+        self.assertChangesEqual(
+          [TreeChange.delete(('a', F, blob.id)),
+           TreeChange.add(('a/x', F, blob.id)),
+           TreeChange.delete(('a.', F, blob.id)),
+           TreeChange.add(('a./x', F, blob.id))],
+          tree1, tree2)
+
+    def test_tree_changes_prune(self):
+        blob_a1 = make_object(Blob, data='a1')
+        blob_a2 = make_object(Blob, data='a2')
+        blob_x = make_object(Blob, data='x')
+        tree1 = self.commit_tree([('a', blob_a1), ('b/x', blob_x)])
+        tree2 = self.commit_tree([('a', blob_a2), ('b/x', blob_x)])
+        # Remove identical items so lookups will fail unless we prune.
+        subtree = self.store[tree1['b'][1]]
+        for entry in subtree.iteritems():
+            del self.store[entry.sha]
+        del self.store[subtree.id]
+
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob_a1.id),
+                      ('a', F, blob_a2.id))],
+          tree1, tree2)
+
+
+class RenameDetectionTest(DiffTestCase):
+
+    def _do_test_count_blocks(self, count_blocks):
+        blob = make_object(Blob, data='a\nb\na\n')
+        self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, count_blocks(blob))
+
+    test_count_blocks = functest_builder(_do_test_count_blocks,
+                                         _count_blocks_py)
+    test_count_blocks_extension = ext_functest_builder(_do_test_count_blocks,
+                                                       _count_blocks)
+
+    def _do_test_count_blocks_no_newline(self, count_blocks):
+        blob = make_object(Blob, data='a\na')
+        self.assertEqual({hash('a\n'): 2, hash('a'): 1}, _count_blocks(blob))
+
+    test_count_blocks_no_newline = functest_builder(
+      _do_test_count_blocks_no_newline, _count_blocks_py)
+    test_count_blocks_no_newline_extension = ext_functest_builder(
+       _do_test_count_blocks_no_newline, _count_blocks)
+
+    def _do_test_count_blocks_chunks(self, count_blocks):
+        blob = ShaFile.from_raw_chunks(Blob.type_num, ['a\nb', '\na\n'])
+        self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, _count_blocks(blob))
+
+    test_count_blocks_chunks = functest_builder(_do_test_count_blocks_chunks,
+                                                _count_blocks_py)
+    test_count_blocks_chunks_extension = ext_functest_builder(
+      _do_test_count_blocks_chunks, _count_blocks)
+
+    def _do_test_count_blocks_long_lines(self, count_blocks):
+        a = 'a' * 64
+        data = a + 'xxx\ny\n' + a + 'zzz\n'
+        blob = make_object(Blob, data=data)
+        self.assertEqual({hash('a' * 64): 128, hash('xxx\n'): 4, hash('y\n'): 2,
+                          hash('zzz\n'): 4},
+                         _count_blocks(blob))
+
+    test_count_blocks_long_lines = functest_builder(
+      _do_test_count_blocks_long_lines, _count_blocks_py)
+    test_count_blocks_long_lines_extension = ext_functest_builder(
+      _do_test_count_blocks_long_lines, _count_blocks)
+
+    def assertSimilar(self, expected_score, blob1, blob2):
+        self.assertEqual(expected_score, _similarity_score(blob1, blob2))
+        self.assertEqual(expected_score, _similarity_score(blob2, blob1))
+
+    def test_similarity_score(self):
+        blob0 = make_object(Blob, data='')
+        blob1 = make_object(Blob, data='ab\ncd\ncd\n')
+        blob2 = make_object(Blob, data='ab\n')
+        blob3 = make_object(Blob, data='cd\n')
+        blob4 = make_object(Blob, data='cd\ncd\n')
+
+        self.assertSimilar(100, blob0, blob0)
+        self.assertSimilar(0, blob0, blob1)
+        self.assertSimilar(33, blob1, blob2)
+        self.assertSimilar(33, blob1, blob3)
+        self.assertSimilar(66, blob1, blob4)
+        self.assertSimilar(0, blob2, blob3)
+        self.assertSimilar(50, blob3, blob4)
+
+    def test_similarity_score_cache(self):
+        blob1 = make_object(Blob, data='ab\ncd\n')
+        blob2 = make_object(Blob, data='ab\n')
+
+        block_cache = {}
+        self.assertEqual(
+          50, _similarity_score(blob1, blob2, block_cache=block_cache))
+        self.assertEqual(set([blob1.id, blob2.id]), set(block_cache))
+
+        def fail_chunks():
+            self.fail('Unexpected call to as_raw_chunks()')
+
+        blob1.as_raw_chunks = blob2.as_raw_chunks = fail_chunks
+        blob1.raw_length = lambda: 6
+        blob2.raw_length = lambda: 3
+        self.assertEqual(
+          50, _similarity_score(blob1, blob2, block_cache=block_cache))
+
+    def test_tree_entry_sort(self):
+        sha = 'abcd' * 10
+        expected_entries = [
+          TreeChange.add(TreeEntry('aaa', F, sha)),
+          TreeChange(CHANGE_COPY, TreeEntry('bbb', F, sha),
+                     TreeEntry('aab', F, sha)),
+          TreeChange(CHANGE_MODIFY, TreeEntry('bbb', F, sha),
+                     TreeEntry('bbb', F, 'dabc' * 10)),
+          TreeChange(CHANGE_RENAME, TreeEntry('bbc', F, sha),
+                     TreeEntry('ddd', F, sha)),
+          TreeChange.delete(TreeEntry('ccc', F, sha)),
+          ]
+
+        for perm in permutations(expected_entries):
+            self.assertEqual(expected_entries,
+                             sorted(perm, key=_tree_change_key))
+
+    def detect_renames(self, tree1, tree2, **kwargs):
+        detector = RenameDetector(self.store, tree1.id, tree2.id, **kwargs)
+        return detector.changes_with_renames()
+
+    def test_no_renames(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\ne\nf\n')
+        blob3 = make_object(Blob, data='a\nb\ng\nh\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob1), ('b', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('b', F, blob2.id), ('b', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_one_to_one(self):
+        blob1 = make_object(Blob, data='1')
+        blob2 = make_object(Blob, data='2')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob1), ('d', blob2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob1.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('d', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_split_different_type(self):
+        blob = make_object(Blob, data='/foo')
+        tree1 = self.commit_tree([('a', blob, 0100644)])
+        tree2 = self.commit_tree([('a', blob, 0120000)])
+        self.assertEqual(
+          [TreeChange.add(('a', 0120000, blob.id)),
+           TreeChange.delete(('a', 0100644, blob.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_and_different_type(self):
+        blob1 = make_object(Blob, data='1')
+        blob2 = make_object(Blob, data='2')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob2, 0120000), ('b', blob1)])
+        self.assertEqual(
+          [TreeChange.add(('a', 0120000, blob2.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob1.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_one_to_many(self):
+        blob = make_object(Blob, data='1')
+        tree1 = self.commit_tree([('a', blob)])
+        tree2 = self.commit_tree([('b', blob), ('c', blob)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob.id), ('b', F, blob.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob.id), ('c', F, blob.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_many_to_one(self):
+        blob = make_object(Blob, data='1')
+        tree1 = self.commit_tree([('a', blob), ('b', blob)])
+        tree2 = self.commit_tree([('c', blob)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob.id), ('c', F, blob.id)),
+           TreeChange.delete(('b', F, blob.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_many_to_many(self):
+        blob = make_object(Blob, data='1')
+        tree1 = self.commit_tree([('a', blob), ('b', blob)])
+        tree2 = self.commit_tree([('c', blob), ('d', blob), ('e', blob)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob.id), ('c', F, blob.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob.id), ('e', F, blob.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob.id), ('d', F, blob.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_rename_threshold(self):
+        blob1 = make_object(Blob, data='a\nb\nc\n')
+        blob2 = make_object(Blob, data='a\nb\nd\n')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('b', blob2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rename_threshold=50))
+        self.assertEqual(
+          [TreeChange.delete(('a', F, blob1.id)),
+           TreeChange.add(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rename_threshold=75))
+
+    def test_content_rename_max_files(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd')
+        blob4 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob2 = make_object(Blob, data='e\nf\ng\nh\n')
+        blob3 = make_object(Blob, data='e\nf\ng\ni\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3), ('d', blob4)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('d', F, blob4.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('c', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange.delete(('a', F, blob1.id)),
+           TreeChange.delete(('b', F, blob2.id)),
+           TreeChange.add(('c', F, blob3.id)),
+           TreeChange.add(('d', F, blob4.id))],
+          self.detect_renames(tree1, tree2, max_files=1))
+
+    def test_content_rename_one_to_one(self):
+        b11 = make_object(Blob, data='a\nb\nc\nd\n')
+        b12 = make_object(Blob, data='a\nb\nc\ne\n')
+        b21 = make_object(Blob, data='e\nf\ng\n\h')
+        b22 = make_object(Blob, data='e\nf\ng\n\i')
+        tree1 = self.commit_tree([('a', b11), ('b', b21)])
+        tree2 = self.commit_tree([('c', b12), ('d', b22)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, b11.id), ('c', F, b12.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, b21.id), ('d', F, b22.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_one_to_one_ordering(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\ne\nf\n')
+        blob2 = make_object(Blob, data='a\nb\nc\nd\ng\nh\n')
+        # 6/10 match to blob1, 8/10 match to blob2
+        blob3 = make_object(Blob, data='a\nb\nc\nd\ng\ni\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3)])
+        self.assertEqual(
+          [TreeChange.delete(('a', F, blob1.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('c', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+
+        tree3 = self.commit_tree([('a', blob2), ('b', blob1)])
+        tree4 = self.commit_tree([('c', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob2.id), ('c', F, blob3.id)),
+           TreeChange.delete(('b', F, blob1.id))],
+          self.detect_renames(tree3, tree4))
+
+    def test_content_rename_one_to_many(self):
+        blob1 = make_object(Blob, data='aa\nb\nc\nd\ne\n')
+        blob2 = make_object(Blob, data='ab\nb\nc\nd\ne\n')  # 8/11 match
+        blob3 = make_object(Blob, data='aa\nb\nc\nd\nf\n')  # 9/11 match
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('b', blob2), ('c', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_many_to_one(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob3 = make_object(Blob, data='a\nb\nc\nf\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id)),
+           TreeChange.delete(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_many_to_many(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob3 = make_object(Blob, data='a\nb\nc\nf\n')
+        blob4 = make_object(Blob, data='a\nb\nc\ng\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3), ('d', blob4)])
+        # TODO(dborowitz): Distribute renames rather than greedily choosing
+        # copies.
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('d', F, blob4.id)),
+           TreeChange.delete(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_gitlink(self):
+        blob1 = make_object(Blob, data='blob1')
+        blob2 = make_object(Blob, data='blob2')
+        link1 = '1' * 40
+        link2 = '2' * 40
+        tree1 = self.commit_tree([('a', blob1), ('b', link1, 0160000)])
+        tree2 = self.commit_tree([('c', blob2), ('d', link2, 0160000)])
+        self.assertEqual(
+          [TreeChange.delete(('a', 0100644, blob1.id)),
+           TreeChange.delete(('b', 0160000, link1)),
+           TreeChange.add(('c', 0100644, blob2.id)),
+           TreeChange.add(('d', 0160000, link2))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_swap(self):
+        blob1 = make_object(Blob, data='1')
+        blob2 = make_object(Blob, data='2')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob2), ('b', blob1)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+           TreeChange(CHANGE_MODIFY, ('b', F, blob2.id), ('b', F, blob1.id))],
+          self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob1.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('a', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=50))
+
+    def test_content_rename_swap(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='e\nf\ng\nh\n')
+        blob3 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob4 = make_object(Blob, data='e\nf\ng\ni\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob4), ('b', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob3.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('a', F, blob4.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=60))
+
+    def test_rewrite_threshold(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob3 = make_object(Blob, data='a\nb\nf\ng\n')
+
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob3), ('b', blob2)])
+
+        no_renames = [
+          TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob3.id)),
+          TreeChange.add(('b', F, blob2.id))]
+        self.assertEqual(
+          no_renames, self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          no_renames, self.detect_renames(tree1, tree2, rewrite_threshold=40))
+        self.assertEqual(
+          [TreeChange.add(('a', F, blob3.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=80))
+
+    def test_find_copies_harder_exact(self):
+        blob = make_object(Blob, data='blob')
+        tree1 = self.commit_tree([('a', blob)])
+        tree2 = self.commit_tree([('a', blob), ('b', blob)])
+        self.assertEqual([TreeChange.add(('b', F, blob.id))],
+                         self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_COPY, ('a', F, blob.id), ('b', F, blob.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+    def test_find_copies_harder_content(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob1), ('b', blob2)])
+        self.assertEqual([TreeChange.add(('b', F, blob2.id))],
+                         self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+    def test_find_copies_harder_modify(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob2), ('b', blob2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+           TreeChange.add(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+    def test_find_copies_harder_with_rewrites(self):
+        blob_a1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob_a2 = make_object(Blob, data='f\ng\nh\ni\n')
+        blob_b2 = make_object(Blob, data='a\nb\nc\ne\n')
+        tree1 = self.commit_tree([('a', blob_a1)])
+        tree2 = self.commit_tree([('a', blob_a2), ('b', blob_b2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob_a1.id),
+                      ('a', F, blob_a2.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob_a1.id), ('b', F, blob_b2.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+        self.assertEqual(
+          [TreeChange.add(('a', F, blob_a2.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob_a1.id),
+                      ('b', F, blob_b2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=50,
+                              find_copies_harder=True))

+ 69 - 0
dulwich/tests/test_fastexport.py

@@ -131,3 +131,72 @@ M 100644 :1 a
         self.assertEquals(2, len(markers))
         self.assertTrue(isinstance(self.repo[markers["1"]], Blob))
         self.assertTrue(isinstance(self.repo[markers["2"]], Commit))
+
+    def test_file_add(self):
+        from fastimport import commands
+        cmd = commands.BlobCommand("23", "data")
+        self.processor.blob_handler(cmd)
+        cmd = commands.CommitCommand("refs/heads/foo", "mrkr",
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            "FOO", None, [], [commands.FileModifyCommand("path", 0100644, ":23", None)])
+        self.processor.commit_handler(cmd)
+        commit = self.repo[self.processor.last_commit]
+        self.assertEquals([
+            ('path', 0100644, '6320cd248dd8aeaab759d5871f8781b5c0505172')],
+            self.repo[commit.tree].items())
+
+    def simple_commit(self):
+        from fastimport import commands
+        cmd = commands.BlobCommand("23", "data")
+        self.processor.blob_handler(cmd)
+        cmd = commands.CommitCommand("refs/heads/foo", "mrkr",
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            "FOO", None, [], [commands.FileModifyCommand("path", 0100644, ":23", None)])
+        self.processor.commit_handler(cmd)
+        commit = self.repo[self.processor.last_commit]
+        return commit
+
+    def make_file_commit(self, file_cmds):
+        """Create a trivial commit with the specified file commands.
+
+        :param file_cmds: File commands to run.
+        :return: The created commit object
+        """
+        from fastimport import commands
+        cmd = commands.CommitCommand("refs/heads/foo", "mrkr",
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            "FOO", None, [], file_cmds)
+        self.processor.commit_handler(cmd)
+        return self.repo[self.processor.last_commit]
+
+    def test_file_copy(self):
+        from fastimport import commands
+        self.simple_commit()
+        commit = self.make_file_commit([commands.FileCopyCommand("path", "new_path")])
+        self.assertEquals([
+            ('new_path', 0100644, '6320cd248dd8aeaab759d5871f8781b5c0505172'),
+            ('path', 0100644, '6320cd248dd8aeaab759d5871f8781b5c0505172'),
+            ], self.repo[commit.tree].items())
+
+    def test_file_move(self):
+        from fastimport import commands
+        self.simple_commit()
+        commit = self.make_file_commit([commands.FileRenameCommand("path", "new_path")])
+        self.assertEquals([
+            ('new_path', 0100644, '6320cd248dd8aeaab759d5871f8781b5c0505172'),
+            ], self.repo[commit.tree].items())
+
+    def test_file_delete(self):
+        from fastimport import commands
+        self.simple_commit()
+        commit = self.make_file_commit([commands.FileDeleteCommand("path")])
+        self.assertEquals([], self.repo[commit.tree].items())
+
+    def test_file_deleteall(self):
+        from fastimport import commands
+        self.simple_commit()
+        commit = self.make_file_commit([commands.FileDeleteAllCommand()])
+        self.assertEquals([], self.repo[commit.tree].items())

+ 32 - 8
dulwich/tests/test_object_store.py

@@ -32,9 +32,9 @@ from dulwich.errors import (
 from dulwich.objects import (
     object_class,
     Blob,
-    ShaFile,
     Tag,
     Tree,
+    TreeEntry,
     )
 from dulwich.object_store import (
     DiskObjectStore,
@@ -89,6 +89,25 @@ class ObjectStoreTests(object):
         r = self.store[testobject.id]
         self.assertEquals(r, testobject)
 
+    def test_tree_changes(self):
+        blob_a1 = make_object(Blob, data='a1')
+        blob_a2 = make_object(Blob, data='a2')
+        blob_b = make_object(Blob, data='b')
+        for blob in [blob_a1, blob_a2, blob_b]:
+            self.store.add_object(blob)
+
+        blobs_1 = [('a', blob_a1.id, 0100644), ('b', blob_b.id, 0100644)]
+        tree1_id = commit_tree(self.store, blobs_1)
+        blobs_2 = [('a', blob_a2.id, 0100644), ('b', blob_b.id, 0100644)]
+        tree2_id = commit_tree(self.store, blobs_2)
+        change_a = (('a', 'a'), (0100644, 0100644), (blob_a1.id, blob_a2.id))
+        self.assertEquals([change_a],
+                          list(self.store.tree_changes(tree1_id, tree2_id)))
+        self.assertEquals(
+          [change_a, (('b', 'b'), (0100644, 0100644), (blob_b.id, blob_b.id))],
+          list(self.store.tree_changes(tree1_id, tree2_id,
+                                       want_unchanged=True)))
+
     def test_iter_tree_contents(self):
         blob_a = make_object(Blob, data='a')
         blob_b = make_object(Blob, data='b')
@@ -104,7 +123,7 @@ class ObjectStoreTests(object):
           ('c', blob_c.id, 0100644),
           ]
         tree_id = commit_tree(self.store, blobs)
-        self.assertEquals([(p, m, h) for (p, h, m) in blobs],
+        self.assertEquals([TreeEntry(p, m, h) for (p, h, m) in blobs],
                           list(self.store.iter_tree_contents(tree_id)))
 
     def test_iter_tree_contents_include_trees(self):
@@ -125,12 +144,12 @@ class ObjectStoreTests(object):
         tree_bd = self.store[tree_ad['bd'][1]]
 
         expected = [
-          ('', 0040000, tree_id),
-          ('a', 0100644, blob_a.id),
-          ('ad', 0040000, tree_ad.id),
-          ('ad/b', 0100644, blob_b.id),
-          ('ad/bd', 0040000, tree_bd.id),
-          ('ad/bd/c', 0100755, blob_c.id),
+          TreeEntry('', 0040000, tree_id),
+          TreeEntry('a', 0100644, blob_a.id),
+          TreeEntry('ad', 0040000, tree_ad.id),
+          TreeEntry('ad/b', 0100644, blob_b.id),
+          TreeEntry('ad/bd', 0040000, tree_bd.id),
+          TreeEntry('ad/bd/c', 0100755, blob_c.id),
           ]
         actual = self.store.iter_tree_contents(tree_id, include_trees=True)
         self.assertEquals(expected, list(actual))
@@ -161,6 +180,10 @@ class MemoryObjectStoreTests(ObjectStoreTests, TestCase):
 
 class PackBasedObjectStoreTests(ObjectStoreTests):
 
+    def tearDown(self):
+        for pack in self.store.packs:
+            pack.close()
+
     def test_empty_packs(self):
         self.assertEquals([], self.store.packs)
 
@@ -184,6 +207,7 @@ class DiskObjectStoreTests(PackBasedObjectStoreTests, TestCase):
 
     def tearDown(self):
         TestCase.tearDown(self)
+        PackBasedObjectStoreTests.tearDown(self)
         shutil.rmtree(self.store_dir)
 
     def test_pack_dir(self):

+ 51 - 62
dulwich/tests/test_objects.py

@@ -30,6 +30,9 @@ import stat
 from dulwich.errors import (
     ObjectFormatException,
     )
+from dulwich._compat import (
+    permutations,
+    )
 from dulwich.objects import (
     Blob,
     Tree,
@@ -50,11 +53,12 @@ from dulwich.objects import (
     )
 from dulwich.tests import (
     TestCase,
-    TestSkipped,
     )
 from utils import (
     make_commit,
     make_object,
+    functest_builder,
+    ext_functest_builder,
     )
 
 a_sha = '6f670c0fb53f9463760b7295fbb814e965fb20c8'
@@ -64,40 +68,6 @@ tree_sha = '70c190eb48fa8bbb50ddc692a17b44cb781af7f6'
 tag_sha = '71033db03a03c6a36721efcf1968dd8f8e0cf023'
 
 
-try:
-    from itertools import permutations
-except ImportError:
-    # Implementation of permutations from Python 2.6 documentation:
-    # http://docs.python.org/2.6/library/itertools.html#itertools.permutations
-    # Copyright (c) 2001-2010 Python Software Foundation; All Rights Reserved
-    # Modified syntax slightly to run under Python 2.4.
-    def permutations(iterable, r=None):
-        # permutations('ABCD', 2) --> AB AC AD BA BC BD CA CB CD DA DB DC
-        # permutations(range(3)) --> 012 021 102 120 201 210
-        pool = tuple(iterable)
-        n = len(pool)
-        if r is None:
-            r = n
-        if r > n:
-            return
-        indices = range(n)
-        cycles = range(n, n-r, -1)
-        yield tuple(pool[i] for i in indices[:r])
-        while n:
-            for i in reversed(range(r)):
-                cycles[i] -= 1
-                if cycles[i] == 0:
-                    indices[i:] = indices[i+1:] + indices[i:i+1]
-                    cycles[i] = n - i
-                else:
-                    j = cycles[i]
-                    indices[i], indices[-j] = indices[-j], indices[i]
-                    yield tuple(pool[i] for i in indices[:r])
-                    break
-            else:
-                return
-
-
 class TestHexToSha(TestCase):
 
     def test_simple(self):
@@ -117,21 +87,21 @@ class BlobReadTests(TestCase):
     def get_blob(self, sha):
         """Return the blob named sha from the test data dir"""
         return self.get_sha_file(Blob, 'blobs', sha)
-  
+
     def get_tree(self, sha):
         return self.get_sha_file(Tree, 'trees', sha)
-  
+
     def get_tag(self, sha):
         return self.get_sha_file(Tag, 'tags', sha)
-  
+
     def commit(self, sha):
         return self.get_sha_file(Commit, 'commits', sha)
-  
+
     def test_decompress_simple_blob(self):
         b = self.get_blob(a_sha)
         self.assertEqual(b.data, 'test 1\n')
         self.assertEqual(b.sha().hexdigest(), a_sha)
-  
+
     def test_hash(self):
         b = self.get_blob(a_sha)
         self.assertEqual(hash(b.id), hash(b))
@@ -142,7 +112,7 @@ class BlobReadTests(TestCase):
         self.assertEqual(b.data, '')
         self.assertEqual(b.id, sha)
         self.assertEqual(b.sha().hexdigest(), sha)
-  
+
     def test_create_blob_from_string(self):
         string = 'test 2\n'
         b = Blob.from_string(string)
@@ -166,23 +136,23 @@ class BlobReadTests(TestCase):
         self.assertEqual('test 5\n', b.data)
         b.chunked = ['te', 'st', ' 6\n']
         self.assertEqual('test 6\n', b.as_raw_string())
-  
+
     def test_parse_legacy_blob(self):
         string = 'test 3\n'
         b = self.get_blob(c_sha)
         self.assertEqual(b.data, string)
         self.assertEqual(b.sha().hexdigest(), c_sha)
-  
+
     def test_eq(self):
         blob1 = self.get_blob(a_sha)
         blob2 = self.get_blob(a_sha)
         self.assertEqual(blob1, blob2)
-  
+
     def test_read_tree_from_file(self):
         t = self.get_tree(tree_sha)
         self.assertEqual(t.entries()[0], (33188, 'a', a_sha))
         self.assertEqual(t.entries()[1], (33188, 'b', b_sha))
-  
+
     def test_read_tag_from_file(self):
         t = self.get_tag(tag_sha)
         self.assertEqual(t.object, (Commit, '51b668fd5bf7061b7d6fa525f88803e6cfadaa51'))
@@ -190,7 +160,7 @@ class BlobReadTests(TestCase):
         self.assertEqual(t.tagger,'Ali Sabil <ali.sabil@gmail.com>')
         self.assertEqual(t.tag_time, 1231203091)
         self.assertEqual(t.message, 'This is a signed tag\n-----BEGIN PGP SIGNATURE-----\nVersion: GnuPG v1.4.9 (GNU/Linux)\n\niEYEABECAAYFAkliqx8ACgkQqSMmLy9u/kcx5ACfakZ9NnPl02tOyYP6pkBoEkU1\n5EcAn0UFgokaSvS371Ym/4W9iJj6vh3h\n=ql7y\n-----END PGP SIGNATURE-----\n')
-  
+
     def test_read_commit_from_file(self):
         sha = '60dacdc733de308bb77bb76ce0fb0f9b44c9769e'
         c = self.commit(sha)
@@ -205,7 +175,7 @@ class BlobReadTests(TestCase):
         self.assertEqual(c.commit_timezone, 0)
         self.assertEqual(c.author_timezone, 0)
         self.assertEqual(c.message, 'Test commit\n')
-  
+
     def test_read_commit_no_parents(self):
         sha = '0d89f20333fbb1d2f3a94da77f4981373d8f4310'
         c = self.commit(sha)
@@ -219,7 +189,7 @@ class BlobReadTests(TestCase):
         self.assertEqual(c.commit_timezone, 0)
         self.assertEqual(c.author_timezone, 0)
         self.assertEqual(c.message, 'Test commit\n')
-  
+
     def test_read_commit_two_parents(self):
         sha = '5dac377bdded4c9aeb8dff595f0faeebcc8498cc'
         c = self.commit(sha)
@@ -465,18 +435,24 @@ class TreeTests(ShaFileCheckTests):
         o = Tree.from_path(hex_to_filename(dir, tree_sha))
         self.assertEquals([('a', 0100644, a_sha), ('b', 0100644, b_sha)],
                           list(parse_tree(o.as_raw_string())))
+        # test a broken tree that has a leading 0 on the file mode
+        broken_tree = '0100644 foo\0' + hex_to_sha(a_sha)
 
-    def test_parse_tree(self):
-        self._do_test_parse_tree(_parse_tree_py)
+        def eval_parse_tree(*args, **kwargs):
+            return list(parse_tree(*args, **kwargs))
 
-    def test_parse_tree_extension(self):
-        if parse_tree is _parse_tree_py:
-            raise TestSkipped('parse_tree extension not found')
-        self._do_test_parse_tree(parse_tree)
+        self.assertEquals([('foo', 0100644, a_sha)],
+                          eval_parse_tree(broken_tree))
+        self.assertRaises(ObjectFormatException,
+                          eval_parse_tree, broken_tree, strict=True)
+
+    test_parse_tree = functest_builder(_do_test_parse_tree, _parse_tree_py)
+    test_parse_tree_extension = ext_functest_builder(_do_test_parse_tree,
+                                                     parse_tree)
 
     def _do_test_sorted_tree_items(self, sorted_tree_items):
         def do_sort(entries):
-            return list(sorted_tree_items(entries))
+            return list(sorted_tree_items(entries, False))
 
         actual = do_sort(_TREE_ITEMS)
         self.assertEqual(_SORTED_TREE_ITEMS, actual)
@@ -494,13 +470,24 @@ class TreeTests(ShaFileCheckTests):
         self.assertRaises(errors, do_sort, {'foo': ('xxx', myhexsha)})
         self.assertRaises(errors, do_sort, {'foo': (0100755, 12345)})
 
-    def test_sorted_tree_items(self):
-        self._do_test_sorted_tree_items(_sorted_tree_items_py)
-
-    def test_sorted_tree_items_extension(self):
-        if sorted_tree_items is _sorted_tree_items_py:
-            raise TestSkipped('sorted_tree_items extension not found')
-        self._do_test_sorted_tree_items(sorted_tree_items)
+    test_sorted_tree_items = functest_builder(_do_test_sorted_tree_items,
+                                              _sorted_tree_items_py)
+    test_sorted_tree_items_extension = ext_functest_builder(
+      _do_test_sorted_tree_items, sorted_tree_items)
+
+    def _do_test_sorted_tree_items_name_order(self, sorted_tree_items):
+        self.assertEqual([
+          TreeEntry('a', stat.S_IFDIR,
+                    'd80c186a03f423a81b39df39dc87fd269736ca86'),
+          TreeEntry('a.c', 0100755, 'd80c186a03f423a81b39df39dc87fd269736ca86'),
+          TreeEntry('a/c', stat.S_IFDIR,
+                    'd80c186a03f423a81b39df39dc87fd269736ca86'),
+          ], list(sorted_tree_items(_TREE_ITEMS, True)))
+
+    test_sorted_tree_items_name_order = functest_builder(
+      _do_test_sorted_tree_items_name_order, _sorted_tree_items_py)
+    test_sorted_tree_items_name_order_extension = ext_functest_builder(
+      _do_test_sorted_tree_items_name_order, sorted_tree_items)
 
     def test_check(self):
         t = Tree
@@ -520,6 +507,8 @@ class TreeTests(ShaFileCheckTests):
         # TODO more whitelisted modes
         self.assertCheckFails(t, '123456 a\0%s' % sha)
         self.assertCheckFails(t, '123abc a\0%s' % sha)
+        # should fail check, but parses ok
+        self.assertCheckFails(t, '0100644 foo\0' + sha)
 
         # shas
         self.assertCheckFails(t, '100644 a\0%s' % ('x' * 5))

+ 26 - 16
dulwich/tests/test_pack.py

@@ -73,7 +73,8 @@ class PackTests(TestCase):
         shutil.rmtree(self.tempdir)
         super(PackTests, self).tearDown()
 
-    datadir = os.path.join(os.path.dirname(__file__), 'data/packs')
+    datadir = os.path.abspath(os.path.join(os.path.dirname(__file__),
+        'data/packs'))
 
     def get_pack_index(self, sha):
         """Returns a PackIndex from the datadir with the given sha"""
@@ -271,21 +272,30 @@ class TestPack(PackTests):
 
     def test_copy(self):
         origpack = self.get_pack(pack1_sha)
-        self.assertSucceeds(origpack.index.check)
-        basename = os.path.join(self.tempdir, 'Elch')
-        write_pack(basename, [(x, '') for x in origpack.iterobjects()],
-                   len(origpack))
-        newpack = Pack(basename)
-        self.assertEquals(origpack, newpack)
-        self.assertSucceeds(newpack.index.check)
-        self.assertEquals(origpack.name(), newpack.name())
-        self.assertEquals(origpack.index.get_pack_checksum(),
-                          newpack.index.get_pack_checksum())
-
-        wrong_version = origpack.index.version != newpack.index.version
-        orig_checksum = origpack.index.get_stored_checksum()
-        new_checksum = newpack.index.get_stored_checksum()
-        self.assertTrue(wrong_version or orig_checksum == new_checksum)
+
+        try:
+            self.assertSucceeds(origpack.index.check)
+            basename = os.path.join(self.tempdir, 'Elch')
+            write_pack(basename, [(x, '') for x in origpack.iterobjects()],
+                       len(origpack))
+            newpack = Pack(basename)
+
+            try:
+                self.assertEquals(origpack, newpack)
+                self.assertSucceeds(newpack.index.check)
+                self.assertEquals(origpack.name(), newpack.name())
+                self.assertEquals(origpack.index.get_pack_checksum(),
+                                  newpack.index.get_pack_checksum())
+
+                wrong_version = origpack.index.version != newpack.index.version
+                orig_checksum = origpack.index.get_stored_checksum()
+                new_checksum = newpack.index.get_stored_checksum()
+                self.assertTrue(wrong_version or orig_checksum == new_checksum)
+            finally:
+                newpack.close()
+        finally:
+            origpack.close()
+
 
     def test_commit_obj(self):
         p = self.get_pack(pack1_sha)

+ 138 - 1
dulwich/tests/test_patch.py

@@ -21,15 +21,22 @@
 from cStringIO import StringIO
 
 from dulwich.objects import (
+    Blob,
     Commit,
     Tree,
     )
+from dulwich.object_store import (
+    MemoryObjectStore,
+    )
 from dulwich.patch import (
     git_am_patch_split,
+    write_blob_diff,
     write_commit_patch,
+    write_tree_diff,
     )
 from dulwich.tests import (
     TestCase,
+    TestSkipped,
     )
 
 
@@ -152,4 +159,134 @@ From: Jelmer Vernooy <jelmer@debian.org>
 
 """
         c, diff, version = git_am_patch_split(StringIO(text))
-        self.assertIs(None, version)
+        self.assertEquals(None, version)
+
+    def test_extract_mercurial(self):
+        raise TestSkipped("git_am_patch_split doesn't handle Mercurial patches properly yet")
+        expected_diff = """diff --git a/dulwich/tests/test_patch.py b/dulwich/tests/test_patch.py
+--- a/dulwich/tests/test_patch.py
++++ b/dulwich/tests/test_patch.py
+@@ -158,7 +158,7 @@
+ 
+ '''
+         c, diff, version = git_am_patch_split(StringIO(text))
+-        self.assertIs(None, version)
++        self.assertEquals(None, version)
+ 
+ 
+ class DiffTests(TestCase):
+"""
+        text = """From dulwich-users-bounces+jelmer=samba.org@lists.launchpad.net Mon Nov 29 00:58:18 2010
+Date: Sun, 28 Nov 2010 17:57:27 -0600
+From: Augie Fackler <durin42@gmail.com>
+To: dulwich-users <dulwich-users@lists.launchpad.net>
+Subject: [Dulwich-users] [PATCH] test_patch: fix tests on Python 2.6
+Content-Transfer-Encoding: 8bit
+
+Change-Id: I5e51313d4ae3a65c3f00c665002a7489121bb0d6
+
+%s
+
+_______________________________________________
+Mailing list: https://launchpad.net/~dulwich-users
+Post to     : dulwich-users@lists.launchpad.net
+Unsubscribe : https://launchpad.net/~dulwich-users
+More help   : https://help.launchpad.net/ListHelp
+
+""" % expected_diff
+        c, diff, version = git_am_patch_split(StringIO(text))
+        self.assertEquals(expected_diff, diff)
+        self.assertEquals(None, version)
+
+
+class DiffTests(TestCase):
+    """Tests for write_blob_diff and write_tree_diff."""
+
+    def test_blob_diff(self):
+        f = StringIO()
+        write_blob_diff(f, ("foo.txt", 0644, Blob.from_string("old\nsame\n")),
+                           ("bar.txt", 0644, Blob.from_string("new\nsame\n")))
+        self.assertEquals([
+            "diff --git a/foo.txt b/bar.txt",
+            "index 3b0f961..a116b51 644",
+            "--- a/foo.txt",
+            "+++ b/bar.txt",
+            "@@ -1,2 +1,2 @@",
+            "-old",
+            "+new",
+            " same"
+            ], f.getvalue().splitlines())
+
+    def test_blob_add(self):
+        f = StringIO()
+        write_blob_diff(f, (None, None, None),
+                           ("bar.txt", 0644, Blob.from_string("new\nsame\n")))
+        self.assertEquals([
+            'diff --git /dev/null b/bar.txt',
+             'new mode 644',
+             'index 0000000..a116b51 644',
+             '--- /dev/null',
+             '+++ b/bar.txt',
+             '@@ -1,0 +1,2 @@',
+             '+new',
+             '+same'
+            ], f.getvalue().splitlines())
+
+    def test_blob_remove(self):
+        f = StringIO()
+        write_blob_diff(f, ("bar.txt", 0644, Blob.from_string("new\nsame\n")),
+                           (None, None, None))
+        self.assertEquals([
+            'diff --git a/bar.txt /dev/null',
+            'deleted mode 644',
+            'index a116b51..0000000',
+            '--- a/bar.txt',
+            '+++ /dev/null',
+            '@@ -1,2 +1,0 @@',
+            '-new',
+            '-same'
+            ], f.getvalue().splitlines())
+
+    def test_tree_diff(self):
+        f = StringIO()
+        store = MemoryObjectStore()
+        added = Blob.from_string("add\n")
+        removed = Blob.from_string("removed\n")
+        changed1 = Blob.from_string("unchanged\nremoved\n")
+        changed2 = Blob.from_string("unchanged\nadded\n")
+        unchanged = Blob.from_string("unchanged\n")
+        tree1 = Tree()
+        tree1.add(0644, "removed.txt", removed.id)
+        tree1.add(0644, "changed.txt", changed1.id)
+        tree1.add(0644, "unchanged.txt", changed1.id)
+        tree2 = Tree()
+        tree2.add(0644, "added.txt", added.id)
+        tree2.add(0644, "changed.txt", changed2.id)
+        tree2.add(0644, "unchanged.txt", changed1.id)
+        store.add_objects([(o, None) for o in [
+            tree1, tree2, added, removed, changed1, changed2, unchanged]])
+        write_tree_diff(f, store, tree1.id, tree2.id)
+        self.assertEquals([
+            'diff --git /dev/null b/added.txt',
+            'new mode 644',
+            'index e69de29..76d4bb8 644',
+            '--- /dev/null',
+            '+++ b/added.txt',
+            '@@ -1,0 +1,1 @@',
+            '+add',
+            'diff --git a/changed.txt b/changed.txt',
+            'index bf84e48..1be2436 644',
+            '--- a/changed.txt',
+            '+++ b/changed.txt',
+            '@@ -1,2 +1,2 @@',
+            ' unchanged',
+            '-removed',
+            '+added',
+            'diff --git a/removed.txt /dev/null',
+            'deleted mode 644',
+            'index 2c3f0b3..e69de29',
+            '--- a/removed.txt',
+            '+++ /dev/null',
+            '@@ -1,1 +1,0 @@',
+            '-removed',
+            ], f.getvalue().splitlines())

+ 34 - 17
dulwich/tests/test_repository.py

@@ -66,24 +66,35 @@ class CreateRepositoryTests(TestCase):
             finally:
                 f.close()
 
-    def _check_repo_contents(self, repo):
-        self.assertTrue(repo.bare)
+    def _check_repo_contents(self, repo, expect_bare):
+        self.assertEquals(expect_bare, repo.bare)
         self.assertFileContentsEqual('Unnamed repository', repo, 'description')
         self.assertFileContentsEqual('', repo, os.path.join('info', 'exclude'))
         self.assertFileContentsEqual(None, repo, 'nonexistent file')
+        barestr = 'bare = %s' % str(expect_bare).lower()
+        self.assertTrue(barestr in repo.get_named_file('config').read())
 
-    def test_create_disk(self):
+    def test_create_disk_bare(self):
         tmp_dir = tempfile.mkdtemp()
         try:
             repo = Repo.init_bare(tmp_dir)
             self.assertEquals(tmp_dir, repo._controldir)
-            self._check_repo_contents(repo)
+            self._check_repo_contents(repo, True)
+        finally:
+            shutil.rmtree(tmp_dir)
+
+    def test_create_disk_non_bare(self):
+        tmp_dir = tempfile.mkdtemp()
+        try:
+            repo = Repo.init(tmp_dir)
+            self.assertEquals(os.path.join(tmp_dir, '.git'), repo._controldir)
+            self._check_repo_contents(repo, False)
         finally:
             shutil.rmtree(tmp_dir)
 
     def test_create_memory(self):
         repo = MemoryRepo.init_bare([], {})
-        self._check_repo_contents(repo)
+        self._check_repo_contents(repo, True)
 
 
 class RepositoryTests(TestCase):
@@ -409,6 +420,16 @@ class BuildRepoTests(TestCase):
         tree = r[r[commit_sha].tree]
         self.assertEqual([], list(tree.iteritems()))
 
+    def test_commit_encoding(self):
+        r = self._repo
+        commit_sha = r.do_commit('commit with strange character \xee',
+             committer='Test Committer <test@nodomain.com>',
+             author='Test Author <test@nodomain.com>',
+             commit_timestamp=12395, commit_timezone=0,
+             author_timestamp=12395, author_timezone=0,
+             encoding="iso8859-1")
+        self.assertEquals("iso8859-1", r[commit_sha].encoding)
+
     def test_commit_fail_ref(self):
         r = self._repo
 
@@ -596,18 +617,13 @@ class RefsContainerTests(object):
                          self._refs['refs/heads/symbolic'])
 
     def test_check_refname(self):
-        try:
-            self._refs._check_refname('HEAD')
-        except KeyError:
-            self.fail()
-
-        try:
-            self._refs._check_refname('refs/heads/foo')
-        except KeyError:
-            self.fail()
+        self._refs._check_refname('HEAD')
+        self._refs._check_refname('refs/heads/foo')
 
-        self.assertRaises(KeyError, self._refs._check_refname, 'refs')
-        self.assertRaises(KeyError, self._refs._check_refname, 'notrefs/foo')
+        self.assertRaises(errors.RefFormatError, self._refs._check_refname,
+                          'refs')
+        self.assertRaises(errors.RefFormatError, self._refs._check_refname,
+                          'notrefs/foo')
 
     def test_contains(self):
         self.assertTrue('refs/heads/master' in self._refs)
@@ -732,7 +748,8 @@ class DiskRefsContainerTests(RefsContainerTests, TestCase):
         self.assertEquals(
           ('refs/heads/master', '42d06bd4b77fed026b154d16493e5deab78f02ec'),
           self._refs._follow('refs/heads/master'))
-        self.assertRaises(KeyError, self._refs._follow, 'notrefs/foo')
+        self.assertRaises(errors.RefFormatError, self._refs._follow,
+                          'notrefs/foo')
         self.assertRaises(KeyError, self._refs._follow, 'refs/heads/loop')
 
     def test_delitem(self):

+ 55 - 0
dulwich/tests/test_server.py

@@ -18,22 +18,30 @@
 
 """Tests for the smart protocol server."""
 
+from cStringIO import StringIO
+import os
+import tempfile
 
 from dulwich.errors import (
     GitProtocolError,
+    NotGitRepository,
     UnexpectedCommandError,
     )
 from dulwich.repo import (
     MemoryRepo,
+    Repo,
     )
 from dulwich.server import (
     Backend,
     DictBackend,
+    FileSystemBackend,
     Handler,
     MultiAckGraphWalkerImpl,
     MultiAckDetailedGraphWalkerImpl,
     _split_proto_line,
+    serve_command,
     ProtocolGraphWalker,
+    ReceivePackHandler,
     SingleAckGraphWalkerImpl,
     UploadPackHandler,
     )
@@ -638,3 +646,50 @@ class MultiAckDetailedGraphWalkerImplTestCase(AckGraphWalkerImplTestCase):
 
         self.assertNextEquals(None)
         self.assertNak()
+
+
+class FileSystemBackendTests(TestCase):
+    """Tests for FileSystemBackend."""
+
+    def setUp(self):
+        super(FileSystemBackendTests, self).setUp()
+        self.path = tempfile.mkdtemp()
+        self.repo = Repo.init(self.path)
+        self.backend = FileSystemBackend()
+
+    def test_nonexistant(self):
+        self.assertRaises(NotGitRepository,
+            self.backend.open_repository, "/does/not/exist/unless/foo")
+
+    def test_absolute(self):
+        repo = self.backend.open_repository(self.path)
+        self.assertEquals(repo.path, self.repo.path)
+
+    def test_child(self):
+        self.assertRaises(NotGitRepository,
+            self.backend.open_repository, os.path.join(self.path, "foo"))
+
+
+class ServeCommandTests(TestCase):
+    """Tests for serve_command."""
+
+    def setUp(self):
+        super(ServeCommandTests, self).setUp()
+        self.backend = DictBackend({})
+
+    def serve_command(self, handler_cls, args, inf, outf):
+        return serve_command(handler_cls, ["test"] + args, backend=self.backend,
+            inf=inf, outf=outf)
+
+    def test_receive_pack(self):
+        commit = make_commit(id=ONE, parents=[], commit_time=111)
+        self.backend.repos["/"] = MemoryRepo.init_bare(
+            [commit], {"refs/heads/master": commit.id})
+        outf = StringIO()
+        exitcode = self.serve_command(ReceivePackHandler, ["/"], StringIO("0000"), outf)
+        outlines = outf.getvalue().splitlines()
+        self.assertEquals(2, len(outlines))
+        self.assertEquals("1111111111111111111111111111111111111111 refs/heads/master",
+            outlines[0][4:].split("\x00")[0])
+        self.assertEquals("0000", outlines[-1])
+        self.assertEquals(0, exitcode)

+ 0 - 1
dulwich/tests/test_web.py

@@ -30,7 +30,6 @@ from dulwich.objects import (
     )
 from dulwich.repo import (
     BaseRepo,
-    DictRefsContainer,
     MemoryRepo,
     )
 from dulwich.server import (

+ 42 - 0
dulwich/tests/utils.py

@@ -25,12 +25,16 @@ import os
 import shutil
 import tempfile
 import time
+import types
 
 from dulwich.objects import (
     FixedSha,
     Commit,
     )
 from dulwich.repo import Repo
+from dulwich.tests import (
+    TestSkipped,
+    )
 
 
 def open_repo(name):
@@ -105,3 +109,41 @@ def make_commit(**attrs):
                  'tree': '0' * 40}
     all_attrs.update(attrs)
     return make_object(Commit, **all_attrs)
+
+
+def functest_builder(method, func):
+    """Generate a test method that tests the given function."""
+
+    def do_test(self):
+        method(self, func)
+
+    return do_test
+
+
+def ext_functest_builder(method, func):
+    """Generate a test method that tests the given extension function.
+
+    This is intended to generate test methods that test both a pure-Python
+    version and an extension version using common test code. The extension test
+    will raise TestSkipped if the extension is not found.
+
+    Sample usage:
+
+    class MyTest(TestCase);
+        def _do_some_test(self, func_impl):
+            self.assertEqual('foo', func_impl())
+
+        test_foo = functest_builder(_do_some_test, foo_py)
+        test_foo_extension = ext_functest_builder(_do_some_test, _foo_c)
+
+    :param method: The method to run. It must must two parameters, self and the
+        function implementation to test.
+    :param func: The function implementation to pass to method.
+    """
+
+    def do_test(self):
+        if not isinstance(func, types.BuiltinFunctionType):
+            raise TestSkipped("%s extension not found", func.func_name)
+        method(self, func)
+
+    return do_test

+ 1 - 1
dulwich/web.py

@@ -27,7 +27,7 @@ import time
 try:
     from urlparse import parse_qs
 except ImportError:
-    from dulwich.misc import parse_qs
+    from dulwich._compat import parse_qs
 from dulwich import log_utils
 from dulwich.protocol import (
     ReceivableProtocol,

+ 2 - 0
setup.py

@@ -54,6 +54,8 @@ setup(name='dulwich',
                     include_dirs=include_dirs),
           Extension('dulwich._pack', ['dulwich/_pack.c'],
               include_dirs=include_dirs),
+          Extension('dulwich._diff_tree', ['dulwich/_diff_tree.c'],
+              include_dirs=include_dirs),
           ],
       distclass=DulwichDistribution,
       )