Browse Source

Import upstream version 0.6.2+bzr788

Jelmer Vernooij 14 years ago
parent
commit
df0a8161e9
55 changed files with 2989 additions and 846 deletions
  1. 0 7
      .bzrignore
  2. 3 2
      .testr.conf
  3. 2 0
      AUTHORS
  4. 11 14
      Makefile
  5. 36 5
      NEWS
  6. 6 13
      bin/dul-receive-pack
  7. 6 13
      bin/dul-upload-pack
  8. 10 9
      docs/tutorial/0-introduction.txt
  9. 0 119
      docs/tutorial/1-initial-commit.txt
  10. 28 0
      docs/tutorial/1-repo.txt
  11. 0 61
      docs/tutorial/2-change-file.txt
  12. 184 0
      docs/tutorial/2-object-store.txt
  13. 0 41
      docs/tutorial/3-add-file.txt
  14. 11 0
      docs/tutorial/3-conclusion.txt
  15. 0 30
      docs/tutorial/4-remove-file.txt
  16. 0 33
      docs/tutorial/5-rename-file.txt
  17. 0 14
      docs/tutorial/6-conclusion.txt
  18. 3 6
      docs/tutorial/index.txt
  19. 0 178
      docs/tutorial/test.py
  20. 76 1
      dulwich/_compat.py
  21. 449 0
      dulwich/_diff_tree.c
  22. 53 14
      dulwich/_objects.c
  23. 20 7
      dulwich/client.py
  24. 495 0
      dulwich/diff_tree.py
  25. 4 0
      dulwich/errors.py
  26. 36 3
      dulwich/fastexport.py
  27. 1 16
      dulwich/file.py
  28. 29 63
      dulwich/object_store.py
  29. 32 20
      dulwich/objects.py
  30. 7 9
      dulwich/pack.py
  31. 26 2
      dulwich/patch.py
  32. 1 2
      dulwich/protocol.py
  33. 34 18
      dulwich/repo.py
  34. 35 1
      dulwich/server.py
  35. 85 28
      dulwich/tests/__init__.py
  36. 37 0
      dulwich/tests/compat/__init__.py
  37. 4 0
      dulwich/tests/compat/test_client.py
  38. 2 3
      dulwich/tests/compat/test_server.py
  39. 3 2
      dulwich/tests/compat/test_utils.py
  40. 3 4
      dulwich/tests/compat/test_web.py
  41. 8 2
      dulwich/tests/compat/utils.py
  42. 67 0
      dulwich/tests/test_blackbox.py
  43. 61 0
      dulwich/tests/test_client.py
  44. 671 0
      dulwich/tests/test_diff_tree.py
  45. 69 0
      dulwich/tests/test_fastexport.py
  46. 32 8
      dulwich/tests/test_object_store.py
  47. 51 62
      dulwich/tests/test_objects.py
  48. 26 16
      dulwich/tests/test_pack.py
  49. 138 1
      dulwich/tests/test_patch.py
  50. 34 17
      dulwich/tests/test_repository.py
  51. 55 0
      dulwich/tests/test_server.py
  52. 0 1
      dulwich/tests/test_web.py
  53. 42 0
      dulwich/tests/utils.py
  54. 1 1
      dulwich/web.py
  55. 2 0
      setup.py

+ 0 - 7
.bzrignore

@@ -1,7 +0,0 @@
-_trial_temp
-build
-MANIFEST
-dist
-apidocs
-*,cover
-.testrepository

+ 3 - 2
.testr.conf

@@ -1,3 +1,4 @@
 [DEFAULT]
-test_command=PYTHONPATH=. python -m subunit.run $IDLIST
-test_id_list_default=dulwich.tests.test_suite
+test_command=PYTHONPATH=. python -m subunit.run $IDOPTION $LISTOPT dulwich.tests.test_suite
+test_id_option=--load-list $IDFILE
+test_list_option=--list

+ 2 - 0
AUTHORS

@@ -3,4 +3,6 @@ James Westby <jw+debian@jameswestby.net>
 John Carr <john.carr@unrouted.co.uk>
 Dave Borowitz <dborowitz@google.com>
 
+Hervé Cauwelier <herve@itaapy.com> wrote the original tutorial.
+
 See the revision history for a full list of contributors.

+ 11 - 14
Makefile

@@ -1,8 +1,12 @@
 PYTHON = python
 SETUP = $(PYTHON) setup.py
 PYDOCTOR ?= pydoctor
-TESTRUNNER = $(shell which nosetests)
-TESTFLAGS =
+ifeq ($(shell $(PYTHON) -c "import sys; print sys.version_info >= (2, 7)"),True)
+TESTRUNNER ?= unittest
+else
+TESTRUNNER ?= unittest2
+endif
+RUNTEST = PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) -m $(TESTRUNNER)
 
 all: build
 
@@ -19,21 +23,14 @@ install::
 	$(SETUP) install
 
 check:: build
-	PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) $(TESTRUNNER) dulwich
-	which git > /dev/null && PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) $(TESTRUNNER) $(TESTFLAGS) -i compat
+	$(RUNTEST) dulwich.tests.test_suite
 
-check-noextensions:: clean
-	PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) $(TESTRUNNER) $(TESTFLAGS) dulwich
+check-nocompat:: build
+	$(RUNTEST) dulwich.tests.nocompat_test_suite
 
-check-compat:: build
-	PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) $(TESTRUNNER) $(TESTFLAGS) -i compat
+check-noextensions:: clean
+	$(RUNTEST) dulwich.tests.test_suite
 
 clean::
 	$(SETUP) clean --all
 	rm -f dulwich/*.so
-
-coverage:: build
-	PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) $(TESTRUNNER) --cover-package=dulwich --with-coverage --cover-erase --cover-inclusive dulwich
-
-coverage-annotate: coverage
-	python-coverage -a -o /usr

+ 36 - 5
NEWS

@@ -2,17 +2,48 @@
 
  FEATURES
 
+  * New `dulwich.diff_tree` module for simple content-based rename detection.
+    (Dave Borowitz)
+
   * Add Tree.items(). (Jelmer Vernooij)
 
   * Add eof() and unread_pkt_line() methods to Protocol. (Dave Borowitz)
 
+  * Add write_tree_diff(). (Jelmer Vernooij)
+
+  * Add `serve_command` function for git server commands as executables.
+    (Jelmer Vernooij)
+
+  * dulwich.client.get_transport_and_path now supports rsync-style repository URLs.
+    (Dave Borowitz, #568493)
+
  BUG FIXES
 
   * Correct short-circuiting operation for no-op fetches in the server.
     (Dave Borowitz)
 
-  * Support parsing git mbox patches without a version tail, as generated by Mercurial. 
-    (Jelmer Vernooij)
+  * Support parsing git mbox patches without a version tail, as generated by
+    Mercurial.  (Jelmer Vernooij)
+
+  * Fix dul-receive-pack and dul-upload-pack. (Jelmer Vernooij)
+
+  * Zero-padded file modes in Tree objects no longer trigger an exception but
+    the check code warns about them. (Augie Fackler, #581064)
+
+  * Repo.init() now honors the mkdir flag. (#671159)
+
+  * The ref format is now checked when setting a ref rather than when reading it back.
+    (Dave Borowitz, #653527)
+
+  * Make sure pack files are closed correctly. (Tay Ray Chuan)
+
+ DOCUMENTATION
+
+  * Run the tutorial inside the test suite. (Jelmer Vernooij)
+
+  * Reorganized and updated the tutorial. (Jelmer Vernooij, Dave Borowitz, #610550,
+     #610540)
+
 
 0.6.2	2010-10-16
 
@@ -176,7 +207,7 @@ note: This list is most likely incomplete for 0.6.0.
   * Implement RefsContainer.__contains__. (Jelmer Vernooij)
 
   * Cope with \r in ref files on Windows. (
-	http://github.com/jelmer/dulwich/issues/#issue/13, Jelmer Vernooij)
+    http://github.com/jelmer/dulwich/issues/#issue/13, Jelmer Vernooij)
 
   * Fix GitFile breakage on Windows. (Anatoly Techtonik, #557585)
 
@@ -246,7 +277,7 @@ note: This list is most likely incomplete for 0.6.0.
     with chunks of strings rather than with full-text strings. 
     (Jelmer Vernooij)
 
-0.5.0	2010-03-03
+0.5.02010-03-03
 
  BUG FIXES
 
@@ -347,7 +378,7 @@ note: This list is most likely incomplete for 0.6.0.
 
   * Removed Repo.set_ref, Repo.remove_ref, Repo.tags, Repo.get_refs and 
     Repo.heads in favor of Repo.refs, a dictionary-like object for accessing
-	refs.
+    refs.
 
  BUG FIXES
 

+ 6 - 13
bin/dul-receive-pack

@@ -17,19 +17,12 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 # MA  02110-1301, USA.
 
+from dulwich.server import serve_command, ReceivePackHandler
+import os
 import sys
-from dulwich.repo import Repo
-from dulwich.server import GitBackend, ReceivePackHandler
 
-def send_fn(data):
-    sys.stdout.write(data)
-    sys.stdout.flush()
+if len(sys.argv) < 2:
+    print >>sys.stderr, "usage: %s <git-dir>" % os.path.basename(sys.argv[0])
+    sys.exit(1)
 
-if __name__ == "__main__":
-    gitdir = None
-    if len(sys.argv) > 1:
-        gitdir = sys.argv[1]
-
-    backend = GitBackend(Repo(gitdir))
-    handler = ReceivePackHandler(backend, sys.stdin.read, send_fn)
-    handler.handle()
+sys.exit(serve_command(ReceivePackHandler))

+ 6 - 13
bin/dul-upload-pack

@@ -17,19 +17,12 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 # MA  02110-1301, USA.
 
+from dulwich.server import serve_command, UploadPackHandler
+import os
 import sys
-from dulwich.repo import Repo
-from dulwich.server import GitBackend, UploadPackHandler
 
-def send_fn(data):
-    sys.stdout.write(data)
-    sys.stdout.flush()
+if len(sys.argv) < 2:
+    print >>sys.stderr, "usage: %s <git-dir>" % os.path.basename(sys.argv[0])
+    sys.exit(1)
 
-if __name__ == "__main__":
-    gitdir = None
-    if len(sys.argv) > 1:
-        gitdir = sys.argv[1]
-
-    backend = GitBackend(Repo(gitdir))
-    handler = UploadPackHandler(backend, sys.stdin.read, send_fn)
-    handler.handle()
+sys.exit(serve_command(UploadPackHandler))

+ 10 - 9
docs/tutorial/0-introduction.txt

@@ -45,16 +45,16 @@ tree.
 The Tree
 --------
 
-A tree is a collection of file information, the state of your working copy at
+A tree is a collection of file information, the state of a single directory at
 a given point in time.
 
 A tree file looks like this::
 
-  tree <content length><NUL><file mode> <filename><NUL><blob sha>...
+  tree <content length><NUL><file mode> <filename><NUL><item sha>...
 
 And repeats for every file in the tree.
 
-Note that for a unknown reason, the SHA-1 digest is in binary form here.
+Note that the SHA-1 digest is in binary form here.
 
 The file mode is like the octal argument you could give to the ``chmod``
 command.  Except it is in extended form to tell regular files from
@@ -88,14 +88,15 @@ accelerate operations and reduce space.
 More About Git formats
 ----------------------
 
-These three objects make 90 % of a Git repository. The rest is branch
-information and optimizations.
+These three objects make up most of the contents of a Git repository and are
+used for the history. They can either appear as simple files on disk (one file
+per object) or in a ``pack`` file, which is a container for a number of these
+objects.
 
-For instance there is an index of the current state of the working copy.
-There are also pack files to group several small objects in a single indexed
-file.
+The is also an index of the current state of the working copy in the
+repository as well as files to track the existing branches and tags.
 
-For a more detailled explanation of object formats and SHA-1 digests, see:
+For a more detailed explanation of object formats and SHA-1 digests, see:
 http://www-cs-students.stanford.edu/~blynn/gitmagic/ch08.html
 
 Just note that recent versions of Git compress object files using zlib.

+ 0 - 119
docs/tutorial/1-initial-commit.txt

@@ -1,119 +0,0 @@
-The Repository
-==============
-
-After this introduction, let's start directly with code::
-
-  >>> from dulwich.repo import Repo
-
-The access to every object is through the Repo object. You can open an
-existing repository or you can create a new one. There are two types of Git
-repositories:
-
-  Regular Repositories -- They are the ones you create using ``git init`` and
-  you daily use. They contain a ``.git`` folder.
-
-  Bare Repositories -- There is not ".git" folder. The top-level folder
-  contains itself the "branches", "hooks"... folders. These are used for
-  published repositories (mirrors).
-
-Let's create a folder and turn it into a repository, like ``git init`` would::
-
-  >>> from os import mkdir
-  >>> mkdir("myrepo")
-  >>> repo = Repo.init("myrepo")
-  >>> repo
-  <Repo at '/tmp/myrepo/'>
-
-You can already look a the structure of the "myrepo/.git" folder, though it
-is mostly empty for now.
-
-Initial commit
-==============
-
-When you use Git, you generally add or modify content. As our repository is
-empty for now, we'll start by adding a new file::
-
-  >>> from dulwich.objects import Blob
-  >>> blob = Blob.from_string("My file content\n")
-  >>> blob.id
-  'c55063a4d5d37aa1af2b2dad3a70aa34dae54dc6'
-
-Of course you could create a blob from an existing file using ``from_file``
-instead.
-
-As said in the introduction, file content is separed from file name. Let's
-give this content a name::
-
-  >>> from dulwich.objects import Tree
-  >>> tree = Tree()
-  >>> tree.add(0100644, "spam", blob.id)
-
-Note that "0100644" is the octal form for a regular file with common
-permissions. You can hardcode them or you can use the ``stat`` module.
-
-The tree state of our repository still needs to be placed in time. That's the
-job of the commit::
-
-  >>> from dulwich.objects import Commit, parse_timezone
-  >>> from time import time
-  >>> commit = Commit()
-  >>> commit.tree = tree.id
-  >>> author = "Your Name <your.email@example.com>"
-  >>> commit.author = commit.committer = author
-  >>> commit.commit_time = commit.author_time = int(time())
-  >>> tz = parse_timezone('-0200')
-  >>> commit.commit_timezone = commit.author_timezone = tz
-  >>> commit.encoding = "UTF-8"
-  >>> commit.message = "Initial commit"
-
-Note that the initial commit has no parents.
-
-At this point, the repository is still empty because all operations happen in
-memory. Let's "commit" it.
-
-  >>> object_store = repo.object_store
-  >>> object_store.add_object(blob)
-
-Now the ".git/objects" folder contains a first SHA-1 file. Let's continue
-saving the changes::
-
-  >>> object_store.add_object(tree)
-  >>> object_store.add_object(commit)
-
-Now the physical repository contains three objects but still has no branch.
-Let's create the master branch like Git would::
-
-  >>> repo.refs['refs/heads/master'] = commit.id
-
-The master branch now has a commit where to start, but Git itself would not
-known what is the current branch. That's another reference::
-
-  >>> repo.refs['HEAD'] = 'ref: refs/heads/master'
-
-Now our repository is officialy tracking a branch named "master" refering to a
-single commit.
-
-Playing again with Git
-======================
-
-At this point you can come back to the shell, go into the "myrepo" folder and
-type ``git status`` to let Git confirm that this is a regular repository on
-branch "master".
-
-Git will tell you that the file "spam" is deleted, which is normal because
-Git is comparing the repository state with the current working copy. And we
-have absolutely no working copy using Dulwich because we don't need it at
-all!
-
-You can checkout the last state using ``git checkout -f``. The force flag
-will prevent Git from complaining that there are uncommitted changes in the
-working copy.
-
-The file ``spam`` appears and with no surprise contains the same bytes as the
-blob::
-
-  $ cat spam
-  My file content
-
-.. attention:: Remember to recreate the repo object when you modify the
-               repository outside of Dulwich!

+ 28 - 0
docs/tutorial/1-repo.txt

@@ -0,0 +1,28 @@
+The Repository
+==============
+
+After this introduction, let's start directly with code::
+
+  >>> from dulwich.repo import Repo
+
+The access to a repository is through the Repo object. You can open an
+existing repository or you can create a new one. There are two types of Git
+repositories:
+
+  Regular Repositories -- They are the ones you create using ``git init`` and
+  you daily use. They contain a ``.git`` folder.
+
+  Bare Repositories -- There is not ".git" folder. The top-level folder
+  contains itself the "branches", "hooks"... folders. These are used for
+  published repositories (mirrors). They do not have a working tree.
+
+Let's create a folder and turn it into a repository, like ``git init`` would::
+
+  >>> from os import mkdir
+  >>> mkdir("myrepo")
+  >>> repo = Repo.init("myrepo")
+  >>> repo
+  <Repo at 'myrepo'>
+
+You can already look a the structure of the "myrepo/.git" folder, though it
+is mostly empty for now.

+ 0 - 61
docs/tutorial/2-change-file.txt

@@ -1,61 +0,0 @@
-Changing a File and Commit it
-=============================
-
-Now we have a first commit, the next one will show a difference.
-
-As seen in the introduction, it's about making a path in a tree point to a
-new blob. The old blob will remain to compute the diff. The tree is altered
-and the new commit'task is to point to this new version.
-
-In the following examples, we assume we still have the ``repo`` and ``tree``
-object from the previous chapter.
-
-Let's first build the blob::
-
-  >>> spam = Blob.from_string("My new file content\n")
-  >>> spam.id
-  '16ee2682887a962f854ebd25a61db16ef4efe49f'
-
-An alternative is to alter the previously constructed blob object::
-
-  >>> blob.data = "My new file content\n"
-  >>> blob.id
-  '16ee2682887a962f854ebd25a61db16ef4efe49f'
-
-In any case, update the blob id known as "spam". You also have the
-opportunity of changing its mode::
-
-  >>> tree["spam"] = (0100644, spam.id)
-
-Now let's record the change::
-
-  >>> c2 = Commit()
-  >>> c2.tree = tree.id
-  >>> c2.parents = [commit.id]
-  >>> c2.author = c2.committer = author
-  >>> c2.commit_time = c2.author_time = int(time())
-  >>> c2.commit_timezone = c2.author_timezone = tz
-  >>> c2.encoding = "UTF-8"
-  >>> c2.message = 'Changing "spam"'
-
-In this new commit we record the changed tree id, and most important, the
-previous commit as the parent. Parents are actually a list because a commit
-may happen to have several parents after merging branches.
-
-Remain to record this whole new family::
-
-  >>> object_store.add_object(spam)
-  >>> object_store.add_object(tree)
-  >>> object_store.add_object(c2)
-
-You can already ask git to introspect this commit using ``git show`` and the
-value of ``commit.id`` as an argument. You'll see the difference will the
-previous blob recorded as "spam".
-
-You won't see it using git log because the head is still the previous
-commit. It's easy to remedy::
-
-  >>> repo.refs['refs/heads/master'] = c2.id
-
-Now all git tools will work as expected. Though don't forget that Dulwich is
-still open!

+ 184 - 0
docs/tutorial/2-object-store.txt

@@ -0,0 +1,184 @@
+The object store
+================
+
+The objects are stored in the ``object store`` of the repository.
+
+  >>> from dulwich.repo import Repo
+  >>> repo = Repo.init("myrepo", mkdir=True)
+
+Initial commit
+--------------
+
+When you use Git, you generally add or modify content. As our repository is
+empty for now, we'll start by adding a new file::
+
+  >>> from dulwich.objects import Blob
+  >>> blob = Blob.from_string("My file content\n")
+  >>> blob.id
+  'c55063a4d5d37aa1af2b2dad3a70aa34dae54dc6'
+
+Of course you could create a blob from an existing file using ``from_file``
+instead.
+
+As said in the introduction, file content is separed from file name. Let's
+give this content a name::
+
+  >>> from dulwich.objects import Tree
+  >>> tree = Tree()
+  >>> tree.add(0100644, "spam", blob.id)
+
+Note that "0100644" is the octal form for a regular file with common
+permissions. You can hardcode them or you can use the ``stat`` module.
+
+The tree state of our repository still needs to be placed in time. That's the
+job of the commit::
+
+  >>> from dulwich.objects import Commit, parse_timezone
+  >>> from time import time
+  >>> commit = Commit()
+  >>> commit.tree = tree.id
+  >>> author = "Your Name <your.email@example.com>"
+  >>> commit.author = commit.committer = author
+  >>> commit.commit_time = commit.author_time = int(time())
+  >>> tz = parse_timezone('-0200')[0]
+  >>> commit.commit_timezone = commit.author_timezone = tz
+  >>> commit.encoding = "UTF-8"
+  >>> commit.message = "Initial commit"
+
+Note that the initial commit has no parents.
+
+At this point, the repository is still empty because all operations happen in
+memory. Let's "commit" it.
+
+  >>> object_store = repo.object_store
+  >>> object_store.add_object(blob)
+
+Now the ".git/objects" folder contains a first SHA-1 file. Let's continue
+saving the changes::
+
+  >>> object_store.add_object(tree)
+  >>> object_store.add_object(commit)
+
+Now the physical repository contains three objects but still has no branch.
+Let's create the master branch like Git would::
+
+  >>> repo.refs['refs/heads/master'] = commit.id
+
+The master branch now has a commit where to start. When we commit to master, we
+are also moving HEAD, which is Git's currently checked out branch:
+
+  >>> head = repo.refs['HEAD']
+  >>> head == commit.id
+  True
+  >>> head == repo.refs['refs/heads/master']
+  True
+
+How did that work? As it turns out, HEAD is a special kind of ref called a
+symbolic ref, and it points at master. Most functions on the refs container
+work transparently with symbolic refs, but we can also take a peek inside HEAD:
+
+  >>> repo.refs.read_ref('HEAD')
+  'ref: refs/heads/master'
+
+Normally, you won't need to use read_ref. If you want to change what ref HEAD
+points to, in order to check out another branch, just use set_symbolic_ref.
+
+Now our repository is officially tracking a branch named "master" referring to a
+single commit.
+
+Playing again with Git
+----------------------
+
+At this point you can come back to the shell, go into the "myrepo" folder and
+type ``git status`` to let Git confirm that this is a regular repository on
+branch "master".
+
+Git will tell you that the file "spam" is deleted, which is normal because
+Git is comparing the repository state with the current working copy. And we
+have absolutely no working copy using Dulwich because we don't need it at
+all!
+
+You can checkout the last state using ``git checkout -f``. The force flag
+will prevent Git from complaining that there are uncommitted changes in the
+working copy.
+
+The file ``spam`` appears and with no surprise contains the same bytes as the
+blob::
+
+  $ cat spam
+  My file content
+
+Changing a File and Committing it
+---------------------------------
+
+Now we have a first commit, the next one will show a difference.
+
+As seen in the introduction, it's about making a path in a tree point to a
+new blob. The old blob will remain to compute the diff. The tree is altered
+and the new commit'task is to point to this new version.
+
+Let's first build the blob::
+
+  >>> from dulwich.objects import Blob
+  >>> spam = Blob.from_string("My new file content\n")
+  >>> spam.id
+  '16ee2682887a962f854ebd25a61db16ef4efe49f'
+
+An alternative is to alter the previously constructed blob object::
+
+  >>> blob.data = "My new file content\n"
+  >>> blob.id
+  '16ee2682887a962f854ebd25a61db16ef4efe49f'
+
+In any case, update the blob id known as "spam". You also have the
+opportunity of changing its mode::
+
+  >>> tree["spam"] = (0100644, spam.id)
+
+Now let's record the change::
+
+  >>> from dulwich.objects import Commit
+  >>> from time import time
+  >>> c2 = Commit()
+  >>> c2.tree = tree.id
+  >>> c2.parents = [commit.id]
+  >>> c2.author = c2.committer = "John Doe <john@example.com>"
+  >>> c2.commit_time = c2.author_time = int(time())
+  >>> c2.commit_timezone = c2.author_timezone = 0
+  >>> c2.encoding = "UTF-8"
+  >>> c2.message = 'Changing "spam"'
+
+In this new commit we record the changed tree id, and most important, the
+previous commit as the parent. Parents are actually a list because a commit
+may happen to have several parents after merging branches.
+
+Let's put the objects in the object store::
+
+  >>> repo.object_store.add_object(spam)
+  >>> repo.object_store.add_object(tree)
+  >>> repo.object_store.add_object(c2)
+
+You can already ask git to introspect this commit using ``git show`` and the
+value of ``c2.id`` as an argument. You'll see the difference will the
+previous blob recorded as "spam".
+
+The diff between the previous head and the new one can be printed using
+write_tree_diff::
+
+  >>> from dulwich.patch import write_tree_diff
+  >>> import sys
+  >>> write_tree_diff(sys.stdout, repo.object_store, commit.tree, tree.id)
+  diff --git a/spam b/spam
+  index c55063a..16ee268 100644
+  --- a/spam
+  +++ b/spam
+  @@ -1,1 +1,1 @@
+  -My file content
+  +My new file content
+
+You won't see it using git log because the head is still the previous
+commit. It's easy to remedy::
+
+  >>> repo.refs['refs/heads/master'] = c2.id
+
+Now all git tools will work as expected.

+ 0 - 41
docs/tutorial/3-add-file.txt

@@ -1,41 +0,0 @@
-Adding a file
-=============
-
-If you followed well, the next lesson will be straightforward.
-
-We need a new blob::
-
-    >>> ham = Blob.from_string("Another\nmultiline\nfile\n")
-    >>> ham.id
-    'a3b5eda0b83eb8fb6e5dce91ecafda9e97269c70'
-
-But the same tree::
-
-    >>> tree["ham"] = (0100644, spam.id)
-
-And a new commit::
-
-  >>> c3 = Commit()
-  >>> c3.tree = tree.id
-  >>> c3.parents = [commit.id]
-  >>> c3.author = c3.committer = author
-  >>> c3.commit_time = c3.author_time = int(time())
-  >>> c3.commit_timezone = c3.author_timezone = tz
-  >>> c3.encoding = "UTF-8"
-  >>> c3.message = 'Adding "ham"'
-
-Save it all::
-
-    >>> object_store.add_object(spam)
-    >>> object_store.add_object(tree)
-    >>> object_store.add_object(c3)
-
-Update the head::
-
-    >>> repo.refs['refs/heads/master'] = commit.id
-
-A call to ``git show`` will confirm the addition of "spam".
-
-Remember you can also call ``git checkout -f`` to make it appear.
-
-Well... Adding "spam" was not such a good idea... We'll remove it.

+ 11 - 0
docs/tutorial/3-conclusion.txt

@@ -0,0 +1,11 @@
+Conclusion
+==========
+
+This tutorial currently only covers a small (but important) part of Dulwich.
+It still needs to be extended to cover packs, tags, refs, reflogs and network
+communication.
+
+Dulwich is abstracting much of the Git plumbing, so there would be more to
+see.
+
+For now, that's all folks!

+ 0 - 30
docs/tutorial/4-remove-file.txt

@@ -1,30 +0,0 @@
-Removing a file
-===============
-
-Removing a file just means removing its entry in the tree. The blob won't be
-deleted because Git tries to preserve the history of your repository.
-
-It's all pythonic::
-
-    >>> del tree["ham"]
-
-  >>> c4 = Commit()
-  >>> c4.tree = tree.id
-  >>> c4.parents = [commit.id]
-  >>> c4.author = c4.committer = author
-  >>> c4.commit_time = c4.author_time = int(time())
-  >>> c4.commit_timezone = c4.author_timezone = tz
-  >>> c4.encoding = "UTF-8"
-  >>> c4.message = 'Removing "ham"'
-
-Here we only have the new tree and the commit to save::
-
-    >>> object_store.add_object(spam)
-    >>> object_store.add_object(tree)
-    >>> object_store.add_object(c4)
-
-And of course update the head::
-
-    >>> repo.refs['refs/heads/master'] = commit.id
-
-If you don't trust me, ask ``git show``. ;-)

+ 0 - 33
docs/tutorial/5-rename-file.txt

@@ -1,33 +0,0 @@
-Renaming a file
-===============
-
-Remember you learned that the file name and content are distinct. So renaming
-a file is just about associating a blob id to a new name. We won't store more
-content, and the operation will be painless.
-
-Let's transfer the blob id from the old name to the new one::
-
-    >>> tree["eggs"] = tree["spam"]
-    >>> del tree["spam"]
-
-As usual, we need a commit to store the new tree id::
-
-  >>> c5 = Commit()
-  >>> c5.tree = tree.id
-  >>> c5.parents = [commit.id]
-  >>> c5.author = c5.committer = author
-  >>> c5.commit_time = c5.author_time = int(time())
-  >>> c5.commit_timezone = c5.author_timezone = tz
-  >>> c5.encoding = "UTF-8"
-  >>> c5.message = 'Rename "spam" to "eggs"'
-
-As for a deletion, we only have a tree and a commit to save::
-
-    >>> object_store.add_object(tree)
-    >>> object_store.add_object(c5)
-
-Remains to make the head bleeding-edge::
-
-    >>> repo.refs['refs/heads/master'] = commit.id
-
-As a last exercise, see how ``git show`` illustrates it.

+ 0 - 14
docs/tutorial/6-conclusion.txt

@@ -1,14 +0,0 @@
-Conclusion
-==========
-
-You'll find the ``test.py`` program with some tips I use to ease generating
-objects.
-
-You can also make Tag objects, but this is left as a exercise to the reader.
-
-Dulwich is abstracting  much of the Git plumbing, so there would be more to
-see.
-
-Dulwich is also able to clone and push repositories.
-
-That's all folks!

+ 3 - 6
docs/tutorial/index.txt

@@ -5,9 +5,6 @@ Dulwich Tutorial
 .. contents::
 
 .. include:: 0-introduction.txt
-.. include:: 1-initial-commit.txt
-.. include:: 2-change-file.txt
-.. include:: 3-add-file.txt
-.. include:: 4-remove-file.txt
-.. include:: 5-rename-file.txt
-.. include:: 6-conclusion.txt
+.. include:: 1-repo.txt
+.. include:: 2-object-store.txt
+.. include:: 3-conclusion.txt

+ 0 - 178
docs/tutorial/test.py

@@ -1,178 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: UTF-8 -*-
-
-# Import from the Standard Library
-from os import F_OK, access, mkdir
-from pprint import pprint
-from shutil import rmtree
-from subprocess import call
-from time import time
-
-# Import from dulwich
-from dulwich.repo import Repo
-from dulwich.objects import Blob, Tree, Commit, parse_timezone
-
-
-DIRNAME = "myrepo"
-AUTHOR = "Your Name <your.email@example.com>"
-TZ = parse_timezone('-200')
-ENCODING = "UTF-8"
-
-
-def make_commit(repo, tree_id, message):
-    """Build a commit object on the same pattern. Only changing values are
-    required as parameters.
-    """
-    commit = Commit()
-    try:
-        commit.parents = [repo.head()]
-    except KeyError:
-        # The initial commit has no parent
-        pass
-    commit.tree = tree_id
-    commit.message = message
-    commit.author = commit.committer = AUTHOR
-    commit.commit_time = commit.author_time = int(time())
-    commit.commit_timezone = commit.author_timezone = TZ
-    commit.encoding = ENCODING
-    return commit
-
-
-
-def make_tree(repo):
-    """Return the last known tree.
-    """
-    commit_id = repo.head()
-    commit = repo.commit(commit_id)
-    tree_id = commit.tree
-    return repo.tree(tree_id)
-
-
-
-def update_master(repo, commit_id):
-    repo.refs['refs/heads/master'] = commit_id
-
-
-
-def initial_commit(repo):
-    # Add file content
-    blob = Blob.from_string("My file content\n")
-    # Add file
-    tree = Tree()
-    tree.add(0100644, "spam", blob.id)
-    # Set commit
-    commit = make_commit(repo, tree.id, "Initial commit")
-    # Initial commit
-    object_store = repo.object_store
-    object_store.add_object(blob)
-    object_store.add_object(tree)
-    object_store.add_object(commit)
-    # Update master
-    update_master(repo, commit.id)
-    # Set the master branch as the default
-    repo.refs['HEAD'] = 'ref: refs/heads/master'
-
-
-
-def test_change(repo):
-    tree = make_tree(repo)
-    # Change a file
-    spam = Blob.from_string("My new file content\n")
-    tree.add(0100644, "spam", spam.id)
-    # Set commit
-    commit = make_commit(repo, tree.id, "Change spam")
-    # Second commit
-    object_store = repo.object_store
-    object_store.add_object(spam)
-    object_store.add_object(tree)
-    object_store.add_object(commit)
-    # Update master
-    update_master(repo, commit.id)
-
-
-
-def test_add(repo):
-    tree = make_tree(repo)
-    # Add another file
-    ham = Blob.from_string("Another\nmultiline\nfile\n")
-    tree.add(0100644, "ham", ham.id)
-    # Set commit
-    commit = make_commit(repo, tree.id, "Add ham")
-    # Second commit
-    object_store = repo.object_store
-    object_store.add_object(ham)
-    object_store.add_object(tree)
-    object_store.add_object(commit)
-    # Update master
-    update_master(repo, commit.id)
-
-
-
-def test_remove(repo):
-    tree = make_tree(repo)
-    # Remove a file
-    del tree["ham"]
-    # Set commit
-    commit = make_commit(repo, tree.id, 'Remove "ham"')
-    # Third commit
-    # No blob change, just tree operation
-    object_store = repo.object_store
-    object_store.add_object(tree)
-    object_store.add_object(commit)
-    # Update master
-    update_master(repo, commit.id)
-
-
-
-def test_rename(repo):
-    tree = make_tree(repo)
-    # Rename a file
-    tree["eggs"] = tree["spam"]
-    del tree["spam"]
-    # Set commit
-    commit = make_commit(repo, tree.id, 'Rename "spam" to "eggs"')
-    # Fourth commit
-    # No blob change, just tree operation
-    object_store = repo.object_store
-    object_store.add_object(tree)
-    object_store.add_object(commit)
-    # Update master
-    update_master(repo, commit.id)
-
-
-
-def test_history(repo):
-    pprint(repo.revision_history(repo.head()))
-
-
-
-def test_file(repo):
-    tree = make_tree(repo)
-    print "entries", tree.entries()
-    mode, blob_id = tree["eggs"]
-    blob = repo.get_blob(blob_id)
-    print "eggs", repr(blob.data)
-
-
-
-if __name__ == '__main__':
-    # Creating the repository
-    if access(DIRNAME, F_OK):
-        rmtree(DIRNAME)
-    mkdir(DIRNAME)
-    repo = Repo.init(DIRNAME)
-    initial_commit(repo)
-    test_change(repo)
-    test_add(repo)
-    test_remove(repo)
-    test_rename(repo)
-    last_commit_id = repo.head()
-    call(['git', 'gc'], cwd=DIRNAME)
-    # Re-load the repo
-    del repo
-    repo = Repo(DIRNAME)
-    # XXX the ref was removed and dulwich doesn't know where to read it
-    update_master(repo, last_commit_id)
-    assert last_commit_id == repo.head()
-    test_history(repo)
-    test_file(repo)

+ 76 - 1
dulwich/misc.py → dulwich/_compat.py

@@ -1,4 +1,4 @@
-# misc.py -- For dealing with python2.4 oddness
+# _compat.py -- For dealing with python2.4 oddness
 # Copyright (C) 2008 Canonical Ltd.
 #
 # This program is free software; you can redistribute it and/or
@@ -101,10 +101,45 @@ def unpack_from(fmt, buf, offset=0):
         return struct.unpack(fmt, b)
 
 
+try:
+    from itertools import permutations
+except ImportError:
+    # Implementation of permutations from Python 2.6 documentation:
+    # http://docs.python.org/2.6/library/itertools.html#itertools.permutations
+    # Copyright (c) 2001-2010 Python Software Foundation; All Rights Reserved
+    # Modified syntax slightly to run under Python 2.4.
+    def permutations(iterable, r=None):
+        # permutations('ABCD', 2) --> AB AC AD BA BC BD CA CB CD DA DB DC
+        # permutations(range(3)) --> 012 021 102 120 201 210
+        pool = tuple(iterable)
+        n = len(pool)
+        if r is None:
+            r = n
+        if r > n:
+            return
+        indices = range(n)
+        cycles = range(n, n-r, -1)
+        yield tuple(pool[i] for i in indices[:r])
+        while n:
+            for i in reversed(range(r)):
+                cycles[i] -= 1
+                if cycles[i] == 0:
+                    indices[i:] = indices[i+1:] + indices[i:i+1]
+                    cycles[i] = n - i
+                else:
+                    j = cycles[i]
+                    indices[i], indices[-j] = indices[-j], indices[i]
+                    yield tuple(pool[i] for i in indices[:r])
+                    break
+            else:
+                return
+
+
 try:
     from collections import namedtuple
 
     TreeEntryTuple = namedtuple('TreeEntryTuple', ['path', 'mode', 'sha'])
+    TreeChangeTuple = namedtuple('TreeChangeTuple', ['type', 'old', 'new'])
 except ImportError:
     # Provide manual implementations of namedtuples for Python <2.5.
     # If the class definitions change, be sure to keep these in sync by running
@@ -153,3 +188,43 @@ except ImportError:
             path = _property(_itemgetter(0))
             mode = _property(_itemgetter(1))
             sha = _property(_itemgetter(2))
+
+
+    class TreeChangeTuple(tuple):
+            'TreeChangeTuple(type, old, new)'
+
+            __slots__ = ()
+
+            _fields = ('type', 'old', 'new')
+
+            def __new__(_cls, type, old, new):
+                return _tuple.__new__(_cls, (type, old, new))
+
+            @classmethod
+            def _make(cls, iterable, new=tuple.__new__, len=len):
+                'Make a new TreeChangeTuple object from a sequence or iterable'
+                result = new(cls, iterable)
+                if len(result) != 3:
+                    raise TypeError('Expected 3 arguments, got %d' % len(result))
+                return result
+
+            def __repr__(self):
+                return 'TreeChangeTuple(type=%r, old=%r, new=%r)' % self
+
+            def _asdict(t):
+                'Return a new dict which maps field names to their values'
+                return {'type': t[0], 'old': t[1], 'new': t[2]}
+
+            def _replace(_self, **kwds):
+                'Return a new TreeChangeTuple object replacing specified fields with new values'
+                result = _self._make(map(kwds.pop, ('type', 'old', 'new'), _self))
+                if kwds:
+                    raise ValueError('Got unexpected field names: %r' % kwds.keys())
+                return result
+
+            def __getnewargs__(self):
+                return tuple(self)
+
+            type = _property(_itemgetter(0))
+            old = _property(_itemgetter(1))
+            new = _property(_itemgetter(2))

+ 449 - 0
dulwich/_diff_tree.c

@@ -0,0 +1,449 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License or (at your option) a later version of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA  02110-1301, USA.
+ */
+
+#include <Python.h>
+#include <sys/stat.h>
+
+#if (PY_VERSION_HEX < 0x02050000)
+typedef int Py_ssize_t;
+#endif
+
+#if (PY_VERSION_HEX < 0x02060000)
+#define Py_SIZE(ob)             (((PyVarObject*)(ob))->ob_size)
+#endif
+
+static PyObject *tree_entry_cls = NULL, *null_entry = NULL,
+	*defaultdict_cls = NULL, *int_cls = NULL;
+static int block_size;
+
+/**
+ * Free an array of PyObject pointers, decrementing any references.
+ */
+static void free_objects(PyObject **objs, Py_ssize_t n)
+{
+	Py_ssize_t i;
+	for (i = 0; i < n; i++)
+		Py_XDECREF(objs[i]);
+	PyMem_Free(objs);
+}
+
+/**
+ * Get the entries of a tree, prepending the given path.
+ *
+ * :param path: The path to prepend, without trailing slashes.
+ * :param path_len: The length of path.
+ * :param tree: The Tree object to iterate.
+ * :param n: Set to the length of result.
+ * :return: A (C) array of PyObject pointers to TreeEntry objects for each path
+ *     in tree.
+ */
+static PyObject **tree_entries(char *path, Py_ssize_t path_len, PyObject *tree,
+		Py_ssize_t *n)
+{
+	PyObject *iteritems, *items, **result = NULL;
+	PyObject *old_entry, *name, *sha;
+	Py_ssize_t i = 0, name_len, new_path_len;
+	char *new_path;
+
+	if (tree == Py_None) {
+		*n = 0;
+		result = PyMem_New(PyObject*, 0);
+		if (!result) {
+			PyErr_SetNone(PyExc_MemoryError);
+			return NULL;
+		}
+		return result;
+	}
+
+	iteritems = PyObject_GetAttrString(tree, "iteritems");
+	if (!iteritems)
+		return NULL;
+	items = PyObject_CallFunctionObjArgs(iteritems, Py_True, NULL);
+	Py_DECREF(iteritems);
+	if (!items) {
+		return NULL;
+	}
+	/* The C implementation of iteritems returns a list, so depend on that. */
+	if (!PyList_Check(items)) {
+		PyErr_SetString(PyExc_TypeError,
+			"Tree.iteritems() did not return a list");
+		return NULL;
+	}
+
+	*n = PyList_Size(items);
+	result = PyMem_New(PyObject*, *n);
+	if (!result) {
+		PyErr_SetNone(PyExc_MemoryError);
+		goto error;
+	}
+	for (i = 0; i < *n; i++) {
+		old_entry = PyList_GetItem(items, i);
+		if (!old_entry)
+			goto error;
+		sha = PyTuple_GetItem(old_entry, 2);
+		if (!sha)
+			goto error;
+		name = PyTuple_GET_ITEM(old_entry, 0);
+		name_len = PyString_Size(name);
+		if (PyErr_Occurred())
+			goto error;
+
+		new_path_len = name_len;
+		if (path_len)
+			new_path_len += path_len + 1;
+		new_path = PyMem_Malloc(new_path_len);
+		if (!new_path) {
+			PyErr_SetNone(PyExc_MemoryError);
+			goto error;
+		}
+		if (path_len) {
+			memcpy(new_path, path, path_len);
+			new_path[path_len] = '/';
+			memcpy(new_path + path_len + 1, PyString_AS_STRING(name), name_len);
+		} else {
+			memcpy(new_path, PyString_AS_STRING(name), name_len);
+		}
+
+		result[i] = PyObject_CallFunction(tree_entry_cls, "s#OO", new_path,
+			new_path_len, PyTuple_GET_ITEM(old_entry, 1), sha);
+		PyMem_Free(new_path);
+		if (!result[i]) {
+			goto error;
+		}
+	}
+	Py_DECREF(items);
+	return result;
+
+error:
+	free_objects(result, i);
+	Py_DECREF(items);
+	return NULL;
+}
+
+/**
+ * Use strcmp to compare the paths of two TreeEntry objects.
+ */
+static int entry_path_cmp(PyObject *entry1, PyObject *entry2)
+{
+	PyObject *path1 = NULL, *path2 = NULL;
+	int result = 0;
+
+	path1 = PyObject_GetAttrString(entry1, "path");
+	if (!path1)
+		goto done;
+	if (!PyString_Check(path1)) {
+		PyErr_SetString(PyExc_TypeError, "path is not a string");
+		goto done;
+	}
+
+	path2 = PyObject_GetAttrString(entry2, "path");
+	if (!path2)
+		goto done;
+	if (!PyString_Check(path2)) {
+		PyErr_SetString(PyExc_TypeError, "path is not a string");
+		goto done;
+	}
+
+	result = strcmp(PyString_AS_STRING(path1), PyString_AS_STRING(path2));
+
+done:
+	Py_XDECREF(path1);
+	Py_XDECREF(path2);
+	return result;
+}
+
+static PyObject *py_merge_entries(PyObject *self, PyObject *args)
+{
+	PyObject *path, *tree1, *tree2, **entries1 = NULL, **entries2 = NULL;
+	PyObject *e1, *e2, *pair, *result = NULL;
+	Py_ssize_t path_len, n1 = 0, n2 = 0, i1 = 0, i2 = 0;
+	char *path_str;
+	int cmp;
+
+	if (!PyArg_ParseTuple(args, "OOO", &path, &tree1, &tree2))
+		return NULL;
+
+	path_str = PyString_AsString(path);
+	if (!path_str) {
+		PyErr_SetString(PyExc_TypeError, "path is not a string");
+		return NULL;
+	}
+	path_len = PyString_GET_SIZE(path);
+
+	entries1 = tree_entries(path_str, path_len, tree1, &n1);
+	if (!entries1)
+		goto error;
+	entries2 = tree_entries(path_str, path_len, tree2, &n2);
+	if (!entries2)
+		goto error;
+
+	result = PyList_New(n1 + n2);
+	if (!result)
+		goto error;
+	/* PyList_New sets the len of the list, not its allocated size, so we
+	 * need to trim it to the size we actually use. */
+	Py_SIZE(result) = 0;
+
+	while (i1 < n1 && i2 < n2) {
+		cmp = entry_path_cmp(entries1[i1], entries2[i2]);
+		if (PyErr_Occurred())
+			goto error;
+		if (!cmp) {
+			e1 = entries1[i1++];
+			e2 = entries2[i2++];
+		} else if (cmp < 0) {
+			e1 = entries1[i1++];
+			e2 = null_entry;
+		} else {
+			e1 = null_entry;
+			e2 = entries2[i2++];
+		}
+		pair = PyTuple_Pack(2, e1, e2);
+		if (!pair)
+			goto error;
+		PyList_SET_ITEM(result, Py_SIZE(result)++, pair);
+	}
+
+	while (i1 < n1) {
+		pair = PyTuple_Pack(2, entries1[i1++], null_entry);
+		if (!pair)
+			goto error;
+		PyList_SET_ITEM(result, Py_SIZE(result)++, pair);
+	}
+	while (i2 < n2) {
+		pair = PyTuple_Pack(2, null_entry, entries2[i2++]);
+		if (!pair)
+			goto error;
+		PyList_SET_ITEM(result, Py_SIZE(result)++, pair);
+	}
+	goto done;
+
+error:
+	Py_XDECREF(result);
+	result = NULL;
+
+done:
+	free_objects(entries1, n1);
+	free_objects(entries2, n2);
+	return result;
+}
+
+static PyObject *py_is_tree(PyObject *self, PyObject *args)
+{
+	PyObject *entry, *mode, *result;
+	long lmode;
+
+	if (!PyArg_ParseTuple(args, "O", &entry))
+		return NULL;
+
+	mode = PyObject_GetAttrString(entry, "mode");
+	if (!mode)
+		return NULL;
+
+	if (mode == Py_None) {
+		result = Py_False;
+	} else {
+		lmode = PyInt_AsLong(mode);
+		if (lmode == -1 && PyErr_Occurred()) {
+			Py_DECREF(mode);
+			return NULL;
+		}
+		result = PyBool_FromLong(S_ISDIR((mode_t)lmode));
+	}
+	Py_INCREF(result);
+	Py_DECREF(mode);
+	return result;
+}
+
+static int add_hash(PyObject *get, PyObject *set, char *str, int n)
+{
+	PyObject *str_obj = NULL, *hash_obj = NULL, *value = NULL,
+		*set_value = NULL;
+	long hash;
+
+	/* It would be nice to hash without copying str into a PyString, but that
+	 * isn't exposed by the API. */
+	str_obj = PyString_FromStringAndSize(str, n);
+	if (!str_obj)
+		goto error;
+	hash = PyObject_Hash(str_obj);
+	if (hash == -1)
+		goto error;
+	hash_obj = PyInt_FromLong(hash);
+	if (!hash_obj)
+		goto error;
+
+	value = PyObject_CallFunctionObjArgs(get, hash_obj, NULL);
+	if (!value)
+		goto error;
+	set_value = PyObject_CallFunction(set, "(Ol)", hash_obj,
+		PyInt_AS_LONG(value) + n);
+	if (!set_value)
+		goto error;
+
+	Py_DECREF(str_obj);
+	Py_DECREF(hash_obj);
+	Py_DECREF(value);
+	Py_DECREF(set_value);
+	return 0;
+
+error:
+	Py_XDECREF(str_obj);
+	Py_XDECREF(hash_obj);
+	Py_XDECREF(value);
+	Py_XDECREF(set_value);
+	return -1;
+}
+
+static PyObject *py_count_blocks(PyObject *self, PyObject *args)
+{
+	PyObject *obj, *chunks = NULL, *chunk, *counts = NULL, *get = NULL,
+		*set = NULL;
+	char *chunk_str, *block = NULL;
+	Py_ssize_t num_chunks, chunk_len;
+	int i, j, n = 0;
+	char c;
+
+	if (!PyArg_ParseTuple(args, "O", &obj))
+		goto error;
+
+	counts = PyObject_CallFunctionObjArgs(defaultdict_cls, int_cls, NULL);
+	if (!counts)
+		goto error;
+	get = PyObject_GetAttrString(counts, "__getitem__");
+	set = PyObject_GetAttrString(counts, "__setitem__");
+
+	chunks = PyObject_CallMethod(obj, "as_raw_chunks", NULL);
+	if (!chunks)
+		goto error;
+	if (!PyList_Check(chunks)) {
+		PyErr_SetString(PyExc_TypeError,
+			"as_raw_chunks() did not return a list");
+		goto error;
+	}
+	num_chunks = PyList_GET_SIZE(chunks);
+	block = PyMem_New(char, block_size);
+	if (!block) {
+		PyErr_SetNone(PyExc_MemoryError);
+		goto error;
+	}
+
+	for (i = 0; i < num_chunks; i++) {
+		chunk = PyList_GET_ITEM(chunks, i);
+		if (!PyString_Check(chunk)) {
+			PyErr_SetString(PyExc_TypeError, "chunk is not a string");
+			goto error;
+		}
+		if (PyString_AsStringAndSize(chunk, &chunk_str, &chunk_len) == -1)
+			goto error;
+
+		for (j = 0; j < chunk_len; j++) {
+			c = chunk_str[j];
+			block[n++] = c;
+			if (c == '\n' || n == block_size) {
+				if (add_hash(get, set, block, n) == -1)
+					goto error;
+				n = 0;
+			}
+		}
+	}
+	if (n && add_hash(get, set, block, n) == -1)
+		goto error;
+
+	Py_DECREF(chunks);
+	Py_DECREF(get);
+	Py_DECREF(set);
+	PyMem_Free(block);
+	return counts;
+
+error:
+	Py_XDECREF(chunks);
+	Py_XDECREF(get);
+	Py_XDECREF(set);
+	Py_XDECREF(counts);
+	PyMem_Free(block);
+	return NULL;
+}
+
+static PyMethodDef py_diff_tree_methods[] = {
+	{ "_is_tree", (PyCFunction)py_is_tree, METH_VARARGS, NULL },
+	{ "_merge_entries", (PyCFunction)py_merge_entries, METH_VARARGS, NULL },
+	{ "_count_blocks", (PyCFunction)py_count_blocks, METH_VARARGS, NULL },
+	{ NULL, NULL, 0, NULL }
+};
+
+PyMODINIT_FUNC
+init_diff_tree(void)
+{
+	PyObject *m, *objects_mod = NULL, *diff_tree_mod = NULL;
+        PyObject *block_size_obj = NULL;
+	m = Py_InitModule("_diff_tree", py_diff_tree_methods);
+	if (!m)
+		goto error;
+
+	objects_mod = PyImport_ImportModule("dulwich.objects");
+	if (!objects_mod)
+		goto error;
+
+	tree_entry_cls = PyObject_GetAttrString(objects_mod, "TreeEntry");
+	Py_DECREF(objects_mod);
+	if (!tree_entry_cls)
+		goto error;
+
+	diff_tree_mod = PyImport_ImportModule("dulwich.diff_tree");
+	if (!diff_tree_mod)
+		goto error;
+
+	null_entry = PyObject_GetAttrString(diff_tree_mod, "_NULL_ENTRY");
+	if (!null_entry)
+		goto error;
+
+	block_size_obj = PyObject_GetAttrString(diff_tree_mod, "_BLOCK_SIZE");
+	if (!block_size_obj)
+		goto error;
+	block_size = (int)PyInt_AsLong(block_size_obj);
+
+	if (PyErr_Occurred())
+		goto error;
+
+	defaultdict_cls = PyObject_GetAttrString(diff_tree_mod, "defaultdict");
+	if (!defaultdict_cls)
+		goto error;
+
+	/* This is kind of hacky, but I don't know of a better way to get the
+	 * PyObject* version of int. */
+	int_cls = PyDict_GetItemString(PyEval_GetBuiltins(), "int");
+	if (!int_cls) {
+		PyErr_SetString(PyExc_NameError, "int");
+		goto error;
+	}
+
+	Py_DECREF(objects_mod);
+	Py_DECREF(diff_tree_mod);
+	return;
+
+error:
+	Py_XDECREF(objects_mod);
+	Py_XDECREF(diff_tree_mod);
+	Py_XDECREF(null_entry);
+	Py_XDECREF(block_size_obj);
+	Py_XDECREF(defaultdict_cls);
+	Py_XDECREF(int_cls);
+	return;
+}

+ 53 - 14
dulwich/_objects.c

@@ -36,6 +36,7 @@ size_t strnlen(char *text, size_t maxlen)
 #define bytehex(x) (((x)<0xa)?('0'+(x)):('a'-0xa+(x)))
 
 static PyObject *tree_entry_cls;
+static PyObject *object_format_exception_cls;
 
 static PyObject *sha_to_pyhex(const unsigned char *sha)
 {
@@ -49,17 +50,22 @@ static PyObject *sha_to_pyhex(const unsigned char *sha)
 	return PyString_FromStringAndSize(hexsha, 40);
 }
 
-static PyObject *py_parse_tree(PyObject *self, PyObject *args)
+static PyObject *py_parse_tree(PyObject *self, PyObject *args, PyObject *kw)
 {
 	char *text, *start, *end;
-	int len, namelen;
-	PyObject *ret, *item, *name;
+	int len, namelen, strict;
+	PyObject *ret, *item, *name, *py_strict = NULL;
+	static char *kwlist[] = {"text", "strict", NULL};
 
-	if (!PyArg_ParseTuple(args, "s#", &text, &len))
+	if (!PyArg_ParseTupleAndKeywords(args, kw, "s#|O", kwlist,
+	                                 &text, &len, &py_strict))
 		return NULL;
 
+
+	strict = py_strict ?  PyObject_IsTrue(py_strict) : 0;
+
 	/* TODO: currently this returns a list; if memory usage is a concern,
-	* consider rewriting as a custom iterator object */
+	 * consider rewriting as a custom iterator object */
 	ret = PyList_New(0);
 
 	if (ret == NULL) {
@@ -71,6 +77,13 @@ static PyObject *py_parse_tree(PyObject *self, PyObject *args)
 
 	while (text < end) {
 		long mode;
+		if (strict && text[0] == '0') {
+			PyErr_SetString(object_format_exception_cls,
+			                "Illegal leading zero on mode");
+			Py_DECREF(ret);
+			return NULL;
+		}
+
 		mode = strtol(text, &text, 8);
 
 		if (*text != ' ') {
@@ -97,7 +110,7 @@ static PyObject *py_parse_tree(PyObject *self, PyObject *args)
 		}
 
 		item = Py_BuildValue("(NlN)", name, mode,
-							 sha_to_pyhex((unsigned char *)text+namelen+1));
+		                     sha_to_pyhex((unsigned char *)text+namelen+1));
 		if (item == NULL) {
 			Py_DECREF(ret);
 			Py_DECREF(name);
@@ -146,18 +159,32 @@ int cmp_tree_item(const void *_a, const void *_b)
 	return strcmp(remain_a, remain_b);
 }
 
-static PyObject *py_sorted_tree_items(PyObject *self, PyObject *entries)
+int cmp_tree_item_name_order(const void *_a, const void *_b) {
+	const struct tree_item *a = _a, *b = _b;
+	return strcmp(a->name, b->name);
+}
+
+static PyObject *py_sorted_tree_items(PyObject *self, PyObject *args)
 {
 	struct tree_item *qsort_entries = NULL;
-	int num_entries, n = 0, i;
-	PyObject *ret, *key, *value, *py_mode, *py_sha;
+	int name_order, num_entries, n = 0, i;
+	PyObject *entries, *py_name_order, *ret, *key, *value, *py_mode, *py_sha;
 	Py_ssize_t pos = 0;
+	int (*cmp)(const void *, const void *);
+
+	if (!PyArg_ParseTuple(args, "OO", &entries, &py_name_order))
+		goto error;
 
 	if (!PyDict_Check(entries)) {
 		PyErr_SetString(PyExc_TypeError, "Argument not a dictionary");
 		goto error;
 	}
 
+	name_order = PyObject_IsTrue(py_name_order);
+	if (name_order == -1)
+		goto error;
+	cmp = name_order ? cmp_tree_item_name_order : cmp_tree_item;
+
 	num_entries = PyDict_Size(entries);
 	if (PyErr_Occurred())
 		goto error;
@@ -193,13 +220,13 @@ static PyObject *py_sorted_tree_items(PyObject *self, PyObject *entries)
 		qsort_entries[n].mode = PyInt_AS_LONG(py_mode);
 
 		qsort_entries[n].tuple = PyObject_CallFunctionObjArgs(
-				tree_entry_cls, key, py_mode, py_sha, NULL);
+		                tree_entry_cls, key, py_mode, py_sha, NULL);
 		if (qsort_entries[n].tuple == NULL)
 			goto error;
 		n++;
 	}
 
-	qsort(qsort_entries, num_entries, sizeof(struct tree_item), cmp_tree_item);
+	qsort(qsort_entries, num_entries, sizeof(struct tree_item), cmp);
 
 	ret = PyList_New(num_entries);
 	if (ret == NULL) {
@@ -222,20 +249,32 @@ error:
 }
 
 static PyMethodDef py_objects_methods[] = {
-	{ "parse_tree", (PyCFunction)py_parse_tree, METH_VARARGS, NULL },
-	{ "sorted_tree_items", (PyCFunction)py_sorted_tree_items, METH_O, NULL },
+	{ "parse_tree", (PyCFunction)py_parse_tree, METH_VARARGS | METH_KEYWORDS,
+	  NULL },
+	{ "sorted_tree_items", py_sorted_tree_items, METH_VARARGS, NULL },
 	{ NULL, NULL, 0, NULL }
 };
 
 PyMODINIT_FUNC
 init_objects(void)
 {
-	PyObject *m, *objects_mod;
+	PyObject *m, *objects_mod, *errors_mod;
 
 	m = Py_InitModule3("_objects", py_objects_methods, NULL);
 	if (m == NULL)
 		return;
 
+
+	errors_mod = PyImport_ImportModule("dulwich.errors");
+	if (errors_mod == NULL)
+		return;
+
+	object_format_exception_cls = PyObject_GetAttrString(
+		errors_mod, "ObjectFormatException");
+	Py_DECREF(errors_mod);
+	if (object_format_exception_cls == NULL)
+		return;
+
 	/* This is a circular import but should be safe since this module is
 	 * imported at at the very bottom of objects.py. */
 	objects_mod = PyImport_ImportModule("dulwich.objects");

+ 20 - 7
dulwich/client.py

@@ -24,6 +24,7 @@ __docformat__ = 'restructuredText'
 import select
 import socket
 import subprocess
+import urlparse
 
 from dulwich.errors import (
     SendPackError,
@@ -358,11 +359,23 @@ def get_transport_and_path(uri):
     :param uri: URI or path
     :return: Tuple with client instance and relative path.
     """
-    from dulwich.client import TCPGitClient, SSHGitClient, SubprocessGitClient
-    for handler, transport in (("git://", TCPGitClient), ("git+ssh://", SSHGitClient)):
-        if uri.startswith(handler):
-            host, path = uri[len(handler):].split("/", 1)
-            return transport(host), "/"+path
-    # FIXME: Parse rsync-like git URLs (user@host:/path), bug 568493
-    # if its not git or git+ssh, try a local url..
+    parsed = urlparse.urlparse(uri)
+    if parsed.scheme == 'git':
+        return TCPGitClient(parsed.hostname, port=parsed.port), parsed.path
+    elif parsed.scheme == 'git+ssh':
+        return SSHGitClient(parsed.hostname, port=parsed.port,
+                            username=parsed.username), parsed.path
+
+    if parsed.scheme and not parsed.netloc:
+        # SSH with no user@, zero or one leading slash.
+        return SSHGitClient(parsed.scheme), parsed.path
+    elif parsed.scheme:
+        raise ValueError('Unknown git protocol scheme: %s' % parsed.scheme)
+    elif '@' in parsed.path and ':' in parsed.path:
+        # SSH with user@host:foo.
+        user_host, path = parsed.path.split(':')
+        user, host = user_host.rsplit('@')
+        return SSHGitClient(host, username=user), path
+
+    # Otherwise, assume it's a local path.
     return SubprocessGitClient(), uri

+ 495 - 0
dulwich/diff_tree.py

@@ -0,0 +1,495 @@
+# diff_tree.py -- Utilities for diffing files and trees.
+# Copyright (C) 2010 Google, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# or (at your option) a later version of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Utilities for diffing files and trees."""
+
+from cStringIO import StringIO
+import itertools
+import stat
+
+from dulwich._compat import (
+    defaultdict,
+    TreeChangeTuple,
+    )
+from dulwich.objects import (
+    S_ISGITLINK,
+    TreeEntry,
+    )
+
+# TreeChange type constants.
+CHANGE_ADD = 'add'
+CHANGE_MODIFY = 'modify'
+CHANGE_DELETE = 'delete'
+CHANGE_RENAME = 'rename'
+CHANGE_COPY = 'copy'
+CHANGE_UNCHANGED = 'unchanged'
+
+_NULL_ENTRY = TreeEntry(None, None, None)
+
+_MAX_SCORE = 100
+_RENAME_THRESHOLD = 60
+_MAX_FILES = 200
+_REWRITE_THRESHOLD = None
+
+
+class TreeChange(TreeChangeTuple):
+    """Class encapsulating a single change between two trees."""
+
+    @classmethod
+    def add(cls, new):
+        return cls(CHANGE_ADD, _NULL_ENTRY, new)
+
+    @classmethod
+    def delete(cls, old):
+        return cls(CHANGE_DELETE, old, _NULL_ENTRY)
+
+
+def _tree_entries(path, tree):
+    result = []
+    if not tree:
+        return result
+    for entry in tree.iteritems(name_order=True):
+        result.append(entry.in_path(path))
+    return result
+
+
+def _merge_entries(path, tree1, tree2):
+    """Merge the entries of two trees.
+
+    :param path: A path to prepend to all tree entry names.
+    :param tree1: The first Tree object to iterate, or None.
+    :param tree2: The second Tree object to iterate, or None.
+    :return: A list of pairs of TreeEntry objects for each pair of entries in
+        the trees. If an entry exists in one tree but not the other, the other
+        entry will have all attributes set to None. If neither entry's path is
+        None, they are guaranteed to match.
+    """
+    entries1 = _tree_entries(path, tree1)
+    entries2 = _tree_entries(path, tree2)
+    i1 = i2 = 0
+    len1 = len(entries1)
+    len2 = len(entries2)
+
+    result = []
+    while i1 < len1 and i2 < len2:
+        entry1 = entries1[i1]
+        entry2 = entries2[i2]
+        if entry1.path < entry2.path:
+            result.append((entry1, _NULL_ENTRY))
+            i1 += 1
+        elif entry1.path > entry2.path:
+            result.append((_NULL_ENTRY, entry2))
+            i2 += 1
+        else:
+            result.append((entry1, entry2))
+            i1 += 1
+            i2 += 1
+    for i in xrange(i1, len1):
+        result.append((entries1[i], _NULL_ENTRY))
+    for i in xrange(i2, len2):
+        result.append((_NULL_ENTRY, entries2[i]))
+    return result
+
+
+def _is_tree(entry):
+    mode = entry.mode
+    if mode is None:
+        return False
+    return stat.S_ISDIR(mode)
+
+
+def walk_trees(store, tree1_id, tree2_id, prune_identical=False):
+    """Recursively walk all the entries of two trees.
+
+    Iteration is depth-first pre-order, as in e.g. os.walk.
+
+    :param store: An ObjectStore for looking up objects.
+    :param tree1_id: The SHA of the first Tree object to iterate, or None.
+    :param tree2_id: The SHA of the second Tree object to iterate, or None.
+    :param prune_identical: If True, identical subtrees will not be walked.
+    :return: Iterator over Pairs of TreeEntry objects for each pair of entries
+        in the trees and their subtrees recursively. If an entry exists in one
+        tree but not the other, the other entry will have all attributes set
+        to None. If neither entry's path is None, they are guaranteed to
+        match.
+    """
+    # This could be fairly easily generalized to >2 trees if we find a use case.
+    mode1 = tree1_id and stat.S_IFDIR or None
+    mode2 = tree2_id and stat.S_IFDIR or None
+    todo = [(TreeEntry('', mode1, tree1_id), TreeEntry('', mode2, tree2_id))]
+    while todo:
+        entry1, entry2 = todo.pop()
+        is_tree1 = _is_tree(entry1)
+        is_tree2 = _is_tree(entry2)
+        if prune_identical and is_tree1 and is_tree2 and entry1 == entry2:
+            continue
+
+        tree1 = is_tree1 and store[entry1.sha] or None
+        tree2 = is_tree2 and store[entry2.sha] or None
+        path = entry1.path or entry2.path
+        todo.extend(reversed(_merge_entries(path, tree1, tree2)))
+        yield entry1, entry2
+
+
+def _skip_tree(entry):
+    if entry.mode is None or stat.S_ISDIR(entry.mode):
+        return _NULL_ENTRY
+    return entry
+
+
+def tree_changes(store, tree1_id, tree2_id, want_unchanged=False):
+    """Find the differences between the contents of two trees.
+
+    :param store: An ObjectStore for looking up objects.
+    :param tree1_id: The SHA of the source tree.
+    :param tree2_id: The SHA of the target tree.
+    :param want_unchanged: If True, include TreeChanges for unmodified entries
+        as well.
+    :return: Iterator over TreeChange instances for each change between the
+        source and target tree.
+    """
+    entries = walk_trees(store, tree1_id, tree2_id,
+                         prune_identical=(not want_unchanged))
+    for entry1, entry2 in entries:
+        if entry1 == entry2 and not want_unchanged:
+            continue
+
+        # Treat entries for trees as missing.
+        entry1 = _skip_tree(entry1)
+        entry2 = _skip_tree(entry2)
+
+        if entry1 != _NULL_ENTRY and entry2 != _NULL_ENTRY:
+            if stat.S_IFMT(entry1.mode) != stat.S_IFMT(entry2.mode):
+                # File type changed: report as delete/add.
+                yield TreeChange.delete(entry1)
+                entry1 = _NULL_ENTRY
+                change_type = CHANGE_ADD
+            elif entry1 == entry2:
+                change_type = CHANGE_UNCHANGED
+            else:
+                change_type = CHANGE_MODIFY
+        elif entry1 != _NULL_ENTRY:
+            change_type = CHANGE_DELETE
+        elif entry2 != _NULL_ENTRY:
+            change_type = CHANGE_ADD
+        else:
+            # Both were None because at least one was a tree.
+            continue
+        yield TreeChange(change_type, entry1, entry2)
+
+
+_BLOCK_SIZE = 64
+
+
+def _count_blocks(obj):
+    """Count the blocks in an object.
+
+    Splits the data into blocks either on lines or <=64-byte chunks of lines.
+
+    :param obj: The object to count blocks for.
+    :return: A dict of block hashcode -> total bytes occurring.
+    """
+    block_counts = defaultdict(int)
+    block = StringIO()
+    n = 0
+
+    # Cache attrs as locals to avoid expensive lookups in the inner loop.
+    block_write = block.write
+    block_seek = block.seek
+    block_truncate = block.truncate
+    block_getvalue = block.getvalue
+
+    for c in itertools.chain(*obj.as_raw_chunks()):
+        block_write(c)
+        n += 1
+        if c == '\n' or n == _BLOCK_SIZE:
+            value = block_getvalue()
+            block_counts[hash(value)] += len(value)
+            block_seek(0)
+            block_truncate()
+            n = 0
+    if n > 0:
+        last_block = block_getvalue()
+        block_counts[hash(last_block)] += len(last_block)
+    return block_counts
+
+
+def _common_bytes(blocks1, blocks2):
+    """Count the number of common bytes in two block count dicts.
+
+    :param block1: The first dict of block hashcode -> total bytes.
+    :param block2: The second dict of block hashcode -> total bytes.
+    :return: The number of bytes in common between blocks1 and blocks2. This is
+        only approximate due to possible hash collisions.
+    """
+    # Iterate over the smaller of the two dicts, since this is symmetrical.
+    if len(blocks1) > len(blocks2):
+        blocks1, blocks2 = blocks2, blocks1
+    score = 0
+    for block, count1 in blocks1.iteritems():
+        count2 = blocks2.get(block)
+        if count2:
+            score += min(count1, count2)
+    return score
+
+
+def _similarity_score(obj1, obj2, block_cache=None):
+    """Compute a similarity score for two objects.
+
+    :param obj1: The first object to score.
+    :param obj2: The second object to score.
+    :param block_cache: An optional dict of SHA to block counts to cache results
+        between calls.
+    :return: The similarity score between the two objects, defined as the number
+        of bytes in common between the two objects divided by the maximum size,
+        scaled to the range 0-100.
+    """
+    if block_cache is None:
+        block_cache = {}
+    if obj1.id not in block_cache:
+        block_cache[obj1.id] = _count_blocks(obj1)
+    if obj2.id not in block_cache:
+        block_cache[obj2.id] = _count_blocks(obj2)
+
+    common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])
+    max_size = max(obj1.raw_length(), obj2.raw_length())
+    if not max_size:
+        return _MAX_SCORE
+    return int(float(common_bytes) * _MAX_SCORE / max_size)
+
+
+def _tree_change_key(entry):
+    # Sort by old path then new path. If only one exists, use it for both keys.
+    path1 = entry.old.path
+    path2 = entry.new.path
+    if path1 is None:
+        path1 = path2
+    if path2 is None:
+        path2 = path1
+    return (path1, path2)
+
+
+class RenameDetector(object):
+    """Object for handling rename detection between two trees."""
+
+    def __init__(self, store, tree1_id, tree2_id,
+                 rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES,
+                 rewrite_threshold=_REWRITE_THRESHOLD,
+                 find_copies_harder=False):
+        """Initialize the rename detector.
+
+        :param store: An ObjectStore for looking up objects.
+        :param tree1_id: The SHA of the first Tree.
+        :param tree2_id: The SHA of the second Tree.
+        :param rename_threshold: The threshold similarity score for considering
+            an add/delete pair to be a rename/copy; see _similarity_score.
+        :param max_files: The maximum number of adds and deletes to consider, or
+            None for no limit. The detector is guaranteed to compare no more
+            than max_files ** 2 add/delete pairs. This limit is provided because
+            rename detection can be quadratic in the project size. If the limit
+            is exceeded, no content rename detection is attempted.
+        :param rewrite_threshold: The threshold similarity score below which a
+            modify should be considered a delete/add, or None to not break
+            modifies; see _similarity_score.
+        :param find_copies_harder: If True, consider unmodified files when
+            detecting copies.
+        """
+        self._tree1_id = tree1_id
+        self._tree2_id = tree2_id
+        self._store = store
+        self._rename_threshold = rename_threshold
+        self._rewrite_threshold = rewrite_threshold
+        self._max_files = max_files
+        self._find_copies_harder = find_copies_harder
+
+        self._adds = []
+        self._deletes = []
+        self._changes = []
+
+    def _should_split(self, change):
+        if (self._rewrite_threshold is None or change.type != CHANGE_MODIFY or
+            change.old.sha == change.new.sha):
+            return False
+        old_obj = self._store[change.old.sha]
+        new_obj = self._store[change.new.sha]
+        return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
+
+    def _collect_changes(self):
+        for change in tree_changes(self._store, self._tree1_id, self._tree2_id,
+                                   want_unchanged=self._find_copies_harder):
+            if change.type == CHANGE_ADD:
+                self._adds.append(change)
+            elif change.type == CHANGE_DELETE:
+                self._deletes.append(change)
+            elif self._should_split(change):
+                self._deletes.append(TreeChange.delete(change.old))
+                self._adds.append(TreeChange.add(change.new))
+            elif (self._find_copies_harder and (
+              change.type == CHANGE_MODIFY or change.type == CHANGE_UNCHANGED)):
+                # Treat modified/unchanged as deleted rather than splitting it,
+                # to avoid spurious renames.
+                self._deletes.append(change)
+            else:
+                self._changes.append(change)
+
+    def _prune(self, add_paths, delete_paths):
+        self._adds = [a for a in self._adds if a.new.path not in add_paths]
+        self._deletes = [d for d in self._deletes
+                         if d.old.path not in delete_paths]
+
+    def _find_exact_renames(self):
+        add_map = defaultdict(list)
+        for add in self._adds:
+            add_map[add.new.sha].append(add.new)
+        delete_map = defaultdict(list)
+        for delete in self._deletes:
+            # Keep track of whether the delete was actually marked as a delete.
+            # If not, it must have been added due to find_copies_harder, and
+            # needs to be marked as a copy.
+            is_delete = delete.type == CHANGE_DELETE
+            delete_map[delete.old.sha].append((delete.old, is_delete))
+
+        add_paths = set()
+        delete_paths = set()
+        for sha, sha_deletes in delete_map.iteritems():
+            sha_adds = add_map[sha]
+            for (old, is_delete), new in itertools.izip(sha_deletes, sha_adds):
+                if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode):
+                    continue
+                delete_paths.add(old.path)
+                add_paths.add(new.path)
+                new_type = is_delete and CHANGE_RENAME or CHANGE_COPY
+                self._changes.append(TreeChange(new_type, old, new))
+
+            num_extra_adds = len(sha_adds) - len(sha_deletes)
+            # TODO(dborowitz): Less arbitrary way of dealing with extra copies.
+            old = sha_deletes[0][0]
+            if num_extra_adds:
+                for new in sha_adds[-num_extra_adds:]:
+                    add_paths.add(new.path)
+                    self._changes.append(TreeChange(CHANGE_COPY, old, new))
+        self._prune(add_paths, delete_paths)
+
+    def _find_content_renames(self):
+        # TODO: Optimizations:
+        #  - Compare object sizes before counting blocks.
+        #  - Skip if delete's S_IFMT differs from all adds.
+        #  - Skip if adds or deletes is empty.
+        # Match C git's behavior of not attempting to find content renames if
+        # the matrix size exceeds the threshold.
+        if len(self._adds) * len(self._deletes) > self._max_files ** 2:
+            return
+
+        check_paths = self._rename_threshold is not None
+        candidates = []
+        for delete in self._deletes:
+            if S_ISGITLINK(delete.old.mode):
+                continue  # Git links don't exist in this repo.
+            old_sha = delete.old.sha
+            old_obj = self._store[old_sha]
+            old_blocks = _count_blocks(old_obj)
+            for add in self._adds:
+                if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
+                    continue
+                new_obj = self._store[add.new.sha]
+                score = _similarity_score(old_obj, new_obj,
+                                          block_cache={old_sha: old_blocks})
+                if score > self._rename_threshold:
+                    if check_paths and delete.old.path == add.new.path:
+                        # If the paths match, this must be a split modify, so
+                        # make sure it comes out as a modify.
+                        new_type = CHANGE_MODIFY
+                    elif delete.type != CHANGE_DELETE:
+                        # If it's in deletes but not marked as a delete, it must
+                        # have been added due to find_copies_harder, and needs
+                        # to be marked as a copy.
+                        new_type = CHANGE_COPY
+                    else:
+                        new_type = CHANGE_RENAME
+                    rename = TreeChange(new_type, delete.old, add.new)
+                    candidates.append((-score, rename))
+
+        # Sort scores from highest to lowest, but keep names in ascending order.
+        candidates.sort()
+
+        delete_paths = set()
+        add_paths = set()
+        for _, change in candidates:
+            new_path = change.new.path
+            if new_path in add_paths:
+                continue
+            old_path = change.old.path
+            orig_type = change.type
+            if old_path in delete_paths:
+                change = TreeChange(CHANGE_COPY, change.old, change.new)
+
+            # If the candidate was originally a copy, that means it came from a
+            # modified or unchanged path, so we don't want to prune it.
+            if orig_type != CHANGE_COPY:
+                delete_paths.add(old_path)
+            add_paths.add(new_path)
+            self._changes.append(change)
+        self._prune(add_paths, delete_paths)
+
+    def _join_modifies(self):
+        if self._rewrite_threshold is None:
+            return
+
+        modifies = {}
+        delete_map = dict((d.old.path, d) for d in self._deletes)
+        for add in self._adds:
+            path = add.new.path
+            delete = delete_map.get(path)
+            if (delete is not None and
+              stat.S_IFMT(delete.old.mode) == stat.S_IFMT(add.new.mode)):
+                modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new)
+
+        self._adds = [a for a in self._adds if a.new.path not in modifies]
+        self._deletes = [a for a in self._deletes if a.new.path not in modifies]
+        self._changes += modifies.values()
+
+    def _sorted_changes(self):
+        result = []
+        result.extend(self._adds)
+        result.extend(self._deletes)
+        result.extend(self._changes)
+        result.sort(key=_tree_change_key)
+        return result
+
+    def _prune_unchanged(self):
+        self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED]
+
+    def changes_with_renames(self):
+        """Iterate TreeChanges between the two trees, with rename detection."""
+        self._collect_changes()
+        self._find_exact_renames()
+        self._find_content_renames()
+        self._join_modifies()
+        self._prune_unchanged()
+        return self._sorted_changes()
+
+
+# Hold on to the pure-python implementations for testing.
+_is_tree_py = _is_tree
+_merge_entries_py = _merge_entries
+_count_blocks_py = _count_blocks
+try:
+    # Try to import C versions
+    from dulwich._diff_tree import _is_tree, _merge_entries, _count_blocks
+except ImportError:
+    pass

+ 4 - 0
dulwich/errors.py

@@ -166,3 +166,7 @@ class NoIndexPresent(Exception):
 
 class CommitError(Exception):
     """An error occurred while performing a commit."""
+
+
+class RefFormatError(Exception):
+    """Indicates an invalid ref name."""

+ 36 - 3
dulwich/fastexport.py

@@ -117,6 +117,7 @@ class GitImportProcessor(processor.ImportProcessor):
         self.repo = repo
         self.last_commit = None
         self.markers = {}
+        self._contents = {}
 
     def import_stream(self, stream):
         p = parser.ImportParser(stream)
@@ -151,10 +152,32 @@ class GitImportProcessor(processor.ImportProcessor):
         commit.commit_time = int(commit_timestamp)
         commit.message = cmd.message
         commit.parents = []
-        contents = {}
+        if cmd.from_:
+            self._reset_base(cmd.from_)
+        for filecmd in cmd.iter_files():
+            if filecmd.name == "filemodify":
+                if filecmd.data is not None:
+                    blob = Blob.from_string(filecmd.data)
+                    self.repo.object_store.add(blob)
+                    blob_id = blob.id
+                else:
+                    assert filecmd.dataref[0] == ":", "non-marker refs not supported yet"
+                    blob_id = self.markers[filecmd.dataref[1:]]
+                self._contents[filecmd.path] = (filecmd.mode, blob_id)
+            elif filecmd.name == "filedelete":
+                del self._contents[filecmd.path]
+            elif filecmd.name == "filecopy":
+                self._contents[filecmd.dest_path] = self._contents[filecmd.src_path]
+            elif filecmd.name == "filerename":
+                self._contents[filecmd.new_path] = self._contents[filecmd.old_path]
+                del self._contents[filecmd.old_path]
+            elif filecmd.name == "filedeleteall":
+                self._contents = {}
+            else:
+                raise Exception("Command %s not supported" % filecmd.name)
         commit.tree = commit_tree(self.repo.object_store,
             ((path, hexsha, mode) for (path, (mode, hexsha)) in
-                contents.iteritems()))
+                self._contents.iteritems()))
         if self.last_commit is not None:
             commit.parents.append(self.last_commit)
         commit.parents += cmd.merges
@@ -168,9 +191,19 @@ class GitImportProcessor(processor.ImportProcessor):
         """Process a ProgressCommand."""
         pass
 
+    def _reset_base(self, commit_id):
+        if self.last_commit == commit_id:
+            return
+        self.last_commit = commit_id
+        self._contents = {}
+        tree_id = self.repo[commit_id].tree
+        for (path, mode, hexsha) in (
+                self.repo.object_store.iter_tree_contents(tree_id)):
+            self._contents[path] = (mode, hexsha)
+
     def reset_handler(self, cmd):
         """Process a ResetCommand."""
-        self.last_commit = cmd.from_
+        self._reset_base(cmd.from_)
         self.rep.refs[cmd.from_] = cmd.id
 
     def tag_handler(self, cmd):

+ 1 - 16
dulwich/file.py

@@ -71,22 +71,7 @@ def GitFile(filename, mode='rb', bufsize=-1):
     Only read-only and write-only (binary) modes are supported; r+, w+, and a
     are not.  To read and write from the same file, you can take advantage of
     the fact that opening a file for write does not actually open the file you
-    request:
-
-    >>> write_file = GitFile('filename', 'wb')
-    >>> read_file = GitFile('filename', 'rb')
-    >>> read_file.readlines()
-    ['contents\n', 'of\n', 'the\n', 'file\n']
-    >>> write_file.write('foo')
-    >>> read_file.close()
-    >>> write_file.close()
-    >>> new_file = GitFile('filename', 'rb')
-    'foo'
-    >>> new_file.close()
-    >>> other_file = GitFile('filename', 'wb')
-    Traceback (most recent call last):
-        ...
-    OSError: [Errno 17] File exists: 'filename.lock'
+    request.
     """
     if 'a' in mode:
         raise IOError('append mode not supported for Git files')

+ 29 - 63
dulwich/object_store.py

@@ -23,11 +23,14 @@
 import errno
 import itertools
 import os
-import posixpath
 import stat
 import tempfile
 import urllib2
 
+from dulwich.diff_tree import (
+    tree_changes,
+    walk_trees,
+    )
 from dulwich.errors import (
     NotTreeError,
     )
@@ -129,52 +132,14 @@ class BaseObjectStore(object):
         :param object_store: Object store to use for retrieving tree contents
         :param tree: SHA1 of the root tree
         :param want_unchanged: Whether unchanged files should be reported
-        :return: Iterator over tuples with (oldpath, newpath), (oldmode, newmode), (oldsha, newsha)
+        :return: Iterator over tuples with
+            (oldpath, newpath), (oldmode, newmode), (oldsha, newsha)
         """
-        todo = set([(source, target, "")])
-        while todo:
-            (sid, tid, path) = todo.pop()
-            if sid is not None:
-                stree = self[sid]
-            else:
-                stree = {}
-            if tid is not None:
-                ttree = self[tid]
-            else:
-                ttree = {}
-            for name, oldmode, oldhexsha in stree.iteritems():
-                oldchildpath = posixpath.join(path, name)
-                try:
-                    (newmode, newhexsha) = ttree[name]
-                    newchildpath = oldchildpath
-                except KeyError:
-                    newmode = None
-                    newhexsha = None
-                    newchildpath = None
-                if (want_unchanged or oldmode != newmode or
-                    oldhexsha != newhexsha):
-                    if stat.S_ISDIR(oldmode):
-                        if newmode is None or stat.S_ISDIR(newmode):
-                            todo.add((oldhexsha, newhexsha, oldchildpath))
-                        else:
-                            # entry became a file
-                            todo.add((oldhexsha, None, oldchildpath))
-                            yield ((None, newchildpath), (None, newmode), (None, newhexsha))
-                    else:
-                        if newmode is not None and stat.S_ISDIR(newmode):
-                            # entry became a dir
-                            yield ((oldchildpath, None), (oldmode, None), (oldhexsha, None))
-                            todo.add((None, newhexsha, newchildpath))
-                        else:
-                            yield ((oldchildpath, newchildpath), (oldmode, newmode), (oldhexsha, newhexsha))
-
-            for name, newmode, newhexsha in ttree.iteritems():
-                childpath = posixpath.join(path, name)
-                if not name in stree:
-                    if not stat.S_ISDIR(newmode):
-                        yield ((None, childpath), (None, newmode), (None, newhexsha))
-                    else:
-                        todo.add((None, newhexsha, childpath))
+        for change in tree_changes(self, source, target,
+                                   want_unchanged=want_unchanged):
+            yield ((change.old.path, change.new.path),
+                   (change.old.mode, change.new.mode),
+                   (change.old.sha, change.new.sha))
 
     def iter_tree_contents(self, tree_id, include_trees=False):
         """Iterate the contents of a tree and all subtrees.
@@ -183,19 +148,12 @@ class BaseObjectStore(object):
 
         :param tree_id: SHA1 of the tree.
         :param include_trees: If True, include tree objects in the iteration.
-        :return: Yields tuples of (path, mode, hexhsa) for objects in a tree.
+        :return: Iterator over TreeEntry namedtuples for all the objects in a
+            tree.
         """
-        todo = [('', stat.S_IFDIR, tree_id)]
-        while todo:
-            path, mode, hexsha = todo.pop()
-            is_subtree = stat.S_ISDIR(mode)
-            if not is_subtree or include_trees:
-                yield path, mode, hexsha
-            if is_subtree:
-                entries = reversed(list(self[hexsha].iteritems()))
-                for name, entry_mode, entry_hexsha in entries:
-                    entry_path = posixpath.join(path, name)
-                    todo.append((entry_path, entry_mode, entry_hexsha))
+        for entry, _ in walk_trees(self, tree_id, None):
+            if not stat.S_ISDIR(entry.mode) or include_trees:
+                yield entry
 
     def find_missing_objects(self, haves, wants, progress=None,
                              get_tagged=None):
@@ -338,7 +296,7 @@ class PackBasedObjectStore(BaseObjectStore):
             sha = name
             hexsha = None
         else:
-            raise AssertionError
+            raise AssertionError("Invalid object name %r" % name)
         for pack in self.packs:
             try:
                 return pack.get_raw(sha)
@@ -443,10 +401,14 @@ class DiskObjectStore(PackBasedObjectStore):
         data.create_index_v2(temppath)
         p = Pack.from_objects(data, load_pack_index(temppath))
 
-        # Write a full pack version
-        temppath = os.path.join(self.pack_dir,
-            sha_to_hex(urllib2.randombytes(20))+".temppack")
-        write_pack(temppath, ((o, None) for o in p.iterobjects()), len(p))
+        try:
+            # Write a full pack version
+            temppath = os.path.join(self.pack_dir,
+                sha_to_hex(urllib2.randombytes(20))+".temppack")
+            write_pack(temppath, ((o, None) for o in p.iterobjects()), len(p))
+        finally:
+            p.close()
+
         pack_sha = load_pack_index(temppath+".idx").objects_sha1()
         newbasename = os.path.join(self.pack_dir, "pack-%s" % pack_sha)
         os.rename(temppath+".pack", newbasename+".pack")
@@ -579,6 +541,10 @@ class MemoryObjectStore(BaseObjectStore):
     def __getitem__(self, name):
         return self._data[name]
 
+    def __delitem__(self, name):
+        """Delete an object from this store, for testing only."""
+        del self._data[name]
+
     def add_object(self, obj):
         """Add a single object to this object store.
 

+ 32 - 20
dulwich/objects.py

@@ -38,7 +38,7 @@ from dulwich.errors import (
     ObjectFormatException,
     )
 from dulwich.file import GitFile
-from dulwich.misc import (
+from dulwich._compat import (
     make_sha,
     TreeEntryTuple,
     )
@@ -143,7 +143,7 @@ def check_identity(identity, error_msg):
     """Check if the specified identity is valid.
 
     This will raise an exception if the identity is not valid.
-    
+
     :param identity: Identity string
     :param error_msg: Error message to use in exception
     """
@@ -175,7 +175,7 @@ class FixedSha(object):
 class ShaFile(object):
     """A git SHA file."""
 
-    __slots__ = ('_needs_parsing', '_chunked_text', '_file', '_path', 
+    __slots__ = ('_needs_parsing', '_chunked_text', '_file', '_path',
                  '_sha', '_needs_serialization', '_magic')
 
     @staticmethod
@@ -564,7 +564,7 @@ class Tag(ShaFile):
     type_name = 'tag'
     type_num = 4
 
-    __slots__ = ('_tag_timezone_neg_utc', '_name', '_object_sha', 
+    __slots__ = ('_tag_timezone_neg_utc', '_name', '_object_sha',
                  '_object_class', '_tag_time', '_tag_timezone',
                  '_tagger', '_message')
 
@@ -694,18 +694,20 @@ class TreeEntry(TreeEntryTuple):
         return TreeEntry(posixpath.join(path, self.path), self.mode, self.sha)
 
 
-def parse_tree(text):
+def parse_tree(text, strict=False):
     """Parse a tree text.
 
     :param text: Serialized text to parse
     :return: iterator of tuples of (name, mode, sha)
+    :raise ObjectFormatException: if the object was malformed in some way
     """
     count = 0
     l = len(text)
     while count < l:
         mode_end = text.index(' ', count)
         mode_text = text[count:mode_end]
-        assert mode_text[0] != '0'
+        if strict and mode_text.startswith('0'):
+            raise ObjectFormatException("Invalid mode '%s'" % mode_text)
         try:
             mode = int(mode_text, 8)
         except ValueError:
@@ -730,14 +732,17 @@ def serialize_tree(items):
         yield "%04o %s\0%s" % (mode, name, hex_to_sha(hexsha))
 
 
-def sorted_tree_items(entries):
-    """Iterate over a tree entries dictionary in the order in which 
-    the items would be serialized.
+def sorted_tree_items(entries, name_order):
+    """Iterate over a tree entries dictionary.
 
+    :param name_order: If True, iterate entries in order of their name. If
+        False, iterate entries in tree order, that is, treat subtree entries as
+        having '/' appended.
     :param entries: Dictionary mapping names to (mode, sha) tuples
     :return: Iterator over (name, mode, hexsha)
     """
-    for name, entry in sorted(entries.iteritems(), cmp=cmp_entry):
+    cmp_func = name_order and cmp_entry_name_order or cmp_entry
+    for name, entry in sorted(entries.iteritems(), cmp=cmp_func):
         mode, hexsha = entry
         # Stricter type checks than normal to mirror checks in the C version.
         mode = int(mode)
@@ -747,7 +752,7 @@ def sorted_tree_items(entries):
 
 
 def cmp_entry((name1, value1), (name2, value2)):
-    """Compare two tree entries."""
+    """Compare two tree entries in tree order."""
     if stat.S_ISDIR(value1[0]):
         name1 += "/"
     if stat.S_ISDIR(value2[0]):
@@ -755,6 +760,11 @@ def cmp_entry((name1, value1), (name2, value2)):
     return cmp(name1, name2)
 
 
+def cmp_entry_name_order(entry1, entry2):
+    """Compare two tree entries in name order."""
+    return cmp(entry1[0], entry2[0])
+
+
 class Tree(ShaFile):
     """A Git tree object"""
 
@@ -822,9 +832,9 @@ class Tree(ShaFile):
 
     def entries(self):
         """Return a list of tuples describing the tree entries.
-        
-        :note: The order of the tuples that are returned is different from that 
-            returned by the items and iteritems methods. This function will be 
+
+        :note: The order of the tuples that are returned is different from that
+            returned by the items and iteritems methods. This function will be
             deprecated in the future.
         """
         self._ensure_parsed()
@@ -833,13 +843,14 @@ class Tree(ShaFile):
         return [
             (mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
 
-    def iteritems(self):
-        """Iterate over entries in the order in which they would be serialized.
+    def iteritems(self, name_order=False):
+        """Iterate over entries.
 
+        :param name_order: If True, iterate in name order instead of tree order.
         :return: Iterator over (name, mode, sha) tuples
         """
         self._ensure_parsed()
-        return sorted_tree_items(self._entries)
+        return sorted_tree_items(self._entries, name_order)
 
     def items(self):
         """Return the sorted entries in this tree.
@@ -869,7 +880,8 @@ class Tree(ShaFile):
                          stat.S_IFLNK, stat.S_IFDIR, S_IFGITLINK,
                          # TODO: optionally exclude as in git fsck --strict
                          stat.S_IFREG | 0664)
-        for name, mode, sha in parse_tree("".join(self._chunked_text)):
+        for name, mode, sha in parse_tree(''.join(self._chunked_text),
+                                          True):
             check_hexsha(sha, 'invalid sha %s' % sha)
             if '/' in name or name in ('', '.', '..'):
                 raise ObjectFormatException('invalid name %s' % name)
@@ -903,7 +915,7 @@ def parse_timezone(text):
     """Parse a timezone text fragment (e.g. '+0100').
 
     :param text: Text to parse.
-    :return: Tuple with timezone as seconds difference to UTC 
+    :return: Tuple with timezone as seconds difference to UTC
         and a boolean indicating whether this was a UTC timezone
         prefixed with a negative sign (-0000).
     """
@@ -968,7 +980,7 @@ class Commit(ShaFile):
         self._parents = []
         self._extra = []
         self._author = None
-        for field, value in parse_commit("".join(self._chunked_text)):
+        for field, value in parse_commit(''.join(self._chunked_text)):
             if field == _TREE_HEADER:
                 self._tree = value
             elif field == _PARENT_HEADER:

+ 7 - 9
dulwich/pack.py

@@ -33,7 +33,7 @@ a pointer in to the corresponding packfile.
 try:
     from collections import defaultdict
 except ImportError:
-    from misc import defaultdict
+    from _compat import defaultdict
 
 from cStringIO import (
     StringIO,
@@ -53,7 +53,7 @@ import struct
 try:
     from struct import unpack_from
 except ImportError:
-    from dulwich.misc import unpack_from
+    from dulwich._compat import unpack_from
 import sys
 import zlib
 
@@ -65,7 +65,7 @@ from dulwich.file import GitFile
 from dulwich.lru_cache import (
     LRUSizeCache,
     )
-from dulwich.misc import (
+from dulwich._compat import (
     make_sha,
     SEEK_END,
     )
@@ -243,7 +243,7 @@ class PackIndex(object):
 
     def __iter__(self):
         """Iterate over the SHAs in this pack."""
-        raise NotImplementedError(self.__iter__)
+        return imap(sha_to_hex, self._itersha())
 
     def iterentries(self):
         """Iterate over the entries in this pack index.
@@ -278,10 +278,6 @@ class PackIndex(object):
         """
         raise NotImplementedError(self._object_index)
 
-    def __iter__(self):
-        """Iterate over the SHAs in this pack."""
-        return imap(sha_to_hex, self._itersha())
-
     def objects_sha1(self):
         """Return the hex SHA1 over all the shas of all objects in this pack.
 
@@ -350,7 +346,7 @@ class FilePackIndex(PackIndex):
         else:
             self._file = file
         if contents is None:
-            self._contents, self._size = _load_file_contents(file, size)
+            self._contents, self._size = _load_file_contents(self._file, size)
         else:
             self._contents, self._size = (contents, size)
 
@@ -364,6 +360,8 @@ class FilePackIndex(PackIndex):
 
     def close(self):
         self._file.close()
+        if getattr(self._contents, "close", None) is not None:
+            self._contents.close()
 
     def __len__(self):
         """Return the number of entries in this pack index."""

+ 26 - 2
dulwich/patch.py

@@ -28,6 +28,7 @@ import subprocess
 import time
 
 from dulwich.objects import (
+    Blob,
     Commit,
     )
 
@@ -136,14 +137,37 @@ def write_blob_diff(f, (old_path, old_mode, old_blob),
             f.write("new mode %o\n" % new_mode)
         else:
             f.write("deleted mode %o\n" % old_mode)
-    f.write("index %s..%s %o\n" % (
-        blob_id(old_blob), blob_id(new_blob), new_mode))
+    f.write("index %s..%s" % (blob_id(old_blob), blob_id(new_blob)))
+    if new_mode is not None:
+        f.write(" %o" % new_mode)
+    f.write("\n")
     old_contents = lines(old_blob)
     new_contents = lines(new_blob)
     f.writelines(unified_diff(old_contents, new_contents,
         old_path, new_path))
 
 
+def write_tree_diff(f, store, old_tree, new_tree):
+    """Write tree diff.
+
+    :param f: File-like object to write to.
+    :param old_tree: Old tree id
+    :param new_tree: New tree id
+    """
+    changes = store.tree_changes(old_tree, new_tree)
+    for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
+        if oldsha is None:
+            old_blob = Blob.from_string("")
+        else:
+            old_blob = store[oldsha]
+        if newsha is None:
+            new_blob = Blob.from_string("")
+        else:
+            new_blob = store[newsha]
+        write_blob_diff(f, (oldpath, oldmode, old_blob),
+                           (newpath, newmode, new_blob))
+
+
 def git_am_patch_split(f):
     """Parse a git-am-style patch and split it up into bits.
 

+ 1 - 2
dulwich/protocol.py

@@ -20,14 +20,13 @@
 """Generic functions for talking the git smart server protocol."""
 
 from cStringIO import StringIO
-import os
 import socket
 
 from dulwich.errors import (
     HangupException,
     GitProtocolError,
     )
-from dulwich.misc import (
+from dulwich._compat import (
     SEEK_END,
     )
 

+ 34 - 18
dulwich/repo.py

@@ -35,6 +35,7 @@ from dulwich.errors import (
     NotTagError,
     PackedRefsException,
     CommitError,
+    RefFormatError,
     )
 from dulwich.file import (
     ensure_dir_exists,
@@ -213,7 +214,7 @@ class RefsContainer(object):
         if name == 'HEAD':
             return
         if not name.startswith('refs/') or not check_ref_format(name[5:]):
-            raise KeyError(name)
+            raise RefFormatError(name)
 
     def read_ref(self, refname):
         """Read a reference without following any references.
@@ -763,13 +764,13 @@ class BaseRepo(object):
         self.object_store = object_store
         self.refs = refs
 
-    def _init_files(self):
+    def _init_files(self, bare):
         """Initialize a default set of named files."""
         self._put_named_file('description', "Unnamed repository")
         self._put_named_file('config', ('[core]\n'
                                         'repositoryformatversion = 0\n'
                                         'filemode = true\n'
-                                        'bare = ' + str(self.bare).lower() + '\n'
+                                        'bare = ' + str(bare).lower() + '\n'
                                         'logallrefupdates = true\n'))
         self._put_named_file(os.path.join('info', 'exclude'), '')
 
@@ -777,7 +778,7 @@ class BaseRepo(object):
         """Get a file from the control dir with a specific name.
 
         Although the filename should be interpreted as a filename relative to
-        the control dir in a disk-baked Repo, the object returned need not be
+        the control dir in a disk-based Repo, the object returned need not be
         pointing to a file in that location.
 
         :param path: The path to the file, relative to the control dir.
@@ -990,7 +991,10 @@ class BaseRepo(object):
                 return self.object_store[name]
             except KeyError:
                 pass
-        return self.object_store[self.refs[name]]
+        try:
+            return self.object_store[self.refs[name]]
+        except RefFormatError:
+            raise KeyError(name)
 
     def __contains__(self, name):
         if len(name) in (20, 40):
@@ -1017,7 +1021,7 @@ class BaseRepo(object):
     def do_commit(self, message, committer=None,
                   author=None, commit_timestamp=None,
                   commit_timezone=None, author_timestamp=None,
-                  author_timezone=None, tree=None):
+                  author_timezone=None, tree=None, encoding=None):
         """Create a new commit.
 
         :param message: Commit message
@@ -1028,7 +1032,9 @@ class BaseRepo(object):
         :param author_timestamp: Author timestamp (defaults to commit timestamp)
         :param author_timezone: Author timestamp timezone
             (defaults to commit timestamp timezone)
-        :param tree: SHA1 of the tree root to use (if not specified the current index will be committed).
+        :param tree: SHA1 of the tree root to use (if not specified the
+            current index will be committed).
+        :param encoding: Encoding
         :return: New commit SHA1
         """
         import time
@@ -1037,6 +1043,8 @@ class BaseRepo(object):
             index = self.open_index()
             c.tree = index.commit(self.object_store)
         else:
+            if len(tree) != 40:
+                raise ValueError("tree must be a 40-byte hex sha string")
             c.tree = tree
         # TODO: Allow username to be missing, and get it from .git/config
         if committer is None:
@@ -1058,6 +1066,8 @@ class BaseRepo(object):
         if author_timezone is None:
             author_timezone = commit_timezone
         c.author_timezone = author_timezone
+        if encoding is not None:
+            c.encoding = encoding
         c.message = message
         try:
             old_head = self.refs["HEAD"]
@@ -1116,7 +1126,7 @@ class Repo(BaseRepo):
         """Get a file from the control dir with a specific name.
 
         Although the filename should be interpreted as a filename relative to
-        the control dir in a disk-baked Repo, the object returned need not be
+        the control dir in a disk-based Repo, the object returned need not be
         pointing to a file in that location.
 
         :param path: The path to the file, relative to the control dir.
@@ -1184,22 +1194,28 @@ class Repo(BaseRepo):
         return "<Repo at %r>" % self.path
 
     @classmethod
-    def init(cls, path, mkdir=True):
-        controldir = os.path.join(path, ".git")
-        os.mkdir(controldir)
-        cls.init_bare(controldir)
-        return cls(path)
-
-    @classmethod
-    def init_bare(cls, path, mkdir=True):
+    def _init_maybe_bare(cls, path, bare):
         for d in BASE_DIRECTORIES:
             os.mkdir(os.path.join(path, *d))
         DiskObjectStore.init(os.path.join(path, OBJECTDIR))
         ret = cls(path)
         ret.refs.set_symbolic_ref("HEAD", "refs/heads/master")
-        ret._init_files()
+        ret._init_files(bare)
         return ret
 
+    @classmethod
+    def init(cls, path, mkdir=False):
+        if mkdir:
+            os.mkdir(path)
+        controldir = os.path.join(path, ".git")
+        os.mkdir(controldir)
+        cls._init_maybe_bare(controldir, False)
+        return cls(path)
+
+    @classmethod
+    def init_bare(cls, path):
+        return cls._init_maybe_bare(path, True)
+
     create = init_bare
 
 
@@ -1249,5 +1265,5 @@ class MemoryRepo(BaseRepo):
             ret.object_store.add_object(obj)
         for refname, sha in refs.iteritems():
             ret.refs[refname] = sha
-        ret._init_files()
+        ret._init_files(bare=True)
         return ret

+ 35 - 1
dulwich/server.py

@@ -48,8 +48,10 @@ from dulwich.pack import (
     write_pack_data,
     )
 from dulwich.protocol import (
+    BufferedPktLineWriter,
     MULTI_ACK,
     MULTI_ACK_DETAILED,
+    Protocol,
     ProtocolFile,
     ReceivableProtocol,
     SINGLE_ACK,
@@ -58,7 +60,6 @@ from dulwich.protocol import (
     ack_type,
     extract_capabilities,
     extract_want_line_capabilities,
-    BufferedPktLineWriter,
     )
 from dulwich.repo import (
     Repo,
@@ -155,6 +156,14 @@ class DictBackend(Backend):
         return self.repos[path]
 
 
+class FileSystemBackend(Backend):
+    """Simple backend that looks up Git repositories in the local file system."""
+
+    def open_repository(self, path):
+        logger.debug('opening repository at %s', path)
+        return Repo(path)
+
+
 class Handler(object):
     """Smart protocol command handler base class."""
 
@@ -775,3 +784,28 @@ def main(argv=sys.argv):
     backend = DictBackend({'/': Repo(gitdir)})
     server = TCPGitServer(backend, 'localhost')
     server.serve_forever()
+
+
+def serve_command(handler_cls, argv=sys.argv, backend=None, inf=sys.stdin,
+                  outf=sys.stdout):
+    """Serve a single command.
+
+    This is mostly useful for the implementation of commands used by e.g. git+ssh.
+
+    :param handler_cls: `Handler` class to use for the request
+    :param argv: execv-style command-line arguments. Defaults to sys.argv.
+    :param backend: `Backend` to use
+    :param inf: File-like object to read from, defaults to standard input.
+    :param outf: File-like object to write to, defaults to standard output.
+    :return: Exit code for use with sys.exit. 0 on success, 1 on failure.
+    """
+    if backend is None:
+        backend = FileSystemBackend()
+    def send_fn(data):
+        outf.write(data)
+        outf.flush()
+    proto = Protocol(inf.read, send_fn)
+    handler = handler_cls(backend, argv[1:], proto)
+    # FIXME: Catch exceptions and write a single-line summary to outf.
+    handler.handle()
+    return 0

+ 85 - 28
dulwich/tests/__init__.py

@@ -19,42 +19,69 @@
 
 """Tests for Dulwich."""
 
+import doctest
+import os
 import unittest
-
-try:
-    from testtools.testcase import TestCase
-except ImportError:
-    from unittest import TestCase
+import shutil
+import subprocess
+import sys
+import tempfile
 
 try:
     # If Python itself provides an exception, use that
     from unittest import SkipTest as TestSkipped
 except ImportError:
-    # Check if the nose exception can be used
     try:
-        import nose
+        from unittest2 import SkipTest as TestSkipped
     except ImportError:
-        try:
-            import testtools.testcase
-        except ImportError:
-            class TestSkipped(Exception):
-                def __init__(self, msg):
-                    self.msg = msg
-        else:
-            TestSkipped = testtools.testcase.TestCase.skipException
-    else:
-        TestSkipped = nose.SkipTest
-        try:
-            import testtools.testcase
-        except ImportError:
-            pass
-        else:
-            # Make testtools use the same exception class as nose
-            testtools.testcase.TestCase.skipException = TestSkipped
+        from testtools.testcase import TestSkipped
+
+try:
+    from testtools.testcase import TestCase
+except ImportError:
+    from unittest import TestCase
+else:
+    TestCase.skipException = TestSkipped
 
 
-def test_suite():
+class BlackboxTestCase(TestCase):
+    """Blackbox testing."""
+
+    bin_directory = os.path.abspath(os.path.join(os.path.dirname(__file__),
+        "..", "..", "bin"))
+
+    def bin_path(self, name):
+        """Determine the full path of a binary.
+
+        :param name: Name of the script
+        :return: Full path
+        """
+        return os.path.join(self.bin_directory, name)
+
+    def run_command(self, name, args):
+        """Run a Dulwich command.
+
+        :param name: Name of the command, as it exists in bin/
+        :param args: Arguments to the command
+        """
+        env = dict(os.environ)
+        env["PYTHONPATH"] = os.pathsep.join(sys.path)
+
+        # Since they don't have any extensions, Windows can't recognize
+        # executablility of the Python files in /bin. Even then, we'd have to
+        # expect the user to set up file associations for .py files.
+        #
+        # Save us from all that headache and call python with the bin script.
+        argv = [sys.executable, self.bin_path(name)] + args
+        return subprocess.Popen(argv,
+            stdout=subprocess.PIPE,
+            stdin=subprocess.PIPE, stderr=subprocess.PIPE,
+            env=env)
+
+
+def self_test_suite():
     names = [
+        'blackbox',
         'client',
         'fastexport',
         'file',
@@ -70,8 +97,38 @@ def test_suite():
         'web',
         ]
     module_names = ['dulwich.tests.test_' + name for name in names]
-    result = unittest.TestSuite()
     loader = unittest.TestLoader()
-    suite = loader.loadTestsFromNames(module_names)
-    result.addTests(suite)
+    return loader.loadTestsFromNames(module_names)
+
+
+def tutorial_test_suite():
+    tutorial = [
+        '0-introduction',
+        '1-repo',
+        '2-object-store',
+        '3-conclusion',
+        ]
+    tutorial_files = ["../../docs/tutorial/%s.txt" % name for name in tutorial]
+    def setup(test):
+        test.__dulwich_tempdir = tempfile.mkdtemp()
+        os.chdir(test.__dulwich_tempdir)
+    def teardown(test):
+        shutil.rmtree(test.__dulwich_tempdir)
+    return doctest.DocFileSuite(setUp=setup, tearDown=teardown,
+        *tutorial_files)
+
+
+def nocompat_test_suite():
+    result = unittest.TestSuite()
+    result.addTests(self_test_suite())
+    result.addTests(tutorial_test_suite())
+    return result
+
+
+def test_suite():
+    result = unittest.TestSuite()
+    result.addTests(self_test_suite())
+    result.addTests(tutorial_test_suite())
+    from dulwich.tests.compat import test_suite as compat_test_suite
+    result.addTests(compat_test_suite())
     return result

+ 37 - 0
dulwich/tests/compat/__init__.py

@@ -0,0 +1,37 @@
+# __init__.py -- Compatibility tests for dulwich
+# Copyright (C) 2010 Jelmer Vernooij <jelmer@samba.org>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License or (at your option) any later version of
+# the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Compatibility tests for Dulwich."""
+
+import unittest
+
+def test_suite():
+    names = [
+        'client',
+        'pack',
+        'repository',
+        'server',
+        'utils',
+        ]
+    module_names = ['dulwich.tests.compat.test_' + name for name in names]
+    result = unittest.TestSuite()
+    loader = unittest.TestLoader()
+    suite = loader.loadTestsFromNames(module_names)
+    result.addTests(suite)
+    return result

+ 4 - 0
dulwich/tests/compat/test_client.py

@@ -43,6 +43,7 @@ from utils import (
     run_git_or_fail,
     )
 
+
 class DulwichClientTestBase(object):
     """Tests for client/server compatibility."""
 
@@ -167,6 +168,7 @@ class DulwichClientTestBase(object):
 
 
 class DulwichTCPClientTest(CompatTestCase, DulwichClientTestBase):
+
     def setUp(self):
         CompatTestCase.setUp(self)
         DulwichClientTestBase.setUp(self)
@@ -211,6 +213,7 @@ class TestSSHVendor(object):
 
 
 class DulwichMockSSHClientTest(CompatTestCase, DulwichClientTestBase):
+
     def setUp(self):
         CompatTestCase.setUp(self)
         DulwichClientTestBase.setUp(self)
@@ -230,6 +233,7 @@ class DulwichMockSSHClientTest(CompatTestCase, DulwichClientTestBase):
 
 
 class DulwichSubprocessClientTest(CompatTestCase, DulwichClientTestBase):
+
     def setUp(self):
         CompatTestCase.setUp(self)
         DulwichClientTestBase.setUp(self)

+ 2 - 3
dulwich/tests/compat/test_server.py

@@ -20,8 +20,8 @@
 """Compatibility tests between Dulwich and the cgit server.
 
 Warning: these tests should be fairly stable, but when writing/debugging new
-tests, deadlocks may freeze the test process such that it cannot be Ctrl-C'ed.
-On *nix, you can kill the tests with Ctrl-Z, "kill %".
+    tests, deadlocks may freeze the test process such that it cannot be
+    Ctrl-C'ed. On POSIX systems, you can kill the tests with Ctrl-Z, "kill %".
 """
 
 import threading
@@ -29,7 +29,6 @@ import threading
 from dulwich.server import (
     DictBackend,
     TCPGitServer,
-    ReceivePackHandler,
     )
 from server_utils import (
     ServerTests,

+ 3 - 2
dulwich/tests/compat/test_utils.py

@@ -19,9 +19,8 @@
 
 """Tests for git compatibility utilities."""
 
-from unittest import TestCase
-
 from dulwich.tests import (
+    TestCase,
     TestSkipped,
     )
 import utils
@@ -30,6 +29,7 @@ import utils
 class GitVersionTests(TestCase):
 
     def setUp(self):
+        super(GitVersionTests, self).setUp()
         self._orig_run_git = utils.run_git
         self._version_str = None  # tests can override to set stub version
 
@@ -39,6 +39,7 @@ class GitVersionTests(TestCase):
         utils.run_git = run_git
 
     def tearDown(self):
+        super(GitVersionTests, self).tearDown()
         utils.run_git = self._orig_run_git
 
     def test_git_version_none(self):

+ 3 - 4
dulwich/tests/compat/test_web.py

@@ -19,15 +19,14 @@
 
 """Compatibility tests between Dulwich and the cgit HTTP server.
 
-Warning: these tests should be fairly stable, but when writing/debugging new
-tests, deadlocks may freeze the test process such that it cannot be Ctrl-C'ed.
-On *nix, you can kill the tests with Ctrl-Z, "kill %".
+warning: these tests should be fairly stable, but when writing/debugging new
+    tests, deadlocks may freeze the test process such that it cannot be
+    Ctrl-C'ed. On POSIX systems, you can kill the tests with Ctrl-Z, "kill %".
 """
 
 import threading
 from wsgiref import simple_server
 
-import dulwich
 from dulwich.server import (
     DictBackend,
     )

+ 8 - 2
dulwich/tests/compat/utils.py

@@ -36,6 +36,8 @@ from dulwich.tests import (
 
 _DEFAULT_GIT = 'git'
 _VERSION_LEN = 4
+_REPOS_DATA_DIR = os.path.abspath(os.path.join(
+    os.path.dirname(__file__), os.pardir, 'data', 'repos'))
 
 
 def git_version(git_path=_DEFAULT_GIT):
@@ -78,6 +80,10 @@ def require_git_version(required_version, git_path=_DEFAULT_GIT):
     :raise TestSkipped: if no suitable git version was found at the given path.
     """
     found_version = git_version(git_path=git_path)
+    if found_version is None:
+        raise TestSkipped('Test requires git >= %s, but c git not found' %
+                         (required_version, ))
+
     if len(required_version) > _VERSION_LEN:
         raise ValueError('Invalid version tuple %s, expected %i parts' %
                          (required_version, _VERSION_LEN))
@@ -142,8 +148,7 @@ def import_repo_to_dir(name):
     :returns: The path to the imported repository.
     """
     temp_dir = tempfile.mkdtemp()
-    export_path = os.path.join(os.path.dirname(__file__), os.pardir, 'data',
-                               'repos', name)
+    export_path = os.path.join(_REPOS_DATA_DIR, name)
     temp_repo_dir = os.path.join(temp_dir, name)
     export_file = open(export_path, 'rb')
     run_git_or_fail(['init', '--quiet', '--bare', temp_repo_dir])
@@ -152,6 +157,7 @@ def import_repo_to_dir(name):
     export_file.close()
     return temp_repo_dir
 
+
 def import_repo(name):
     """Import a repo from a fast-export file in a temporary directory.
 

+ 67 - 0
dulwich/tests/test_blackbox.py

@@ -0,0 +1,67 @@
+# test_blackbox.py -- blackbox tests
+# Copyright (C) 2010 Jelmer Vernooij <jelmer@samba.org>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License or (at your option) a later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Blackbox tests for Dulwich commands."""
+
+import tempfile
+
+from dulwich.repo import (
+    Repo,
+    )
+from dulwich.tests import (
+    BlackboxTestCase,
+    )
+
+
+class GitReceivePackTests(BlackboxTestCase):
+    """Blackbox tests for dul-receive-pack."""
+
+    def setUp(self):
+        super(GitReceivePackTests, self).setUp()
+        self.path = tempfile.mkdtemp()
+        self.repo = Repo.init(self.path)
+
+    def test_basic(self):
+        process = self.run_command("dul-receive-pack", [self.path])
+        (stdout, stderr) = process.communicate("0000")
+        self.assertEquals('', stderr)
+        self.assertEquals('0000', stdout[-4:])
+        self.assertEquals(0, process.returncode)
+
+    def test_missing_arg(self):
+        process = self.run_command("dul-receive-pack", [])
+        (stdout, stderr) = process.communicate()
+        self.assertEquals('usage: dul-receive-pack <git-dir>\n', stderr)
+        self.assertEquals('', stdout)
+        self.assertEquals(1, process.returncode)
+
+
+class GitUploadPackTests(BlackboxTestCase):
+    """Blackbox tests for dul-upload-pack."""
+
+    def setUp(self):
+        super(GitUploadPackTests, self).setUp()
+        self.path = tempfile.mkdtemp()
+        self.repo = Repo.init(self.path)
+
+    def test_missing_arg(self):
+        process = self.run_command("dul-upload-pack", [])
+        (stdout, stderr) = process.communicate()
+        self.assertEquals('usage: dul-upload-pack <git-dir>\n', stderr)
+        self.assertEquals('', stdout)
+        self.assertEquals(1, process.returncode)

+ 61 - 0
dulwich/tests/test_client.py

@@ -20,12 +20,16 @@ from cStringIO import StringIO
 
 from dulwich.client import (
     GitClient,
+    TCPGitClient,
+    SubprocessGitClient,
     SSHGitClient,
+    get_transport_and_path,
     )
 from dulwich.tests import (
     TestCase,
     )
 from dulwich.protocol import (
+    TCP_GIT_PORT,
     Protocol,
     )
 
@@ -66,6 +70,63 @@ class GitClientTests(TestCase):
         self.client.fetch_pack("bla", lambda heads: [], None, None, None)
         self.assertEquals(self.rout.getvalue(), "0000")
 
+    def test_get_transport_and_path_tcp(self):
+        client, path = get_transport_and_path('git://foo.com/bar/baz')
+        self.assertTrue(isinstance(client, TCPGitClient))
+        self.assertEquals('foo.com', client._host)
+        self.assertEquals(TCP_GIT_PORT, client._port)
+        self.assertEqual('/bar/baz', path)
+
+        client, path = get_transport_and_path('git://foo.com:1234/bar/baz')
+        self.assertTrue(isinstance(client, TCPGitClient))
+        self.assertEquals('foo.com', client._host)
+        self.assertEquals(1234, client._port)
+        self.assertEqual('/bar/baz', path)
+
+    def test_get_transport_and_path_ssh_explicit(self):
+        client, path = get_transport_and_path('git+ssh://foo.com/bar/baz')
+        self.assertTrue(isinstance(client, SSHGitClient))
+        self.assertEquals('foo.com', client.host)
+        self.assertEquals(None, client.port)
+        self.assertEquals(None, client.username)
+        self.assertEqual('/bar/baz', path)
+
+        client, path = get_transport_and_path('git+ssh://foo.com:1234/bar/baz')
+        self.assertTrue(isinstance(client, SSHGitClient))
+        self.assertEquals('foo.com', client.host)
+        self.assertEquals(1234, client.port)
+        self.assertEqual('/bar/baz', path)
+
+    def test_get_transport_and_path_ssh_implicit(self):
+        client, path = get_transport_and_path('foo:/bar/baz')
+        self.assertTrue(isinstance(client, SSHGitClient))
+        self.assertEquals('foo', client.host)
+        self.assertEquals(None, client.port)
+        self.assertEquals(None, client.username)
+        self.assertEqual('/bar/baz', path)
+
+        client, path = get_transport_and_path('foo.com:/bar/baz')
+        self.assertTrue(isinstance(client, SSHGitClient))
+        self.assertEquals('foo.com', client.host)
+        self.assertEquals(None, client.port)
+        self.assertEquals(None, client.username)
+        self.assertEqual('/bar/baz', path)
+
+        client, path = get_transport_and_path('user@foo.com:/bar/baz')
+        self.assertTrue(isinstance(client, SSHGitClient))
+        self.assertEquals('foo.com', client.host)
+        self.assertEquals(None, client.port)
+        self.assertEquals('user', client.username)
+        self.assertEqual('/bar/baz', path)
+
+    def test_get_transport_and_path_subprocess(self):
+        client, path = get_transport_and_path('foo.bar/baz')
+        self.assertTrue(isinstance(client, SubprocessGitClient))
+        self.assertEquals('foo.bar/baz', path)
+
+    def test_get_transport_and_path_error(self):
+        self.assertRaises(ValueError, get_transport_and_path, 'foo://bar/baz')
+
 
 class SSHGitClientTests(TestCase):
 

+ 671 - 0
dulwich/tests/test_diff_tree.py

@@ -0,0 +1,671 @@
+# test_diff_tree.py -- Tests for file and tree diff utilities.
+# Copyright (C) 2010 Google, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# or (at your option) a later version of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Tests for file and tree diff utilities."""
+
+from dulwich.diff_tree import (
+    CHANGE_MODIFY,
+    CHANGE_RENAME,
+    CHANGE_COPY,
+    CHANGE_UNCHANGED,
+    TreeChange,
+    _merge_entries,
+    _merge_entries_py,
+    tree_changes,
+    _count_blocks,
+    _count_blocks_py,
+    _similarity_score,
+    _tree_change_key,
+    RenameDetector,
+    _is_tree,
+    _is_tree_py
+    )
+from dulwich.index import (
+    commit_tree,
+    )
+from dulwich._compat import (
+    permutations,
+    )
+from dulwich.object_store import (
+    MemoryObjectStore,
+    )
+from dulwich.objects import (
+    ShaFile,
+    Blob,
+    TreeEntry,
+    Tree,
+    )
+from dulwich.tests import (
+    TestCase,
+    )
+from dulwich.tests.utils import (
+    make_object,
+    functest_builder,
+    ext_functest_builder,
+    )
+
+# Shorthand mode for Files.
+F = 0100644
+
+
+class DiffTestCase(TestCase):
+
+    def setUp(self):
+        super(DiffTestCase, self).setUp()
+        self.store = MemoryObjectStore()
+        self.empty_tree = self.commit_tree([])
+
+    def commit_tree(self, entries):
+        commit_blobs = []
+        for entry in entries:
+            if len(entry) == 2:
+                path, obj = entry
+                mode = F
+            else:
+                path, obj, mode = entry
+            if isinstance(obj, Blob):
+                self.store.add_object(obj)
+                sha = obj.id
+            else:
+                sha = obj
+            commit_blobs.append((path, sha, mode))
+        return self.store[commit_tree(self.store, commit_blobs)]
+
+
+class TreeChangesTest(DiffTestCase):
+
+    def assertMergeFails(self, merge_entries, name, mode, sha):
+        t = Tree()
+        t[name] = (mode, sha)
+        self.assertRaises(TypeError, merge_entries, '', t, t)
+
+    def _do_test_merge_entries(self, merge_entries):
+        blob_a1 = make_object(Blob, data='a1')
+        blob_a2 = make_object(Blob, data='a2')
+        blob_b1 = make_object(Blob, data='b1')
+        blob_c2 = make_object(Blob, data='c2')
+        tree1 = self.commit_tree([('a', blob_a1, 0100644),
+                                  ('b', blob_b1, 0100755)])
+        tree2 = self.commit_tree([('a', blob_a2, 0100644),
+                                  ('c', blob_c2, 0100755)])
+
+        self.assertEqual([], merge_entries('', self.empty_tree,
+                                           self.empty_tree))
+        self.assertEqual([
+          ((None, None, None), ('a', 0100644, blob_a1.id)),
+          ((None, None, None), ('b', 0100755, blob_b1.id)),
+          ], merge_entries('', self.empty_tree, tree1))
+        self.assertEqual([
+          ((None, None, None), ('x/a', 0100644, blob_a1.id)),
+          ((None, None, None), ('x/b', 0100755, blob_b1.id)),
+          ], merge_entries('x', self.empty_tree, tree1))
+
+        self.assertEqual([
+          (('a', 0100644, blob_a2.id), (None, None, None)),
+          (('c', 0100755, blob_c2.id), (None, None, None)),
+          ], merge_entries('', tree2, self.empty_tree))
+
+        self.assertEqual([
+          (('a', 0100644, blob_a1.id), ('a', 0100644, blob_a2.id)),
+          (('b', 0100755, blob_b1.id), (None, None, None)),
+          ((None, None, None), ('c', 0100755, blob_c2.id)),
+          ], merge_entries('', tree1, tree2))
+
+        self.assertEqual([
+          (('a', 0100644, blob_a2.id), ('a', 0100644, blob_a1.id)),
+          ((None, None, None), ('b', 0100755, blob_b1.id)),
+          (('c', 0100755, blob_c2.id), (None, None, None)),
+          ], merge_entries('', tree2, tree1))
+
+        self.assertMergeFails(merge_entries, 0xdeadbeef, 0100644, '1' * 40)
+        self.assertMergeFails(merge_entries, 'a', 'deadbeef', '1' * 40)
+        self.assertMergeFails(merge_entries, 'a', 0100644, 0xdeadbeef)
+
+    test_merge_entries = functest_builder(_do_test_merge_entries,
+                                          _merge_entries_py)
+    test_merge_entries_extension = ext_functest_builder(_do_test_merge_entries,
+                                                        _merge_entries)
+
+    def _do_test_is_tree(self, is_tree):
+        self.assertFalse(is_tree(TreeEntry(None, None, None)))
+        self.assertFalse(is_tree(TreeEntry('a', 0100644, 'a' * 40)))
+        self.assertFalse(is_tree(TreeEntry('a', 0100755, 'a' * 40)))
+        self.assertFalse(is_tree(TreeEntry('a', 0120000, 'a' * 40)))
+        self.assertTrue(is_tree(TreeEntry('a', 0040000, 'a' * 40)))
+        self.assertRaises(TypeError, is_tree, TreeEntry('a', 'x', 'a' * 40))
+        self.assertRaises(AttributeError, is_tree, 1234)
+
+    test_is_tree = functest_builder(_do_test_is_tree, _is_tree_py)
+    test_is_tree_extension = ext_functest_builder(_do_test_is_tree, _is_tree)
+
+    def assertChangesEqual(self, expected, tree1, tree2, **kwargs):
+        actual = list(tree_changes(self.store, tree1.id, tree2.id, **kwargs))
+        self.assertEqual(expected, actual)
+
+    # For brevity, the following tests use tuples instead of TreeEntry objects.
+
+    def test_tree_changes_empty(self):
+        self.assertChangesEqual([], self.empty_tree, self.empty_tree)
+
+    def test_tree_changes_no_changes(self):
+        blob = make_object(Blob, data='blob')
+        tree = self.commit_tree([('a', blob), ('b/c', blob)])
+        self.assertChangesEqual([], self.empty_tree, self.empty_tree)
+        self.assertChangesEqual([], tree, tree)
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_UNCHANGED, ('a', F, blob.id), ('a', F, blob.id)),
+           TreeChange(CHANGE_UNCHANGED, ('b/c', F, blob.id),
+                      ('b/c', F, blob.id))],
+          tree, tree, want_unchanged=True)
+
+    def test_tree_changes_add_delete(self):
+        blob_a = make_object(Blob, data='a')
+        blob_b = make_object(Blob, data='b')
+        tree = self.commit_tree([('a', blob_a, 0100644),
+                                 ('x/b', blob_b, 0100755)])
+        self.assertChangesEqual(
+          [TreeChange.add(('a', 0100644, blob_a.id)),
+           TreeChange.add(('x/b', 0100755, blob_b.id))],
+          self.empty_tree, tree)
+        self.assertChangesEqual(
+          [TreeChange.delete(('a', 0100644, blob_a.id)),
+           TreeChange.delete(('x/b', 0100755, blob_b.id))],
+          tree, self.empty_tree)
+
+    def test_tree_changes_modify_contents(self):
+        blob_a1 = make_object(Blob, data='a1')
+        blob_a2 = make_object(Blob, data='a2')
+        tree1 = self.commit_tree([('a', blob_a1)])
+        tree2 = self.commit_tree([('a', blob_a2)])
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob_a1.id),
+                      ('a', F, blob_a2.id))], tree1, tree2)
+
+    def test_tree_changes_modify_mode(self):
+        blob_a = make_object(Blob, data='a')
+        tree1 = self.commit_tree([('a', blob_a, 0100644)])
+        tree2 = self.commit_tree([('a', blob_a, 0100755)])
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', 0100644, blob_a.id),
+                      ('a', 0100755, blob_a.id))], tree1, tree2)
+
+    def test_tree_changes_change_type(self):
+        blob_a1 = make_object(Blob, data='a')
+        blob_a2 = make_object(Blob, data='/foo/bar')
+        tree1 = self.commit_tree([('a', blob_a1, 0100644)])
+        tree2 = self.commit_tree([('a', blob_a2, 0120000)])
+        self.assertChangesEqual(
+          [TreeChange.delete(('a', 0100644, blob_a1.id)),
+           TreeChange.add(('a', 0120000, blob_a2.id))],
+          tree1, tree2)
+
+    def test_tree_changes_to_tree(self):
+        blob_a = make_object(Blob, data='a')
+        blob_x = make_object(Blob, data='x')
+        tree1 = self.commit_tree([('a', blob_a)])
+        tree2 = self.commit_tree([('a/x', blob_x)])
+        self.assertChangesEqual(
+          [TreeChange.delete(('a', F, blob_a.id)),
+           TreeChange.add(('a/x', F, blob_x.id))],
+          tree1, tree2)
+
+    def test_tree_changes_complex(self):
+        blob_a_1 = make_object(Blob, data='a1_1')
+        blob_bx1_1 = make_object(Blob, data='bx1_1')
+        blob_bx2_1 = make_object(Blob, data='bx2_1')
+        blob_by1_1 = make_object(Blob, data='by1_1')
+        blob_by2_1 = make_object(Blob, data='by2_1')
+        tree1 = self.commit_tree([
+          ('a', blob_a_1),
+          ('b/x/1', blob_bx1_1),
+          ('b/x/2', blob_bx2_1),
+          ('b/y/1', blob_by1_1),
+          ('b/y/2', blob_by2_1),
+          ])
+
+        blob_a_2 = make_object(Blob, data='a1_2')
+        blob_bx1_2 = blob_bx1_1
+        blob_by_2 = make_object(Blob, data='by_2')
+        blob_c_2 = make_object(Blob, data='c_2')
+        tree2 = self.commit_tree([
+          ('a', blob_a_2),
+          ('b/x/1', blob_bx1_2),
+          ('b/y', blob_by_2),
+          ('c', blob_c_2),
+          ])
+
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob_a_1.id),
+                      ('a', F, blob_a_2.id)),
+           TreeChange.delete(('b/x/2', F, blob_bx2_1.id)),
+           TreeChange.add(('b/y', F, blob_by_2.id)),
+           TreeChange.delete(('b/y/1', F, blob_by1_1.id)),
+           TreeChange.delete(('b/y/2', F, blob_by2_1.id)),
+           TreeChange.add(('c', F, blob_c_2.id))],
+          tree1, tree2)
+
+    def test_tree_changes_name_order(self):
+        blob = make_object(Blob, data='a')
+        tree1 = self.commit_tree([('a', blob), ('a.', blob), ('a..', blob)])
+        # Tree order is the reverse of this, so if we used tree order, 'a..'
+        # would not be merged.
+        tree2 = self.commit_tree([('a/x', blob), ('a./x', blob), ('a..', blob)])
+
+        self.assertChangesEqual(
+          [TreeChange.delete(('a', F, blob.id)),
+           TreeChange.add(('a/x', F, blob.id)),
+           TreeChange.delete(('a.', F, blob.id)),
+           TreeChange.add(('a./x', F, blob.id))],
+          tree1, tree2)
+
+    def test_tree_changes_prune(self):
+        blob_a1 = make_object(Blob, data='a1')
+        blob_a2 = make_object(Blob, data='a2')
+        blob_x = make_object(Blob, data='x')
+        tree1 = self.commit_tree([('a', blob_a1), ('b/x', blob_x)])
+        tree2 = self.commit_tree([('a', blob_a2), ('b/x', blob_x)])
+        # Remove identical items so lookups will fail unless we prune.
+        subtree = self.store[tree1['b'][1]]
+        for entry in subtree.iteritems():
+            del self.store[entry.sha]
+        del self.store[subtree.id]
+
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob_a1.id),
+                      ('a', F, blob_a2.id))],
+          tree1, tree2)
+
+
+class RenameDetectionTest(DiffTestCase):
+
+    def _do_test_count_blocks(self, count_blocks):
+        blob = make_object(Blob, data='a\nb\na\n')
+        self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, count_blocks(blob))
+
+    test_count_blocks = functest_builder(_do_test_count_blocks,
+                                         _count_blocks_py)
+    test_count_blocks_extension = ext_functest_builder(_do_test_count_blocks,
+                                                       _count_blocks)
+
+    def _do_test_count_blocks_no_newline(self, count_blocks):
+        blob = make_object(Blob, data='a\na')
+        self.assertEqual({hash('a\n'): 2, hash('a'): 1}, _count_blocks(blob))
+
+    test_count_blocks_no_newline = functest_builder(
+      _do_test_count_blocks_no_newline, _count_blocks_py)
+    test_count_blocks_no_newline_extension = ext_functest_builder(
+       _do_test_count_blocks_no_newline, _count_blocks)
+
+    def _do_test_count_blocks_chunks(self, count_blocks):
+        blob = ShaFile.from_raw_chunks(Blob.type_num, ['a\nb', '\na\n'])
+        self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, _count_blocks(blob))
+
+    test_count_blocks_chunks = functest_builder(_do_test_count_blocks_chunks,
+                                                _count_blocks_py)
+    test_count_blocks_chunks_extension = ext_functest_builder(
+      _do_test_count_blocks_chunks, _count_blocks)
+
+    def _do_test_count_blocks_long_lines(self, count_blocks):
+        a = 'a' * 64
+        data = a + 'xxx\ny\n' + a + 'zzz\n'
+        blob = make_object(Blob, data=data)
+        self.assertEqual({hash('a' * 64): 128, hash('xxx\n'): 4, hash('y\n'): 2,
+                          hash('zzz\n'): 4},
+                         _count_blocks(blob))
+
+    test_count_blocks_long_lines = functest_builder(
+      _do_test_count_blocks_long_lines, _count_blocks_py)
+    test_count_blocks_long_lines_extension = ext_functest_builder(
+      _do_test_count_blocks_long_lines, _count_blocks)
+
+    def assertSimilar(self, expected_score, blob1, blob2):
+        self.assertEqual(expected_score, _similarity_score(blob1, blob2))
+        self.assertEqual(expected_score, _similarity_score(blob2, blob1))
+
+    def test_similarity_score(self):
+        blob0 = make_object(Blob, data='')
+        blob1 = make_object(Blob, data='ab\ncd\ncd\n')
+        blob2 = make_object(Blob, data='ab\n')
+        blob3 = make_object(Blob, data='cd\n')
+        blob4 = make_object(Blob, data='cd\ncd\n')
+
+        self.assertSimilar(100, blob0, blob0)
+        self.assertSimilar(0, blob0, blob1)
+        self.assertSimilar(33, blob1, blob2)
+        self.assertSimilar(33, blob1, blob3)
+        self.assertSimilar(66, blob1, blob4)
+        self.assertSimilar(0, blob2, blob3)
+        self.assertSimilar(50, blob3, blob4)
+
+    def test_similarity_score_cache(self):
+        blob1 = make_object(Blob, data='ab\ncd\n')
+        blob2 = make_object(Blob, data='ab\n')
+
+        block_cache = {}
+        self.assertEqual(
+          50, _similarity_score(blob1, blob2, block_cache=block_cache))
+        self.assertEqual(set([blob1.id, blob2.id]), set(block_cache))
+
+        def fail_chunks():
+            self.fail('Unexpected call to as_raw_chunks()')
+
+        blob1.as_raw_chunks = blob2.as_raw_chunks = fail_chunks
+        blob1.raw_length = lambda: 6
+        blob2.raw_length = lambda: 3
+        self.assertEqual(
+          50, _similarity_score(blob1, blob2, block_cache=block_cache))
+
+    def test_tree_entry_sort(self):
+        sha = 'abcd' * 10
+        expected_entries = [
+          TreeChange.add(TreeEntry('aaa', F, sha)),
+          TreeChange(CHANGE_COPY, TreeEntry('bbb', F, sha),
+                     TreeEntry('aab', F, sha)),
+          TreeChange(CHANGE_MODIFY, TreeEntry('bbb', F, sha),
+                     TreeEntry('bbb', F, 'dabc' * 10)),
+          TreeChange(CHANGE_RENAME, TreeEntry('bbc', F, sha),
+                     TreeEntry('ddd', F, sha)),
+          TreeChange.delete(TreeEntry('ccc', F, sha)),
+          ]
+
+        for perm in permutations(expected_entries):
+            self.assertEqual(expected_entries,
+                             sorted(perm, key=_tree_change_key))
+
+    def detect_renames(self, tree1, tree2, **kwargs):
+        detector = RenameDetector(self.store, tree1.id, tree2.id, **kwargs)
+        return detector.changes_with_renames()
+
+    def test_no_renames(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\ne\nf\n')
+        blob3 = make_object(Blob, data='a\nb\ng\nh\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob1), ('b', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('b', F, blob2.id), ('b', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_one_to_one(self):
+        blob1 = make_object(Blob, data='1')
+        blob2 = make_object(Blob, data='2')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob1), ('d', blob2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob1.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('d', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_split_different_type(self):
+        blob = make_object(Blob, data='/foo')
+        tree1 = self.commit_tree([('a', blob, 0100644)])
+        tree2 = self.commit_tree([('a', blob, 0120000)])
+        self.assertEqual(
+          [TreeChange.add(('a', 0120000, blob.id)),
+           TreeChange.delete(('a', 0100644, blob.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_and_different_type(self):
+        blob1 = make_object(Blob, data='1')
+        blob2 = make_object(Blob, data='2')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob2, 0120000), ('b', blob1)])
+        self.assertEqual(
+          [TreeChange.add(('a', 0120000, blob2.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob1.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_one_to_many(self):
+        blob = make_object(Blob, data='1')
+        tree1 = self.commit_tree([('a', blob)])
+        tree2 = self.commit_tree([('b', blob), ('c', blob)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob.id), ('b', F, blob.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob.id), ('c', F, blob.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_many_to_one(self):
+        blob = make_object(Blob, data='1')
+        tree1 = self.commit_tree([('a', blob), ('b', blob)])
+        tree2 = self.commit_tree([('c', blob)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob.id), ('c', F, blob.id)),
+           TreeChange.delete(('b', F, blob.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_many_to_many(self):
+        blob = make_object(Blob, data='1')
+        tree1 = self.commit_tree([('a', blob), ('b', blob)])
+        tree2 = self.commit_tree([('c', blob), ('d', blob), ('e', blob)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob.id), ('c', F, blob.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob.id), ('e', F, blob.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob.id), ('d', F, blob.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_rename_threshold(self):
+        blob1 = make_object(Blob, data='a\nb\nc\n')
+        blob2 = make_object(Blob, data='a\nb\nd\n')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('b', blob2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rename_threshold=50))
+        self.assertEqual(
+          [TreeChange.delete(('a', F, blob1.id)),
+           TreeChange.add(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rename_threshold=75))
+
+    def test_content_rename_max_files(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd')
+        blob4 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob2 = make_object(Blob, data='e\nf\ng\nh\n')
+        blob3 = make_object(Blob, data='e\nf\ng\ni\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3), ('d', blob4)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('d', F, blob4.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('c', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange.delete(('a', F, blob1.id)),
+           TreeChange.delete(('b', F, blob2.id)),
+           TreeChange.add(('c', F, blob3.id)),
+           TreeChange.add(('d', F, blob4.id))],
+          self.detect_renames(tree1, tree2, max_files=1))
+
+    def test_content_rename_one_to_one(self):
+        b11 = make_object(Blob, data='a\nb\nc\nd\n')
+        b12 = make_object(Blob, data='a\nb\nc\ne\n')
+        b21 = make_object(Blob, data='e\nf\ng\n\h')
+        b22 = make_object(Blob, data='e\nf\ng\n\i')
+        tree1 = self.commit_tree([('a', b11), ('b', b21)])
+        tree2 = self.commit_tree([('c', b12), ('d', b22)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, b11.id), ('c', F, b12.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, b21.id), ('d', F, b22.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_one_to_one_ordering(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\ne\nf\n')
+        blob2 = make_object(Blob, data='a\nb\nc\nd\ng\nh\n')
+        # 6/10 match to blob1, 8/10 match to blob2
+        blob3 = make_object(Blob, data='a\nb\nc\nd\ng\ni\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3)])
+        self.assertEqual(
+          [TreeChange.delete(('a', F, blob1.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('c', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+
+        tree3 = self.commit_tree([('a', blob2), ('b', blob1)])
+        tree4 = self.commit_tree([('c', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob2.id), ('c', F, blob3.id)),
+           TreeChange.delete(('b', F, blob1.id))],
+          self.detect_renames(tree3, tree4))
+
+    def test_content_rename_one_to_many(self):
+        blob1 = make_object(Blob, data='aa\nb\nc\nd\ne\n')
+        blob2 = make_object(Blob, data='ab\nb\nc\nd\ne\n')  # 8/11 match
+        blob3 = make_object(Blob, data='aa\nb\nc\nd\nf\n')  # 9/11 match
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('b', blob2), ('c', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_many_to_one(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob3 = make_object(Blob, data='a\nb\nc\nf\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id)),
+           TreeChange.delete(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_many_to_many(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob3 = make_object(Blob, data='a\nb\nc\nf\n')
+        blob4 = make_object(Blob, data='a\nb\nc\ng\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3), ('d', blob4)])
+        # TODO(dborowitz): Distribute renames rather than greedily choosing
+        # copies.
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('d', F, blob4.id)),
+           TreeChange.delete(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_gitlink(self):
+        blob1 = make_object(Blob, data='blob1')
+        blob2 = make_object(Blob, data='blob2')
+        link1 = '1' * 40
+        link2 = '2' * 40
+        tree1 = self.commit_tree([('a', blob1), ('b', link1, 0160000)])
+        tree2 = self.commit_tree([('c', blob2), ('d', link2, 0160000)])
+        self.assertEqual(
+          [TreeChange.delete(('a', 0100644, blob1.id)),
+           TreeChange.delete(('b', 0160000, link1)),
+           TreeChange.add(('c', 0100644, blob2.id)),
+           TreeChange.add(('d', 0160000, link2))],
+          self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_swap(self):
+        blob1 = make_object(Blob, data='1')
+        blob2 = make_object(Blob, data='2')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob2), ('b', blob1)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+           TreeChange(CHANGE_MODIFY, ('b', F, blob2.id), ('b', F, blob1.id))],
+          self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob1.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('a', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=50))
+
+    def test_content_rename_swap(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='e\nf\ng\nh\n')
+        blob3 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob4 = make_object(Blob, data='e\nf\ng\ni\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob4), ('b', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob3.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('a', F, blob4.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=60))
+
+    def test_rewrite_threshold(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob3 = make_object(Blob, data='a\nb\nf\ng\n')
+
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob3), ('b', blob2)])
+
+        no_renames = [
+          TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob3.id)),
+          TreeChange.add(('b', F, blob2.id))]
+        self.assertEqual(
+          no_renames, self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          no_renames, self.detect_renames(tree1, tree2, rewrite_threshold=40))
+        self.assertEqual(
+          [TreeChange.add(('a', F, blob3.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=80))
+
+    def test_find_copies_harder_exact(self):
+        blob = make_object(Blob, data='blob')
+        tree1 = self.commit_tree([('a', blob)])
+        tree2 = self.commit_tree([('a', blob), ('b', blob)])
+        self.assertEqual([TreeChange.add(('b', F, blob.id))],
+                         self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_COPY, ('a', F, blob.id), ('b', F, blob.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+    def test_find_copies_harder_content(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob1), ('b', blob2)])
+        self.assertEqual([TreeChange.add(('b', F, blob2.id))],
+                         self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+    def test_find_copies_harder_modify(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob2), ('b', blob2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+           TreeChange.add(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+    def test_find_copies_harder_with_rewrites(self):
+        blob_a1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob_a2 = make_object(Blob, data='f\ng\nh\ni\n')
+        blob_b2 = make_object(Blob, data='a\nb\nc\ne\n')
+        tree1 = self.commit_tree([('a', blob_a1)])
+        tree2 = self.commit_tree([('a', blob_a2), ('b', blob_b2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob_a1.id),
+                      ('a', F, blob_a2.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob_a1.id), ('b', F, blob_b2.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+        self.assertEqual(
+          [TreeChange.add(('a', F, blob_a2.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob_a1.id),
+                      ('b', F, blob_b2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=50,
+                              find_copies_harder=True))

+ 69 - 0
dulwich/tests/test_fastexport.py

@@ -131,3 +131,72 @@ M 100644 :1 a
         self.assertEquals(2, len(markers))
         self.assertTrue(isinstance(self.repo[markers["1"]], Blob))
         self.assertTrue(isinstance(self.repo[markers["2"]], Commit))
+
+    def test_file_add(self):
+        from fastimport import commands
+        cmd = commands.BlobCommand("23", "data")
+        self.processor.blob_handler(cmd)
+        cmd = commands.CommitCommand("refs/heads/foo", "mrkr",
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            "FOO", None, [], [commands.FileModifyCommand("path", 0100644, ":23", None)])
+        self.processor.commit_handler(cmd)
+        commit = self.repo[self.processor.last_commit]
+        self.assertEquals([
+            ('path', 0100644, '6320cd248dd8aeaab759d5871f8781b5c0505172')],
+            self.repo[commit.tree].items())
+
+    def simple_commit(self):
+        from fastimport import commands
+        cmd = commands.BlobCommand("23", "data")
+        self.processor.blob_handler(cmd)
+        cmd = commands.CommitCommand("refs/heads/foo", "mrkr",
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            "FOO", None, [], [commands.FileModifyCommand("path", 0100644, ":23", None)])
+        self.processor.commit_handler(cmd)
+        commit = self.repo[self.processor.last_commit]
+        return commit
+
+    def make_file_commit(self, file_cmds):
+        """Create a trivial commit with the specified file commands.
+
+        :param file_cmds: File commands to run.
+        :return: The created commit object
+        """
+        from fastimport import commands
+        cmd = commands.CommitCommand("refs/heads/foo", "mrkr",
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            ("Jelmer", "jelmer@samba.org", 432432432.0, 3600),
+            "FOO", None, [], file_cmds)
+        self.processor.commit_handler(cmd)
+        return self.repo[self.processor.last_commit]
+
+    def test_file_copy(self):
+        from fastimport import commands
+        self.simple_commit()
+        commit = self.make_file_commit([commands.FileCopyCommand("path", "new_path")])
+        self.assertEquals([
+            ('new_path', 0100644, '6320cd248dd8aeaab759d5871f8781b5c0505172'),
+            ('path', 0100644, '6320cd248dd8aeaab759d5871f8781b5c0505172'),
+            ], self.repo[commit.tree].items())
+
+    def test_file_move(self):
+        from fastimport import commands
+        self.simple_commit()
+        commit = self.make_file_commit([commands.FileRenameCommand("path", "new_path")])
+        self.assertEquals([
+            ('new_path', 0100644, '6320cd248dd8aeaab759d5871f8781b5c0505172'),
+            ], self.repo[commit.tree].items())
+
+    def test_file_delete(self):
+        from fastimport import commands
+        self.simple_commit()
+        commit = self.make_file_commit([commands.FileDeleteCommand("path")])
+        self.assertEquals([], self.repo[commit.tree].items())
+
+    def test_file_deleteall(self):
+        from fastimport import commands
+        self.simple_commit()
+        commit = self.make_file_commit([commands.FileDeleteAllCommand()])
+        self.assertEquals([], self.repo[commit.tree].items())

+ 32 - 8
dulwich/tests/test_object_store.py

@@ -32,9 +32,9 @@ from dulwich.errors import (
 from dulwich.objects import (
     object_class,
     Blob,
-    ShaFile,
     Tag,
     Tree,
+    TreeEntry,
     )
 from dulwich.object_store import (
     DiskObjectStore,
@@ -89,6 +89,25 @@ class ObjectStoreTests(object):
         r = self.store[testobject.id]
         self.assertEquals(r, testobject)
 
+    def test_tree_changes(self):
+        blob_a1 = make_object(Blob, data='a1')
+        blob_a2 = make_object(Blob, data='a2')
+        blob_b = make_object(Blob, data='b')
+        for blob in [blob_a1, blob_a2, blob_b]:
+            self.store.add_object(blob)
+
+        blobs_1 = [('a', blob_a1.id, 0100644), ('b', blob_b.id, 0100644)]
+        tree1_id = commit_tree(self.store, blobs_1)
+        blobs_2 = [('a', blob_a2.id, 0100644), ('b', blob_b.id, 0100644)]
+        tree2_id = commit_tree(self.store, blobs_2)
+        change_a = (('a', 'a'), (0100644, 0100644), (blob_a1.id, blob_a2.id))
+        self.assertEquals([change_a],
+                          list(self.store.tree_changes(tree1_id, tree2_id)))
+        self.assertEquals(
+          [change_a, (('b', 'b'), (0100644, 0100644), (blob_b.id, blob_b.id))],
+          list(self.store.tree_changes(tree1_id, tree2_id,
+                                       want_unchanged=True)))
+
     def test_iter_tree_contents(self):
         blob_a = make_object(Blob, data='a')
         blob_b = make_object(Blob, data='b')
@@ -104,7 +123,7 @@ class ObjectStoreTests(object):
           ('c', blob_c.id, 0100644),
           ]
         tree_id = commit_tree(self.store, blobs)
-        self.assertEquals([(p, m, h) for (p, h, m) in blobs],
+        self.assertEquals([TreeEntry(p, m, h) for (p, h, m) in blobs],
                           list(self.store.iter_tree_contents(tree_id)))
 
     def test_iter_tree_contents_include_trees(self):
@@ -125,12 +144,12 @@ class ObjectStoreTests(object):
         tree_bd = self.store[tree_ad['bd'][1]]
 
         expected = [
-          ('', 0040000, tree_id),
-          ('a', 0100644, blob_a.id),
-          ('ad', 0040000, tree_ad.id),
-          ('ad/b', 0100644, blob_b.id),
-          ('ad/bd', 0040000, tree_bd.id),
-          ('ad/bd/c', 0100755, blob_c.id),
+          TreeEntry('', 0040000, tree_id),
+          TreeEntry('a', 0100644, blob_a.id),
+          TreeEntry('ad', 0040000, tree_ad.id),
+          TreeEntry('ad/b', 0100644, blob_b.id),
+          TreeEntry('ad/bd', 0040000, tree_bd.id),
+          TreeEntry('ad/bd/c', 0100755, blob_c.id),
           ]
         actual = self.store.iter_tree_contents(tree_id, include_trees=True)
         self.assertEquals(expected, list(actual))
@@ -161,6 +180,10 @@ class MemoryObjectStoreTests(ObjectStoreTests, TestCase):
 
 class PackBasedObjectStoreTests(ObjectStoreTests):
 
+    def tearDown(self):
+        for pack in self.store.packs:
+            pack.close()
+
     def test_empty_packs(self):
         self.assertEquals([], self.store.packs)
 
@@ -184,6 +207,7 @@ class DiskObjectStoreTests(PackBasedObjectStoreTests, TestCase):
 
     def tearDown(self):
         TestCase.tearDown(self)
+        PackBasedObjectStoreTests.tearDown(self)
         shutil.rmtree(self.store_dir)
 
     def test_pack_dir(self):

+ 51 - 62
dulwich/tests/test_objects.py

@@ -30,6 +30,9 @@ import stat
 from dulwich.errors import (
     ObjectFormatException,
     )
+from dulwich._compat import (
+    permutations,
+    )
 from dulwich.objects import (
     Blob,
     Tree,
@@ -50,11 +53,12 @@ from dulwich.objects import (
     )
 from dulwich.tests import (
     TestCase,
-    TestSkipped,
     )
 from utils import (
     make_commit,
     make_object,
+    functest_builder,
+    ext_functest_builder,
     )
 
 a_sha = '6f670c0fb53f9463760b7295fbb814e965fb20c8'
@@ -64,40 +68,6 @@ tree_sha = '70c190eb48fa8bbb50ddc692a17b44cb781af7f6'
 tag_sha = '71033db03a03c6a36721efcf1968dd8f8e0cf023'
 
 
-try:
-    from itertools import permutations
-except ImportError:
-    # Implementation of permutations from Python 2.6 documentation:
-    # http://docs.python.org/2.6/library/itertools.html#itertools.permutations
-    # Copyright (c) 2001-2010 Python Software Foundation; All Rights Reserved
-    # Modified syntax slightly to run under Python 2.4.
-    def permutations(iterable, r=None):
-        # permutations('ABCD', 2) --> AB AC AD BA BC BD CA CB CD DA DB DC
-        # permutations(range(3)) --> 012 021 102 120 201 210
-        pool = tuple(iterable)
-        n = len(pool)
-        if r is None:
-            r = n
-        if r > n:
-            return
-        indices = range(n)
-        cycles = range(n, n-r, -1)
-        yield tuple(pool[i] for i in indices[:r])
-        while n:
-            for i in reversed(range(r)):
-                cycles[i] -= 1
-                if cycles[i] == 0:
-                    indices[i:] = indices[i+1:] + indices[i:i+1]
-                    cycles[i] = n - i
-                else:
-                    j = cycles[i]
-                    indices[i], indices[-j] = indices[-j], indices[i]
-                    yield tuple(pool[i] for i in indices[:r])
-                    break
-            else:
-                return
-
-
 class TestHexToSha(TestCase):
 
     def test_simple(self):
@@ -117,21 +87,21 @@ class BlobReadTests(TestCase):
     def get_blob(self, sha):
         """Return the blob named sha from the test data dir"""
         return self.get_sha_file(Blob, 'blobs', sha)
-  
+
     def get_tree(self, sha):
         return self.get_sha_file(Tree, 'trees', sha)
-  
+
     def get_tag(self, sha):
         return self.get_sha_file(Tag, 'tags', sha)
-  
+
     def commit(self, sha):
         return self.get_sha_file(Commit, 'commits', sha)
-  
+
     def test_decompress_simple_blob(self):
         b = self.get_blob(a_sha)
         self.assertEqual(b.data, 'test 1\n')
         self.assertEqual(b.sha().hexdigest(), a_sha)
-  
+
     def test_hash(self):
         b = self.get_blob(a_sha)
         self.assertEqual(hash(b.id), hash(b))
@@ -142,7 +112,7 @@ class BlobReadTests(TestCase):
         self.assertEqual(b.data, '')
         self.assertEqual(b.id, sha)
         self.assertEqual(b.sha().hexdigest(), sha)
-  
+
     def test_create_blob_from_string(self):
         string = 'test 2\n'
         b = Blob.from_string(string)
@@ -166,23 +136,23 @@ class BlobReadTests(TestCase):
         self.assertEqual('test 5\n', b.data)
         b.chunked = ['te', 'st', ' 6\n']
         self.assertEqual('test 6\n', b.as_raw_string())
-  
+
     def test_parse_legacy_blob(self):
         string = 'test 3\n'
         b = self.get_blob(c_sha)
         self.assertEqual(b.data, string)
         self.assertEqual(b.sha().hexdigest(), c_sha)
-  
+
     def test_eq(self):
         blob1 = self.get_blob(a_sha)
         blob2 = self.get_blob(a_sha)
         self.assertEqual(blob1, blob2)
-  
+
     def test_read_tree_from_file(self):
         t = self.get_tree(tree_sha)
         self.assertEqual(t.entries()[0], (33188, 'a', a_sha))
         self.assertEqual(t.entries()[1], (33188, 'b', b_sha))
-  
+
     def test_read_tag_from_file(self):
         t = self.get_tag(tag_sha)
         self.assertEqual(t.object, (Commit, '51b668fd5bf7061b7d6fa525f88803e6cfadaa51'))
@@ -190,7 +160,7 @@ class BlobReadTests(TestCase):
         self.assertEqual(t.tagger,'Ali Sabil <ali.sabil@gmail.com>')
         self.assertEqual(t.tag_time, 1231203091)
         self.assertEqual(t.message, 'This is a signed tag\n-----BEGIN PGP SIGNATURE-----\nVersion: GnuPG v1.4.9 (GNU/Linux)\n\niEYEABECAAYFAkliqx8ACgkQqSMmLy9u/kcx5ACfakZ9NnPl02tOyYP6pkBoEkU1\n5EcAn0UFgokaSvS371Ym/4W9iJj6vh3h\n=ql7y\n-----END PGP SIGNATURE-----\n')
-  
+
     def test_read_commit_from_file(self):
         sha = '60dacdc733de308bb77bb76ce0fb0f9b44c9769e'
         c = self.commit(sha)
@@ -205,7 +175,7 @@ class BlobReadTests(TestCase):
         self.assertEqual(c.commit_timezone, 0)
         self.assertEqual(c.author_timezone, 0)
         self.assertEqual(c.message, 'Test commit\n')
-  
+
     def test_read_commit_no_parents(self):
         sha = '0d89f20333fbb1d2f3a94da77f4981373d8f4310'
         c = self.commit(sha)
@@ -219,7 +189,7 @@ class BlobReadTests(TestCase):
         self.assertEqual(c.commit_timezone, 0)
         self.assertEqual(c.author_timezone, 0)
         self.assertEqual(c.message, 'Test commit\n')
-  
+
     def test_read_commit_two_parents(self):
         sha = '5dac377bdded4c9aeb8dff595f0faeebcc8498cc'
         c = self.commit(sha)
@@ -465,18 +435,24 @@ class TreeTests(ShaFileCheckTests):
         o = Tree.from_path(hex_to_filename(dir, tree_sha))
         self.assertEquals([('a', 0100644, a_sha), ('b', 0100644, b_sha)],
                           list(parse_tree(o.as_raw_string())))
+        # test a broken tree that has a leading 0 on the file mode
+        broken_tree = '0100644 foo\0' + hex_to_sha(a_sha)
 
-    def test_parse_tree(self):
-        self._do_test_parse_tree(_parse_tree_py)
+        def eval_parse_tree(*args, **kwargs):
+            return list(parse_tree(*args, **kwargs))
 
-    def test_parse_tree_extension(self):
-        if parse_tree is _parse_tree_py:
-            raise TestSkipped('parse_tree extension not found')
-        self._do_test_parse_tree(parse_tree)
+        self.assertEquals([('foo', 0100644, a_sha)],
+                          eval_parse_tree(broken_tree))
+        self.assertRaises(ObjectFormatException,
+                          eval_parse_tree, broken_tree, strict=True)
+
+    test_parse_tree = functest_builder(_do_test_parse_tree, _parse_tree_py)
+    test_parse_tree_extension = ext_functest_builder(_do_test_parse_tree,
+                                                     parse_tree)
 
     def _do_test_sorted_tree_items(self, sorted_tree_items):
         def do_sort(entries):
-            return list(sorted_tree_items(entries))
+            return list(sorted_tree_items(entries, False))
 
         actual = do_sort(_TREE_ITEMS)
         self.assertEqual(_SORTED_TREE_ITEMS, actual)
@@ -494,13 +470,24 @@ class TreeTests(ShaFileCheckTests):
         self.assertRaises(errors, do_sort, {'foo': ('xxx', myhexsha)})
         self.assertRaises(errors, do_sort, {'foo': (0100755, 12345)})
 
-    def test_sorted_tree_items(self):
-        self._do_test_sorted_tree_items(_sorted_tree_items_py)
-
-    def test_sorted_tree_items_extension(self):
-        if sorted_tree_items is _sorted_tree_items_py:
-            raise TestSkipped('sorted_tree_items extension not found')
-        self._do_test_sorted_tree_items(sorted_tree_items)
+    test_sorted_tree_items = functest_builder(_do_test_sorted_tree_items,
+                                              _sorted_tree_items_py)
+    test_sorted_tree_items_extension = ext_functest_builder(
+      _do_test_sorted_tree_items, sorted_tree_items)
+
+    def _do_test_sorted_tree_items_name_order(self, sorted_tree_items):
+        self.assertEqual([
+          TreeEntry('a', stat.S_IFDIR,
+                    'd80c186a03f423a81b39df39dc87fd269736ca86'),
+          TreeEntry('a.c', 0100755, 'd80c186a03f423a81b39df39dc87fd269736ca86'),
+          TreeEntry('a/c', stat.S_IFDIR,
+                    'd80c186a03f423a81b39df39dc87fd269736ca86'),
+          ], list(sorted_tree_items(_TREE_ITEMS, True)))
+
+    test_sorted_tree_items_name_order = functest_builder(
+      _do_test_sorted_tree_items_name_order, _sorted_tree_items_py)
+    test_sorted_tree_items_name_order_extension = ext_functest_builder(
+      _do_test_sorted_tree_items_name_order, sorted_tree_items)
 
     def test_check(self):
         t = Tree
@@ -520,6 +507,8 @@ class TreeTests(ShaFileCheckTests):
         # TODO more whitelisted modes
         self.assertCheckFails(t, '123456 a\0%s' % sha)
         self.assertCheckFails(t, '123abc a\0%s' % sha)
+        # should fail check, but parses ok
+        self.assertCheckFails(t, '0100644 foo\0' + sha)
 
         # shas
         self.assertCheckFails(t, '100644 a\0%s' % ('x' * 5))

+ 26 - 16
dulwich/tests/test_pack.py

@@ -73,7 +73,8 @@ class PackTests(TestCase):
         shutil.rmtree(self.tempdir)
         super(PackTests, self).tearDown()
 
-    datadir = os.path.join(os.path.dirname(__file__), 'data/packs')
+    datadir = os.path.abspath(os.path.join(os.path.dirname(__file__),
+        'data/packs'))
 
     def get_pack_index(self, sha):
         """Returns a PackIndex from the datadir with the given sha"""
@@ -271,21 +272,30 @@ class TestPack(PackTests):
 
     def test_copy(self):
         origpack = self.get_pack(pack1_sha)
-        self.assertSucceeds(origpack.index.check)
-        basename = os.path.join(self.tempdir, 'Elch')
-        write_pack(basename, [(x, '') for x in origpack.iterobjects()],
-                   len(origpack))
-        newpack = Pack(basename)
-        self.assertEquals(origpack, newpack)
-        self.assertSucceeds(newpack.index.check)
-        self.assertEquals(origpack.name(), newpack.name())
-        self.assertEquals(origpack.index.get_pack_checksum(),
-                          newpack.index.get_pack_checksum())
-
-        wrong_version = origpack.index.version != newpack.index.version
-        orig_checksum = origpack.index.get_stored_checksum()
-        new_checksum = newpack.index.get_stored_checksum()
-        self.assertTrue(wrong_version or orig_checksum == new_checksum)
+
+        try:
+            self.assertSucceeds(origpack.index.check)
+            basename = os.path.join(self.tempdir, 'Elch')
+            write_pack(basename, [(x, '') for x in origpack.iterobjects()],
+                       len(origpack))
+            newpack = Pack(basename)
+
+            try:
+                self.assertEquals(origpack, newpack)
+                self.assertSucceeds(newpack.index.check)
+                self.assertEquals(origpack.name(), newpack.name())
+                self.assertEquals(origpack.index.get_pack_checksum(),
+                                  newpack.index.get_pack_checksum())
+
+                wrong_version = origpack.index.version != newpack.index.version
+                orig_checksum = origpack.index.get_stored_checksum()
+                new_checksum = newpack.index.get_stored_checksum()
+                self.assertTrue(wrong_version or orig_checksum == new_checksum)
+            finally:
+                newpack.close()
+        finally:
+            origpack.close()
+
 
     def test_commit_obj(self):
         p = self.get_pack(pack1_sha)

+ 138 - 1
dulwich/tests/test_patch.py

@@ -21,15 +21,22 @@
 from cStringIO import StringIO
 
 from dulwich.objects import (
+    Blob,
     Commit,
     Tree,
     )
+from dulwich.object_store import (
+    MemoryObjectStore,
+    )
 from dulwich.patch import (
     git_am_patch_split,
+    write_blob_diff,
     write_commit_patch,
+    write_tree_diff,
     )
 from dulwich.tests import (
     TestCase,
+    TestSkipped,
     )
 
 
@@ -152,4 +159,134 @@ From: Jelmer Vernooy <jelmer@debian.org>
 
 """
         c, diff, version = git_am_patch_split(StringIO(text))
-        self.assertIs(None, version)
+        self.assertEquals(None, version)
+
+    def test_extract_mercurial(self):
+        raise TestSkipped("git_am_patch_split doesn't handle Mercurial patches properly yet")
+        expected_diff = """diff --git a/dulwich/tests/test_patch.py b/dulwich/tests/test_patch.py
+--- a/dulwich/tests/test_patch.py
++++ b/dulwich/tests/test_patch.py
+@@ -158,7 +158,7 @@
+ 
+ '''
+         c, diff, version = git_am_patch_split(StringIO(text))
+-        self.assertIs(None, version)
++        self.assertEquals(None, version)
+ 
+ 
+ class DiffTests(TestCase):
+"""
+        text = """From dulwich-users-bounces+jelmer=samba.org@lists.launchpad.net Mon Nov 29 00:58:18 2010
+Date: Sun, 28 Nov 2010 17:57:27 -0600
+From: Augie Fackler <durin42@gmail.com>
+To: dulwich-users <dulwich-users@lists.launchpad.net>
+Subject: [Dulwich-users] [PATCH] test_patch: fix tests on Python 2.6
+Content-Transfer-Encoding: 8bit
+
+Change-Id: I5e51313d4ae3a65c3f00c665002a7489121bb0d6
+
+%s
+
+_______________________________________________
+Mailing list: https://launchpad.net/~dulwich-users
+Post to     : dulwich-users@lists.launchpad.net
+Unsubscribe : https://launchpad.net/~dulwich-users
+More help   : https://help.launchpad.net/ListHelp
+
+""" % expected_diff
+        c, diff, version = git_am_patch_split(StringIO(text))
+        self.assertEquals(expected_diff, diff)
+        self.assertEquals(None, version)
+
+
+class DiffTests(TestCase):
+    """Tests for write_blob_diff and write_tree_diff."""
+
+    def test_blob_diff(self):
+        f = StringIO()
+        write_blob_diff(f, ("foo.txt", 0644, Blob.from_string("old\nsame\n")),
+                           ("bar.txt", 0644, Blob.from_string("new\nsame\n")))
+        self.assertEquals([
+            "diff --git a/foo.txt b/bar.txt",
+            "index 3b0f961..a116b51 644",
+            "--- a/foo.txt",
+            "+++ b/bar.txt",
+            "@@ -1,2 +1,2 @@",
+            "-old",
+            "+new",
+            " same"
+            ], f.getvalue().splitlines())
+
+    def test_blob_add(self):
+        f = StringIO()
+        write_blob_diff(f, (None, None, None),
+                           ("bar.txt", 0644, Blob.from_string("new\nsame\n")))
+        self.assertEquals([
+            'diff --git /dev/null b/bar.txt',
+             'new mode 644',
+             'index 0000000..a116b51 644',
+             '--- /dev/null',
+             '+++ b/bar.txt',
+             '@@ -1,0 +1,2 @@',
+             '+new',
+             '+same'
+            ], f.getvalue().splitlines())
+
+    def test_blob_remove(self):
+        f = StringIO()
+        write_blob_diff(f, ("bar.txt", 0644, Blob.from_string("new\nsame\n")),
+                           (None, None, None))
+        self.assertEquals([
+            'diff --git a/bar.txt /dev/null',
+            'deleted mode 644',
+            'index a116b51..0000000',
+            '--- a/bar.txt',
+            '+++ /dev/null',
+            '@@ -1,2 +1,0 @@',
+            '-new',
+            '-same'
+            ], f.getvalue().splitlines())
+
+    def test_tree_diff(self):
+        f = StringIO()
+        store = MemoryObjectStore()
+        added = Blob.from_string("add\n")
+        removed = Blob.from_string("removed\n")
+        changed1 = Blob.from_string("unchanged\nremoved\n")
+        changed2 = Blob.from_string("unchanged\nadded\n")
+        unchanged = Blob.from_string("unchanged\n")
+        tree1 = Tree()
+        tree1.add(0644, "removed.txt", removed.id)
+        tree1.add(0644, "changed.txt", changed1.id)
+        tree1.add(0644, "unchanged.txt", changed1.id)
+        tree2 = Tree()
+        tree2.add(0644, "added.txt", added.id)
+        tree2.add(0644, "changed.txt", changed2.id)
+        tree2.add(0644, "unchanged.txt", changed1.id)
+        store.add_objects([(o, None) for o in [
+            tree1, tree2, added, removed, changed1, changed2, unchanged]])
+        write_tree_diff(f, store, tree1.id, tree2.id)
+        self.assertEquals([
+            'diff --git /dev/null b/added.txt',
+            'new mode 644',
+            'index e69de29..76d4bb8 644',
+            '--- /dev/null',
+            '+++ b/added.txt',
+            '@@ -1,0 +1,1 @@',
+            '+add',
+            'diff --git a/changed.txt b/changed.txt',
+            'index bf84e48..1be2436 644',
+            '--- a/changed.txt',
+            '+++ b/changed.txt',
+            '@@ -1,2 +1,2 @@',
+            ' unchanged',
+            '-removed',
+            '+added',
+            'diff --git a/removed.txt /dev/null',
+            'deleted mode 644',
+            'index 2c3f0b3..e69de29',
+            '--- a/removed.txt',
+            '+++ /dev/null',
+            '@@ -1,1 +1,0 @@',
+            '-removed',
+            ], f.getvalue().splitlines())

+ 34 - 17
dulwich/tests/test_repository.py

@@ -66,24 +66,35 @@ class CreateRepositoryTests(TestCase):
             finally:
                 f.close()
 
-    def _check_repo_contents(self, repo):
-        self.assertTrue(repo.bare)
+    def _check_repo_contents(self, repo, expect_bare):
+        self.assertEquals(expect_bare, repo.bare)
         self.assertFileContentsEqual('Unnamed repository', repo, 'description')
         self.assertFileContentsEqual('', repo, os.path.join('info', 'exclude'))
         self.assertFileContentsEqual(None, repo, 'nonexistent file')
+        barestr = 'bare = %s' % str(expect_bare).lower()
+        self.assertTrue(barestr in repo.get_named_file('config').read())
 
-    def test_create_disk(self):
+    def test_create_disk_bare(self):
         tmp_dir = tempfile.mkdtemp()
         try:
             repo = Repo.init_bare(tmp_dir)
             self.assertEquals(tmp_dir, repo._controldir)
-            self._check_repo_contents(repo)
+            self._check_repo_contents(repo, True)
+        finally:
+            shutil.rmtree(tmp_dir)
+
+    def test_create_disk_non_bare(self):
+        tmp_dir = tempfile.mkdtemp()
+        try:
+            repo = Repo.init(tmp_dir)
+            self.assertEquals(os.path.join(tmp_dir, '.git'), repo._controldir)
+            self._check_repo_contents(repo, False)
         finally:
             shutil.rmtree(tmp_dir)
 
     def test_create_memory(self):
         repo = MemoryRepo.init_bare([], {})
-        self._check_repo_contents(repo)
+        self._check_repo_contents(repo, True)
 
 
 class RepositoryTests(TestCase):
@@ -409,6 +420,16 @@ class BuildRepoTests(TestCase):
         tree = r[r[commit_sha].tree]
         self.assertEqual([], list(tree.iteritems()))
 
+    def test_commit_encoding(self):
+        r = self._repo
+        commit_sha = r.do_commit('commit with strange character \xee',
+             committer='Test Committer <test@nodomain.com>',
+             author='Test Author <test@nodomain.com>',
+             commit_timestamp=12395, commit_timezone=0,
+             author_timestamp=12395, author_timezone=0,
+             encoding="iso8859-1")
+        self.assertEquals("iso8859-1", r[commit_sha].encoding)
+
     def test_commit_fail_ref(self):
         r = self._repo
 
@@ -596,18 +617,13 @@ class RefsContainerTests(object):
                          self._refs['refs/heads/symbolic'])
 
     def test_check_refname(self):
-        try:
-            self._refs._check_refname('HEAD')
-        except KeyError:
-            self.fail()
-
-        try:
-            self._refs._check_refname('refs/heads/foo')
-        except KeyError:
-            self.fail()
+        self._refs._check_refname('HEAD')
+        self._refs._check_refname('refs/heads/foo')
 
-        self.assertRaises(KeyError, self._refs._check_refname, 'refs')
-        self.assertRaises(KeyError, self._refs._check_refname, 'notrefs/foo')
+        self.assertRaises(errors.RefFormatError, self._refs._check_refname,
+                          'refs')
+        self.assertRaises(errors.RefFormatError, self._refs._check_refname,
+                          'notrefs/foo')
 
     def test_contains(self):
         self.assertTrue('refs/heads/master' in self._refs)
@@ -732,7 +748,8 @@ class DiskRefsContainerTests(RefsContainerTests, TestCase):
         self.assertEquals(
           ('refs/heads/master', '42d06bd4b77fed026b154d16493e5deab78f02ec'),
           self._refs._follow('refs/heads/master'))
-        self.assertRaises(KeyError, self._refs._follow, 'notrefs/foo')
+        self.assertRaises(errors.RefFormatError, self._refs._follow,
+                          'notrefs/foo')
         self.assertRaises(KeyError, self._refs._follow, 'refs/heads/loop')
 
     def test_delitem(self):

+ 55 - 0
dulwich/tests/test_server.py

@@ -18,22 +18,30 @@
 
 """Tests for the smart protocol server."""
 
+from cStringIO import StringIO
+import os
+import tempfile
 
 from dulwich.errors import (
     GitProtocolError,
+    NotGitRepository,
     UnexpectedCommandError,
     )
 from dulwich.repo import (
     MemoryRepo,
+    Repo,
     )
 from dulwich.server import (
     Backend,
     DictBackend,
+    FileSystemBackend,
     Handler,
     MultiAckGraphWalkerImpl,
     MultiAckDetailedGraphWalkerImpl,
     _split_proto_line,
+    serve_command,
     ProtocolGraphWalker,
+    ReceivePackHandler,
     SingleAckGraphWalkerImpl,
     UploadPackHandler,
     )
@@ -638,3 +646,50 @@ class MultiAckDetailedGraphWalkerImplTestCase(AckGraphWalkerImplTestCase):
 
         self.assertNextEquals(None)
         self.assertNak()
+
+
+class FileSystemBackendTests(TestCase):
+    """Tests for FileSystemBackend."""
+
+    def setUp(self):
+        super(FileSystemBackendTests, self).setUp()
+        self.path = tempfile.mkdtemp()
+        self.repo = Repo.init(self.path)
+        self.backend = FileSystemBackend()
+
+    def test_nonexistant(self):
+        self.assertRaises(NotGitRepository,
+            self.backend.open_repository, "/does/not/exist/unless/foo")
+
+    def test_absolute(self):
+        repo = self.backend.open_repository(self.path)
+        self.assertEquals(repo.path, self.repo.path)
+
+    def test_child(self):
+        self.assertRaises(NotGitRepository,
+            self.backend.open_repository, os.path.join(self.path, "foo"))
+
+
+class ServeCommandTests(TestCase):
+    """Tests for serve_command."""
+
+    def setUp(self):
+        super(ServeCommandTests, self).setUp()
+        self.backend = DictBackend({})
+
+    def serve_command(self, handler_cls, args, inf, outf):
+        return serve_command(handler_cls, ["test"] + args, backend=self.backend,
+            inf=inf, outf=outf)
+
+    def test_receive_pack(self):
+        commit = make_commit(id=ONE, parents=[], commit_time=111)
+        self.backend.repos["/"] = MemoryRepo.init_bare(
+            [commit], {"refs/heads/master": commit.id})
+        outf = StringIO()
+        exitcode = self.serve_command(ReceivePackHandler, ["/"], StringIO("0000"), outf)
+        outlines = outf.getvalue().splitlines()
+        self.assertEquals(2, len(outlines))
+        self.assertEquals("1111111111111111111111111111111111111111 refs/heads/master",
+            outlines[0][4:].split("\x00")[0])
+        self.assertEquals("0000", outlines[-1])
+        self.assertEquals(0, exitcode)

+ 0 - 1
dulwich/tests/test_web.py

@@ -30,7 +30,6 @@ from dulwich.objects import (
     )
 from dulwich.repo import (
     BaseRepo,
-    DictRefsContainer,
     MemoryRepo,
     )
 from dulwich.server import (

+ 42 - 0
dulwich/tests/utils.py

@@ -25,12 +25,16 @@ import os
 import shutil
 import tempfile
 import time
+import types
 
 from dulwich.objects import (
     FixedSha,
     Commit,
     )
 from dulwich.repo import Repo
+from dulwich.tests import (
+    TestSkipped,
+    )
 
 
 def open_repo(name):
@@ -105,3 +109,41 @@ def make_commit(**attrs):
                  'tree': '0' * 40}
     all_attrs.update(attrs)
     return make_object(Commit, **all_attrs)
+
+
+def functest_builder(method, func):
+    """Generate a test method that tests the given function."""
+
+    def do_test(self):
+        method(self, func)
+
+    return do_test
+
+
+def ext_functest_builder(method, func):
+    """Generate a test method that tests the given extension function.
+
+    This is intended to generate test methods that test both a pure-Python
+    version and an extension version using common test code. The extension test
+    will raise TestSkipped if the extension is not found.
+
+    Sample usage:
+
+    class MyTest(TestCase);
+        def _do_some_test(self, func_impl):
+            self.assertEqual('foo', func_impl())
+
+        test_foo = functest_builder(_do_some_test, foo_py)
+        test_foo_extension = ext_functest_builder(_do_some_test, _foo_c)
+
+    :param method: The method to run. It must must two parameters, self and the
+        function implementation to test.
+    :param func: The function implementation to pass to method.
+    """
+
+    def do_test(self):
+        if not isinstance(func, types.BuiltinFunctionType):
+            raise TestSkipped("%s extension not found", func.func_name)
+        method(self, func)
+
+    return do_test

+ 1 - 1
dulwich/web.py

@@ -27,7 +27,7 @@ import time
 try:
     from urlparse import parse_qs
 except ImportError:
-    from dulwich.misc import parse_qs
+    from dulwich._compat import parse_qs
 from dulwich import log_utils
 from dulwich.protocol import (
     ReceivableProtocol,

+ 2 - 0
setup.py

@@ -54,6 +54,8 @@ setup(name='dulwich',
                     include_dirs=include_dirs),
           Extension('dulwich._pack', ['dulwich/_pack.c'],
               include_dirs=include_dirs),
+          Extension('dulwich._diff_tree', ['dulwich/_diff_tree.c'],
+              include_dirs=include_dirs),
           ],
       distclass=DulwichDistribution,
       )