diff --git a/puer.c b/puer.c
index c204237..47e1d0b 100644
--- a/puer.c
+++ b/puer.c
@@ -11,8 +11,10 @@
 #include <unistd.h>
 
 // Adjusting this will render the file format incompatible
-// The minimum possible buffer size is 64
-unsigned char workbuf[8 * 1024 * 1024];
+// KDF_WORKFACTOR must be a power of two between 1 and 2^32
+#define KDF_BLOCKSIZE 1024
+#define KDF_WORKFACTOR (64 * 1024)
+unsigned char workbuf[KDF_WORKFACTOR * KDF_BLOCKSIZE];
 
 void xxtea128(uint32_t const key[4], uint32_t block[4]) {
 	// Encryption half of the XXTEA algorithm, with block size limited
@@ -229,25 +231,36 @@ void finalize_hash(struct hashstate *state, unsigned char hash[32]) {
 	explicit_bzero(state, sizeof(struct hashstate));
 }
 
-void hmac(unsigned char output[32], unsigned char key[], size_t keylen, unsigned char message[], size_t messagelen) {
-	// The blocksize of the underlying has function is 128 bits (16B)
+void pbkdf2_1_block(unsigned char output[32], unsigned char passphrase[], size_t passphraselen, unsigned char salt[], size_t saltlen, uint32_t blockindex) {
+	// NOTE: This implementation is hardcoded to one round, as required
+	// by the MFcrypt (see Stronger Key Derivation Via Sequential
+	// Memory-hard Functions by Colin Percival) algorithm. This is not
+	// suitable as a general purpose password-based KDF.
+
+	// This is equivalent to
+	//     F(Password, Salt, 1, i)
+	//   = U_1
+	//   = PRF(Password, Salt + INT_32_BE(i))
+	// We use HMAC-MDC2-XXTEA128 as our PRF
+
+	// The blocksize of the underlying hash function is 128 bits (16B)
 	// but HMAC is specified assuming that the hash function output (in
 	// our case 256 bits or 32B) fits in one block. As far as I can
 	// tell extending the key to be two blocks long is not a problem.
 
 	unsigned char padded_key[32];
-	if (keylen > 16) {
+	if (passphraselen > 16) {
 		// We hash it even if it is shorter than our extended key
 		// length to avoid giving attacker any funny surfaces to
 		// play with at the interface of two blocks
 		struct hashstate state;
 		initialize_hash(&state);
-		feed_hash(&state, key, keylen);
+		feed_hash(&state, passphrase, passphraselen);
 		finalize_hash(&state, padded_key);
 	} else {
 		// Copy the key and zero-pad if necessary
 		memset(padded_key, 0, 32);
-		memcpy(padded_key, key, keylen);
+		memcpy(padded_key, passphrase, passphraselen);
 	}
 
 	// Outer and inner key derivation
@@ -262,7 +275,14 @@ void hmac(unsigned char output[32], unsigned char key[], size_t keylen, unsigned
 	struct hashstate state;
 	initialize_hash(&state);
 	feed_hash(&state, inner_key, 32);
-	feed_hash(&state, message, messagelen);
+	// Our message is salt plus big endian encoding of blockindex
+	feed_hash(&state, salt, saltlen);
+	unsigned char be_blockindex[4];
+	be_blockindex[0] = blockindex >> 24;
+	be_blockindex[1] = blockindex >> 16;
+	be_blockindex[2] = blockindex >> 8;
+	be_blockindex[3] = blockindex;
+	feed_hash(&state, be_blockindex, 4);
 	finalize_hash(&state, inner_hash);
 
 	// Outer hash
@@ -272,45 +292,87 @@ void hmac(unsigned char output[32], unsigned char key[], size_t keylen, unsigned
 	finalize_hash(&state, output);
 }
 
-#define KDF_ROUNDS (sizeof(workbuf) / 32)
+void mfcrypt_hash(unsigned char chunk[16]) {
+	uint32_t key[4], words[4];
+	block2words(key, chunk);
+	block2words(words, chunk);
+	xxtea128(key, words);
+	words2block(chunk, words);
+}
 
-void kdf(unsigned char key[16], unsigned char salt[32], unsigned char passphrase[], size_t passphraselen) {
-	// This is based on the design of PBKDF2 but aims to be memory hard
-	// This is achieved by storing all the hashes in a buffer and the
-	// in the end hashing them together in reverse order, instead of
-	// just xoring together.
-	//
-	// The memory-hardness of this scheme rests of the assumption that
-	// it is not feasible to compute the final hash backwards, that is,
-	// starting with the first hash and working towards the final hash.
-	// While I cannot prove this to be the case, the fact that our hash
-	// is made out of a one-way compression function makes me
-	// relatively confident in it.
+void blockmix(unsigned char block[KDF_BLOCKSIZE]) {
+	// r = KDF_BLOCKSIZE / 32, since block is 2r times the width of our
+	// hash function (xxtea128)
+	const size_t r = KDF_BLOCKSIZE / 32;
 
-	// Place the hash of the salt at the top of the buffer. We do not
-	// include the counter i from PBKDF2 since we will ever only
-	// produce one block of output
-	size_t index = KDF_ROUNDS*32 - 32;
-	hmac(&workbuf[index], passphrase, passphraselen, salt, 32);
-	index -= 32;
+	// accumulator (X) starts off as chunk 2r-1. Chunk k is at memory
+	// location 16*k and is 16 bytes long. Substituting we get:
+	//   start = 16*(2*(KDF_BLOCKSIZE / 32) - 1)
+	//   start = 16*(KDF_BLOCKSIZE / 16 - 1)
+	//   start = KDF_BLOCKSIZE - 16
+	unsigned char accumulator[16];
+	memcpy(accumulator, &block[16 * (2*r - 1)], 16);
 
-	// Walk back along the buffer, at each step hashing the previous
-	// hashes
-	while (index > 0) {
-		hmac(&workbuf[index], passphrase, passphraselen, &workbuf[index+32], 32);
-		index -= 32;
+	// Chunk i is at memory location 16*i. We go through chunks < 2r
+	unsigned char hashedchunks[KDF_BLOCKSIZE];
+	for (size_t i = 0; i < 2*r; i++) {
+		// X = H(X xor B_i)
+		for (size_t index = 0; index < 16; index++) {
+			accumulator[index] ^= block[16 * i + index];
+		}
+		mfcrypt_hash(accumulator);
+		// Y_i = X
+		memcpy(&hashedchunks[16 * i], accumulator, 16);
 	}
-	hmac(workbuf, passphrase, passphraselen, &workbuf[32], 32);
 
-	// Perform the final hash
-	unsigned char final_hash[32];
-	hmac(final_hash, passphrase, passphraselen, workbuf, KDF_ROUNDS * 32);
+	// Interleave the blocks back into the buffer. We go through B's
+	// chunks < r which corresponds to indices every 16 bytes smaller
+	// than 16*(KDF_BLOCKSIZE / 32) = KDF_BLOCKSIZE / 2
+	size_t i = 0;
+	for (; i < r; i++) {
+		// B_i = Y_{2*i}
+		memcpy(&block[16*i], &hashedchunks[16*2*i], 16);
+	}
+	// Now we go through B's chunks < 2r but >= r
+	for (; i < 2*r; i++) {
+		// B_i = Y_{2*(i - r) + 1}
+		memcpy(&block[16*i], &hashedchunks[16*(2*(i - r) + 1)], 16);
+	}
+}
 
-	// Use first 128 bits of final hash as the key
-	memcpy(key, final_hash, 16);
+void romix(unsigned char block[KDF_BLOCKSIZE]) {
+	// Block i starts at location KDF_BLOCKSIZE * i
+	for (size_t i = 0; i < KDF_WORKFACTOR; i++) {
+		// V_i = X
+		memcpy(&workbuf[KDF_BLOCKSIZE * i], block, KDF_BLOCKSIZE);
+		// X = H(X)
+		blockmix(block);
+	}
 
-	// Empty the buffer
-	explicit_bzero(workbuf, sizeof(workbuf));
+	for (size_t i = 0; i < sizeof(workbuf) / KDF_BLOCKSIZE; i++) {
+		// j = Integrify(X) mod N
+		// N is a power of two
+		uint32_t j = bytes2word(&block[KDF_BLOCKSIZE - 4]) & (KDF_WORKFACTOR - 1);
+		// X = H(X xor V_j)
+		for (size_t index = 0; index < KDF_BLOCKSIZE; index++) {
+			block[index] ^= workbuf[KDF_BLOCKSIZE * j + index];
+		}
+		blockmix(block);
+	}
+}
+
+void kdf(unsigned char key[16], unsigned char passphrase[], size_t passphraselen, unsigned char salt[32]) {
+	unsigned char block[KDF_BLOCKSIZE];
+	for (size_t i = 0; i < KDF_BLOCKSIZE / 32; i++) {
+		pbkdf2_1_block(&block[i * 32], passphrase, passphraselen, salt, 32, i);
+	}
+
+	romix(block);
+
+	unsigned char result[32];
+	pbkdf2_1_block(result, passphrase, passphraselen, block, KDF_BLOCKSIZE, 0);
+
+	memcpy(key, result, 16);
 }
 
 // 16 bit authentication tag
@@ -728,7 +790,7 @@ int main(int argc, char *argv[]) {
 
 	// Derive key
 	unsigned char key[16];
-	kdf(key, salt, passphrase, passphrase_len);
+	kdf(key, passphrase, passphrase_len, salt);
 	explicit_bzero(passphrase, sizeof(passphrase));
 
 	uint64_t messageindex = 0;