#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

void xxtea128(uint32_t const key[4], uint32_t block[4]) {
	// Encryption half of the XXTEA algorithm, with block size limited
	// to 128 bits or 4 words. This avoids all the weaknesses that
	// Wikipedia knows of, since both depend on only running 6 rounds
	// per block, and we will run 6 + 52//4 = 6 + 13 = 19

	uint32_t roundconstant = 0;
	for (unsigned round = 0; round < 19; round++) {
		// This took a while to puzzle out since the original
		// specification is a mess, and the mess is only added to
		// by needing to support custom blockwidths.
		//
		// The algorithm is as follows:
		//
		// 1. Set the round constant (sum) to round * 0x9e3779b9
		//    (implemented by addition)
		//
		// 2. Create a reduced version of the round constant (e)
		//    which is the bits 3…2 of the round constant. The
		//    reduced version is needed for changing the pattern of
		//    key accesses, since key is only 4 words long
		//
		// 3. Go through each word in the block and derive its new
		//    value based on its current value (v[p]), the next (y)
		//    and the previous word (z), wrapping around the ends
		//    of the block as needed.
		//
		// The function for deriving the new value of a word is a
		// xor of sums of xors, followed by an in-place addition.
		// The first sum adds together combinations of the next and
		// previous word, and the second sum adds together
		// previous/next combined with a value dependant on the
		// round constant. The key is also mixed into the word in
		// the first xor of second sum. After this the result is
		// added back into the original word.
		//
		// I have changed the operand order in the second xor of
		// first add and in the second add. This is to keep the
		// part dependant of previous word on the left and the part
		// dependant on the next word on the right.

		roundconstant += 0x9e3779b9;
		uint32_t reduced = (roundconstant >> 2) & 3;

		block[0] += ((block[3]>>5 ^ block[1]<<2) + (block[3]<<4 ^ block[1]>>3)) ^ ((key[reduced ^ 0] ^ block[3]) + (roundconstant ^ block[1]));
		block[1] += ((block[0]>>5 ^ block[2]<<2) + (block[0]<<4 ^ block[2]>>3)) ^ ((key[reduced ^ 1] ^ block[0]) + (roundconstant ^ block[2]));
		block[2] += ((block[1]>>5 ^ block[3]<<2) + (block[1]<<4 ^ block[3]>>3)) ^ ((key[reduced ^ 2] ^ block[1]) + (roundconstant ^ block[3]));
		block[3] += ((block[2]>>5 ^ block[0]<<2) + (block[2]<<4 ^ block[0]>>3)) ^ ((key[reduced ^ 3] ^ block[2]) + (roundconstant ^ block[0]));
	}
}

uint32_t bytes2word(unsigned char const bytes[4]) {
	return bytes[0] | bytes[1]<<8 | bytes[2]<<16 | bytes[3]<<24;
}

void word2bytes(unsigned char *bytes, uint32_t word) {
	bytes[0] = word;
	bytes[1] = word>>8;
	bytes[2] = word>>16;
	bytes[3] = word>>24;
}

struct hashstate {
	// A_n and B_n of the MDC-2 algorithm
	uint32_t a[4];
	uint32_t b[4];
	// Buffer to hold data until next full block
	unsigned char buffer[16];
	size_t length;
	// Counter that keeps tracks of how much data we've hashed
	uint64_t totalbits;
};

void initialize_hash(struct hashstate *state) {
	// Hash function is MDC-2 with xxtea128, which is nice since it
	// gives us a 256 bit hash. The constants are based on binary
	// expansion of the square root of two, A1 being the first 128 bits
	// and B1 the next 128.
	//
	// If we treat A1 and B1 as 128bit little endian integers, they
	// have the values:
	//
	// A1 = 6a09e667 f3bcc908 b2fb1366 ea957d3e
	// A2 = 3adec175 12775099 da2f590b 0667322a

	state->a[0] = 0xea957d3eUL;
	state->a[1] = 0xb2fb1366UL;
	state->a[2] = 0xf3bcc908UL;
	state->a[3] = 0x6a09e667UL;

	state->b[0] = 0x0667322aUL;
	state->b[1] = 0xda2f590bUL;
	state->b[2] = 0x12775099UL;
	state->b[3] = 0x3adec175UL;

	memset(state->buffer, 0, sizeof(state->buffer));
	state->length = 0;
	state->totalbits = 0;
}

void compress_hash(struct hashstate *state) {
	assert(state->length == 16);

	// M_i
	uint32_t message[4];
	message[0] = bytes2word(&state->buffer[0]);
	message[1] = bytes2word(&state->buffer[4]);
	message[2] = bytes2word(&state->buffer[8]);
	message[3] = bytes2word(&state->buffer[12]);

	// V_i = M_i ^ E(M_i, A_i)
	// Note: In this description A_i is the *key*, not the plaintext
	uint32_t v[4];
	memcpy(v, message, sizeof(v));
	xxtea128(state->a, v);
	v[0] ^= message[0];
	v[1] ^= message[1];
	v[2] ^= message[2];
	v[3] ^= message[3];

	// W_i = M_i ^ E(M_i, B_i);
	uint32_t w[4];
	memcpy(w, message, sizeof(w));
	xxtea128(state->b, w);
	w[0] ^= message[0];
	w[1] ^= message[1];
	w[2] ^= message[2];
	w[3] ^= message[3];

	// A_{i+1} = Vwi^L || W_i^R
	state->a[0] = v[0];
	state->a[1] = v[1];
	state->a[2] = w[2];
	state->a[3] = w[3];

	// B_{i+1} = W_i^L || V_i^R
	state->b[0] = w[0];
	state->b[1] = w[1];
	state->b[2] = v[2];
	state->b[3] = v[3];

	// Mark that we have consumed the buffer
	state->length = 0;
}

void feed_hash(struct hashstate *state, unsigned char input[], size_t length) {
	// Invariant: The buffer will be filled somewhere between 0 and 15
	// when we enter this loop. This is because once it reaches 16, the
	// hash compression function is executed.
	for (size_t i = 0; i < length; i++) {
		// Must not overflow the internat counter. In practice we will not
		// hit this.
		assert(state->totalbits <= UINT64_MAX - 8);

		state->buffer[state->length++] = input[i];
		state->totalbits += 8;

		if (state->length == 16) {
			compress_hash(state);
		}
	}
}

void finalize_hash(struct hashstate *state, unsigned char hash[32]) {
	// Feed the padding. It consists of one-bit, followed by zero-bits,
	// followed by the number of bits in the message as big-endian
	// uint64. This is the same padding as in SHA-2.

	// We can assume that this works due to the invariant that buffer
	// fill when entering this function is between 0 and 15
	state->buffer[state->length++] = 0x80;

	while(state->length != 8) {
		if (state->length == 16) {
			compress_hash(state);
		}
		state->buffer[state->length++] = 0;
	}

	// Add the number of bits, and do one last compression
	state->buffer[8] = state->totalbits >> 56;
	state->buffer[9] = state->totalbits >> 48;
	state->buffer[10] = state->totalbits >> 40;
	state->buffer[11] = state->totalbits >> 32;
	state->buffer[12] = state->totalbits >> 24;
	state->buffer[13] = state->totalbits >> 16;
	state->buffer[14] = state->totalbits >> 8;
	state->buffer[15] = state->totalbits;
	state->length += 8;
	compress_hash(state);

	// Extract the hash state
	for (size_t i = 0; i < 4; i++) {
		word2bytes(&hash[i*4], state->a[i]);
	}
	for (size_t i = 0; i < 4; i++) {
		word2bytes(&hash[i*4 + 16], state->b[i]);
	}

	// Clear all of the hash state, in case there was sth important
	// there
	explicit_bzero(state, sizeof(struct hashstate));
}

void hmac(unsigned char output[32], unsigned char key[], size_t keylen, unsigned char message[], size_t messagelen) {
	// The blocksize of the underlying has function is 128 bits (16B)
	// but HMAC is specified assuming that the hash function output (in
	// our case 256 bits or 32B) fits in one block. As far as I can
	// tell extending the key to be two blocks long is not a problem.

	unsigned char padded_key[32];
	if (keylen > 16) {
		// We hash it even if it is shorter than our extended key
		// length to avoid giving attacker any funny surfaces to
		// play with at the interface of two blocks
		struct hashstate state;
		initialize_hash(&state);
		feed_hash(&state, key, keylen);
		finalize_hash(&state, padded_key);
	} else {
		// Copy the key and zero-pad if necessary
		memset(padded_key, 0, 32);
		memcpy(padded_key, key, keylen);
	}

	// Outer and inner key derivation
	unsigned char outer_key[32], inner_key[32];
	for (size_t i = 0; i < 32; i++) {
		outer_key[i] = padded_key[i] ^ 0x5c;
		inner_key[i] = padded_key[i] ^ 0x36;
	}

	// Inner hash
	unsigned char inner_hash[32];
	struct hashstate state;
	initialize_hash(&state);
	feed_hash(&state, inner_key, 32);
	feed_hash(&state, message, messagelen);
	finalize_hash(&state, inner_hash);

	// Outer hash
	initialize_hash(&state);
	feed_hash(&state, outer_key, 32);
	feed_hash(&state, inner_hash, 32);
	finalize_hash(&state, output);
}

// KDF_ROUNDS must be at least 2
#define KDF_ROUNDS 100000
unsigned char kdf_buf[KDF_ROUNDS * 32];
void kdf(unsigned char key[16], unsigned char salt[32], unsigned char passphrase[], size_t passphraselen) {
	// This is based on the design of PBKDF2 but aims to be memory hard
	// This is achieved by storing all the hashes in a buffer and the
	// in the end hashing them together in reverse order, instead of
	// just xoring together.
	//
	// The memory-hardness of this scheme rests of the assumption that
	// it is not feasible to compute the final hash backwards, that is,
	// starting with the first hash and working towards the final hash.
	// While I cannot prove this to be the case, the fact that our hash
	// is made out of a one-way compression function makes me
	// relatively confident in it.

	// Place the hash of the salt at the top of the buffer. We do not
	// include the counter i from PBKDF2 since we will ever only
	// produce one block of output
	size_t index = KDF_ROUNDS*32 - 32;
	hmac(&kdf_buf[index], passphrase, passphraselen, salt, 32);
	index -= 32;

	// Walk back along the buffer, at each step hashing the previous
	// hashes
	while (index > 0) {
		hmac(&kdf_buf[index], passphrase, passphraselen, &kdf_buf[index+32], 32);
		index -= 32;
	}
	hmac(kdf_buf, passphrase, passphraselen, &kdf_buf[32], 32);

	// Perform the final hash
	unsigned char final_hash[32];
	hmac(final_hash, passphrase, passphraselen, kdf_buf, KDF_ROUNDS * 32);

	// Use first 128 bits of final hash as the key
	memcpy(key, final_hash, 16);
}

int main(void) {
	unsigned char key[16] = {0};
	unsigned char salt[32] = "seasaltrocksalt seasaltrocksalt";
	unsigned char passphrase[] = "a quick brown fox jumps over the lazy dog";
	kdf(key, salt, passphrase, sizeof(passphrase) - 1);
	for (size_t i = 0; i < 16; i++) {
		printf("%02hhx ", key[i]);
	}
	printf("\n");
	return 0;
}