commit 92041f9fbf9a2f3383d1270af83f52a5c475cd0c
Author: Juhani Krekelä <juhani@krekelä.fi>
Date:   Sun Apr 4 23:05:31 2021 +0300

    First commit

diff --git a/puer.c b/puer.c
new file mode 100644
index 0000000..6667953
--- /dev/null
+++ b/puer.c
@@ -0,0 +1,214 @@
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+void xxtea128(uint32_t const key[4], uint32_t block[4]) {
+	// Encryption half of the XXTEA algorithm, with block size limited
+	// to 128 bits or 4 words. This avoids all the weaknesses that
+	// Wikipedia knows of, since both depend on only running 6 rounds
+	// per block, and we will run 6 + 52//4 = 6 + 13 = 19
+
+	uint32_t roundconstant = 0;
+	for (unsigned round = 0; round < 19; round++) {
+		// This took a while to puzzle out since the original
+		// specification is a mess, and the mess is only added to
+		// by needing to support custom blockwidths.
+		//
+		// The algorithm is as follows:
+		//
+		// 1. Set the round constant (sum) to round * 0x9e3779b9
+		//    (implemented by addition)
+		//
+		// 2. Create a reduced version of the round constant (e)
+		//    which is the bits 3…2 of the round constant. The
+		//    reduced version is needed for changing the pattern of
+		//    key accesses, since key is only 4 words long
+		//
+		// 3. Go through each word in the block and derive its new
+		//    value based on next (y) and previous word (z),
+		//    wrapping around as needed, as well as the round
+		//    constant(s) and the key.
+		//
+		// The function for deriving the new value of a block is a
+		// xor of sums of xors. The first sum adds together
+		// combinations of the next and previous block, and the
+		// second sum adds together previous/bext combined with a
+		// value dependant on the round constant. The key is also
+		// mixed into the block in the first xor of second sum.
+		//
+		// I have changed the operand order in the second xor of
+		// first add and in the second add. This is to keep the
+		// part dependant of previous word on the left and the part
+		// dependant on the next word on the right.
+
+		roundconstant += 0x9e3779b9;
+		uint32_t reduced = (roundconstant >> 2) & 3;
+
+		block[0] += ((block[3]>>5 ^ block[1]<<2) + (block[3]<<4 ^ block[1]>>3)) ^ ((key[reduced ^ 0] ^ block[3]) + (roundconstant ^ block[1]));
+		block[1] += ((block[0]>>5 ^ block[2]<<2) + (block[0]<<4 ^ block[2]>>3)) ^ ((key[reduced ^ 1] ^ block[0]) + (roundconstant ^ block[2]));
+		block[2] += ((block[1]>>5 ^ block[3]<<2) + (block[1]<<4 ^ block[3]>>3)) ^ ((key[reduced ^ 2] ^ block[1]) + (roundconstant ^ block[3]));
+		block[3] += ((block[2]>>5 ^ block[0]<<2) + (block[2]<<4 ^ block[0]>>3)) ^ ((key[reduced ^ 3] ^ block[2]) + (roundconstant ^ block[0]));
+	}
+}
+
+uint32_t bytes2word(unsigned char const bytes[4]) {
+	return bytes[0] | bytes[1]<<8 | bytes[2]<<16 | bytes[3]<<24;
+}
+
+void word2bytes(unsigned char *bytes, uint32_t word) {
+	bytes[0] = word & 0xff;
+	bytes[1] = word>>8 & 0xff;
+	bytes[2] = word>>16 & 0xff;
+	bytes[3] = word>>24;
+}
+
+void derive_subkey(uint32_t key[4], uint32_t nonce[6], uint32_t subkey[4]) {
+	// We are using an extended nonce construction with 192 bit nonces.
+	// The first 128 bits of nonce are encrypted using xxtea128 with
+	// the provided key, in order to derive a subkey that is then used
+	// alongside the remaining nonce to do the actual encryption.
+	//
+	// This is, as far as I can tell, not a standard construction. I
+	// have based it on xchacha20, with the understanding that it
+	// should not matter if the function used to derive the subkey is
+	// reversible or not, since an attacker doesn't know the original
+	// key and the original key is used only in this derivation.
+
+	subkey[0] = nonce[0];
+	subkey[1] = nonce[1];
+	subkey[2] = nonce[2];
+	subkey[3] = nonce[3];
+
+	xxtea128(key, subkey);
+}
+
+struct hashstate {
+	// A_n and B_n of the MDC-2 algorithm
+	uint32_t a[4];
+	uint32_t b[4];
+	// Buffer to hold data until next full block
+	unsigned char buffer[16];
+	size_t length;
+	// Counter that keeps tracks of how much data we've hashed
+	uint64_t totalbits;
+};
+
+void initialize_hash(struct hashstate *state) {
+	// Hash function is MDC-2 with xxtea128, which is nice since it
+	// gives us a 256 bit hash. The constants are based on binary
+	// expansion of the square root of two, A1 being the first 128 bits
+	// and B1 the next 128.
+	//
+	// If we treat A1 and B1 as 128bit little endian integers, they
+	// have the values:
+	//
+	// A1 = 6a09e667 f3bcc908 b2fb1366 ea957d3e
+	// A2 = 3adec175 12775099 da2f590b 0667322a
+
+	state->a[0] = 0xea957d3eUL;
+	state->a[1] = 0xb2fb1366UL;
+	state->a[2] = 0xf3bcc908UL;
+	state->a[3] = 0x6a09e667UL;
+
+	state->b[0] = 0x0667322aUL;
+	state->b[1] = 0xda2f590bUL;
+	state->b[2] = 0x12775099UL;
+	state->b[3] = 0x3adec175UL;
+
+	memset(state->buffer, 0, sizeof(state->buffer));
+	state->length = 0;
+	state->totalbits = 0;
+}
+
+void compress_hash(struct hashstate *state) {
+	assert(state->length == 16);
+
+	// M_i
+	uint32_t message[4];
+	message[0] = bytes2word(&state->buffer[0]);
+	message[1] = bytes2word(&state->buffer[4]);
+	message[2] = bytes2word(&state->buffer[8]);
+	message[3] = bytes2word(&state->buffer[12]);
+
+	// A_i, B_i
+	uint32_t a[4], b[4];
+	memcpy(a, state->a, sizeof(a));
+	memcpy(b, state->b, sizeof(b));
+
+	// V_i = M_i ^ E(M_i, A_i)
+	xxtea128(message, a);
+	a[0] ^= message[0];
+	a[1] ^= message[1];
+	a[2] ^= message[2];
+	a[3] ^= message[3];
+
+	// W_i = M_i ^ E(M_i, B_i);
+	xxtea128(message, b);
+	b[0] ^= message[0];
+	b[1] ^= message[1];
+	b[2] ^= message[2];
+	b[3] ^= message[3];
+
+	// A_{i+1} = V_i^L || W_i^R
+	state->a[0] = a[0];
+	state->a[1] = a[1];
+	state->a[2] = b[2];
+	state->a[3] = b[3];
+
+	// B_{i+1} = W_i^L || V_i^R
+	state->b[0] = b[0];
+	state->b[1] = b[1];
+	state->b[2] = a[2];
+	state->b[3] = a[3];
+
+	// Mark that we have consumed the buffer
+	state->length = 0;
+}
+
+void feed_hash(struct hashstate *state, unsigned char input[], size_t length) {
+	// Invariant: The buffer will be filled somewhere between 0 and 15
+	// when we enter this loop. This is because once it reaches 16, the
+	// hash compression function is executed.
+	for (size_t i; i < length; i++) {
+		// Must not overflow the internat counter. In practice we will not
+		// hit this.
+		assert(state->totalbits <= UINT64_MAX - 8);
+
+		state->buffer[state->length++] = input[i];
+		state->totalbits += 8;
+
+		if (state->length == 16) {
+			compress_hash(state);
+		}
+	}
+}
+
+void finalize_hash(struct hashstate *state, unsigned char hash[32]) {
+	// Feed the padding. It consists of one-bit, followed by zero-bits,
+	// followed by the number of bits in the message as big-endian
+	// uint64. This is the same padding as in SHA-2.
+
+	// We can assume that this works due to the invariant that buffer
+	// fill when entering this function is between 0 and 15
+	state->buffer[state->length++] = 0x80;
+
+	while(state->length != 8) {
+		if (state->length == 16) {
+			compress_hash(state);
+		}
+		state->buffer[state->length++] = 0;
+	}
+
+	// Add the number of bits, and do one last compression
+	word2bytes(&state->buffer[state->length+=4], state->totalbits>>32);
+	word2bytes(&state->buffer[state->length+=4], state->totalbits & 0xffffffffUL);
+	compress_hash(state);
+
+	// Extract the hash state
+	for (size_t i = 0; i < 4; i++) {
+		word2bytes(&hash[i*4], state->a[i]);
+	}
+	for (size_t i = 0; i < 4; i++) {
+		word2bytes(&hash[i*4 + 16], state->b[i]);
+	}
+}