commit 92041f9fbf9a2f3383d1270af83f52a5c475cd0c Author: Juhani Krekelä Date: Sun Apr 4 23:05:31 2021 +0300 First commit diff --git a/puer.c b/puer.c new file mode 100644 index 0000000..6667953 --- /dev/null +++ b/puer.c @@ -0,0 +1,214 @@ +#include +#include +#include + +void xxtea128(uint32_t const key[4], uint32_t block[4]) { + // Encryption half of the XXTEA algorithm, with block size limited + // to 128 bits or 4 words. This avoids all the weaknesses that + // Wikipedia knows of, since both depend on only running 6 rounds + // per block, and we will run 6 + 52//4 = 6 + 13 = 19 + + uint32_t roundconstant = 0; + for (unsigned round = 0; round < 19; round++) { + // This took a while to puzzle out since the original + // specification is a mess, and the mess is only added to + // by needing to support custom blockwidths. + // + // The algorithm is as follows: + // + // 1. Set the round constant (sum) to round * 0x9e3779b9 + // (implemented by addition) + // + // 2. Create a reduced version of the round constant (e) + // which is the bits 3…2 of the round constant. The + // reduced version is needed for changing the pattern of + // key accesses, since key is only 4 words long + // + // 3. Go through each word in the block and derive its new + // value based on next (y) and previous word (z), + // wrapping around as needed, as well as the round + // constant(s) and the key. + // + // The function for deriving the new value of a block is a + // xor of sums of xors. The first sum adds together + // combinations of the next and previous block, and the + // second sum adds together previous/bext combined with a + // value dependant on the round constant. The key is also + // mixed into the block in the first xor of second sum. + // + // I have changed the operand order in the second xor of + // first add and in the second add. This is to keep the + // part dependant of previous word on the left and the part + // dependant on the next word on the right. + + roundconstant += 0x9e3779b9; + uint32_t reduced = (roundconstant >> 2) & 3; + + block[0] += ((block[3]>>5 ^ block[1]<<2) + (block[3]<<4 ^ block[1]>>3)) ^ ((key[reduced ^ 0] ^ block[3]) + (roundconstant ^ block[1])); + block[1] += ((block[0]>>5 ^ block[2]<<2) + (block[0]<<4 ^ block[2]>>3)) ^ ((key[reduced ^ 1] ^ block[0]) + (roundconstant ^ block[2])); + block[2] += ((block[1]>>5 ^ block[3]<<2) + (block[1]<<4 ^ block[3]>>3)) ^ ((key[reduced ^ 2] ^ block[1]) + (roundconstant ^ block[3])); + block[3] += ((block[2]>>5 ^ block[0]<<2) + (block[2]<<4 ^ block[0]>>3)) ^ ((key[reduced ^ 3] ^ block[2]) + (roundconstant ^ block[0])); + } +} + +uint32_t bytes2word(unsigned char const bytes[4]) { + return bytes[0] | bytes[1]<<8 | bytes[2]<<16 | bytes[3]<<24; +} + +void word2bytes(unsigned char *bytes, uint32_t word) { + bytes[0] = word & 0xff; + bytes[1] = word>>8 & 0xff; + bytes[2] = word>>16 & 0xff; + bytes[3] = word>>24; +} + +void derive_subkey(uint32_t key[4], uint32_t nonce[6], uint32_t subkey[4]) { + // We are using an extended nonce construction with 192 bit nonces. + // The first 128 bits of nonce are encrypted using xxtea128 with + // the provided key, in order to derive a subkey that is then used + // alongside the remaining nonce to do the actual encryption. + // + // This is, as far as I can tell, not a standard construction. I + // have based it on xchacha20, with the understanding that it + // should not matter if the function used to derive the subkey is + // reversible or not, since an attacker doesn't know the original + // key and the original key is used only in this derivation. + + subkey[0] = nonce[0]; + subkey[1] = nonce[1]; + subkey[2] = nonce[2]; + subkey[3] = nonce[3]; + + xxtea128(key, subkey); +} + +struct hashstate { + // A_n and B_n of the MDC-2 algorithm + uint32_t a[4]; + uint32_t b[4]; + // Buffer to hold data until next full block + unsigned char buffer[16]; + size_t length; + // Counter that keeps tracks of how much data we've hashed + uint64_t totalbits; +}; + +void initialize_hash(struct hashstate *state) { + // Hash function is MDC-2 with xxtea128, which is nice since it + // gives us a 256 bit hash. The constants are based on binary + // expansion of the square root of two, A1 being the first 128 bits + // and B1 the next 128. + // + // If we treat A1 and B1 as 128bit little endian integers, they + // have the values: + // + // A1 = 6a09e667 f3bcc908 b2fb1366 ea957d3e + // A2 = 3adec175 12775099 da2f590b 0667322a + + state->a[0] = 0xea957d3eUL; + state->a[1] = 0xb2fb1366UL; + state->a[2] = 0xf3bcc908UL; + state->a[3] = 0x6a09e667UL; + + state->b[0] = 0x0667322aUL; + state->b[1] = 0xda2f590bUL; + state->b[2] = 0x12775099UL; + state->b[3] = 0x3adec175UL; + + memset(state->buffer, 0, sizeof(state->buffer)); + state->length = 0; + state->totalbits = 0; +} + +void compress_hash(struct hashstate *state) { + assert(state->length == 16); + + // M_i + uint32_t message[4]; + message[0] = bytes2word(&state->buffer[0]); + message[1] = bytes2word(&state->buffer[4]); + message[2] = bytes2word(&state->buffer[8]); + message[3] = bytes2word(&state->buffer[12]); + + // A_i, B_i + uint32_t a[4], b[4]; + memcpy(a, state->a, sizeof(a)); + memcpy(b, state->b, sizeof(b)); + + // V_i = M_i ^ E(M_i, A_i) + xxtea128(message, a); + a[0] ^= message[0]; + a[1] ^= message[1]; + a[2] ^= message[2]; + a[3] ^= message[3]; + + // W_i = M_i ^ E(M_i, B_i); + xxtea128(message, b); + b[0] ^= message[0]; + b[1] ^= message[1]; + b[2] ^= message[2]; + b[3] ^= message[3]; + + // A_{i+1} = V_i^L || W_i^R + state->a[0] = a[0]; + state->a[1] = a[1]; + state->a[2] = b[2]; + state->a[3] = b[3]; + + // B_{i+1} = W_i^L || V_i^R + state->b[0] = b[0]; + state->b[1] = b[1]; + state->b[2] = a[2]; + state->b[3] = a[3]; + + // Mark that we have consumed the buffer + state->length = 0; +} + +void feed_hash(struct hashstate *state, unsigned char input[], size_t length) { + // Invariant: The buffer will be filled somewhere between 0 and 15 + // when we enter this loop. This is because once it reaches 16, the + // hash compression function is executed. + for (size_t i; i < length; i++) { + // Must not overflow the internat counter. In practice we will not + // hit this. + assert(state->totalbits <= UINT64_MAX - 8); + + state->buffer[state->length++] = input[i]; + state->totalbits += 8; + + if (state->length == 16) { + compress_hash(state); + } + } +} + +void finalize_hash(struct hashstate *state, unsigned char hash[32]) { + // Feed the padding. It consists of one-bit, followed by zero-bits, + // followed by the number of bits in the message as big-endian + // uint64. This is the same padding as in SHA-2. + + // We can assume that this works due to the invariant that buffer + // fill when entering this function is between 0 and 15 + state->buffer[state->length++] = 0x80; + + while(state->length != 8) { + if (state->length == 16) { + compress_hash(state); + } + state->buffer[state->length++] = 0; + } + + // Add the number of bits, and do one last compression + word2bytes(&state->buffer[state->length+=4], state->totalbits>>32); + word2bytes(&state->buffer[state->length+=4], state->totalbits & 0xffffffffUL); + compress_hash(state); + + // Extract the hash state + for (size_t i = 0; i < 4; i++) { + word2bytes(&hash[i*4], state->a[i]); + } + for (size_t i = 0; i < 4; i++) { + word2bytes(&hash[i*4 + 16], state->b[i]); + } +}