#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include <string.h>

// Multiple-precision modular arithmetic
// package to support SRP-6 server and clients

// Author: Karl Malbrain, [email protected]

// configure package for 1024 bit operations

#define SIZE (1024 / 32)

#ifdef unix
#define __int64 long long
#else
typedef unsigned int uint;
typedef unsigned long ulong;
#endif

typedef unsigned char uchar;
typedef unsigned __int64 uint64;

//  1024 bit modulus taken from IETF
//  draft-ietf-tls-srp-10

unsigned long DHmod[SIZE] = {
0xEEAF0AB9, 0xADB38DD6, 0x9C33F80A, 0xFA8FC5E8,
0x60726187, 0x75FF3C0B, 0x9EA2314C, 0x9C256576,
0xD674DF74, 0x96EA81D3, 0x383B4813, 0xD692C6E0,
0xE0D5D8E2, 0x50B98BE4, 0x8E495C1D, 0x6089DAD1,
0x5DC7D7B4, 0x6154D6B6, 0xCE8EF4AD, 0x69B15D49,
0x82559B29, 0x7BCF1885, 0xC529F566, 0x660E57EC,
0x68EDBC3C, 0x05726CC0, 0x2FD4CBF4, 0x976EAA9A,
0xFD5138FE, 0x8376435B, 0x9FC61D2F, 0xC0EB06E3
};

// return 1 if multiple precision number is zero
// used to verify A & B are non-zero

int mpzero (ulong *a)
{
int idx = SIZE;

	while( idx-- )
	  if( *a++ )
		return 0;

	return 1;
}

// return remainder of double sized product u/DHmod in u

void mpmod (ulong *u)
{
int i, j, n = SIZE, m = SIZE * 2;
uint64 quot, rem, nxt, prod;
__int64 cry;

  while( m )
	if( !u[0] )
		m--, u++;
	else
		break;

  nxt = 0;

  for( i = 0; i <= m - n; i++ ) {
	nxt <<= 32;
	nxt |= u[i];

	rem = nxt % DHmod[0];
	quot = nxt / DHmod[0];

	while( quot == 0x100000000 || rem < 0x100000000 && (quot * DHmod[1]) > (rem << 32 | u[i + 1]) )
		quot -= 1, rem += DHmod[0];

	prod = cry = 0;

	if( quot ) for( j = n; j--; ) {
		prod += quot * DHmod[j];
		cry += (uint64)u[i + j] - (ulong)prod;
		u[i + j] = (ulong)cry;
		prod >>= 32;
		cry >>= 32;
	}

	nxt = u[i];

	if( !i )
	  if( cry - prod )
		quot++;
	  else
		continue;
	else if( u[i - 1] += cry - prod )
		quot++;
	else
		continue;

	cry = 0;

	for( j = n; j--; ) {
		cry += (uint64)u[i + j] + DHmod[j];
		u[i + j] = (ulong)cry;
		cry >>= 32;
	}

	if( i )
		u[i - 1] += (ulong)cry;

	nxt = u[i];
  }
}

//  multiply two SIZE numbers into double SIZED result

void mpmult (ulong *dest, ulong *what, ulong *by)
{
int m, n = SIZE;
uint64 fact;
uint64 cry;

	memset (dest, 0, SIZE * 2 * sizeof(ulong));

	while( n-- )
	{
		cry = 0;

		if( fact = what[n] ) 
		  for( m = SIZE; m--; ) {
			cry += fact * by[m] + dest[n + m + 1];
			dest[n + m + 1] = (ulong)cry;
			cry >>= 32;
		  }

		dest[n] = (ulong)cry;
	}
}

//	exponentiate SIZE base by SIZE exponent
//  to SIZE result, modulo DHmod

void mpexp (ulong *result, ulong *base, ulong *exponent)
{
ulong prod[SIZE * 2], term[SIZE];
int idx = SIZE * 32;

	memset (result, 0, SIZE * sizeof(ulong));
	memcpy (term, base, SIZE * sizeof(ulong));
	result[SIZE - 1] = 1;

	while( idx )
	  if( *exponent )
		break;
	  else
		exponent++, idx -= 32;

	while( idx-- ) {
	  if( exponent[idx / 32] & (1 << (31 - (idx & 0x1f))) ) {
		mpmult (prod, result, term);
		mpmod(prod);
		memcpy (result, prod + SIZE, SIZE * sizeof(ulong));
	  }
	  mpmult (prod, term, term);
	  mpmod(prod);
	  memcpy (term, prod + SIZE, SIZE * sizeof(ulong));
	}
}

// multiply a by b (single precision multiplier)
// and add to dest

void mpmpyadd (ulong *dest, ulong *a, ulong b)
{
ulong result[SIZE * 2];
__int64 cry = 0;
int idx = SIZE;

	while( idx-- ) {
		cry += dest[idx];
		cry += a[idx] * (uint64)b;
		result[idx + SIZE] = cry;
		result[idx] = 0;
		cry >>= 32;
	}

	//  normalize result

	result[SIZE - 1] = cry;
	mpmod (result);
	memcpy (dest, result + SIZE, SIZE * sizeof(ulong));
}

// multiply a by b (single precision multiplier)
// and subtract from dest

void mpmpysub (ulong *dest, ulong *a, ulong b)
{
ulong result[SIZE * 2];
__int64 cry = 0;
int idx = SIZE;

	// multiply a by b

	while( idx-- ) {
		cry += a[idx] * (uint64)b;
		result[idx + SIZE] = cry;
		result[idx] = 0;
		cry >>= 32;
	}

	//  normalize result

	result[SIZE - 1] = cry;
	mpmod (result);

	//  subtract result from dest


	for( cry = 0, idx = SIZE; idx--; ) {
		cry += dest[idx];
		cry -= result[idx + SIZE];
		dest[idx] = cry;
		cry >>= 32;
	}

	//  normalize result

	if( cry < 0 )
	  for( cry = 0, idx = SIZE; idx--; ) {
		cry += dest[idx];
		cry += DHmod[idx];
		dest[idx] = cry;
		cry >>= 32;
	}
}

//  SHA 256 routines
//  taken from Wikipedia pseudo code

//2^32 times the cube root of the first 64 primes 2..311

static ulong k[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 };

//	store 64 bit integer

void putlonglong (uint64 what, uchar *where)
{
	*where++ = what >> 56;
	*where++ = what >> 48;
	*where++ = what >> 40;
	*where++ = what >> 32;
	*where++ = what >> 24;
	*where++ = what >> 16;
	*where++ = what >> 8;
	*where++ = what;
}

//	store 32 bit integer

void putlong (ulong what, uchar *where)
{
	*where++ = what >> 24;
	*where++ = what >> 16;
	*where++ = what >> 8;
	*where++ = what;
}

//	retrieve 32 bit integer

ulong getlong (uchar *where)
{
ulong ans;

	ans = *where++ << 24;
	ans |= *where++ << 16;
	ans |= *where++ << 8;
	ans |= *where++;
	return ans;
}

//	right rotate bits

ulong rotate (ulong what, int bits)
{
	return (what >> bits) | (what << (32 - bits));
}

//	right shift bits

ulong shift (ulong what, int bits)
{
	return what >> bits;
}

//	private structure for SHA

typedef struct {
	uchar buff[512/8];	// buffer, digest when full
	ulong h[256/32];	// state variable of digest
	uint64 length;		// number of bytes in digest
	int next;			// next buffer available
} SHA256;

//	start new SHA run

void sha256_begin (SHA256 *sha)
{
	sha->length = 0;
	sha->next = 0;

// 2^32 times the square root of the first 8 primes 2..19
	sha->h[0] = 0x6a09e667;
	sha->h[1] = 0xbb67ae85;
	sha->h[2] = 0x3c6ef372;
	sha->h[3] = 0xa54ff53a;
	sha->h[4] = 0x510e527f;
	sha->h[5] = 0x9b05688c;
	sha->h[6] = 0x1f83d9ab;
	sha->h[7] = 0x5be0cd19;
}

//	digest SHA buffer contents
//	to state variable

void sha256_digest (SHA256 *sha)
{
ulong nxt, s0, s1, maj, t0, t1, ch;
ulong a,b,c,d,e,f,g,h;
ulong w[64];
int i;

	sha->next = 0;

	for( i = 0; i < 16; i++ )
		w[i] = getlong (sha->buff + i * sizeof(ulong));

	for( i = 16; i < 64; i++ ) {
	  s0 = rotate(w[i-15], 7) ^ rotate(w[i-15], 18) ^ shift(w[i-15], 3);
	  s1 = rotate(w[i-2], 17) ^ rotate(w[i-2], 19) ^ shift (w[i-2], 10);
	  w[i] = w[i-16] + s0 + w[i-7] + s1;
	}

	a = sha->h[0];
	b = sha->h[1];
	c = sha->h[2];
	d = sha->h[3];
	e = sha->h[4];
	f = sha->h[5];
	g = sha->h[6];
	h = sha->h[7];

	for( i = 0; i < 64; i++ ) {
		s0 = rotate (a, 2) ^ rotate (a, 13) ^ rotate (a, 22);
		maj = (a & b) ^ (b & c) ^ (c & a);
		t0 = s0 + maj;
		s1 = rotate (e, 6) ^ rotate (e, 11) ^ rotate (e, 25);
		ch = (e & f) ^ (~e & g);
		t1 = h + s1 + ch + k[i] + w[i];

		h = g;
		g = f;
		f = e;
		e = d + t1;
		d = c;
		c = b;
		b = a;
		a = t0 + t1;
	}

	sha->h[0] += a;
	sha->h[1] += b;
	sha->h[2] += c;
	sha->h[3] += d;
	sha->h[4] += e;
	sha->h[5] += f;
	sha->h[6] += g;
	sha->h[7] += h;
}

//	add to current SHA buffer
//	digest when full

void sha256_next (SHA256 *sha, uchar *what, int len)
{
	while( len-- ) {
		sha->length++;
		sha->buff[sha->next] = *what++;
		if( ++sha->next == 512/8 )
			sha256_digest (sha);
	}
}

//	finish SHA run, output 256 bit result

void sha256_finish (SHA256 *sha, uchar *out)
{
int idx;

	// trailing bit pad

	sha->buff[sha->next] = 0x80;

	if( ++sha->next == 512/8 )
		sha256_digest (sha);

	// pad with zeroes until almost full
	// leaving room for length, below

	while( sha->next != 448/8 ) {
		sha->buff[sha->next] = 0;
		if( ++sha->next == 512/8 )
			sha256_digest (sha);
	}

	// n.b. length doesn't include padding from above

	putlonglong (sha->length * 8, sha->buff + 448/8);
	sha->next += sizeof(uint64);	// must be full now

	sha256_digest (sha);

	// output the result, big endian

	for( idx = 0; idx < 256/32; idx++ )
		putlong (sha->h[idx], out + idx * sizeof(ulong));
}

#ifdef STANDALONE

//	proof-of-concept implementation of SHA-6 
//	calculate shared secret S three ways using
//  variables known to client, server, both

//  AES key can be taken from a HASH of S

prt (ulong *x)
{
char buff[SIZE*8 + 1];
int idx = 0;

	while( idx < SIZE )
		sprintf (buff + idx * 8,"%.8x", x[idx]), idx++;

	for( idx = 0; idx < SIZE*8; idx++ )
		if( buff[idx] > '0' )
			break;

 	printf ("%s\n", buff + idx);
}

ulong lrand ()
{
static ulong seed = 0xdeadbeef;

	return seed = (seed + 23 ) * 65537;
}

#define HASH (256/8)

main (int argc, char **argv)
{
ulong S2[SIZE], s1[SIZE], s2[SIZE], s3[SIZE];
ulong x[SIZE], g[SIZE], v[SIZE], u[SIZE];
ulong S1[SIZE], t[SIZE];
ulong A[SIZE], a[SIZE];
ulong B[SIZE], b[SIZE];
ulong prod[SIZE * 2];

uchar M1[HASH], M2[HASH];
SHA256 sha[1];
ulong salt[1];

char *user = "my user_id";
char *pwd = "my password";
int i;

	// select random numbers a, b
	// n.b. use a cryptographically secure
	// source of random numbers in any
	// actual implementation of the SRP-6
	// protocol

	for( i = 0; i < SIZE; i++ ) {
		a[i] = lrand();
		b[i] = lrand();
	}

// step zero client
	*salt = lrand();

// step two client
	sha256_begin( sha );
	sha256_next( sha, (uchar *)salt, sizeof(ulong));
	sha256_next( sha, user, strlen(user) );
	sha256_next( sha, pwd, strlen(pwd) );

	memset (x, 0, sizeof(x));
	sha256_finish( sha, (uchar *)(x + SIZE) - HASH);

	memset (g, 0, sizeof(g));
	g[SIZE - 1] = 2;
	mpexp (v, g, x);

// step three client
	mpexp (A, g, a);

// step three server
	mpexp (B, g, b);
	mpmpyadd (B, v, 3);

	memset (u, 0, sizeof(u));
	sha256_begin( sha );
	sha256_next( sha, (uchar *)A, SIZE * sizeof(ulong));
	sha256_next( sha, (uchar *)B, SIZE * sizeof(ulong));
	sha256_finish( sha, (uchar *)(u + SIZE) - HASH);

// step four/five client -- first result
	sha256_begin( sha );
	sha256_next( sha, (uchar *)A, SIZE * sizeof(ulong));
	sha256_next( sha, (uchar *)B, SIZE * sizeof(ulong));

	mpmpysub (B, v, 3); // re-use v as g^x
	mpexp (s1, B, a);
	mpexp (s2, B, u);
	mpexp (s3, s2, x);
	mpmult (prod, s1, s3);
	mpmod(prod); 		// normalize S version one
	prt (prod + SIZE);

	sha256_next( sha, (uchar *)(prod + SIZE), SIZE * sizeof(ulong));
	sha256_finish( sha, M1);

// step five server -- second result
	mpexp (t, v, u);
	mpmult(prod, A, t);
	mpmod(prod);

	mpexp (S2, prod + SIZE, b);	// compute S version two
	prt (S2);

	sha256_begin( sha );
	sha256_next( sha, (uchar *)A, SIZE * sizeof(ulong));
	sha256_next( sha, (uchar *)B, SIZE * sizeof(ulong));
	sha256_next( sha, (uchar *)S2, SIZE * sizeof(ulong));
	sha256_finish( sha, M2);

// generic calc of S -- third result
	mpexp (s1, g, a);
	mpexp (t, s1, b);

	mpexp (s1, g, x);
	mpexp (s2, s1, u);
	mpexp (s3, s2, b);

	mpmult (prod, t, s3);
	mpmod(prod);
	prt (prod + SIZE);
}
#endif
1