/* Block Matrix Multiplication.  This FT-Linda example was adapted
 * from that in Robert Bjornson's (1993 Yale CS) dissertation.
 * It is a systolic algorithm developed with a hypercube-like
 * algorithm.  
 *
 * Making this algorithm fault-tolerant is interesting because:
 *
 *      1) Each worker has a specific subtask to do.  It is not sufficient
 *         to just clean up any in-progress work a failed worker left
 *         undone---a new worker must be created to take over exactly
 *         where the failed worker left off.  That is because we do
 *	   not have identical workers that perform any available subtask.
 *
 *      2) Not only does the worker have a specific subtask to perform,
 *         it has to perform various iterations of that subtask in close
 *         coordination with other workers.
 *
 * For now, we hardcode the size of the problem and the number of
 * workers.  This is because we can't dynamically allocate memory
 * and then use it with FT-Linda, because it hasn't yet been modified
 * to work with things that sizeof(object) doesn't work with, i.e.
 * dynamically allocated memory.
 *
 * See file block_mmult.notes for an overview of the FT-Linda solution.
 * I have tried to follow Bjornson's code as closely as possible, since
 * it is really hard to understand (at least its finer points).
 */

#include <stdio.h>
#include <ftlinda.h>

#include "block_mmult.h"

#define ABS(x)		( (x)>0 ? (x) : -(x) )

#define F(i,j)		( (i)*blocking + (j))
#define EPSILON		.00001
#define ABASE		5.00
#define BBASE		1.05

#define DIM		8	/* Matrix is DIM by DIM */
#define N		2	/* Divisions per DIM. */
#define WORKERS		(N*N)	/* Number of workers */
#define BLOCK_DIM	(DIM/N)	/* Each worker gets BLOCK_DIM by BLOCK_DIM */
#define BLOCK_DIM_SQ	(BLOCK_DIM * BLOCK_DIM)

#define ID_TO_HOST(id)	(id % ftl_num_hosts() )

#define PACK(array, yb, xb) {\
    int ii=0, xx, yy; \
    for (yy=0; yy<BLOCK_DIM; yy++) { \
	for (xx=0; xx<BLOCK_DIM; xx++) { \
	    mess[ii++] = array[yy][xx]; \
	} \
    } \
}

#define UNPACK(array, yb, xb) {\
    int ii=0, xx, yy; \
    for (yy=0; yy<BLOCK_DIM; yy++) { \
	for (xx=0; xx<BLOCK_DIM; xx++) { \
	    array[yy][xx] = mess[ii++]; \
	} \
    } \
} 

/* newtypes for the tuple logical names */
newtype void REGISTRY;			/* One registry tuple per worker */
newtype void GLOBAL_DATA;		/* A few parameters */
newtype void A_BLOCK;			/* A submatrix for a worker */
newtype void B_BLOCK;			/* B submatrix for a worker */
newtype void A_BLOCK_INPROGRESS;	/* A submatrix place holder */
newtype void B_BLOCK_INPROGRESS;	/* B submatrix place holder */
newtype void RESULT;			/* C submatrix for a worker */

newtype float ELEM;
newtype ELEM[DIM][DIM] 	ARRAY;		/* The whole array */
newtype ELEM[BLOCK_DIM_SQ] CHUNK	/* Linearized (1D) submatrix passed
					 * to and from a worker */

ARRAY a, b, c;

LindaMain(argc, argv)
int argc;
char *argv[];
{
    CHUNK my_chunk;
    int x, y, xb, yb, i, rlen, host;
    ELEM *mess = &my_chunk;
    ELEM correct, correct2;
    int id, lpid, host, failure_id, num_hosts = ftl_num_hosts();

    void worker(int id);
    void monitor(int f_id);	

    /* N must divide DIM */
    assert( (DIM % N) == 0);

    /* initialize arrays */
    for (y=0; y<DIM; y++)
	for (x=0; x<DIM; x++) {
	    a[y][x] = x * ABASE;
	    b[y][x] = y * BBASE;
	}

    correct = 0.0;
    for (i=0; i<DIM; i++)
	correct += ABASE * i * BBASE * i;
    correct2 = 0.0;
    for (i=0; i<DIM; i++)
	correct2 += correct*BBASE*i;
    correct = correct2;

    /* create a monitor process on each thread */
    for (i=0; i<num_hosts; i++) {
	lpid = new_lpid();
	failure_id = new_failure_id();
	ftl_create_user_thread(monitor, "monitor", i, lpid, failure_id, 0,0,0);
    }

    /* create workers */
    for (i=0; i<NUM_WORKERS; i++) {
	lpid = new_lpid();
	host = ID_TO_HOST(i);

	/* make registry tuple with host, id, and last iteration completed */
	< true => out(TSmain, REGISTRY, host, i, lpid, 0); >

	/* create worker(i) on host host with LPID lpid */
	ftl_create_user_thread(worker, "worker", host, lpid, i, 0, 0, 0);
    }

    /* output arrays */
    for (yb=0; yb<N; yb++) {
	for (xb=0; xb<N; xb++) {
	    PACK(a, yb, xb);
	    id = F(yb, xb);
	    < true => out(TSmain, A_BLOCK, id, 0, my_chunk); >
	}
    }

    for (yb=0; yb<N; yb++) {
	for (xb=0; xb<N; xb++) {
	    PACK(b, yb, xb);
	    id = F(yb, xb);
	    < true => out(TSmain, B_BLOCK, id, 0, my_chunk); >
	}
    }

    /* get results */
    for (yb=0; yb<N; yb++) {
	for (xb=0; xb<N; xb++) {
	    id = F(yb,xb);
	    < in(TSmain, RESULT, id, ?my_chunk) => skip >
	    UNPACK(c, yb, xb);	
	}
    }

    /* check results */
    for (y=0; y<DIM; y++) {
	for (x=0; x<DIM; x++) {
	    if (ABS(c[y][x]-correct) > EPSILON)
		printf("c[%d][%d] = %f, correct = %f\n", y, x, 
			c[y][x], correct);
	}
    }

}


void
worker(id)
int id;
{
    register int i;
    int iter, x, y, my_x, my_y, act_my_y, act_my_x, divisor, my_host;
    register ELEM dot;
    CHUNK my_a, my_b, my_c;
    register ELEM *ap, *bp;
    ELEM *alinear, *blinear, *clinear, *cp;
    int right, down, a_init, b_init, a_from, b_from;

    my_x = my_id / BLOCK_DIM;
    my_x = my_id % BLOCK_DIM;
    
    alinear = &my_a; blinear = &my_b; clinear = &my_c;

    right =  F(my_y, (my_x+1) % N);
    down =   F( (my_y+1)%N, my_x);
    a_init = F(my_y, (my_x + my_y)%N);
    b_init = F( (my_x+my_y)%N, my_x);

    my_host = ID_TO_HOST(id);

    /* Get current iteration (remember, we could be a reincarnation...) */
    < rd(TSmain, REGISTRY, my_host, id, ?lpid, ?iter) => skip >

    /* Perform iterations iter through BLOCK_DIM-1 */
    for ( ; iter < BLOCK_DIM; iter++) {

	/* The first time through Bjornson got his A and B from
	 * some other ID, for some inscrutable reason... */
	if (iter == 0) {
	    a_from = a_init; 
	    b_from = b_init;
	}
	else {
	    a_from = my_id; 
	    b_from = my_id;
	}

	/* Get A and B submatrices and leave in_progress tuples */
	< in(TSmain, A_BLOCK, a_from, iter, ?my_a) =>
	    out(TSmain, A_BLOCK_INPROGRESS, my_host, a_from, iter, my_a); 
	>
	< in(TSmain, B_BLOCK, b_from, iter, ?my_b) =>
	    out(TSmain, B_BLOCK_INPROGRESS, my_host, b_from, iter, my_b); 
	>

	/* Do my multiply for this iteration */	
	for (y=0; y<DIM; y++) {
	   for (x=0; x<DIM; x++) {
		cp = clinear + (y*DIM + x);
		dot = 0.0;

		for (	i=DIM, ap=alinear+y*DIM, bp=blinear+x;
			i;
			--i, ++ap, bp += DIM) {	

		    dot += *ap * *bp;	/* c[y][x] = a[y][i] * b[i][x] */

		}

		*cp = dot;

	   } /* for x */

	} /* for y */

	/* remove in_progress; pass A to the right and B down; update iter */
	< true =>
	    in(TSmain, A_BLOCK_INPROGRESS, my_host, a_from, iter, ?CHUNK);
	    in(TSmain, B_BLOCK_INPROGRESS, my_host, b_from, iter, ?CHUNK);
	    out(TSmain, A_BLOCK, right, INC(iter), my_a);
	    out(TSmain, B_BLOCK, down,  INC(iter), my_b);
	    in(TSmain, REGISTRY, my_host, id, ?lpid, iter);
	    out(TSmain, REGISTRY, my_host, id, lpid, INC(iter) );
	>

	iter++;

	/* copy c to a */
	for (i=0; i<BLOCK_DIM_SQ; i++)
	    alinear[i] = clinear[i];

    }  /* for all iterations */

    /* Output result.  We withdraw the registry tuple atomically with
     * this, because if my_host fails there is now no need to reincarnate
     * this worker since its result has been posted */

    < in(TSmain, REGISTRY, my_host, id, iter) => out(TSmain, result, id, my_c);>

}

monitor(f_id)
int f_id;	/* my failure ID */
{
    int failed_host, iter, lpid, my_host = ftl_my_host();
    CHUNK chunk;

    for (;;) {

	/* wait for a host to fail */
	< in(TSmain, FAILURE, f_id, ?failed_host) => skip >

	/* regenerate all IN_PROGRESS tuples found on that host */
	while (	< inp(TSmain, A_BLOCK_INPROGRESS, failed_host, 
				?from, ?iter, ?chunk) =>
			out(TSmain, A_BLOCK, from, iter, chunk)
		or
		  inp(TSmain, B_BLOCK_INPROGRESS, failed_host, 
				?from, ?iter, ?chunk) =>
			out(TSmain, B_BLOCK, from, iter, chunk)
		> )

		;	/* nothing -- just clean up the in_progress stuff */

	/* now try to reincarnate all failed workers */
	while ( <inp(TSmain, REGISTRY, failed_host, ?id, ?lpid, ?iter) =>
			out(TSmain, REGISTRY, my_host, id, lpid, iter); > ) {
		
		/* create worker(id) on my_host */
		ftl_create_user_thread(worker, "worker", my_host, lpid, 
					id, 0, 0, 0);

		/* sleep some to aid load balancing --- let other 
		 * monitors on other hosts have a chance to reincarnate
		 * other workers from failed_host */
		nap(some);

    }
}
