Lab03_mpi

1. Getting started with "Hello World”
2. Sending in a ring (broadcast by ring)
3. Finding PI using MPI collective operations 
4. A simple Jacobi iteration 

*****************************************************************************
*****************************************************************************
1. Exercise - Getting Started

Objective: Learn how to login, write, compile, and run a simple MPI program. 

-Run the ``Hello world'' programs. 
-Try two or more different processes.
-Try two or more different parallel computers. 
-What does the output look like? 
-Try to improve your program to give you the information on 
 the outputting process (rank and processor name).

1.
Code:

#include "mpi.h"
#include <stdio.h>

int main( int argc, char *argv[] )
{
    char name[100];
    int len;
    MPI_Init( &argc, &argv );
//    MPI_Get_processor_name(name, &len);
//    printf( "Hello, world from %s!\n", name );
    printf( "Hello, world!\n" );
    MPI_Finalize();
    return 0;
}

2.
Compiling: mpicc.mpich -o hello hello.c

3.
Running: mpirun.mpich -np 2 hello

File /usr/lib/mpich/share/machines.LINUX on agutis should look like:
$trobec@agutis(bash): more /usr/lib/mpich/share/machines.LINUX
agutis
alpaka
apella
assapan
adjag
admiral
afghane
aktinie
arakanga
arapaima
aratinga
ararauna
argali
airedale

4.
Running from agutis: ./hello -p4pg processors
File "processors" on your directory on agutis should look like:
agutis 0
alpaka 1 /home/scicomp/your_username/mpi/hello
... 

*****************************************************************************
2. Excercise - Measuring bandwidth

Write a program to measure the time it takes to send 1, 2, 4, ..., 1M C doubles from one processor to another using MPI_Send and MPI_Recv. 

Use more trials to average out variations and overhead in MPI_Wtime. 

Print the size, time, and rate in MB/sec for each test. 

Make sure that both sender and reciever are ready when you begin the test. The sample solution uses MPI_Sendrecv, but other choices are possible.

Code:
/* compile with: mpicc.mpich -o bw bw.c */

#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"

#define NUMBER_OF_TESTS 5  /* Number of tests for more reliable average.*/

int main( argc, argv )
int argc;
char **argv;
{
    double       *buf;
    int          rank, numprocs;
    int          n;
    double       t1, t2, tmin;
    int          j, k, nloop;
    MPI_Status   status;

    MPI_Init( &argc, &argv );

    MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
    if (numprocs != 2) {
        printf( "The number of processes has to be exactly two!\n" );
	return(0);
    }
    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    if (rank == 0) 
        printf( "Kind\t\tn\ttime (sec)\tRate (Mb/sec)\n" );

    for (n=1; n<110000; n*=2) { /* Message lengths doubles each time */
        if (n == 0) nloop = 100;
        else        nloop  = 100/n;
        if (nloop < 1) nloop = 1;

        buf = (double *) malloc( n * sizeof(double) );
        if (!buf) {
            fprintf( stderr, 
                     "Could not allocate send/recv buffer of size %d\n", n );
            MPI_Abort( MPI_COMM_WORLD, 1 );
        }
        tmin = 1000;
        for (k=0; k<NUMBER_OF_TESTS; k++) {
            if (rank == 0) {
                /* Make sure both processes are ready */
                MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 1, 14,
                              MPI_BOTTOM, 0, MPI_INT, 1, 14, MPI_COMM_WORLD, 
                              &status );
                t1 = MPI_Wtime();
                for (j=0; j<nloop; j++) { /* Send a message nloop times. */
//                    MPI_Ssend( buf, n, MPI_DOUBLE, 1, k, MPI_COMM_WORLD );
                    MPI_Send( buf, n, MPI_DOUBLE, 1, k, MPI_COMM_WORLD );
                    MPI_Recv( buf, n, MPI_DOUBLE, 1, k, MPI_COMM_WORLD, 
                              &status );
                }
                t2 = (MPI_Wtime() - t1) / nloop;
                if (t2 < tmin) tmin = t2;
            }
            else if (rank == 1) {
                /* Make sure both processes are ready */
                MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 0, 14,
                              MPI_BOTTOM, 0, MPI_INT, 0, 14, MPI_COMM_WORLD, 
                              &status );
                for (j=0; j<nloop; j++) {
                    MPI_Recv( buf, n, MPI_DOUBLE, 0, k, MPI_COMM_WORLD, 
                              &status );
//                    MPI_Ssend( buf, n, MPI_DOUBLE, 0, k, MPI_COMM_WORLD );
                    MPI_Send( buf, n, MPI_DOUBLE, 0, k, MPI_COMM_WORLD );
                }
            }
        }
        /* Convert to half the round-trip time */
        tmin = tmin / 2.0;
        if (rank == 0) {
            double rate;
            if (tmin > 0) rate = n * sizeof(double) * 1.0e-6 * 8/tmin; /* in Mb/sec */
            else          rate = 0.0;
            printf( "Send/Recv\t%d\t%f\t%f\n", n, tmin, rate );
        }
        free( buf );
    }

    MPI_Finalize( );
    return 0;
}
 

*****************************************************************************
3. Exercise - Calculating PI - Collective communication

Objective: Experiment with send/receive and Bcast/Reduce

-Study and Run the program for PI.
-Explanation on how the algorithm works:
This exercise presents a simple program to determine the value of pi. 
The algorithm suggested here is chosen for its simplicity. The method 
evaluates the integral of 4/(1+x*x) between -1/2 and 1/2. The method is simple: 
the integral is approximated by a sum of n intervals; the approximation to the
integral in each interval is (1/n)*4/(1+x*x). The master process (rank 0) asks 
the user for the number of intervals; the master should then broadcast this number 
to all of the other processes. Each process then adds up every n'th interval 
(x = -1/2+rank/n, -1/2+rank/n+size/n,...). Finally, the sums computed
by each process are added together using a reduction. 
 
-Write new versions that replace the calls to MPI_Bcast and
 MPI_Reduce with MPI_Send and MPI_Recv. Test this program.
-The MPI broadcast and reduce operations use at most log(p) send and receive 
 operations on each process where p is the size of MPI_COMM_WORLD. How many 
 operations do your versions use? 
  
Code:
/* compile with: mpicc.mpich -o pi pi.c -lm */

#include "mpi.h" 
#include <math.h> 
 
int main(argc,argv) 
int argc; 
char *argv[]; 
{ 
  int done = 0, n, myid, numprocs, i, rc; 
  double PI25DT = 3.141592653589793238462643; 
  double mypi, pi, h, sum, x, a; 
 
MPI_Init(&argc,&argv); 
  MPI_Comm_size(MPI_COMM_WORLD,&numprocs); 
  MPI_Comm_rank(MPI_COMM_WORLD,&myid); 
while (!done) 
  { 
    if (myid == 0) { 
        printf("Enter the number of intervals: (0 quits) "); 
        scanf("%d",&n); 
    } 
    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD); 
    if (n == 0) break; 
  
 
h   = 1.0 / (double) n; 
    sum = 0.0; 
    for (i = myid + 1; i <= n; i += numprocs) { 
        x = h * ((double)i - 0.5); 
        sum += 4.0 / (1.0 + x*x); 
    } 
    mypi = h * sum; 
    
 
MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, 
                MPI_COMM_WORLD); 
    
 
if (myid == 0) 
        printf("pi is approximately %.16f, Error is %.16f\n", 
               pi, fabs(pi - PI25DT)); 
  } 
  MPI_Finalize(); 
} 


*****************************************************************************
4. Exercise - Jacobi iteration 

see: http://www-unix.mcs.anl.gov/mpi/tutorial/mpiexmpl/
  and start the excercise.

Jacobi iteration is a method for approximating the solution to a linear system of equations. We solve the Laplace equation in two dimensions with finite differences. 
The basic principle is a simple computation of an average of neighboring values.

while (not converged) {
  for (i,j)
    xnew[i][j] = (x[i+1][j] + x[i-1][j] + x[i][j+1] + x[i][j-1])/4;
  for (i,j)
    x[i][j] = xnew[i][j];
  }

The replacement of xnew with the average of the values around it is applied only in the interior; the boundary values are left fixed. In practice, this means that if the mesh is n by n, then the values 

x[0][j]
x[n-1][j]
x[i][0]
x[i][n-1]   are left unchanged. 

We wish to compute this approximation in parallel. For convergence testing, compute RMS:
diffnorm = 0;
for (i,j)
    diffnorm += (xnew[i][j] - x[i][j]) * (xnew[i][j] - x[i][j]);
diffnorm = sqrt(diffnorm);

You'll need to use MPI_Allreduce for this. (Why not use MPI_Reduce?) Have process zero write out the value of diffnorm and the iteration count at each iteration. When diffnorm is less that 1.0e-2, consider the iteration converged. Also, if you reach 100 iterations, exit the loop. 

For simplicity, consider a 12 x 12 mesh on 4 processors, 3 columns on each processors. Some points have to be communicated because there are in different processors! The example solution uses this boundary values: -1 on the top and bottom, and the rank of the process on the side. The initial data (the values of x that are being relaxed) are also the same; the interior points have the same value as the rank of the process. This is shown below: 

-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 
 3  3  3  3  3  3  3  3  3  3  3  3
 3  3  3  3  3  3  3  3  3  3  3  3
 2  2  2  2  2  2  2  2  2  2  2  2
 2  2  2  2  2  2  2  2  2  2  2  2
 2  2  2  2  2  2  2  2  2  2  2  2
 1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1
 0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 

Code:
#include <stdio.h>
#include <math.h>
#include "mpi.h"
/* compile with: mpicc.mpich -o jacobi jacobi.c -lm */

/* This example handles a 12 x 12 mesh, on 4 processors only. */
#define maxn 12

int main( argc, argv )
int argc;
char **argv;
{
    int        rank, value, size, errcnt, toterr, i, j, itcnt;
    int        i_first, i_last;
    MPI_Status status;
    double     diffnorm, gdiffnorm;
    double     xlocal[(12/4)+2][12];
    double     xnew[(12/3)+2][12];

    MPI_Init( &argc, &argv );

    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    MPI_Comm_size( MPI_COMM_WORLD, &size );

    if (size != 4) MPI_Abort( MPI_COMM_WORLD, 1 );

    /* xlocal[][0] is lower ghostpoints, xlocal[][maxn+2] is upper */

    /* Note that top and bottom processes have one less row of interior
       points */
    i_first = 1;
    i_last  = maxn/size;
    if (rank == 0)        i_first++;
    if (rank == size - 1) i_last--;

    /* Fill the data as specified */
    for (i=1; i<=maxn/size; i++) 
        for (j=0; j<maxn; j++) 
            xlocal[i][j] = rank;
    for (j=0; j<maxn; j++) {
        xlocal[i_first-1][j] = -1;
        xlocal[i_last+1][j] = -1;
    }

    itcnt = 0;
    do {
        /* Send up unless I'm at the top, then receive from below */
        /* Note the use of xlocal[i] for &xlocal[i][0] */
        if (rank < size - 1) 
            MPI_Send( xlocal[maxn/size], maxn, MPI_DOUBLE, rank + 1, 0, 
                      MPI_COMM_WORLD );
        if (rank > 0)
            MPI_Recv( xlocal[0], maxn, MPI_DOUBLE, rank - 1, 0, 
                      MPI_COMM_WORLD, &status );
        /* Send down unless I'm at the bottom */
        if (rank > 0) 
            MPI_Send( xlocal[1], maxn, MPI_DOUBLE, rank - 1, 1, 
                      MPI_COMM_WORLD );
        if (rank < size - 1) 
            MPI_Recv( xlocal[maxn/size+1], maxn, MPI_DOUBLE, rank + 1, 1, 
                      MPI_COMM_WORLD, &status );
        
        /* Compute new values (but not on boundary) */
        itcnt ++;
        diffnorm = 0.0;
        for (i=i_first; i<=i_last; i++) 
            for (j=1; j<maxn-1; j++) {
                xnew[i][j] = (xlocal[i][j+1] + xlocal[i][j-1] +
                              xlocal[i+1][j] + xlocal[i-1][j]) / 4.0;
                diffnorm += (xnew[i][j] - xlocal[i][j]) * 
                            (xnew[i][j] - xlocal[i][j]);
            }
        /* Only transfer the interior points */
        for (i=i_first; i<=i_last; i++) 
            for (j=1; j<maxn-1; j++) 
                xlocal[i][j] = xnew[i][j];

        MPI_Allreduce( &diffnorm, &gdiffnorm, 1, MPI_DOUBLE, MPI_SUM,
                       MPI_COMM_WORLD );
        gdiffnorm = sqrt( gdiffnorm );
        if (rank == 0) printf( "At iteration %d, diff is %e\n", itcnt, 
                               gdiffnorm );
    } while (gdiffnorm > 1.0e-2 && itcnt < 100);

    MPI_Finalize( );
    return 0;
}