Course: AMS 530 Parallel Computing

Deepak Naidu

Instructor: Dr. Yuefan Deng

Project IV

Problem 4.1

Objective:

 Write a parallel program to charge N particles with charges randomly distributed in [-1, -1], place them randomly in a 101010 cube and compute the distribution of speed (magnitude of velocity) of the particles after 100 steps.

 Collect timing result for individual processors for P=2, 4, 8.

 Compute load imbalance ratio.

Implementation details:

The root process, that is, the process with rank 0 generates N particles randomly as described above. Each process computes the new coordinates, force acting on it, and new velocity of the particles assigned to it. This is repeated for 100 steps. The root process then gathers the results, including the elapsed times, produced by the other processes.

Timing results:

N=1000 P=2

Process 0: Elapsed time = 162.378036 seconds

Process 1: Elapsed time = 150.882721 seconds

Maximum: 162.378036 seconds

Average: 156.630379 seconds

N=1000 P=4

Process 0: Elapsed time = 41.783180 seconds

Process 1: Elapsed time = 49.318260 seconds

Process 2: Elapsed time = 46.533562 seconds

Process 3: Elapsed time = 43.120197 seconds

Maximum: 49.318260 seconds

Average: 45.188800 seconds

N=1000 P=8

Process 0: Elapsed time = 16.538200 seconds

Process 1: Elapsed time = 13.271678 seconds

Process 2: Elapsed time = 19.039444 seconds

Process 3: Elapsed time = 14.035685 seconds

Process 4: Elapsed time = 19.273207 seconds

Process 5: Elapsed time = 26.638098 seconds

Process 6: Elapsed time = 29.388863 seconds

Process 7: Elapsed time = 26.171789 seconds

Maximum: 29.388863 seconds

Average: 20.544621 seconds

Load imbalance ratio:

We use the following definition of load imbalance ratio, L = (tmax – tavg) / tavg where tmax = max{running time of process pi} and tavg = avg{running time of process pi}.

 For N=1000, P=2: L = (162.3 - 156.6)/ 156.6 = 0.036.

 For N=1000, P=4: L = (49.3 - 45.1)/ 45.1 = 0.093.

 For N=1000, P=8: L = (29.3 - 20.5)/ 20.5 = 0.429.

Source Code

/*********************************************************************

* File project4.c *

* Project IV *

* Problem 4.1 *

* Compile gcc project4.c -o project4 *

* -I/usr/local/mpich-1.2.0/std/include *

* -L/usr/local/mpich-1.2.0/std/lib -lmpich *

* Run mpirun -pbs -np <number_of_processors> project4 *

*********************************************************************/

#include<stdio.h>

#include<stdlib.h>

#include<math.h>

#include<time.h>

#include<mpi.h>

#define N 1000

main(int argc, char** argv)

{

/* Global - values of these variables are the same for all

processes */

int p; /* Number of processes */

int work; /* Number of particles each process has to work on */

/* Local - values of these variables may not be same for

different processes */

int my_rank; /* My rank */

float start, finish; /* Timer */

float elapsed_time;

int i, j, k, step; /* counters */

float* coordinates;

float* velocity;

float* charge;

float* total_force; /* Total force acting on each particle */

float origin[3] = {0.0, 0.0, 0.0};

float force;

/* Root - these variables are accessed only by the root

process. In this implementation, root process is

process with rank 0 */

float sum_time;

float max_time;

MPI_Status status;

/* Declare function */

float dist(float* x, float* y, int n);

/* Start MPI */

MPI_Init(&argc, &argv);

/* Start timer */

start = MPI_Wtime();

/* Get my processor rank */

MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

/* How many processors are available ? */

MPI_Comm_size(MPI_COMM_WORLD, &p);

/* Number of particles each process will work on */

work = N/p;

/* Allocate memory */

coordinates = malloc(3*(N+1)*sizeof(float));

velocity = malloc(3*N*sizeof(float));

charge = malloc((N+1)*sizeof(float));

total_force = malloc(3*work*sizeof(float));

/* Exit if cannot obtain memory */

if(coordinates == NULL || velocity == NULL || charge == NULL ||

total_force == NULL)

{

printf("\nFailed to allocate memory\n");

exit(0);

}

/* Charge particles with charges uniformly distributed in [-1,1]

and place them at random coordinates in a 10x10x10 cube. */

if(my_rank == 0)

{

srand48((long)time(NULL));

for(i=0; i<N; i++)

{

*(charge+i) = drand48()-drand48();

*(coordinates+3*i+0) = 10*drand48();

*(coordinates+3*i+1) = 10*drand48();

*(coordinates+3*i+2) = 10*drand48();

/* Set initial velocity of particles */

*(velocity+3*i+0) = 1.0;

*(velocity+3*i+1) = 0.0;

*(velocity+3*i+2) = 0.0;

}

/* Place particle with charge 7.0 at (10,10,10) */

*(charge+i) = 7.0;

*(coordinates+3*N+0) = 10.0;

*(coordinates+3*N+1) = 10.0;

*(coordinates+3*N+2) = 10.0;

}

/* Communication is necessary only if more than one processor

is being used */

if(p>1)

{

MPI_Bcast(coordinates, 3*(N+1), MPI_FLOAT, 0, MPI_COMM_WORLD);

MPI_Bcast(velocity, 3*N, MPI_FLOAT, 0, MPI_COMM_WORLD);

MPI_Bcast(charge, N+1, MPI_FLOAT, 0, MPI_COMM_WORLD);

}

for(step=0; step<100; step++)

{

/* Compute position and velocity of the particles after one step */

for(i=0; i<work; i++)

{

*(total_force+3*i+0) = 0.0;

*(total_force+3*i+1) = 0.0;

*(total_force+3*i+2) = 0.0;

for(j=0; j<N; j++)

{

/* Ignore particle interaction if distance between the two

particles is greater than 0.6 units */

if(work*my_rank+i != j &

dist((coordinates+3*work*my_rank+3*i),

(coordinates+3*j), 3) < 0.6)

{

force = 0.0;

/* Compute force according to the law:fij=qi*qj/(rij)^5 */

force = *(charge+work*my_rank+i) * *(charge+j)

/pow(dist((coordinates+3*work*my_rank+3*i),

(coordinates+3*j), 3), 5);

for(k=0; k<3; k++)

{

*(total_force+3*i+k) = *(total_force+3*i+k) +

force * (*(coordinates+3*i+k) –

*(coordinates+3*j+k));

*(velocity+3*work*my_rank+3*i+k) =

*(velocity+3*work*my_rank+3*i+k) + 10E-6 *

*(total_force+3*i+k);

*(coordinates+3*work*my_rank+3*i+k) =

*(coordinates+3*work*my_rank+3*i+k) + 10E-6 *

*(velocity+3*work*my_rank+3*i+k);

}

/* Take into account the interaction of current particle and the

particle at the coordinates (10,10,10) irrespective of

the distance between them */

force = *(charge+work*my_rank+i) * *(charge+N)

/pow(dist((coordinates+3*work*my_rank+3*i),

(coordinates+3*N), 3), 5);

for(k=0; k<3; k++)

{

*(total_force+3*i+k) = *(total_force+3*i+k) +

force * (*(coordinates+3*i+k) - *(coordinates+3*N+k));

*(velocity+3*work*my_rank+3*i+k) =

*(velocity+3*work*my_rank+3*i+k) + 10E-6 *

*(total_force+3*i+k);

*(coordinates+3*work*my_rank+3*i+k) =

*(coordinates+3*work*my_rank+3*i+k) + 10E-6 *

*(velocity+3*work*my_rank+3*i+k);

}

if(p>1)

{

MPI_Bcast(coordinates+3*work*my_rank, 3*work, MPI_FLOAT,

my_rank, MPI_COMM_WORLD);

MPI_Bcast(velocity+3*work*my_rank, 3*work, MPI_FLOAT,

my_rank, MPI_COMM_WORLD);

}

finish = MPI_Wtime();

elapsed_time = (finish-start);

max_time = elapsed_time;

sum_time = elapsed_time;

/* Print the final timing result */

if(my_rank != 0)

{

MPI_Send(&elapsed_time, 1, MPI_FLOAT, 0, 1, MPI_COMM_WORLD);

}

else

{

printf("\nN=%d P=%d", N, p);

printf("\nProcess %d: Elapsed time = %f seconds", my_rank,

elapsed_time);

for(i=1; i<p; i++)

{

MPI_Recv(&elapsed_time, 1, MPI_FLOAT, i, 1, MPI_COMM_WORLD,

&status);

sum_time += elapsed_time;

if(max_time < elapsed_time)

{

max_time = elapsed_time;

}

printf("\nProcess %d: Elapsed time = %f seconds", i,

elapsed_time);

}

printf("\nMaximum: %f seconds", max_time);

printf("\nAverage: %f seconds\n\n", sum_time/p);

}

/* The following block of code should be uncommented if final

result, i.e, distribution of the speed of the N particles

is desired */

/*if(my_rank==0)

{

for(i=0; i<N; i++)

{

printf("\nSpeed: %f", dist(velocity+3*i, origin, 3));

}

}*/

/* Shut down MPI */

MPI_Finalize();

}

/*********************************************************************

* Function dist *

* Purpose Computes euclidean distance between two vectors. *

* Input 1. float* x: pointer to first vector. *

* 2. float* y: pointer to second vector. *

* 3. int n: # of components of the vector. *

* Output Returns euclidean distance of two vectors. *

*********************************************************************/

float dist(float* x, float* y, int n)

{

int i;

float d = 0.0;

for(i=0; i<n; i++)

{

d = d + pow((*(x+i) - *(y+i)), 2);

}

return sqrt(d);

}