The MPI program is now working. Rather than merely a stray pointer, there was a race in gathering the results between lower and higher levels of the recursion. The unexpected order of the messages resulted in memory being freed before the return value arrived. This caused the program to scribble on the stack of the MPI_Recv function. Surprisingly, the problem was fixed by a thirteen-character change in the sync_bigmul3 routine to explicitly specify the ranks from which to receive messages. As the saying goes, if contractors built houses like programmers built programs, then not one of the three little pigs would have been saved.
The parallelism is relatively coarse grained. The last return in the recursion uses 3^n ranks while fibowork runs with 2*3^n ranks. Both ways of dividing up the computation must be efficiently distributed among the nodes of a cluster for good performance. Theoretically, this can be achieved by over-provisioning 3^n nodes with 2*3^n ranks. Alternatively, TUNE3 can be set to an impossibly large number so the linear recurrence in fibowork is not parallelized. In that case only the Karatsuba multiplication in bigmul3 will be parallelized in which case using 3^n ranks is all that's needed.
Sometimes over-provisioning is not practical because the MPI library employs busy waiting (almost all do by default) or takes a relatively long time to start each individual rank. In such situations setting TUNE3 large and using exactly 3^n nodes can result in better performance. When the number of nodes available is not a power of three then over-provisioning will almost always increase performance.
Since the super-cheap cluster has 5 single-processor nodes, one option would be to run the MPI program using 18 over-provisioned ranks. Alternatively, one could set TUNE3 very large and run the program using 9 ranks. In the case of the super-cheap cluster, the latter results slightly better performance. In particular, we have the following results in seconds:
The distributed-memory MPI parallel code runs 2.718 times faster than the simple serial code on the super-cheap cluster. Further comparing the parallel code to the serial code suggests that at least 2.5 seconds of the 9 second runtime is used for initialization. If these initialization costs are neglected, then 24 divided by 6.5 estimates the actual parallel speedup to be over 3.7 times faster. Following this line of reasoning, it is reasonable to conjecture that a 4-fold speedup might be achieved when calculating larger Fibonacci numbers.
The MPI code is about 55 lines longer than the original parallel.c OpenMP code. For reference fibompi.c follows:
Code: Select all
/* fibompi.c -- Compute the nth Fibonacci Number
Written February 4, 2019 by Eric Olson
This program demonstrates the expressiveness of C with MPICH
as measured by explicitly coding a distributed-memory parallel
version of the Karatsuba multiplication algorithm and then
using the doubling formulas
F(2k) = F(k)[2F(k+1)-F(k)]
F(2k+1) = F(k+1)[2F(k+1)-F(k)]+(-1)^(k+1)
F(2k+1) = F(k)[2F(k)+F(k+1)]+(-1)^(k)
F(2k+2) = F(k+1)[2F(k)+F(k+1)]
to compute the nth Fibonacci number. Note that n is specified
in the first line of the main routine.
For best performance use 3^n nodes over provisioned with 2*3^n
ranks or set TUNE3 to be large and then use only 3^n ranks. */
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <strings.h>
#include <stdlib.h>
#include <alloca.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <mpi.h>
int mpi_rank, mpi_size;
#define TUNE1 44
#define TUNE2 1024
//#define TUNE3 1048576
#define TUNE3 134217728
typedef unsigned long long llu;
typedef int limb;
typedef unsigned long long limb2;
typedef struct {
int n; limb *d;
} bignum;
static int rexp,cmult,digits;
static limb rbase,rcarry;
static limb2 cadd,cmax;
static void bigprint(bignum x){
char fmt[10];
sprintf(fmt,"%%0%dllu",rexp);
if(!x.n){
printf("0\n");
return;
}
for(int i=x.n-1;i>=0;i--){
if(i==x.n-1) printf("%llu",(llu)x.d[i]);
else printf(fmt,(llu)x.d[i]);
}
printf("\n");
}
static bignum bigtrim(bignum x){
for(int k=x.n-1;k>=0;k--){
if(x.d[k]!=0){
x.n=k+1;
return x;
}
}
x.n=0;
return x;
}
static bignum bigcarry(bignum x){
int imax=x.n-1;
for(int i=0;i<imax;i++){
if(x.d[i]>=rbase){
do {
x.d[i]-=rbase; x.d[i+1]++;
} while(x.d[i]>=rbase);
} else if(x.d[i]<0){
do {
x.d[i]+=rbase; x.d[i+1]--;
} while(x.d[i]<0);
}
}
if(x.d[imax]<rbase) return bigtrim(x);
x.n++;
x.d[imax]-=rbase; x.d[imax+1]=1;
while(x.d[imax]>=rbase){
x.d[imax]-=rbase; x.d[imax+1]++;
}
return x;
}
static bignum bigcopy(bignum z,bignum x){
memcpy(z.d,x.d,sizeof(limb)*x.n);
z.n=x.n;
return z;
}
static bignum bigadd3(limb zd[],bignum x,bignum y){
bignum z; z.d=zd;
if(x.n<y.n){
for(int i=0;i<x.n;i++) z.d[i]=x.d[i]+y.d[i];
for(int i=x.n;i<y.n;i++) z.d[i]=y.d[i];
z.n=y.n;
} else {
for(int i=0;i<y.n;i++) z.d[i]=x.d[i]+y.d[i];
for(int i=y.n;i<x.n;i++) z.d[i]=x.d[i];
z.n=x.n;
}
return z;
}
#ifdef NOTUSED
static bignum bigsub3(limb zd[],bignum x,bignum y){
bignum z; z.d=zd;
for(int i=0;i<y.n;i++) z.d[i]=x.d[i]-y.d[i];
for(int i=y.n;i<x.n;i++) z.d[i]=x.d[i];
z.n=x.n;
return z;
}
#endif
static bignum bigsub2(bignum x,bignum y){
for(int i=0;i<y.n;i++) x.d[i]-=y.d[i];
return x;
}
static bignum bigadd2(bignum x,bignum y){
if(x.n<y.n){
if(x.n>0) {
x.d[0]--;
for(int i=0;i<x.n;i++) x.d[i]+=y.d[i]-rcarry;
x.d[x.n]=y.d[x.n]+1;
for(int i=x.n+1;i<y.n;i++) x.d[i]=y.d[i];
x.n=y.n;
} else {
x=bigcopy(x,y);
}
} else {
x.d[0]--;
for(int i=0;i<y.n;i++) x.d[i]+=y.d[i]-rcarry;
if(y.n==x.n) {
x.n++;
x.d[y.n]=1;
} else {
x.d[y.n]++;
}
}
return x;
}
static bignum bigdec1(bignum x){
int imax=x.n-1;
x.d[0]--;
for(int i=0;i<imax;i++){
if(x.d[i]>=0) return x;
x.d[i]+=rbase; x.d[i+1]--;
}
return x;
}
static bignum biginc1(bignum x){
int imax=x.n-1;
if(imax>=0) {
x.d[0]++;
for(int i=0;i<imax;i++){
if(x.d[i]<rbase) return x;
x.d[i]-=rbase; x.d[i+1]++;
}
if(x.d[imax]<rbase) return x;
x.d[imax]-=rbase;
}
x.n++;
x.d[imax+1]=1;
return x;
}
static bignum bigmul3w(limb zd[],bignum x,bignum y){
bignum z; z.d=zd;
int imax=x.n+y.n;
limb2 s[imax];
bzero(s,sizeof(limb2)*imax);
imax--;
for(int i=0;i<x.n;i++){
for(int j=0;j<y.n;j++){
s[i+j]+=(limb2)x.d[i]*y.d[j];
}
if((i+1)%cmult==0){
for(int k=0;k<imax;k++){
if(s[k]<cmax) continue;
s[k]-=cmax; s[k+1]+=cadd;
}
}
}
for(int k=0;k<imax;k++){
s[k+1]+=s[k]/rbase; z.d[k]=s[k]%rbase;
}
z.d[imax]=s[imax];
z.n=imax+1;
return bigtrim(z);
}
static bignum biglow(bignum x,int n){
if(x.n>n) x.n=n;
return x;
}
static bignum bighigh(bignum x,int n){
if(x.n<n) x.n=0;
else {
x.n-=n;
x.d=&x.d[n];
}
return x;
}
static bignum bigmul3s(limb zd[],bignum x,bignum y){
if(x.n<TUNE1||y.n<TUNE1) return bigmul3w(zd,x,y);
int M=x.n<y.n?y.n:x.n; int n=M/2;
bignum z; z.d=zd;
bignum x0=biglow(x,n),x1=bighigh(x,n);
bignum y0=biglow(y,n),y1=bighigh(y,n);
limb z0d[M+1],z1d[M+3],z1yd[n+1],z2d[M+3];
bignum z0,z2,z1x,z1y,z1;
z0=bigmul3s(z0d,x0,y0);
z2=bigmul3s(z2d,x1,y1);
z1x=bigcarry(bigadd3(z1d,x0,x1));
z1y=bigcarry(bigadd3(z1yd,y0,y1));
z1=bigmul3s(z1x.d,z1x,z1y);
z1=bigcarry(bigsub2(bigsub2(z1,z0),z2));
z=bigcopy(z,z0);
z=bigadd3(&z.d[n],(bignum){z.n-n,&z.d[n]},z1);
z=bigadd2((bignum){z.n-n,&z.d[n]},z2);
z.n=z.n+2*n; z.d=zd;
return bigcarry(z);
}
static bignum bigmul3(int l,limb zd[],bignum x,bignum y);
typedef struct { bignum *zp; int l,nx,ny; limb d[]; } bigmul3args;
static void spawn_bigmul3(int c,bignum *zp,int l,limb zd[],bignum x,bignum y){
unsigned int msgsize=sizeof(bigmul3args)+(x.n+y.n)*sizeof(limb);
bigmul3args *msg=alloca(msgsize);
msg->zp=zp; msg->l=l; msg->nx=x.n; msg->ny=y.n;
memcpy(&msg->d[0],x.d,x.n*sizeof(limb));
memcpy(&msg->d[x.n],y.d,y.n*sizeof(limb));
zp->d=zd;
MPI_Send(msg,msgsize,MPI_BYTE,2*mpi_rank+c,0,MPI_COMM_WORLD);
return;
}
typedef struct { bignum *zp; int n; limb d[]; } bigmul3ret;
static int workn(){
unsigned int insize=sizeof(bigmul3args)+digits*sizeof(limb);
bigmul3args *in=alloca(insize);
for(;;){
MPI_Status err;
MPI_Recv(in,insize,MPI_BYTE,
MPI_ANY_SOURCE,0,MPI_COMM_WORLD,&err);
int rankret=err.MPI_SOURCE;
if(in->zp==0) return 0;
bignum x,y,z;
x.n=in->nx; x.d=&in->d[0];
y.n=in->ny; y.d=&in->d[x.n];
bigmul3ret *out=(bigmul3ret *)in;
z=bigmul3(in->l,out->d,x,y);
out->n=z.n;
unsigned int outsize=sizeof(bigmul3ret)+z.n*sizeof(limb);
MPI_Send(out,outsize,MPI_BYTE,rankret,0,MPI_COMM_WORLD);
}
}
static void sync_bigmul3(int l,int p){
unsigned int msgsize=sizeof(bigmul3ret)+digits*sizeof(limb);
bigmul3ret *msg=alloca(msgsize);
for(int i=0;i<p;i++){
MPI_Status err;
MPI_Recv(msg,msgsize,MPI_BYTE,
2*mpi_rank+l+i,0,MPI_COMM_WORLD,&err);
msg->zp->n=msg->n;
memcpy(msg->zp->d,msg->d,msg->n*sizeof(limb));
}
return;
}
static bignum bigmul3(int l,limb zd[],bignum x,bignum y){
int l3=l*3;
if(x.n<TUNE1||y.n<TUNE1) return bigmul3w(zd,x,y);
int M=x.n<y.n?y.n:x.n; int n=M/2;
if(M<TUNE2||l3>mpi_size) return bigmul3s(zd,x,y);
bignum z; z.d=zd;
bignum x0=biglow(x,n),x1=bighigh(x,n);
bignum y0=biglow(y,n),y1=bighigh(y,n);
limb z0d[M+1],z1d[M+3],z1yd[n+1],z2d[M+3];
bignum z0,z2,z1x,z1y,z1;
spawn_bigmul3(l,&z0,l3,z0d,x0,y0);
spawn_bigmul3(l+1,&z2,l3,z2d,x1,y1);
z1x=bigcarry(bigadd3(z1d,x0,x1));
z1y=bigcarry(bigadd3(z1yd,y0,y1));
z1=bigmul3(l3,z1x.d,z1x,z1y);
sync_bigmul3(l,2);
z1=bigcarry(bigsub2(bigsub2(z1,z0),z2));
z=bigcopy(z,z0);
z=bigadd3(&z.d[n],(bignum){z.n-n,&z.d[n]},z1);
z=bigadd2((bignum){z.n-n,&z.d[n]},z2);
z.n=z.n+2*n; z.d=zd;
return bigcarry(z);
}
static bignum t1,a,b;
static void fiboworks(int n){
if(n==0){
a.n=0; b.n=1; b.d[0]=1;
return;
}
fiboworks(n/2);
if(n%2==0){
// [a,b]=[a*(2*b-a),b*(2*b-a)-(-1)^k]
t1=bigcarry(bigsub2(bigadd3(t1.d,b,b),a));
a=bigmul3(1,a.d,a,t1);
b=bigmul3(1,b.d,b,t1);
if(n%4==0) b=bigdec1(b); else b=biginc1(b);
} else {
// [a,b]=[a*(2*a+b)+(-1)^k,b*(2*a+b)]
t1=bigcarry(bigadd2(bigadd3(t1.d,a,a),b));
b=bigmul3(1,b.d,b,t1);
a=bigmul3(1,a.d,a,t1);
if(n%4==1) a=biginc1(a); else a=bigdec1(a);
}
return;
}
static void fibowork(int n){
if(n==0){
a.n=0; b.n=1; b.d[0]=1;
return;
}
if(n<TUNE3||2>mpi_size) {
fiboworks(n);
return;
}
fibowork(n/2);
if(n%2==0){
// [a,b]=[a*(2*b-a),b*(2*b-a)-(-1)^k]
t1=bigcarry(bigsub2(bigadd3(t1.d,b,b),a));
spawn_bigmul3(1,&a,2,a.d,a,t1);
b=bigmul3(2,b.d,b,t1);
sync_bigmul3(1,1);
if(n%4==0) b=bigdec1(b); else b=biginc1(b);
} else {
// [a,b]=[a*(2*a+b)+(-1)^k,b*(2*a+b)]
t1=bigcarry(bigadd2(bigadd3(t1.d,a,a),b));
spawn_bigmul3(1,&b,2,b.d,b,t1);
a=bigmul3(2,a.d,a,t1);
sync_bigmul3(1,1);
if(n%4==1) a=biginc1(a); else a=bigdec1(a);
}
return;
}
static bignum fibo(int n, limb xd[]){
b.d=xd;
if(n<2){
b.n=1; b.d[0]=n;
return b;
}
limb ad[digits]; a.d=ad;
fibowork((n-1)/2);
if(n%2==0){
// b=b*(2*a+b)
t1=bigcarry(bigadd2(bigadd3(t1.d,a,a),b));
b=bigmul3(1,b.d,b,t1);
} else {
// b=b*(2*b-a)-(-1)^k
t1=bigcarry(bigsub2(bigadd3(t1.d,b,b),a));
b=bigmul3(1,b.d,b,t1);
if(n%4==1) b=bigdec1(b); else b=biginc1(b);
}
return b;
}
static int work0(int n){
limb t1d[digits]; t1.d=t1d;
limb xd[digits];
bignum x=fibo(n,xd);
bigprint(x);
return 0;
}
int main(int argc, char* argv[]){
int n=4784969;
// int n=7499;
setrlimit(RLIMIT_STACK,
&(const struct rlimit)
{RLIM_INFINITY,RLIM_INFINITY});
if(sizeof(limb)*2!=sizeof(limb2)){
fprintf(stderr,
"sizeof(limb)=%llu must be half of sizeof(limb2)=%llu!\n",
(llu)sizeof(limb),(llu)sizeof(limb2));
return 1;
}
limb limbmax=((limb2)1<<(8*sizeof(limb)-1))-1;
limb2 limb2max=((limb2)1<<(8*sizeof(limb)))-1;
limb2max=(limb2max<<(8*sizeof(limb)))+limb2max;
rexp=log(limbmax)/log(10);
rbase=pow(10,rexp)+0.5; rcarry=rbase-1;
cmult=(limb2max-2*rbase)/rbase/rbase/2;
cadd=(limb2)rbase*cmult; cmax=cadd*rbase;
digits=n*log10(0.5*(1+sqrt(5.0)))/rexp+4;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD,&mpi_size);
if(mpi_rank==0){
work0(n);
bignum *zp=0;
for(int i=1;i<mpi_size;i++){
MPI_Send(&zp,sizeof(bignum *),MPI_BYTE,i,0,MPI_COMM_WORLD);
}
} else {
workn();
}
MPI_Finalize();
return 0;
}