/*********************************************/ /* O(N), Multi-Channel Perturbation Method */ /* to Calclate the Local Density of States */ /* [Phys. Rev. B 56, pp. 3524 (1997)]. */ /* */ /* Ver 2.1 June 22, 1998 */ /* Developed by Ju Li (MIT) */ /*********************************************/ /* Version Notes: 1. TTY interface. 2. Woon's pair potential. 3. Dynamical matrix for arbitrary configuration and pair potential. 4. User-defined supercell k sampling or random sampling. 5. High-precision (order-12) integration of equation of motion. 6. Spin-wave amplitude encoding to minimize the point spread. 7. Shift the spectrum for better convergence at low frequencies. 8. Parallel shared memory using UNIX fork(), shmop(), semop(). 9. Flop rate benchmark. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mcp.h" int main() { int i,j,k,l,m,n,ip,ipm,ibm; double cc,dd,ee,tmp[3][3],sumweight; double sxji,syji,szji,rx,ry,rz; char *p, sentence[MAX_STRING_LENGTH]; /* Read instructions from input, such as "mon" file: */ printf ("\nReading instructions...\n"); fscanf (stdin, "Name of configuration (->*, default=perfect_crystal.lj):\n%s\n\n", config_name); if (!strcasecmp(config_name,"default")) sprintf(config_name, "perfect_crystal.lj"); fscanf (stdin, "Pair potential to be used (LJ6_12 or WOON):\n%s\n\n", potential_function); LJ6_12 = !strcmp(potential_function,"LJ6_12"); WOON = !strcmp(potential_function,"WOON"); if ((!LJ6_12) && (!WOON)) { fprintf (stdout, "\n **Error: Choose between LJ6_12 or WOON pair potential\n"); exit(1); } fscanf (stdin, "Load configuration from file (default=*):\n%s\n\n", fn_config_read); if (!strcasecmp(fn_config_read,"default")) sprintf(fn_config_read, "%s", config_name); fscanf(stdin, "Atom index and Cartesian direction for LDOS:\n%d %lf %lf %lf\n\n", &iatom, &Q[0][0], &Q[0][1], &Q[0][2]); fscanf (stdin, "Save LDOS on file (default=*.ldos):\n%s\n\n", fn_ldos); if (!strcasecmp(fn_ldos, "default")) sprintf(fn_ldos, "%s.ldos", config_name); fscanf (stdin, "Supercell k-point sampling file (if numeric then\n"); fscanf (stdin, "apply random sampling in BZ, default=Kpts/sckpt.8):\n%s\n\n", fn_kpt); if (random_k_sampling=((*fn_kpt>='0')&&(*fn_kpt<='9'))) { /* user inputs a integer, do random k sampling */ for (p=fn_kpt;(*p>='0')&&(*p<='9');p++); *p = 0; sscanf(fn_kpt, "%d", &num_kpts); } else if (!strcasecmp(fn_kpt, "default")) sprintf(fn_kpt, "Kpts/sckpt.8"); fscanf(stdin, "Ignore dynamical matrix components smaller than (THz^2):\n%lf\n\n", &tiny); fscanf(stdin, "Minimum (base) frequency (THz):\n%lf\n\n", &min_v); fscanf(stdin, "Maximum frequency (THz):\n%lf\n\n", &max_v); fscanf(stdin, "Shift the spectrum by (THz):\n%lf\n\n", &shift_v); fscanf(stdin, "Twist period of spin-wave amplitude encoding:\n%d\n\n", &ktwist); fscanf(stdin, "Number of steps of a period for the maximum frequency:\n%d\n\n", &mincycle); fscanf(stdin, "To run on how many processors:\n%d\n\n", &num_processors); if (num_processors > MAX_PROCESSORS) { printf ("MAX_PROCESSORS < %d, revise source code.\n", num_processors); exit(1); } fscanf(stdin, "Work load of the master process\n"); fscanf(stdin, "(-1 -> {1., 0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98}):\n%lf\n\n", &master_load); if (master_load < 0) master_load = default_master_load[num_processors-1]; if (num_processors==1) master_load = 1.0; fscanf(stdin, "Frequency of getting reported (in Gflop)\n"); fscanf(stdin, "(NULL=don't, default=0.5 Gflop):\n%s\n\n", sentence); if (print_floprate=strcasecmp(sentence, "NULL")) if (!strcasecmp(sentence, "default")) work_in_Gflop = 0.5; else sscanf(sentence, "%lf", &work_in_Gflop); fscanf (stdin, "Random number generator seed (long integer):\n%ld\n\n", &iseed); srandom (iseed); printf("Instructions read complete.\n\n"); /* Instructions read complete */ /* print out mission statement */ printf ("LDOS will be calculated for atom %d in \"%s\"\n", iatom, config_name); printf ("along (%.2f,%.2f,%.2f) direction using %s potential\n", Q[0][0], Q[0][1], Q[0][2], potential_function); printf ("with Rcut = %f = %f Angstrom,\n", CUTOFF, CUTOFF*ULENGTH_IN_ANGSTROM); /* first channel, after-transformation freq. fixed: */ min_channel = (int) ceil(shift_v/min_v); dd = min_channel*min_v; /* second channel, after-transformation freq. fixed: */ ee = dd + min_v; /* make the 2nd real freq. (cc x the 1st real freq.) */ cc = 2.5; shift_v = sqrt((cc*cc*dd*dd-ee*ee)/(cc*cc-1.)); /* maximum after-transformation freq. */ max_v = sqrt(max_v*max_v+shift_v*shift_v); max_channel = floor(max_v/min_v); printf ("for %d frequency channels between [%f, %f] THz, or\n", max_channel-min_channel+1, sqrt(dd*dd-shift_v*shift_v), sqrt(max_channel*max_channel*min_v*min_v-shift_v*shift_v)); printf ("[%f, %f] THz for the shifted dynamical matrix.\n\n", dd, max_channel*min_v); /* allocating memory for frequency channels */ printf ("Allocating memory for %d frequency channels.\n\n", max_channel-min_channel+1); /* frequencies in THz */ v1 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); v2 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); v4 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); v6 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); v8 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); v10 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); /* real and imaginary amplitude series */ ar = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); ai = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); /* spin wave phase encoding */ phi = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); /* LDOS result */ fup = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); fvp = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); ldos = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); for (m=0; m<=max_channel-min_channel; m++) { v1[m] = (m+min_channel)*min_v; v2[m] = pow(v1[m],2.); v4[m] = pow(v1[m],4.); v6[m] = pow(v1[m],6.); v8[m] = pow(v1[m],8.); v10[m]= pow(v1[m],10.); ar[m] = 0.; ai[m] = 0.; phi[m] = 0.; ldos[m] = 0.; } /* total number of integration steps: */ mint = max_channel*mincycle; /* 2 x Pi x (timestep size in ps): */ delta = 2.*PI/min_v/mint; printf ("Run track is %d steps long, equivalent step size = %f ps.\n", mint, delta/2./PI); del2 = pow(delta,2.)/2.; del3 = pow(delta,3.)/6.; del4 = pow(delta,4.)/24.; del5 = pow(delta,5.)/120.; del6 = pow(delta,6.)/720.; del7 = pow(delta,7.)/5040.; del8 = pow(delta,8.)/40320.; del9 = pow(delta,9.)/362880.; del10 = pow(delta,10.)/3628800.; del11 = pow(delta,11.)/39916800.; printf ("Allocating memory for %d phases.\n\n", mint); /* prepare the phases */ dsin = (double *) malloc(mint*sizeof(double)); dcos = (double *) malloc(mint*sizeof(double)); for (ip=1; ip<=mint; ip++) { dsin[ip-1] = sin(2.*PI/mint*ip); dcos[ip-1] = cos(2.*PI/mint*ip); } printf("The configuration is being loaded from \"%s\".\n", fn_config_read); if ((filehandle=fopen(fn_config_read, "r"))==NULL) { fprintf(stdout, "\n **Error: cannot locate file \"%s\"\n.", fn_config_read); exit(1); } fscanf (filehandle, "Number of particles = %d\n\n", &np); printf ("Allocate property arrays for %d particles.\n\n",np); /* reduced coordinates */ mass = (double *) malloc(np*sizeof(double)); s = (double *) malloc(3*np*sizeof(double)); uold = (double *) malloc(6*np*sizeof(double)); /* read in supercell geometry from configuration file */ fscanf (filehandle, "H(1,1) = %lf Angstrom\n", &H[0][0]); fscanf (filehandle, "H(1,2) = %lf Angstrom\n", &H[0][1]); fscanf (filehandle, "H(1,3) = %lf Angstrom\n\n", &H[0][2]); fscanf (filehandle, "H(2,1) = %lf Angstrom\n", &H[1][0]); fscanf (filehandle, "H(2,2) = %lf Angstrom\n", &H[1][1]); fscanf (filehandle, "H(2,3) = %lf Angstrom\n\n", &H[1][2]); fscanf (filehandle, "H(3,1) = %lf Angstrom\n", &H[2][0]); fscanf (filehandle, "H(3,2) = %lf Angstrom\n", &H[2][1]); fscanf (filehandle, "H(3,3) = %lf Angstrom\n\n", &H[2][2]); for (i=0;i<3;i++) for (j=0;j<3;j++) H[i][j] /= ULENGTH_IN_ANGSTROM; volume = fabs(matinv(H, HI)); printf ("The supercell geometry is\n"); printf (" | %.5f %.5f %.5f |\n", H[0][0]*ULENGTH_IN_ANGSTROM, H[0][1]*ULENGTH_IN_ANGSTROM, H[0][2]*ULENGTH_IN_ANGSTROM); printf (" H[A] = | %.5f %.5f %.5f |\n", H[1][0]*ULENGTH_IN_ANGSTROM, H[1][1]*ULENGTH_IN_ANGSTROM, H[1][2]*ULENGTH_IN_ANGSTROM); printf (" | %.5f %.5f %.5f |\n", H[2][0]*ULENGTH_IN_ANGSTROM, H[2][1]*ULENGTH_IN_ANGSTROM, H[2][2]*ULENGTH_IN_ANGSTROM); printf ("volume = %f = %f A^3.\n\n", volume, volume*ULENGTH_IN_ANGSTROM*ULENGTH_IN_ANGSTROM*ULENGTH_IN_ANGSTROM); if (np/volume > 0.85*MAXDENSITY) { printf("Current density %f is above 85%% of MAXDENSITY %f in \"mcp_constant.h\".\n"); printf("Overflow feared. Please increase MAXDENSITY in \"mcp_constant.h\".\n"); exit(1); } /* To simplify the programming, we require the CUTOFF sphere to be contained in the parallelepiped formed by H(1,:), H(2,:), H(3,:), which requires the thickness opposite to any of the parallelepiped surfaces to be greater than 2*CUTOFF, but that thickness is just 1/|G_i|, where G^T * H = I, the corresponding reciprocal lattice vector. */ if ((CUTOFF>0.5/sqrt(HI[0][0]*HI[0][0]+HI[1][0]*HI[1][0]+HI[2][0]*HI[2][0])) || (CUTOFF>0.5/sqrt(HI[0][1]*HI[0][1]+HI[1][1]*HI[1][1]+HI[2][1]*HI[2][1])) || (CUTOFF>0.5/sqrt(HI[0][2]*HI[0][2]+HI[1][2]*HI[1][2]+HI[2][2]*HI[2][2]))) { printf ("For code efficiency we require a larger system,\n"); printf ("so there is no self-interaction.\n"); exit(1); } /* read in reduced coordinates from configuration file */ fscanf(filehandle, "Mass(amu) sx sy sz sx1(1/ps) sy1(1/ps) sz1(1/ps)\n"); for (i=0;i<3*np;i+=3) { fscanf (filehandle, "%lf %lf %lf %lf %lf %lf %lf\n", &mass[i/3], &s[i], &s[i+1], &s[i+2], &cc, &cc, &cc); /* convert from AMU to reduced mass unit */ mass[i/3] = mass[i/3]/AVO*1E-3/UMASS_IN_KG; } fclose (filehandle); printf ("\"%s\" generated successfully on the computer.\n\n", config_name); printf ("Obtain an orthogonal transformation\n"); cc = sqrt(Q[0][0]*Q[0][0]+Q[0][1]*Q[0][1]+Q[0][2]*Q[0][2]); Q[0][0] /= cc; Q[0][1] /= cc; Q[0][2] /= cc; /* get a random vector v in space: */ cc = Q[0][2]; dd = Q[0][0]; ee = -Q[0][1]; /* take a cross-product v*c1 */ Q[1][0] = dd*Q[0][2]-ee*Q[0][1]; Q[1][1] = ee*Q[0][0]-cc*Q[0][2]; Q[1][2] = cc*Q[0][1]-dd*Q[0][0]; cc = sqrt(Q[1][0]*Q[1][0]+Q[1][1]*Q[1][1]+Q[1][2]*Q[1][2]); Q[1][0] /= cc; Q[1][1] /= cc; Q[1][2] /= cc; /* take the cross-product c1*c2 */ Q[2][0] = Q[0][1]*Q[1][2]-Q[0][2]*Q[1][1]; Q[2][1] = Q[0][2]*Q[1][0]-Q[0][0]*Q[1][2]; Q[2][2] = Q[0][0]*Q[1][1]-Q[0][1]*Q[1][0]; /* get the transpose */ for (i=0; i<3; i++) for (j=0; j<3; j++) QT[i][j] = Q[j][i]; printf (" | %7.4f %7.4f %7.4f |\n",Q[0][0],Q[0][1],Q[0][2]); printf (" Q = | %7.4f %7.4f %7.4f |\n",Q[1][0],Q[1][1],Q[1][2]); printf (" | %7.4f %7.4f %7.4f |\n",Q[2][0],Q[2][1],Q[2][2]); printf ("making (%.2f,%.2f,%.2f) a normal coordinate.\n\n", Q[0][0],Q[0][1],Q[0][2]); /* count the total number of interactions */ printf ("Counting all possible interactions within Rcut = %f A.\n", CUTOFF*ULENGTH_IN_ANGSTROM); for (num_dm=0,i=0; i0.5) sxji--; else if (sxji<-0.5) sxji++; if(syji>0.5) syji--; else if (syji<-0.5) syji++; if (szji>0.5) szji--; else if (szji<-0.5) szji++; rx = H[0][0]*sxji + H[1][0]*syji + H[2][0]*szji; ry = H[0][1]*sxji + H[1][1]*syji + H[2][1]*szji; rz = H[0][2]*sxji + H[1][2]*syji + H[2][2]*szji; if (rx*rx+ry*ry+rz*rz0.5) sxji--; else if (sxji<-0.5) sxji++; if(syji>0.5) syji--; else if (syji<-0.5) syji++; if (szji>0.5) szji--; else if (szji<-0.5) szji++; rx = H[0][0]*sxji + H[1][0]*syji + H[2][0]*szji; ry = H[0][1]*sxji + H[1][1]*syji + H[2][1]*szji; rz = H[0][2]*sxji + H[1][2]*syji + H[2][2]*szji; if (rx*rx+ry*ry+rz*rztiny) { num[3*j+l+1]+=2; num[3*i+k+1]+=2; } ip++; } } /* convert into accumulative index: the dynamical matrix is compressed */ /* into 1D arrays row[]+value[], where the ith column of D(k) occupies */ /* slots [num[i], num[i+1]) on those 1D arrays. */ for (i=0; i<3*np; i++) num[i+1] += num[i]; printf("After getting rid of off-diagonal components < %f THz^2,\n",tiny); printf("we will have on average %d significant elements per column on D(k).\n", (num[3*np]>>1)/3/np); printf("Allocating memory for D(k) (sparse).\n\n"); /* currently filled slots [num[i], idx[i]) for the ith column */ idx = (int *) malloc(3*np*sizeof(int)); /* row index for each D(k) element */ row = (int *) malloc((num[3*np]>>1)*sizeof(int)); /* value of D(k) element (complex) */ value = (double *) malloc(num[3*np]*sizeof(double)); if ((idx==NULL)||(row==NULL)||(value==NULL)) { printf("Not enough memory (%d bytes).\n", (3*np+num[3*np]>>1) *sizeof(int) + num[3*np]*sizeof(double)); exit(1); } /* get sampling k-points in BZ */ if (random_k_sampling) { printf("Randomly generating %d k-points in supercell BZ.\n\n", num_kpts); kpts = (double *) malloc(4*num_kpts*sizeof(double)); for (i=0; i\n"); /* set data layout */ n = 3*np/num_processors; domain[0] = 0; domain[1] = ceil(master_load*n); for (i=1; i>1) /(ORDER_OF_INTEGRATION-2)*2); if (print_floprate) printf ("Flop rate will be shown for every %.1f Mflop chunk of work.\n", work_in_Gflop*1000); printf ("Final LDOS result will be saved on \"%s\".\n\n", fn_ldos); /* go over all supercell k-points */ for (sumweight=0.,k=0; k>1] = 2*((i/3)*3); row[(num[i]>>1)+1] = 2*((i/3)*3+1); row[(num[i]>>1)+2] = 2*((i/3)*3+2); } for (ip=0; iptiny) { /* (ix, jx) */ row[idx[3*j]>>1] = 2*(3*i); value[idx[3*j]] -= dm[9*ip+3]*cc; value[idx[3*j]+1] -= dm[9*ip+3]*dd; idx[3*j]+=2; /* (jx, ix) */ row[idx[3*i]>>1] = 2*(3*j); value[idx[3*i]] -= dm[9*ip+3]*cc; value[idx[3*i]+1] += dm[9*ip+3]*dd; idx[3*i]+=2; } if (fabs(dm[9*ip+6])>tiny) { /* (iy, jy) */ row[idx[3*j+1]>>1] = 2*(3*i+1); value[idx[3*j+1]] -= dm[9*ip+6]*cc; value[idx[3*j+1]+1] -= dm[9*ip+6]*dd; idx[3*j+1]+=2; /* (jy, iy) */ row[idx[3*i+1]>>1] = 2*(3*j+1); value[idx[3*i+1]] -= dm[9*ip+6]*cc; value[idx[3*i+1]+1] += dm[9*ip+6]*dd; idx[3*i+1]+=2; } if (fabs(dm[9*ip+8])>tiny) { /* (iz, jz) */ row[idx[3*j+2]>>1] = 2*(3*i+2); value[idx[3*j+2]] -= dm[9*ip+8]*cc; value[idx[3*j+2]+1] -= dm[9*ip+8]*dd; idx[3*j+2]+=2; /* (jz, iz) */ row[idx[3*i+2]>>1] = 2*(3*j+2); value[idx[3*i+2]] -= dm[9*ip+8]*cc; value[idx[3*i+2]+1] += dm[9*ip+8]*dd; idx[3*i+2]+=2; } if (fabs(dm[9*ip+4])>tiny) { /* (ix, jy) */ row[idx[3*j+1]>>1] = 2*(3*i); value[idx[3*j+1]] -= dm[9*ip+4]*cc; value[idx[3*j+1]+1] -= dm[9*ip+4]*dd; idx[3*j+1]+=2; /* (jy, ix) */ row[idx[3*i]>>1] = 2*(3*j+1); value[idx[3*i]] -= dm[9*ip+4]*cc; value[idx[3*i]+1] += dm[9*ip+4]*dd; idx[3*i]+=2; /* (iy, jx) */ row[idx[3*j]>>1] = 2*(3*i+1); value[idx[3*j]] -= dm[9*ip+4]*cc; value[idx[3*j]+1] -= dm[9*ip+4]*dd; idx[3*j]+=2; /* (jx, iy) */ row[idx[3*i+1]>>1] = 2*(3*j); value[idx[3*i+1]] -= dm[9*ip+4]*cc; value[idx[3*i+1]+1] += dm[9*ip+4]*dd; idx[3*i+1]+=2; } if (fabs(dm[9*ip+5])>tiny) { /* (ix, jz) */ row[idx[3*j+2]>>1] = 2*(3*i); value[idx[3*j+2]] -= dm[9*ip+5]*cc; value[idx[3*j+2]+1] -= dm[9*ip+5]*dd; idx[3*j+2]+=2; /* (jz, ix) */ row[idx[3*i]>>1] = 2*(3*j+2); value[idx[3*i]] -= dm[9*ip+5]*cc; value[idx[3*i]+1] += dm[9*ip+5]*dd; idx[3*i]+=2; /* (iz, jx) */ row[idx[3*j]>>1] = 2*(3*i+2); value[idx[3*j]] -= dm[9*ip+5]*cc; value[idx[3*j]+1] -= dm[9*ip+5]*dd; idx[3*j]+=2; /* (jx, iz) */ row[idx[3*i+2]>>1] = 2*(3*j); value[idx[3*i+2]] -= dm[9*ip+5]*cc; value[idx[3*i+2]+1] += dm[9*ip+5]*dd; idx[3*i+2]+=2; } if (fabs(dm[9*ip+7])>tiny) { /* (iy, jz) */ row[idx[3*j+2]>>1] = 2*(3*i+1); value[idx[3*j+2]] -= dm[9*ip+7]*cc; value[idx[3*j+2]+1] -= dm[9*ip+7]*dd; idx[3*j+2]+=2; /* (jz, iy) */ row[idx[3*i+1]>>1] = 2*(3*j+2); value[idx[3*i+1]] -= dm[9*ip+7]*cc; value[idx[3*i+1]+1] += dm[9*ip+7]*dd; idx[3*i+1]+=2; /* (iz, jy) */ row[idx[3*j+1]>>1] = 2*(3*i+2); value[idx[3*j+1]] -= dm[9*ip+7]*cc; value[idx[3*j+1]+1] -= dm[9*ip+7]*dd; idx[3*j+1]+=2; /* (jy, iz) */ row[idx[3*i+2]>>1] = 2*(3*j+1); value[idx[3*i+2]] -= dm[9*ip+7]*cc; value[idx[3*i+2]+1] += dm[9*ip+7]*dd; idx[3*i+2]+=2; } } /* checksum */ for (i=0; i<3*np; i++) if (idx[i] != num[i+1]) { printf ("Internal data corruption.\n"); free_shm(); free_semaphore(); exit(1); } /*************************************/ /* Multi-Channel Perturbation Method */ /*************************************/ /* reset the error handlers to system defaults */ reset_error_handler(); /* fork the program here: resources such as row[] */ /* (read only) will be automatically "cloned". */ master_pid = getpid(); my_pid = master_pid; my_idx = 0; for (i=1; i0.5) ar[1] = -ar[1]; for (m=2; m<=max_channel-min_channel; m++) ar[m] = -ar[m-2]; /* +--++--++--++--+ */ n = 1; if (frandom()>0.5) n = -n ; for (m=0; m<=max_channel-min_channel; m++) { ai[m] = n*ar[m]; n = -n; } /* apply a random "spin-wave" encoding to avoid */ /* numerical problems during time integration */ phi[0] = 0.; cc = 2*PI/ktwist; if (frandom()>0.5) cc = -cc; for (m=0; m<=max_channel-min_channel; m++) { cmplx_mul (ar[m], ai[m], cos(phi[m]), sin(phi[m]), &ar[m], &ai[m]); if (m!=0) phi[m] = phi[m-1] + cc*frandom(); fup[m] = 0.; fvp[m] = 0.; } /* Precondition the integration: only odd-power derivatives exist at t=0. To save space, u2 stands for u3, u4 stands for u5, etc. */ for (i=0; i<6*np; i++) { u[i] = 0.; u2[i] = 0.; u4[i] = 0.; u6[i] = 0.; u8[i] = 0.; u10[i] = 0.; } for (m=0; m<=max_channel-min_channel; m++) { u2[6*iatom] += v1[m]*ar[m]; u4[6*iatom] -= v1[m]*v2[m]*ar[m]; u6[6*iatom] += v1[m]*v4[m]*ar[m]; u8[6*iatom] -= v1[m]*v6[m]*ar[m]; u10[6*iatom] += v1[m]*v8[m]*ar[m]; u2[6*iatom+1] += v1[m]*ai[m]; u4[6*iatom+1] -= v1[m]*v2[m]*ai[m]; u6[6*iatom+1] += v1[m]*v4[m]*ai[m]; u8[6*iatom+1] -= v1[m]*v6[m]*ai[m]; u10[6*iatom+1] += v1[m]*v8[m]*ai[m]; } for (i=0; i<6*np; i+=2) for (j=num[i>>1]; j>1)+1]; j+=2) { u4[row[j>>1]] -= value[j]*u2[i]-value[j+1]*u2[i+1]; u4[row[j>>1]+1] -= value[j]*u2[i+1]+value[j+1]*u2[i]; } for (i=0; i<6*np; i+=2) for (j=num[i>>1]; j>1)+1]; j+=2) { u6[row[j>>1]] -= value[j]*u4[i]-value[j+1]*u4[i+1]; u6[row[j>>1]+1] -= value[j]*u4[i+1]+value[j+1]*u4[i]; } for (i=0; i<6*np; i+=2) for (j=num[i>>1]; j>1)+1]; j+=2) { u8[row[j>>1]] -= value[j]*u6[i]-value[j+1]*u6[i+1]; u8[row[j>>1]+1] -= value[j]*u6[i+1]+value[j+1]*u6[i]; } for (i=0; i<6*np; i+=2) for (j=num[i>>1]; j>1)+1]; j+=2) { u10[row[j>>1]] -= value[j]*u8[i]-value[j+1]*u8[i+1]; u10[row[j>>1]+1] -= value[j]*u8[i+1]+value[j+1]*u8[i]; } for (i=0; i<6*np; i++) { u[i] = u2[i]*del3+u4[i]*del5+u6[i]*del7+u8[i]*del9+u10[i]*del11; uold[i] = 0.; } /* real part */ u1i = 0.; u3i = u2[6*iatom]; u5i = u4[6*iatom]; u7i = u6[6*iatom]; u9i = u8[6*iatom]; u11i = u10[6*iatom]; ui = 0.; u2i = 0.; u4i = 0.; u6i = 0.; u8i = 0.; u10i = 0.; /* imaginary part */ v1i = 0.; v3i = u2[6*iatom+1]; v5i = u4[6*iatom+1]; v7i = u6[6*iatom+1]; v9i = u8[6*iatom+1]; v11i = u10[6*iatom+1]; vi = 0.; v2i = 0.; v4i = 0.; v6i = 0.; v8i = 0.; v10i = 0.; if (print_floprate) startwatch(); /* main loop starts */ for (ip=1; ip<=mint; ip++) { /* calculate the perturbation on normal coordinate */ u2_ibb = 0.; u4_ibb = 0.; u6_ibb = 0.; u8_ibb = 0.; u10_ibb = 0.; v2_ibb = 0.; v4_ibb = 0.; v6_ibb = 0.; v8_ibb = 0.; v10_ibb = 0.; for (m=0; m<=max_channel-min_channel; m++) { ipm = ((m+min_channel)*ip-1)%mint; u2_ibb += dsin[ipm]*ar[m]; u4_ibb -= v2[m]*dsin[ipm]*ar[m]; u6_ibb += v4[m]*dsin[ipm]*ar[m]; u8_ibb -= v6[m]*dsin[ipm]*ar[m]; u10_ibb += v8[m]*dsin[ipm]*ar[m]; v2_ibb += dsin[ipm]*ai[m]; v4_ibb -= v2[m]*dsin[ipm]*ai[m]; v6_ibb += v4[m]*dsin[ipm]*ai[m]; v8_ibb -= v6[m]*dsin[ipm]*ai[m]; v10_ibb += v8[m]*dsin[ipm]*ai[m]; } /* Let the work be done in parallel: */ parallel_multiply_all (6*iatom, u2_ibb, u4_ibb, u6_ibb, u8_ibb, u10_ibb, v2_ibb, v4_ibb, v6_ibb, v8_ibb, v10_ibb); if (ip!=1) { u11i = (u10[6*iatom]-uo10i)/2./delta; u9i = ((u8[6*iatom]-uo8i)/2.-del3*u11i)/delta; u7i = ((u6[6*iatom]-uo6i)/2.-del3*u9i-del5*u11i)/delta; u5i = ((u4[6*iatom]-uo4i)/2.-del3*u7i-del5*u9i-del7*u11i)/delta; u3i = ((u2[6*iatom]-uo2i)/2.-del3*u5i-del5*u7i-del7*u9i -del9*u11i)/delta; u1i = ((u[6*iatom]-uoi)/2.-del3*u3i-del5*u5i-del7*u7i -del9*u9i-del11*u11i)/delta; v11i = (u10[6*iatom+1]-vo10i)/2./delta; v9i = ((u8[6*iatom+1]-vo8i)/2.-del3*v11i)/delta; v7i = ((u6[6*iatom+1]-vo6i)/2.-del3*v9i-del5*v11i)/delta; v5i = ((u4[6*iatom+1]-vo4i)/2.-del3*v7i-del5*v9i-del7*v11i)/delta; v3i = ((u2[6*iatom+1]-vo2i)/2.-del3*v5i-del5*v7i-del7*v9i -del9*v11i)/delta; v1i = ((u[6*iatom+1]-voi)/2.-del3*v3i-del5*v5i-del7*v7i -del9*v9i-del11*v11i)/delta; } /* calculate the integral */ uni = ui+delta*u1i+del2*u2i+del3*u3i+del4*u4i+del5*u5i +del6*u6i+del7*u7i+del8*u8i+del9*u9i+del10*u10i+del11*u11i; un1i = u1i+delta*u2i+del2*u3i+del3*u4i+del4*u5i+del5*u6i +del6*u7i+del7*u8i+del8*u9i+del9*u10i+del10*u11i; un2i = u2i+delta*u3i+del2*u4i+del3*u5i+del4*u6i+del5*u7i +del6*u8i+del7*u9i+del8*u10i+del9*u11i; un3i = u3i+delta*u4i+del2*u5i+del3*u6i+del4*u7i+del5*u8i +del6*u9i+del7*u10i+del8*u11i; un4i = u4i+delta*u5i+del2*u6i+del3*u7i+del4*u8i+del5*u9i +del6*u10i+del7*u11i; un5i = u5i+delta*u6i+del2*u7i+del3*u8i+del4*u9i+del5*u10i +del6*u11i; un6i = u6i+delta*u7i+del2*u8i+del3*u9i+del4*u10i+del5*u11i; un7i = u7i+delta*u8i+del2*u9i+del3*u10i+del4*u11i; un8i = u8i+delta*u9i+del2*u10i+del3*u11i; un9i = u9i+delta*u10i+del2*u11i; un10i = u10i+delta*u11i; un11i = u11i; vni = vi+delta*v1i+del2*v2i+del3*v3i+del4*v4i+del5*v5i +del6*v6i+del7*v7i+del8*v8i+del9*v9i+del10*v10i+del11*v11i; vn1i = v1i+delta*v2i+del2*v3i+del3*v4i+del4*v5i+del5*v6i +del6*v7i+del7*v8i+del8*v9i+del9*v10i+del10*v11i; vn2i = v2i+delta*v3i+del2*v4i+del3*v5i+del4*v6i+del5*v7i +del6*v8i+del7*v9i+del8*v10i+del9*v11i; vn3i = v3i+delta*v4i+del2*v5i+del3*v6i+del4*v7i+del5*v8i +del6*v9i+del7*v10i+del8*v11i; vn4i = v4i+delta*v5i+del2*v6i+del3*v7i+del4*v8i+del5*v9i +del6*v10i+del7*v11i; vn5i = v5i+delta*v6i+del2*v7i+del3*v8i+del4*v9i+del5*v10i +del6*v11i; vn6i = v6i+delta*v7i+del2*v8i+del3*v9i+del4*v10i+del5*v11i; vn7i = v7i+delta*v8i+del2*v9i+del3*v10i+del4*v11i; vn8i = v8i+delta*v9i+del2*v10i+del3*v11i; vn9i = v9i+delta*v10i+del2*v11i; vn10i = v10i+delta*v11i; vn11i = v11i; for (m=0; m<=max_channel-min_channel; m++) { ipm = ((m+min_channel)*ip-1)%mint; ibm = ((m+min_channel)*(ip+mint-1)-1)%mint; fup[m] += (uni-un2i/v2[m]+un4i/v4[m]-un6i/v6[m]+un8i/v8[m] -un10i/v10[m])/v1[m]*dsin[ipm] -(ui-u2i/v2[m]+u4i/v4[m]-u6i/v6[m]+u8i/v8[m] -u10i/v10[m])/v1[m]*dsin[ibm] +(un1i-un3i/v2[m]+un5i/v4[m]-un7i/v6[m]+un9i/v8[m] -un11i/v10[m])/v2[m]*dcos[ipm] -(u1i-u3i/v2[m]+u5i/v4[m]-u7i/v6[m]+u9i/v8[m] -u11i/v10[m])/v2[m]*dcos[ibm]; fvp[m] += (vni-vn2i/v2[m]+vn4i/v4[m]-vn6i/v6[m]+vn8i/v8[m] -vn10i/v10[m])/v1[m]*dsin[ipm] -(vi-v2i/v2[m]+v4i/v4[m]-v6i/v6[m]+v8i/v8[m] -v10i/v10[m])/v1[m]*dsin[ibm] +(vn1i-vn3i/v2[m]+vn5i/v4[m]-vn7i/v6[m]+vn9i/v8[m] -vn11i/v10[m])/v2[m]*dcos[ipm] -(v1i-v3i/v2[m]+v5i/v4[m]-v7i/v6[m]+v9i/v8[m] -v11i/v10[m])/v2[m]*dcos[ibm]; } /* a chunk of work done */ if (print_floprate && (ip%steps_to_inform==0)) { /* cc = seconds spent */ checkwatch (&cc); dd = ip*(ORDER_OF_INTEGRATION-2)/2.*8.*(num[3*np]>>1); printf ("step = %d, %f%% of a base cycle completed.\n", ip, 100.*ip/mint); ee = dd*1e-6/cc; printf ("Mflops/s up to now = %f,\n", ee); printf ("Mflops/s per processor = %f.\n\n", ee/num_processors); } uo10i = u10i; uo8i = u8i; uo6i = u6i; uo4i = u4i; uo2i = u2i; uoi = ui; vo10i = v10i; vo8i = v8i; vo6i = v6i; vo4i = v4i; vo2i = v2i; voi = vi; u10i = u10[6*iatom]; u8i = u8[6*iatom]; u6i = u6[6*iatom]; u4i = u4[6*iatom]; u2i = u2[6*iatom]; ui = u[6*iatom]; v10i = u10[6*iatom+1]; v8i = u8[6*iatom+1]; v6i = u6[6*iatom+1]; v4i = u4[6*iatom+1]; v2i = u2[6*iatom+1]; vi = u[6*iatom+1]; /* Verlet leap frog integration */ for (i=0; i<6*np; i++) { cc = uold[i]; uold[i] = u[i]; u[i] = 2.*(u[i]+del2*u2[i]+del4*u4[i]+del6*u6[i]+ del8*u8[i]+del10*u10[i])-cc; } } /* main loop ends */ if (print_floprate) { checkwatch (&cc); dd = mint*(ORDER_OF_INTEGRATION-2)/2.*8.*(num[3*np]>>1); ee = dd*1e-6/cc; printf ("Mflops/s during the cycle = %f\n", ee); printf ("Mflops/s per processor = %f\n\n", ee/num_processors); } /* calculate the LDOS */ for (m=0; m<=max_channel-min_channel; m++) { /* decode the spin-wave */ cmplx_mul (ar[m], ai[m], cos(phi[m]), -sin(phi[m]), &ar[m], &ai[m]); cmplx_mul (fup[m], fvp[m], cos(phi[m]), -sin(phi[m]), &fup[m], &fvp[m]); /* plug into the formula */ /* cancel out all odd-numbered noise functions */ /* scale back to real freq. DOS */ dd = -2.*(m+min_channel)*min_v*min_v/PI/PI *(fup[m]/ar[m]+fvp[m]/ai[m])/2. *sqrt(v1[m]*v1[m]-shift_v*shift_v)/v1[m]; ldos[m] += kpts[4*k+3]*dd; } sumweight += kpts[4*k+3]; /* save to .ldos file */ filehandle = fopen(fn_ldos, "w+"); for (m=0; m<=max_channel-min_channel; m++) fprintf (filehandle, "%9.6f %9.6f\n", sqrt(v1[m]*v1[m]-shift_v*shift_v), ldos[m]/sumweight); fclose(filehandle); /* Kill the slave processes here */ /* because D(k) is going to be changed. */ master_set_command (SLAVE_DIE); master_sleep(); } /* sample over supercell k's */ /* free all allocated memories */ printf ("Freeing all allocated memories.\n"); free (kpts); free (value); free (row); free (idx); free (num); free (dm); free (dm_index); free (uold); free (s); free (mass); free (ldos); free (fvp); free (fup); free (phi); free (ai); free (ar); free (v10); free (v8); free (v6); free (v4); free (v2); free (v1); free (dcos); free (dsin); /* free IPC resources */ free_shm(); free_semaphore(); printf("Mission terminates successfully.\n\n"); return(0); } /* end main() */ /* Evaluate the force constant matrix of a pair interaction. */ void force_constants (double rx, double ry, double rz, double force_constant[3][3]) { double r1, r2, a, b; double ir2, ir4, ir6, ir12; double first, second; /* V'/r and V''/r */ r2 = rx*rx+ry*ry+rz*rz; if (LJ6_12) { ir2 = 1./r2; ir4 = ir2 * ir2; ir6 = ir4 * ir2; ir12 = ir6 * ir6; first = (-48.*ir12+24.*ir6)*ir2; second = (624.*ir12-168.*ir6)*ir4; } else if(WOON) { r1 = sqrt(r2); a = WOON_EPSILON * WOON_Q * exp(-WOON_K1 * (r1-WOON_RE)); b = WOON_EPSILON * (WOON_Q+1) * exp(-WOON_K2 * (r1-WOON_RE)); first = (-WOON_K1*a+WOON_K2*b)/r1; second = (WOON_K1*WOON_K1*a-WOON_K2*WOON_K2*b)/r2; } force_constant[0][0] = (second-first/r2)*rx*rx+first; force_constant[0][1] = (second-first/r2)*rx*ry; force_constant[0][2] = (second-first/r2)*rx*rz; force_constant[1][1] = (second-first/r2)*ry*ry+first; force_constant[1][2] = (second-first/r2)*ry*rz; force_constant[2][2] = (second-first/r2)*rz*rz+first; force_constant[1][0] = force_constant[0][1]; force_constant[2][0] = force_constant[0][2]; force_constant[2][1] = force_constant[1][2]; return; } /* end force_constants() */ double matinv (double A[3][3], double B[3][3]) { double D11,D22,D33,D12,D21,D13,D31,D32,D23,volume; D11=A[1][1]*A[2][2]-A[1][2]*A[2][1]; D22=A[2][2]*A[0][0]-A[2][0]*A[0][2]; D33=A[0][0]*A[1][1]-A[0][1]*A[1][0]; D12=A[1][2]*A[2][0]-A[1][0]*A[2][2]; D23=A[2][0]*A[0][1]-A[2][1]*A[0][0]; D31=A[0][1]*A[1][2]-A[0][2]*A[1][1]; D13=A[1][0]*A[2][1]-A[2][0]*A[1][1]; D21=A[2][1]*A[0][2]-A[0][1]*A[2][2]; D32=A[0][2]*A[1][0]-A[1][2]*A[0][0]; volume=A[0][0]*D11+A[0][1]*D12+A[0][2]*D13; B[0][0]=D11/volume; B[1][1]=D22/volume; B[2][2]=D33/volume; B[0][1]=D21/volume; B[1][2]=D32/volume; B[2][0]=D13/volume; B[1][0]=D12/volume; B[2][1]=D23/volume; B[0][2]=D31/volume; return (volume); } /* end matinv() */ void matmul (double A[3][3], double B[3][3], double D[3][3]) { volatile double C[3][3]; C[0][0] = A[0][0]*B[0][0] + A[0][1]*B[1][0] + A[0][2]*B[2][0]; C[0][1] = A[0][0]*B[0][1] + A[0][1]*B[1][1] + A[0][2]*B[2][1]; C[0][2] = A[0][0]*B[0][2] + A[0][1]*B[1][2] + A[0][2]*B[2][2]; C[1][0] = A[1][0]*B[0][0] + A[1][1]*B[1][0] + A[1][2]*B[2][0]; C[1][1] = A[1][0]*B[0][1] + A[1][1]*B[1][1] + A[1][2]*B[2][1]; C[1][2] = A[1][0]*B[0][2] + A[1][1]*B[1][2] + A[1][2]*B[2][2]; C[2][0] = A[2][0]*B[0][0] + A[2][1]*B[1][0] + A[2][2]*B[2][0]; C[2][1] = A[2][0]*B[0][1] + A[2][1]*B[1][1] + A[2][2]*B[2][1]; C[2][2] = A[2][0]*B[0][2] + A[2][1]*B[1][2] + A[2][2]*B[2][2]; D[0][0] = C[0][0]; D[0][1] = C[0][1]; D[0][2] = C[0][2]; D[1][0] = C[1][0]; D[1][1] = C[1][1]; D[1][2] = C[1][2]; D[2][0] = C[2][0]; D[2][1] = C[2][1]; D[2][2] = C[2][2]; return; } /* end matmul() */ void cmplx_mul (double ar, double ai, double br, double bi, double *dr, double *di) { volatile double cr = ar*br-ai*bi; *di = ar*bi+ai*br; *dr = cr; return; } /* cmplx_mul() */ /* returns pseudo-random number on (0,1) */ double frandom() { return ((random()+0.5)/pow((double)2, (double)31)); } /* end frandom() */ /* manage an internal list of shared memories */ void *apply_for_shm (int size) { static int last_KEY = 0; if (num_shm_seg==MAX_SHM_SEG) { printf ("Error in apply_for_shm(): maximum of %d\n", MAX_SHM_SEG); printf ("shared memory segments exceeded.\n"); return (NULL); } shm[num_shm_seg].size = size; for (shm[num_shm_seg].KEY = last_KEY+1; ((shm[num_shm_seg].ID=shmget(shm[num_shm_seg].KEY,size,IPC_CREAT|0600)) ==-1)&&(shm[num_shm_seg].KEY<65535); shm[num_shm_seg].KEY++); if(shm[num_shm_seg].ID!=-1) { printf ("shared memory segment %d: KEY = %d ID = %d\n", num_shm_seg+1,shm[num_shm_seg].KEY,shm[num_shm_seg].ID); /* The system will find a good logical address to attach the shared memory segment to. Should be rounded to 16 bytes. */ shm[num_shm_seg].addr = shmat(shm[num_shm_seg].ID,(char *)0,0600|SHM_RND); if (shm[num_shm_seg].addr==(void *)-1) { printf ("Error in apply_for_shm(): fail to attach shm segment %d\n", num_shm_seg+1); return(NULL); } else { printf("Succeed in getting %d bytes of shared memory.\n", size); /* clear the allocated region */ memset (shm[num_shm_seg].addr, 0, size); last_KEY = shm[num_shm_seg].KEY; return (shm[num_shm_seg++].addr); } } else { printf("Error in apply_for_shm(): fail to get %d bytes of shared memory.\n", size); return(NULL); } } /* apply_for_shm() */ /* allocate shared arrays that are WRITABLE */ int init_shm_arrays (int n6) { int i, not_null=1; int totalsize = sizeof(int)*(num_processors+1) /* memeory for domain[] */ + sizeof(char)*num_processors /* memeory for commands[] */ + sizeof(double)*n6*(6+num_processors); /* for u,u2,u4,u6,u8,u10,copy[] */ num_shm_seg = 0; size_of_vector_in_double = n6; size_of_vector_in_bytes = size_of_vector_in_double*sizeof(double); /* first see if we can allocate everything in one piece */ if ((shm_base=(double *)apply_for_shm(totalsize))!=NULL) { /* displacement variables used in multichannel method */ u = shm_base; u2 = u + n6; u4 = u2 + n6; u6 = u4 + n6; u8 = u6 + n6; u10 = u8 + n6; for (i=0; i\n"); printf ("Let us try segmented allocation.\n"); /* pack into maximally contiguous blocks to increase cache hit */ shm_base = (double *) apply_for_shm(size_of_vector_in_bytes*3+(num_processors+1)*sizeof(int)); u = shm_base; u2 = u + n6; u4 = u2 + n6; domain = (int *) (u4 + n6); not_null = not_null && (u!=NULL); u6 = (double *) apply_for_shm (size_of_vector_in_bytes*3+sizeof(char)*num_processors); u8 = u6 + n6; u10 = u8 + n6; commands = (char *) (u10+n6); not_null = not_null && (u6!=NULL); for (i=0; i=0; num_shm_seg--) if (shmctl(shm[num_shm_seg].ID, IPC_RMID, NULL) == -1) printf("Unable to remove shared memory ID=%d.\n", shm[num_shm_seg].ID); } /* free_shm() */ /* semaphore set for multiprocess scheduling */ int init_semaphore (int num_processors) { int i; for (sem_KEY=1;((sem_ID=semget(sem_KEY,num_processors,IPC_CREAT|0600)) ==-1)&&(sem_KEY<65535);sem_KEY++); printf ("semaphore KEY = %d ID = %d\n", sem_KEY, sem_ID); if(sem_ID!=-1) { /* define standard semaphore operations */ /* increase one element */ semop_add.sem_op = +1; semop_add.sem_flg = 0600; /* decrease one element */ semop_minus.sem_op = -1; semop_minus.sem_flg = 0600; /* sleep till one element is set to zero */ semop_zero.sem_op = 0; semop_zero.sem_flg = 0600; /* hypnotize the slave processes now by setting them all to 1 */ for (i=1; i>1], num[(i>>1)+1]); */ for (j=num[i>>1]; j>1)+1]; j+=2) { out[row[j>>1]] -= value[j]*in[i]-value[j+1]*in[i+1]; out[row[j>>1]+1] -= value[j]*in[i+1]+value[j+1]*in[i]; } } break; /* second phase: add copies to next array */ case COPY_TO_U2: out=u2; goto further; case COPY_TO_U4: out=u4; goto further; case COPY_TO_U6: out=u6; goto further; case COPY_TO_U8: out=u8; goto further; case COPY_TO_U10: out=u10; further: for (j=0; j