/********************************************/ /* O(N), Multi-Channel Perturbation Method */ /* to Calculate the Local Density of States */ /* [Phys. Rev. B 56, pp. 3524 (1997)]. */ /* */ /* Ver 2.2 Mar 21, 1998 */ /* Developed by Ju Li (MIT) */ /********************************************/ /* Version Notes: TTY interface; Woon's pair potential; Dynamical matrix for arbitrary configuration and pair potentials; User-defined supercell k sampling or random sampling; High-precision (order-12) integration of equation of motion; Spin-wave amplitude encoding to minimize the point spread; Shift the spectrum for better convergence at low frequencies; Parallel shared memory using Unix fork(), shmop(), semop(); Flop rate benchmark; */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /** universal constants **/ #ifndef PI #define PI 3.14159265358979323846 #endif #define AVO 6.0221367E+23 #define BOLZ 1.380658E-23 /* in J/K */ #define HBAR 1.054589E-34 /* in J.s */ #define EV_IN_J 1.60217733E-19 #define HARTREE_IN_J 4.3597482E-18 #define BOHR_RADIUS_IN_A 0.529177249 /** argon LJ6-12 constants **/ #define SIGMA_IN_M (3.405E-10) #define EPSILON_IN_J (119.8*BOLZ) #define ARGON_MASS_IN_KG (39.948*1E-3/AVO) /** fundamental reduced units **/ #define ULENGTH_IN_M SIGMA_IN_M #define UENERGY_IN_J EPSILON_IN_J #define UMASS_IN_KG ARGON_MASS_IN_KG /** derived reduced units **/ #define ULENGTH_IN_A (ULENGTH_IN_M*1E10) #define UTIME_IN_SECOND (ULENGTH_IN_M*sqrt(UMASS_IN_KG/UENERGY_IN_J)) #define UTIME_IN_PS (UTIME_IN_SECOND*1E12) #define UFREQ_IN_HZ (1./UTIME_IN_SECOND/2./PI) #define UFREQ_IN_THZ (1E-12*UFREQ_IN_HZ) #define UFREQ_IN_MEV (HBAR*2*PI*UFREQ_IN_HZ/EV_IN_J*1000.) /* interaction cutoff */ #define RCUT (2.5*SIGMA_IN_M/ULENGTH_IN_M) /* maximum possible reduced density of the solid */ #define MAXDENSITY 2.0 #define MAXNEIGHBOR ((int)(4./3.*PI*RCUT*RCUT*RCUT*MAXDENSITY)) #define MAX_STRLEN 150 #define MAX_SHM_SEG 6 /* on Xolas, otherwise EMFILE error */ #define MAX_PROCESSORS 8 /* on Xolas, otherwise EMFILE error */ #define ORDER_OF_INTEGRATION 12 #define DEFAULT_ATOM_TP "Ar" #define TP_STRLEN 2 /* commands for collective action */ #define U_TO_COPY 1 #define U2_TO_COPY 2 #define U4_TO_COPY 3 #define U6_TO_COPY 4 #define U8_TO_COPY 5 #define COPY_TO_U2 11 #define COPY_TO_U4 12 #define COPY_TO_U6 13 #define COPY_TO_U8 14 #define COPY_TO_U10 15 #define ZERO_U2_U10 21 #define SLAVE_DIE 99 static int np, LJ6_12, WOON, random_k_sampling, print_floprate, steps_to_inform, iseed, num_kpts, num_dm, iatom, ktwist, mincycle, min_channel, max_channel, mint, num_shm_seg, size_of_vector_in_bytes, size_of_vector_in_double, num_processors, master_pid, sem_ID, my_pid, my_idx, *dm_index, *row, *num, *idx, nc[3]; static double volume, tiny, min_v, max_v, shift_v, work_in_Gflop, sumweight, master_load, delta, del2, del3, del4, del5, del6, del7, del8, del9, del10, del11, ui, u1i, u2i, u3i, u4i, u5i, u6i, u7i, u8i, u9i, u10i, u11i, vi, v1i, v2i, v3i, v4i, v5i, v6i, v7i, v8i, v9i, v10i, v11i, u2_ibb, u4_ibb, u6_ibb, u8_ibb, u10_ibb, v2_ibb, v4_ibb, v6_ibb, v8_ibb, v10_ibb, uni, un1i, un2i, un3i, un4i, un5i, un6i, un7i, un8i, un9i, un10i, un11i, vni, vn1i, vn2i, vn3i, vn4i, vn5i, vn6i, vn7i, vn8i, vn9i, vn10i, vn11i, uoi, uo2i, uo4i, uo6i, uo8i, uo10i, voi, vo2i, vo4i, vo6i, vo8i, vo10i, *s, *mass, *dm, *value, *v1, *v2, *v4, *v6, *v8, *v10, *phi, *dsin, *dcos, *ar, *ai, *fup, *fvp, *ldos, *shm_base, *u, *u2, *u4, *u6, *u8, *u10, *uold, *copy[MAX_PROCESSORS], *kpts, d_master_load[MAX_PROCESSORS]={1.,0.98,0.98,0.98,0.98,0.98,0.98,0.98}, H[3][3], HI[3][3],Q[3][3], QT[3][3], force_constant[3][3]; static char potential_function[MAX_STRLEN], fn_kpt[MAX_STRLEN], config_name[MAX_STRLEN], fn_config_read[MAX_STRLEN], fn_ldos[MAX_STRLEN], **tp; static FILE *filehandle; /* Must be volatile because there is no explicit data change on the thread and the optimization may choose to read in them only once */ volatile int *domain; volatile char *commands; /* semaphore set for multiprocess scheduling */ key_t sem_KEY; struct sembuf semop_add, semop_minus, semop_zero; union semun sem_set; time_t real_t_start, real_t_end; struct shm_list {key_t KEY; int ID; void *addr; int size;} shm[MAX_SHM_SEG]; static void force_constants (double rx, double ry, double rz, double force_constant[3][3]); static double matinv(double A[3][3], double B[3][3]); static void matmul (double A[3][3], double B[3][3], double D[3][3]); static void cmplx_mul (double ar, double ai, double br, double bi, double *dr, double *di); static double frandom(); static void free_shm(); static void *apply_for_shm (int size); static int init_shm_arrays(int n6); static int init_semaphore (int num_processors); static void free_semaphore(); static void slave_work_till_die(); static void parallel_multiply_all (int ibb, double u2_ibb, double u4_ibb, double u6_ibb, double u8_ibb, double u10_ibb, double v2_ibb, double v4_ibb, double v6_ibb, double v8_ibb, double v10_ibb); static void master_set_command (char what); static void do_my_job(); static void master_sleep(); static void slave_sleep(); static void slave_touch_base(); static void my_error_handler(int signal_number); static void set_my_own_error_handler(); static void reset_error_handler(); static void startwatch(); static void checkwatch(double *time_passed_in_seconds); static void fireman(); int main() { int i, j, k, l, m, n, ip, ipm, ibm; double cc, dd, ee, tmp[3][3], sumweight; double sxji, syji, szji, rx, ry, rz, start_time; char *p, sentence[MAX_STRLEN]; struct timeval tp; for (sentence[0]=(char)0,i=0; i*, default=" "Data/perfect.woon):\n%s\n\n", config_name); if (!strcasecmp(config_name,"default")) sprintf(config_name, "Data/perfect.woon"); fscanf (stdin, "Pair potential to be used (LJ6_12 or WOON):\n%s\n\n", potential_function); LJ6_12 = !strcmp(potential_function,"LJ6_12"); WOON = !strcmp(potential_function,"WOON"); if ((!LJ6_12) && (!WOON)) { fprintf (stdout, "Error: choose between LJ6_12 and WOON " "pair potential\n"); exit(1); } fscanf (stdin, "Load configuration from file (default=*):\n%s\n\n", fn_config_read); if (!strcasecmp(fn_config_read,"default")) sprintf(fn_config_read, "%s", config_name); fscanf(stdin, "Atom index and Cartesian direction for LDOS:" "\n%d %lf %lf %lf\n\n", &iatom, &Q[0][0], &Q[0][1], &Q[0][2]); fscanf (stdin, "Save LDOS on file (default=*.ldos):\n%s\n\n", fn_ldos); if (!strcasecmp(fn_ldos, "default")) sprintf(fn_ldos, "%s.ldos", config_name); fscanf (stdin, "Supercell k-point sampling file (if numeric then\n"); fscanf (stdin, "apply random sampling in BZ, " "default=Kpts/sckpt.8):\n%s\n\n", fn_kpt); if (random_k_sampling=((*fn_kpt>='0')&&(*fn_kpt<='9'))) { /* user inputs a integer, do random k sampling */ for (p=fn_kpt;(*p>='0')&&(*p<='9');p++); *p = 0; sscanf(fn_kpt, "%d", &num_kpts); } else if (!strcasecmp(fn_kpt, "default")) sprintf(fn_kpt, "Kpts/sckpt.8"); fscanf(stdin, "Ignore dynamical matrix components smaller " "than (THz^2):\n%lf\n\n", &tiny); fscanf(stdin, "Minimum (base) frequency (THz):\n%lf\n\n", &min_v); fscanf(stdin, "Maximum frequency (THz):\n%lf\n\n", &max_v); fscanf(stdin, "Shift the spectrum by (THz):\n%lf\n\n", &shift_v); fscanf(stdin, "Twist period of spin-wave amplitude " "encoding:\n%d\n\n", &ktwist); fscanf(stdin, "Number of steps of a period for the maximum " "frequency:\n%d\n\n", &mincycle); fscanf(stdin, "To run on how many processors:\n%d\n\n", &num_processors); if (num_processors > MAX_PROCESSORS) { printf ("MAX_PROCESSORS < %d, revise source code.\n", num_processors); exit(1); } fscanf(stdin, "Work load of the master process\n"); fscanf(stdin, "(default = {1., 0.98, 0.98, 0.98, 0.98, 0.98, " "0.98, 0.98}):\n%s\n\n", sentence); if (!strcasecmp(sentence, "default")) master_load = d_master_load[num_processors-1]; else sscanf(sentence, "%lf", &master_load); if (num_processors==1) master_load = 1.0; fscanf(stdin, "Frequency of getting reported (in Gflop)\n"); fscanf(stdin, "(NULL=don't, default=0.5 Gflop):\n%s\n\n", sentence); if (print_floprate=strcasecmp(sentence, "NULL")) if (!strcasecmp(sentence, "default")) work_in_Gflop = 0.5; else sscanf(sentence, "%lf", &work_in_Gflop); fscanf (stdin, "Random number generator seed (0->time seed):\n"); fscanf (stdin, "%d\n", &iseed); if (iseed == 0) iseed = time(NULL); srandom ((unsigned)iseed); printf("Instructions read complete.\n\n"); /* Instructions read complete */ /* print out mission statement */ printf ("LDOS will be calculated for atom %d in \"%s\"\n", iatom, config_name); printf ("along (%.2f,%.2f,%.2f) direction using %s potential\n", Q[0][0], Q[0][1], Q[0][2], potential_function); printf ("with Rcut = %f = %f Angstrom,\n", RCUT, RCUT*ULENGTH_IN_A); /* first channel, after-transformation freq. fixed: */ min_channel = (int) ceil(shift_v/min_v); dd = min_channel*min_v; /* second channel, after-transformation freq. fixed: */ ee = dd + min_v; /* make the 2nd real freq. (cc x the 1st real freq.) */ cc = 2.5; shift_v = sqrt((cc*cc*dd*dd-ee*ee)/(cc*cc-1.)); /* maximum after-transformation freq. */ max_v = sqrt(max_v*max_v+shift_v*shift_v); max_channel = floor(max_v/min_v); printf ("for %d frequency channels between [%f, %f] THz, or\n", max_channel-min_channel+1, sqrt(dd*dd-shift_v*shift_v), sqrt(max_channel*max_channel*min_v*min_v-shift_v*shift_v)); printf ("[%f, %f] THz for the shifted dynamical matrix.\n\n", dd, max_channel*min_v); /* allocating memory for frequency channels */ printf ("Allocating memory for %d frequency channels.\n\n", max_channel-min_channel+1); /* frequencies in THz */ v1 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); v2 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); v4 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); v6 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); v8 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); v10 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); /* real and imaginary amplitude series */ ar = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); ai = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); /* spin wave phase encoding */ phi = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); /* LDOS result */ fup = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); fvp = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); ldos = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); for (m=0; m<=max_channel-min_channel; m++) { v1[m] = (m+min_channel)*min_v; v2[m] = pow(v1[m],2.); v4[m] = pow(v1[m],4.); v6[m] = pow(v1[m],6.); v8[m] = pow(v1[m],8.); v10[m]= pow(v1[m],10.); ar[m] = 0.; ai[m] = 0.; phi[m] = 0.; ldos[m] = 0.; } /* total number of integration steps: */ mint = max_channel*mincycle; /* 2 x Pi x (timestep size in ps): */ delta = 2.*PI/min_v/mint; printf ("Run track is %d steps long, equivalent step size = %f ps.\n", mint, delta/2./PI); del2 = pow(delta,2.)/2.; del3 = pow(delta,3.)/6.; del4 = pow(delta,4.)/24.; del5 = pow(delta,5.)/120.; del6 = pow(delta,6.)/720.; del7 = pow(delta,7.)/5040.; del8 = pow(delta,8.)/40320.; del9 = pow(delta,9.)/362880.; del10 = pow(delta,10.)/3628800.; del11 = pow(delta,11.)/39916800.; printf ("Allocating memory for %d phases.\n\n", mint); /* prepare the phases */ dsin = (double *) malloc(mint*sizeof(double)); dcos = (double *) malloc(mint*sizeof(double)); for (ip=1; ip<=mint; ip++) { dsin[ip-1] = sin(2.*PI/mint*ip); dcos[ip-1] = cos(2.*PI/mint*ip); } printf("The configuration is being loaded from \"%s\".\n", fn_config_read); if ((filehandle=fopen(fn_config_read, "r"))==NULL) { fprintf(stdout, "Error: cannot open file \"%s\"\n.", fn_config_read); exit(1); } fscanf (filehandle, "Number of particles = %d\n\n", &np); printf ("Allocate property arrays for %d particles.\n\n",np); /* reduced coordinates */ mass = (double *) malloc(np*sizeof(double)); s = (double *) malloc(3*np*sizeof(double)); uold = (double *) malloc(6*np*sizeof(double)); /* name space for atom type */ tp = (char **) malloc (np*sizeof(char *)); tp[0] = (char *) malloc (np*(TP_STRLEN+1)*sizeof(char)); for (i=1; i 0.85*MAXDENSITY) { printf("Current density %f is above 85%% of MAXDENSITY = %f.\n", np/volume, MAXDENSITY); printf("Overflow possible. Please increase MAXDENSITY.\n"); exit(1); } /* To simplify the programming, we require the RCUT sphere to be contained in the parallelepiped formed by H(1,:), H(2,:), H(3,:), which requires the thickness opposite to any of the parallelepiped surfaces to be greater than 2*RCUT, but that thickness is just 1/|G_i|, where G^T * H = I, the corresponding reciprocal lattice vector. */ if ( (RCUT > 0.5/sqrt(HI[0][0]*HI[0][0]+HI[1][0]*HI[1][0]+ HI[2][0]*HI[2][0])) || (RCUT > 0.5/sqrt(HI[0][1]*HI[0][1]+HI[1][1]*HI[1][1]+ HI[2][1]*HI[2][1])) || (RCUT > 0.5/sqrt(HI[0][2]*HI[0][2]+HI[1][2]*HI[1][2]+ HI[2][2]*HI[2][2])) ) { printf ("For code efficiency we require a larger system,\n"); printf ("so there is no self-interaction.\n"); exit(1); } /* read in reduced coordinates from configuration file */ fscanf (filehandle, "TP Mass(amu) sx sy sz" " sx1(1/ns) sy1(1/ns) sz1(1/ns)\n"); for (i=0; i<3*np; i+=3) { fscanf (filehandle, "%2s %lf %lf %lf %lf %lf %lf %lf\n", tp[i/3], &mass[i/3], &s[i], &s[i+1], &s[i+2], &cc, &cc, &cc); /* convert from AMU to reduced mass unit */ mass[i/3] = mass[i/3]/AVO*1E-3/UMASS_IN_KG; } fclose (filehandle); printf ("\"%s\" generated successfully on the computer.\n\n", config_name); printf ("Obtain an orthogonal transformation\n"); cc = sqrt(Q[0][0]*Q[0][0]+Q[0][1]*Q[0][1]+Q[0][2]*Q[0][2]); Q[0][0] /= cc; Q[0][1] /= cc; Q[0][2] /= cc; /* get a random vector v in space: */ cc = Q[0][2]; dd = Q[0][0]; ee = -Q[0][1]; /* take a cross-product v*c1 */ Q[1][0] = dd*Q[0][2]-ee*Q[0][1]; Q[1][1] = ee*Q[0][0]-cc*Q[0][2]; Q[1][2] = cc*Q[0][1]-dd*Q[0][0]; cc = sqrt(Q[1][0]*Q[1][0]+Q[1][1]*Q[1][1]+Q[1][2]*Q[1][2]); Q[1][0] /= cc; Q[1][1] /= cc; Q[1][2] /= cc; /* take the cross-product c1*c2 */ Q[2][0] = Q[0][1]*Q[1][2]-Q[0][2]*Q[1][1]; Q[2][1] = Q[0][2]*Q[1][0]-Q[0][0]*Q[1][2]; Q[2][2] = Q[0][0]*Q[1][1]-Q[0][1]*Q[1][0]; /* get the transpose */ for (i=0; i<3; i++) for (j=0; j<3; j++) QT[i][j] = Q[j][i]; printf (" | %7.4f %7.4f %7.4f |\n",Q[0][0],Q[0][1],Q[0][2]); printf (" Q = | %7.4f %7.4f %7.4f |\n",Q[1][0],Q[1][1],Q[1][2]); printf (" | %7.4f %7.4f %7.4f |\n",Q[2][0],Q[2][1],Q[2][2]); printf ("making (%.2f,%.2f,%.2f) a normal coordinate.\n\n", Q[0][0],Q[0][1],Q[0][2]); /* count the total number of interactions */ printf ("Counting all possible interactions within Rcut = %f A.\n", RCUT*ULENGTH_IN_A); for (num_dm=0,i=0; i0.5) sxji--; else if (sxji<-0.5) sxji++; if(syji>0.5) syji--; else if (syji<-0.5) syji++; if (szji>0.5) szji--; else if (szji<-0.5) szji++; rx = H[0][0]*sxji + H[1][0]*syji + H[2][0]*szji; ry = H[0][1]*sxji + H[1][1]*syji + H[2][1]*szji; rz = H[0][2]*sxji + H[1][2]*syji + H[2][2]*szji; if (rx*rx+ry*ry+rz*rz0.5) sxji--; else if (sxji<-0.5) sxji++; if(syji>0.5) syji--; else if (syji<-0.5) syji++; if (szji>0.5) szji--; else if (szji<-0.5) szji++; rx = H[0][0]*sxji + H[1][0]*syji + H[2][0]*szji; ry = H[0][1]*sxji + H[1][1]*syji + H[2][1]*szji; rz = H[0][2]*sxji + H[1][2]*syji + H[2][2]*szji; if (rx*rx+ry*ry+rz*rztiny) { num[3*j+l+1]+=2; num[3*i+k+1]+=2; } ip++; } } /* convert into accumulative index: the dynamical matrix is compressed */ /* into 1D arrays row[]+value[], where the ith column of D(k) occupies */ /* slots [num[i], num[i+1]) on those 1D arrays. */ for (i=0; i<3*np; i++) num[i+1] += num[i]; printf("After getting rid of off-diagonal components < %f THz^2,\n", tiny); printf("we will have on average %d significant elements per " "column on D(k).\n", (num[3*np]>>1)/3/np); printf("Allocating memory for D(k) (sparse).\n\n"); /* currently filled slots [num[i], idx[i]) for the ith column */ idx = (int *) malloc(3*np*sizeof(int)); /* row index for each D(k) element */ row = (int *) malloc((num[3*np]>>1)*sizeof(int)); /* value of D(k) element (complex) */ value = (double *) malloc(num[3*np]*sizeof(double)); if ((idx==NULL)||(row==NULL)||(value==NULL)) { printf("Not enough memory (%d bytes).\n", (3*np+num[3*np]>>1) *(int)sizeof(int)+num[3*np]*(int)sizeof(double)); exit(1); } /* get sampling k-points in BZ */ if (random_k_sampling) { printf("Randomly generating %d k-points in supercell BZ.\n\n", num_kpts); kpts = (double *) malloc(4*num_kpts*sizeof(double)); for (i=0; i\n"); /* set data layout */ n = 3*np/num_processors; domain[0] = 0; domain[1] = ceil(master_load*n); for (i=1; i>1) /(ORDER_OF_INTEGRATION-2)*2); if (print_floprate) printf ("Flop rate will be shown for every %.1f " "Mflop chunk of work.\n", work_in_Gflop*1000); printf ("Final LDOS result will be saved on \"%s\".\n\n", fn_ldos); /* go over all supercell k-points */ for (sumweight=0.,k=0; k>1] = 2*((i/3)*3); row[(num[i]>>1)+1] = 2*((i/3)*3+1); row[(num[i]>>1)+2] = 2*((i/3)*3+2); } for (ip=0; iptiny) { /* (ix, jx) */ row[idx[3*j]>>1] = 2*(3*i); value[idx[3*j]] -= dm[9*ip+3]*cc; value[idx[3*j]+1] -= dm[9*ip+3]*dd; idx[3*j]+=2; /* (jx, ix) */ row[idx[3*i]>>1] = 2*(3*j); value[idx[3*i]] -= dm[9*ip+3]*cc; value[idx[3*i]+1] += dm[9*ip+3]*dd; idx[3*i]+=2; } if (fabs(dm[9*ip+6])>tiny) { /* (iy, jy) */ row[idx[3*j+1]>>1] = 2*(3*i+1); value[idx[3*j+1]] -= dm[9*ip+6]*cc; value[idx[3*j+1]+1] -= dm[9*ip+6]*dd; idx[3*j+1]+=2; /* (jy, iy) */ row[idx[3*i+1]>>1] = 2*(3*j+1); value[idx[3*i+1]] -= dm[9*ip+6]*cc; value[idx[3*i+1]+1] += dm[9*ip+6]*dd; idx[3*i+1]+=2; } if (fabs(dm[9*ip+8])>tiny) { /* (iz, jz) */ row[idx[3*j+2]>>1] = 2*(3*i+2); value[idx[3*j+2]] -= dm[9*ip+8]*cc; value[idx[3*j+2]+1] -= dm[9*ip+8]*dd; idx[3*j+2]+=2; /* (jz, iz) */ row[idx[3*i+2]>>1] = 2*(3*j+2); value[idx[3*i+2]] -= dm[9*ip+8]*cc; value[idx[3*i+2]+1] += dm[9*ip+8]*dd; idx[3*i+2]+=2; } if (fabs(dm[9*ip+4])>tiny) { /* (ix, jy) */ row[idx[3*j+1]>>1] = 2*(3*i); value[idx[3*j+1]] -= dm[9*ip+4]*cc; value[idx[3*j+1]+1] -= dm[9*ip+4]*dd; idx[3*j+1]+=2; /* (jy, ix) */ row[idx[3*i]>>1] = 2*(3*j+1); value[idx[3*i]] -= dm[9*ip+4]*cc; value[idx[3*i]+1] += dm[9*ip+4]*dd; idx[3*i]+=2; /* (iy, jx) */ row[idx[3*j]>>1] = 2*(3*i+1); value[idx[3*j]] -= dm[9*ip+4]*cc; value[idx[3*j]+1] -= dm[9*ip+4]*dd; idx[3*j]+=2; /* (jx, iy) */ row[idx[3*i+1]>>1] = 2*(3*j); value[idx[3*i+1]] -= dm[9*ip+4]*cc; value[idx[3*i+1]+1] += dm[9*ip+4]*dd; idx[3*i+1]+=2; } if (fabs(dm[9*ip+5])>tiny) { /* (ix, jz) */ row[idx[3*j+2]>>1] = 2*(3*i); value[idx[3*j+2]] -= dm[9*ip+5]*cc; value[idx[3*j+2]+1] -= dm[9*ip+5]*dd; idx[3*j+2]+=2; /* (jz, ix) */ row[idx[3*i]>>1] = 2*(3*j+2); value[idx[3*i]] -= dm[9*ip+5]*cc; value[idx[3*i]+1] += dm[9*ip+5]*dd; idx[3*i]+=2; /* (iz, jx) */ row[idx[3*j]>>1] = 2*(3*i+2); value[idx[3*j]] -= dm[9*ip+5]*cc; value[idx[3*j]+1] -= dm[9*ip+5]*dd; idx[3*j]+=2; /* (jx, iz) */ row[idx[3*i+2]>>1] = 2*(3*j); value[idx[3*i+2]] -= dm[9*ip+5]*cc; value[idx[3*i+2]+1] += dm[9*ip+5]*dd; idx[3*i+2]+=2; } if (fabs(dm[9*ip+7])>tiny) { /* (iy, jz) */ row[idx[3*j+2]>>1] = 2*(3*i+1); value[idx[3*j+2]] -= dm[9*ip+7]*cc; value[idx[3*j+2]+1] -= dm[9*ip+7]*dd; idx[3*j+2]+=2; /* (jz, iy) */ row[idx[3*i+1]>>1] = 2*(3*j+2); value[idx[3*i+1]] -= dm[9*ip+7]*cc; value[idx[3*i+1]+1] += dm[9*ip+7]*dd; idx[3*i+1]+=2; /* (iz, jy) */ row[idx[3*j+1]>>1] = 2*(3*i+2); value[idx[3*j+1]] -= dm[9*ip+7]*cc; value[idx[3*j+1]+1] -= dm[9*ip+7]*dd; idx[3*j+1]+=2; /* (jy, iz) */ row[idx[3*i+2]>>1] = 2*(3*j+1); value[idx[3*i+2]] -= dm[9*ip+7]*cc; value[idx[3*i+2]+1] += dm[9*ip+7]*dd; idx[3*i+2]+=2; } } /* checksum */ for (i=0; i<3*np; i++) if (idx[i] != num[i+1]) { printf ("Internal data corruption.\n"); free_shm(); free_semaphore(); exit(1); } /*************************************/ /* Multi-Channel Perturbation Method */ /*************************************/ /* reset the error handlers to system defaults */ reset_error_handler(); /* this eliminates zombies */ signal(SIGCHLD, fireman); /* fork the program here: resources such as row[] */ /* (read only) will be automatically "cloned". */ master_pid = getpid(); my_pid = master_pid; my_idx = 0; for (i=1; i0.5) ar[1] = -ar[1]; for (m=2; m<=max_channel-min_channel; m++) ar[m] = -ar[m-2]; /* +--++--++--++--+ */ n = 1; if (frandom()>0.5) n = -n ; for (m=0; m<=max_channel-min_channel; m++) { ai[m] = n*ar[m]; n = -n; } /* apply a random "spin-wave" encoding to avoid */ /* numerical problems during time integration */ phi[0] = 0.; cc = 2*PI/ktwist; if (frandom()>0.5) cc = -cc; for (m=0; m<=max_channel-min_channel; m++) { cmplx_mul (ar[m], ai[m], cos(phi[m]), sin(phi[m]), &ar[m], &ai[m]); if (m!=0) phi[m] = phi[m-1] + cc*frandom(); fup[m] = 0.; fvp[m] = 0.; } /* Precondition the integration: only odd-power derivatives exist at t=0. To save space, u2 stands for u3, u4 stands for u5, etc. */ for (i=0; i<6*np; i++) { u[i] = 0.; u2[i] = 0.; u4[i] = 0.; u6[i] = 0.; u8[i] = 0.; u10[i] = 0.; } for (m=0; m<=max_channel-min_channel; m++) { u2[6*iatom] += v1[m]*ar[m]; u4[6*iatom] -= v1[m]*v2[m]*ar[m]; u6[6*iatom] += v1[m]*v4[m]*ar[m]; u8[6*iatom] -= v1[m]*v6[m]*ar[m]; u10[6*iatom] += v1[m]*v8[m]*ar[m]; u2[6*iatom+1] += v1[m]*ai[m]; u4[6*iatom+1] -= v1[m]*v2[m]*ai[m]; u6[6*iatom+1] += v1[m]*v4[m]*ai[m]; u8[6*iatom+1] -= v1[m]*v6[m]*ai[m]; u10[6*iatom+1] += v1[m]*v8[m]*ai[m]; } for (i=0; i<6*np; i+=2) for (j=num[i>>1]; j>1)+1]; j+=2) { u4[row[j>>1]] -= value[j]*u2[i]-value[j+1]*u2[i+1]; u4[row[j>>1]+1] -= value[j]*u2[i+1]+value[j+1]*u2[i]; } for (i=0; i<6*np; i+=2) for (j=num[i>>1]; j>1)+1]; j+=2) { u6[row[j>>1]] -= value[j]*u4[i]-value[j+1]*u4[i+1]; u6[row[j>>1]+1] -= value[j]*u4[i+1]+value[j+1]*u4[i]; } for (i=0; i<6*np; i+=2) for (j=num[i>>1]; j>1)+1]; j+=2) { u8[row[j>>1]] -= value[j]*u6[i]-value[j+1]*u6[i+1]; u8[row[j>>1]+1] -= value[j]*u6[i+1]+value[j+1]*u6[i]; } for (i=0; i<6*np; i+=2) for (j=num[i>>1]; j>1)+1]; j+=2) { u10[row[j>>1]] -= value[j]*u8[i]-value[j+1]*u8[i+1]; u10[row[j>>1]+1] -= value[j]*u8[i+1]+value[j+1]*u8[i]; } for (i=0; i<6*np; i++) { u[i] = u2[i]*del3+u4[i]*del5+u6[i]*del7+u8[i]*del9+u10[i]*del11; uold[i] = 0.; } /* real part */ u1i = 0.; u3i = u2[6*iatom]; u5i = u4[6*iatom]; u7i = u6[6*iatom]; u9i = u8[6*iatom]; u11i = u10[6*iatom]; ui = 0.; u2i = 0.; u4i = 0.; u6i = 0.; u8i = 0.; u10i = 0.; /* imaginary part */ v1i = 0.; v3i = u2[6*iatom+1]; v5i = u4[6*iatom+1]; v7i = u6[6*iatom+1]; v9i = u8[6*iatom+1]; v11i = u10[6*iatom+1]; vi = 0.; v2i = 0.; v4i = 0.; v6i = 0.; v8i = 0.; v10i = 0.; if (print_floprate) startwatch(); /* main loop starts */ for (ip=1; ip<=mint; ip++) { /* calculate the perturbation on normal coordinate */ u2_ibb = 0.; u4_ibb = 0.; u6_ibb = 0.; u8_ibb = 0.; u10_ibb = 0.; v2_ibb = 0.; v4_ibb = 0.; v6_ibb = 0.; v8_ibb = 0.; v10_ibb = 0.; for (m=0; m<=max_channel-min_channel; m++) { ipm = ((m+min_channel)*ip-1)%mint; u2_ibb += dsin[ipm]*ar[m]; u4_ibb -= v2[m]*dsin[ipm]*ar[m]; u6_ibb += v4[m]*dsin[ipm]*ar[m]; u8_ibb -= v6[m]*dsin[ipm]*ar[m]; u10_ibb += v8[m]*dsin[ipm]*ar[m]; v2_ibb += dsin[ipm]*ai[m]; v4_ibb -= v2[m]*dsin[ipm]*ai[m]; v6_ibb += v4[m]*dsin[ipm]*ai[m]; v8_ibb -= v6[m]*dsin[ipm]*ai[m]; v10_ibb += v8[m]*dsin[ipm]*ai[m]; } /* Let the work be done in parallel: */ parallel_multiply_all (6*iatom, u2_ibb, u4_ibb, u6_ibb, u8_ibb, u10_ibb, v2_ibb, v4_ibb, v6_ibb, v8_ibb, v10_ibb); if (ip!=1) { u11i = (u10[6*iatom]-uo10i)/2./delta; u9i = ((u8[6*iatom]-uo8i)/2.-del3*u11i)/delta; u7i = ((u6[6*iatom]-uo6i)/2.-del3*u9i-del5*u11i)/delta; u5i = ((u4[6*iatom]-uo4i)/2.-del3*u7i-del5*u9i-del7*u11i) /delta; u3i = ((u2[6*iatom]-uo2i)/2.-del3*u5i-del5*u7i-del7*u9i -del9*u11i)/delta; u1i = ((u[6*iatom]-uoi)/2.-del3*u3i-del5*u5i-del7*u7i -del9*u9i-del11*u11i)/delta; v11i = (u10[6*iatom+1]-vo10i)/2./delta; v9i = ((u8[6*iatom+1]-vo8i)/2.-del3*v11i)/delta; v7i = ((u6[6*iatom+1]-vo6i)/2.-del3*v9i-del5*v11i)/delta; v5i = ((u4[6*iatom+1]-vo4i)/2.-del3*v7i-del5*v9i-del7*v11i) /delta; v3i = ((u2[6*iatom+1]-vo2i)/2.-del3*v5i-del5*v7i-del7*v9i -del9*v11i)/delta; v1i = ((u[6*iatom+1]-voi)/2.-del3*v3i-del5*v5i-del7*v7i -del9*v9i-del11*v11i)/delta; } /* calculate the integral */ uni = ui+delta*u1i+del2*u2i+del3*u3i+del4*u4i+del5*u5i +del6*u6i+del7*u7i+del8*u8i+del9*u9i+del10*u10i+del11*u11i; un1i = u1i+delta*u2i+del2*u3i+del3*u4i+del4*u5i+del5*u6i +del6*u7i+del7*u8i+del8*u9i+del9*u10i+del10*u11i; un2i = u2i+delta*u3i+del2*u4i+del3*u5i+del4*u6i+del5*u7i +del6*u8i+del7*u9i+del8*u10i+del9*u11i; un3i = u3i+delta*u4i+del2*u5i+del3*u6i+del4*u7i+del5*u8i +del6*u9i+del7*u10i+del8*u11i; un4i = u4i+delta*u5i+del2*u6i+del3*u7i+del4*u8i+del5*u9i +del6*u10i+del7*u11i; un5i = u5i+delta*u6i+del2*u7i+del3*u8i+del4*u9i+del5*u10i +del6*u11i; un6i = u6i+delta*u7i+del2*u8i+del3*u9i+del4*u10i+del5*u11i; un7i = u7i+delta*u8i+del2*u9i+del3*u10i+del4*u11i; un8i = u8i+delta*u9i+del2*u10i+del3*u11i; un9i = u9i+delta*u10i+del2*u11i; un10i = u10i+delta*u11i; un11i = u11i; vni = vi+delta*v1i+del2*v2i+del3*v3i+del4*v4i+del5*v5i +del6*v6i+del7*v7i+del8*v8i+del9*v9i+del10*v10i+del11*v11i; vn1i = v1i+delta*v2i+del2*v3i+del3*v4i+del4*v5i+del5*v6i +del6*v7i+del7*v8i+del8*v9i+del9*v10i+del10*v11i; vn2i = v2i+delta*v3i+del2*v4i+del3*v5i+del4*v6i+del5*v7i +del6*v8i+del7*v9i+del8*v10i+del9*v11i; vn3i = v3i+delta*v4i+del2*v5i+del3*v6i+del4*v7i+del5*v8i +del6*v9i+del7*v10i+del8*v11i; vn4i = v4i+delta*v5i+del2*v6i+del3*v7i+del4*v8i+del5*v9i +del6*v10i+del7*v11i; vn5i = v5i+delta*v6i+del2*v7i+del3*v8i+del4*v9i+del5*v10i +del6*v11i; vn6i = v6i+delta*v7i+del2*v8i+del3*v9i+del4*v10i+del5*v11i; vn7i = v7i+delta*v8i+del2*v9i+del3*v10i+del4*v11i; vn8i = v8i+delta*v9i+del2*v10i+del3*v11i; vn9i = v9i+delta*v10i+del2*v11i; vn10i = v10i+delta*v11i; vn11i = v11i; for (m=0; m<=max_channel-min_channel; m++) { ipm = ((m+min_channel)*ip-1)%mint; ibm = ((m+min_channel)*(ip+mint-1)-1)%mint; fup[m] += (uni-un2i/v2[m]+un4i/v4[m]-un6i/v6[m]+un8i/v8[m] -un10i/v10[m])/v1[m]*dsin[ipm] -(ui-u2i/v2[m]+u4i/v4[m]-u6i/v6[m]+u8i/v8[m] -u10i/v10[m])/v1[m]*dsin[ibm] +(un1i-un3i/v2[m]+un5i/v4[m]-un7i/v6[m]+un9i/v8[m] -un11i/v10[m])/v2[m]*dcos[ipm] -(u1i-u3i/v2[m]+u5i/v4[m]-u7i/v6[m]+u9i/v8[m] -u11i/v10[m])/v2[m]*dcos[ibm]; fvp[m] += (vni-vn2i/v2[m]+vn4i/v4[m]-vn6i/v6[m]+vn8i/v8[m] -vn10i/v10[m])/v1[m]*dsin[ipm] -(vi-v2i/v2[m]+v4i/v4[m]-v6i/v6[m]+v8i/v8[m] -v10i/v10[m])/v1[m]*dsin[ibm] +(vn1i-vn3i/v2[m]+vn5i/v4[m]-vn7i/v6[m]+vn9i/v8[m] -vn11i/v10[m])/v2[m]*dcos[ipm] -(v1i-v3i/v2[m]+v5i/v4[m]-v7i/v6[m]+v9i/v8[m] -v11i/v10[m])/v2[m]*dcos[ibm]; } /* a chunk of work done */ if (print_floprate && (ip%steps_to_inform==0)) { /* cc = seconds spent */ checkwatch (&cc); dd = ip*(ORDER_OF_INTEGRATION-2)/2.*8.*(num[3*np]>>1); printf ("step = %d, %f%% of a base cycle completed.\n", ip, 100.*ip/mint); ee = dd*1e-6/cc; printf ("Mflops/s up to now = %f,\n", ee); printf ("Mflops/s per processor = %f.\n\n", ee/num_processors); } uo10i = u10i; uo8i = u8i; uo6i = u6i; uo4i = u4i; uo2i = u2i; uoi = ui; vo10i = v10i; vo8i = v8i; vo6i = v6i; vo4i = v4i; vo2i = v2i; voi = vi; u10i = u10[6*iatom]; u8i = u8[6*iatom]; u6i = u6[6*iatom]; u4i = u4[6*iatom]; u2i = u2[6*iatom]; ui = u[6*iatom]; v10i = u10[6*iatom+1]; v8i = u8[6*iatom+1]; v6i = u6[6*iatom+1]; v4i = u4[6*iatom+1]; v2i = u2[6*iatom+1]; vi = u[6*iatom+1]; /* Verlet leap frog integration */ for (i=0; i<6*np; i++) { cc = uold[i]; uold[i] = u[i]; u[i] = 2.*(u[i]+del2*u2[i]+del4*u4[i]+del6*u6[i]+ del8*u8[i]+del10*u10[i])-cc; } } /* main loop ends */ if (print_floprate) { checkwatch (&cc); dd = mint*(ORDER_OF_INTEGRATION-2)/2.*8.*(num[3*np]>>1); ee = dd*1e-6/cc; printf ("Mflops/s during the cycle = %f\n", ee); printf ("Mflops/s per processor = %f\n\n", ee/num_processors); } /* calculate the LDOS */ for (m=0; m<=max_channel-min_channel; m++) { /* decode the spin-wave */ cmplx_mul (ar[m], ai[m], cos(phi[m]), -sin(phi[m]), &ar[m], &ai[m]); cmplx_mul (fup[m], fvp[m], cos(phi[m]), -sin(phi[m]), &fup[m], &fvp[m]); /* plug into the formula */ /* cancel out all odd-numbered noise functions */ /* scale back to real freq. DOS */ dd = -2.*(m+min_channel)*min_v*min_v/PI/PI *(fup[m]/ar[m]+fvp[m]/ai[m])/2. *sqrt(v1[m]*v1[m]-shift_v*shift_v)/v1[m]; ldos[m] += kpts[4*k+3]*dd; } sumweight += kpts[4*k+3]; /* save to .ldos file */ filehandle = fopen(fn_ldos, "w+"); for (m=0; m<=max_channel-min_channel; m++) fprintf (filehandle, "%9.6f %9.6f\n", sqrt(v1[m]*v1[m]-shift_v*shift_v), ldos[m]/sumweight); fclose (filehandle); /* Kill the slave processes here */ /* because D(k) is going to be changed. */ master_set_command (SLAVE_DIE); master_sleep(); } /* sample over supercell k's */ /* free all allocated memories */ printf ("Freeing all allocated memories.\n"); free (kpts); free (value); free (row); free (idx); free (num); free (dm); free (dm_index); free (uold); free (s); free (mass); free (ldos); free (fvp); free (fup); free (phi); free (ai); free (ar); free (v10); free (v8); free (v6); free (v4); free (v2); free (v1); free (dcos); free (dsin); /* free IPC resources */ free_shm(); free_semaphore(); printf("Mission terminates successfully:\n\n"); gettimeofday (&tp, NULL); printf("the entire mission takes %s.\n", earth_time(tp.tv_sec+0.000001*tp.tv_usec-start_time)); return(0); } /* end main() */ /* Woon's pair potential, Chem. Phys. Lett. 204, 29 (1993) */ #define WOON_Q 0.6060 #define WOON_EPSILON (0.4228*HARTREE_IN_J/1000./UENERGY_IN_J) #define WOON_RE (7.1714*BOHR_RADIUS_IN_A/ULENGTH_IN_A) #define WOON_KE (0.6627*HARTREE_IN_J/1000./UENERGY_IN_J/ \ pow(BOHR_RADIUS_IN_A/ULENGTH_IN_A, 2.)) #define WOON_K1 sqrt((WOON_Q+1)*WOON_KE/WOON_Q/WOON_EPSILON) #define WOON_K2 sqrt(WOON_Q*WOON_KE/(WOON_Q+1)/WOON_EPSILON) #define WOON_RESIDUE (WOON_EPSILON*(WOON_Q*exp(-WOON_K1*(RCUT-WOON_RE))- \ (WOON_Q+1)*exp(-WOON_K2*(RCUT-WOON_RE)))) #define WOON_RESIDUE_FORCE \ (WOON_EPSILON*(WOON_Q*WOON_K1*exp(-WOON_K1*(RCUT-WOON_RE))- \ (WOON_Q+1)*WOON_K2*exp(-WOON_K2*(RCUT-WOON_RE)))) #define LJ_RESIDUE (4*(pow(RCUT,-12.)-pow(RCUT,-6.))) #define LJ_RESIDUE_FORCE ((48*pow(RCUT,-12.)-24*pow(RCUT,-6.))/RCUT) /* Evaluate the force constant matrix of a pair interaction. */ void force_constants (double rx, double ry, double rz, double force_constant[3][3]) { double r1, r2, a, b; double ir2, ir4, ir6, ir12; double first, second; /* V'/r and V''/r */ double res = LJ6_12?LJ_RESIDUE:WOON_RESIDUE; double ref = LJ6_12?LJ_RESIDUE_FORCE:WOON_RESIDUE_FORCE; r2 = rx*rx + ry*ry + rz*rz; r1 = sqrt(r2); if (LJ6_12) { ir2 = 1./r2; ir4 = ir2 * ir2; ir6 = ir4 * ir2; ir12 = ir6 * ir6; first = (-48.*ir12+24.*ir6)*ir2 + ref/r1; second = (624.*ir12-168.*ir6)*ir4; } else if(WOON) { a = WOON_EPSILON * WOON_Q * exp(-WOON_K1 * (r1-WOON_RE)); b = WOON_EPSILON * (WOON_Q+1) * exp(-WOON_K2 * (r1-WOON_RE)); first = (-WOON_K1*a+WOON_K2*b+ ref)/r1; second = (WOON_K1*WOON_K1*a-WOON_K2*WOON_K2*b)/r2; } force_constant[0][0] = (second-first/r2)*rx*rx+first; force_constant[0][1] = (second-first/r2)*rx*ry; force_constant[0][2] = (second-first/r2)*rx*rz; force_constant[1][1] = (second-first/r2)*ry*ry+first; force_constant[1][2] = (second-first/r2)*ry*rz; force_constant[2][2] = (second-first/r2)*rz*rz+first; force_constant[1][0] = force_constant[0][1]; force_constant[2][0] = force_constant[0][2]; force_constant[2][1] = force_constant[1][2]; return; } /* end force_constants() */ double matinv (double A[3][3], double B[3][3]) { double D11,D22,D33,D12,D21,D13,D31,D32,D23,volume; D11=A[1][1]*A[2][2]-A[1][2]*A[2][1]; D22=A[2][2]*A[0][0]-A[2][0]*A[0][2]; D33=A[0][0]*A[1][1]-A[0][1]*A[1][0]; D12=A[1][2]*A[2][0]-A[1][0]*A[2][2]; D23=A[2][0]*A[0][1]-A[2][1]*A[0][0]; D31=A[0][1]*A[1][2]-A[0][2]*A[1][1]; D13=A[1][0]*A[2][1]-A[2][0]*A[1][1]; D21=A[2][1]*A[0][2]-A[0][1]*A[2][2]; D32=A[0][2]*A[1][0]-A[1][2]*A[0][0]; volume=A[0][0]*D11+A[0][1]*D12+A[0][2]*D13; B[0][0]=D11/volume; B[1][1]=D22/volume; B[2][2]=D33/volume; B[0][1]=D21/volume; B[1][2]=D32/volume; B[2][0]=D13/volume; B[1][0]=D12/volume; B[2][1]=D23/volume; B[0][2]=D31/volume; return (volume); } /* end matinv() */ void matmul (double A[3][3], double B[3][3], double D[3][3]) { volatile double C[3][3]; C[0][0] = A[0][0]*B[0][0] + A[0][1]*B[1][0] + A[0][2]*B[2][0]; C[0][1] = A[0][0]*B[0][1] + A[0][1]*B[1][1] + A[0][2]*B[2][1]; C[0][2] = A[0][0]*B[0][2] + A[0][1]*B[1][2] + A[0][2]*B[2][2]; C[1][0] = A[1][0]*B[0][0] + A[1][1]*B[1][0] + A[1][2]*B[2][0]; C[1][1] = A[1][0]*B[0][1] + A[1][1]*B[1][1] + A[1][2]*B[2][1]; C[1][2] = A[1][0]*B[0][2] + A[1][1]*B[1][2] + A[1][2]*B[2][2]; C[2][0] = A[2][0]*B[0][0] + A[2][1]*B[1][0] + A[2][2]*B[2][0]; C[2][1] = A[2][0]*B[0][1] + A[2][1]*B[1][1] + A[2][2]*B[2][1]; C[2][2] = A[2][0]*B[0][2] + A[2][1]*B[1][2] + A[2][2]*B[2][2]; D[0][0] = C[0][0]; D[0][1] = C[0][1]; D[0][2] = C[0][2]; D[1][0] = C[1][0]; D[1][1] = C[1][1]; D[1][2] = C[1][2]; D[2][0] = C[2][0]; D[2][1] = C[2][1]; D[2][2] = C[2][2]; return; } /* end matmul() */ void cmplx_mul (double ar, double ai, double br, double bi, double *dr, double *di) { volatile double cr = ar*br-ai*bi; *di = ar*bi+ai*br; *dr = cr; return; } /* cmplx_mul() */ /* returns pseudo-random number on (0,1) */ double frandom() { return ((random()+0.5)/pow((double)2, (double)31)); } /* end frandom() */ /* manage an internal list of shared memories */ void *apply_for_shm (int size) { static int last_KEY = 0; if (num_shm_seg==MAX_SHM_SEG) { printf ("Error in apply_for_shm(): maximum of %d\n", MAX_SHM_SEG); printf ("shared memory segments exceeded.\n"); return (NULL); } shm[num_shm_seg].size = size; for (shm[num_shm_seg].KEY = last_KEY+1; ((shm[num_shm_seg].ID= shmget(shm[num_shm_seg].KEY,size,IPC_CREAT|0600)) ==-1)&&(shm[num_shm_seg].KEY<65535); shm[num_shm_seg].KEY++); if(shm[num_shm_seg].ID!=-1) { printf ("shared memory segment %d: KEY = %d ID = %d\n", num_shm_seg+1,shm[num_shm_seg].KEY,shm[num_shm_seg].ID); /* The system will find a good logical address to attach the shared memory segment to. Should be rounded to 16 bytes. */ shm[num_shm_seg].addr = shmat(shm[num_shm_seg].ID,(char *)0,0600|SHM_RND); if (shm[num_shm_seg].addr==(void *)-1) { printf ("apply_for_shm(): fail to attach shm segment %d\n", num_shm_seg+1); return(NULL); } else { printf("Succeed in getting %d bytes of shared memory.\n", size); /* clear the allocated region */ memset (shm[num_shm_seg].addr, 0, size); last_KEY = shm[num_shm_seg].KEY; return (shm[num_shm_seg++].addr); } } else { printf("apply_for_shm(): fail to get %d bytes of shared memory.\n", size); return(NULL); } } /* apply_for_shm() */ /* allocate shared arrays that are WRITABLE */ int init_shm_arrays (int n6) { int i, not_null=1; int totalsize = sizeof(int)*(num_processors+1) /* memeory for domain[] */ + sizeof(char)*num_processors /* memeory for commands[] */ + sizeof(double)*n6*(6+num_processors); /* for u,u2,u4,u6,u8,u10,copy[] */ num_shm_seg = 0; size_of_vector_in_double = n6; size_of_vector_in_bytes = size_of_vector_in_double*sizeof(double); /* first see if we can allocate everything in one piece */ if ((shm_base=(double *)apply_for_shm(totalsize))!=NULL) { /* displacement variables used in multichannel method */ u = shm_base; u2 = u + n6; u4 = u2 + n6; u6 = u4 + n6; u8 = u6 + n6; u10 = u8 + n6; for (i=0; i\n"); printf ("Let us try segmented allocation.\n"); /* pack into maximally contiguous blocks to increase cache hit */ shm_base = (double *) apply_for_shm(size_of_vector_in_bytes*3+ (num_processors+1)*sizeof(int)); u = shm_base; u2 = u + n6; u4 = u2 + n6; domain = (int *) (u4 + n6); not_null = not_null && (u!=NULL); u6 = (double *) apply_for_shm (size_of_vector_in_bytes*3+ sizeof(char)*num_processors); u8 = u6 + n6; u10 = u8 + n6; commands = (char *) (u10+n6); not_null = not_null && (u6!=NULL); for (i=0; i=0; num_shm_seg--) if (shmctl(shm[num_shm_seg].ID, IPC_RMID, NULL) == -1) printf("Unable to remove shared memory ID=%d.\n", shm[num_shm_seg].ID); } /* end free_shm() */ /* semaphore set for multiprocess scheduling */ int init_semaphore (int num_processors) { int i; for (sem_KEY=1;((sem_ID=semget(sem_KEY,num_processors,IPC_CREAT|0600)) ==-1)&&(sem_KEY<65535);sem_KEY++); printf ("semaphore KEY = %d ID = %d\n", sem_KEY, sem_ID); if(sem_ID!=-1) { /* define standard semaphore operations */ /* increase one element */ semop_add.sem_op = +1; semop_add.sem_flg = 0600; /* decrease one element */ semop_minus.sem_op = -1; semop_minus.sem_flg = 0600; /* sleep till one element is set to zero */ semop_zero.sem_op = 0; semop_zero.sem_flg = 0600; /* hypnotize the slave processes now by setting them all to 1 */ for (i=1; i>1], num[(i>>1)+1]); */ for (j=num[i>>1]; j>1)+1]; j+=2) { out[row[j>>1]] -= value[j]*in[i]-value[j+1]*in[i+1]; out[row[j>>1]+1] -= value[j]*in[i+1]+value[j+1]*in[i]; } } break; /* second phase: add copies to next array */ case COPY_TO_U2: out=u2; goto further; case COPY_TO_U4: out=u4; goto further; case COPY_TO_U6: out=u6; goto further; case COPY_TO_U8: out=u8; goto further; case COPY_TO_U10: out=u10; further: for (j=0; j 0); signal (SIGCHLD, fireman); } char *earth_time (double seconds) { const struct time_unit { char *name; double scale; } earthman[] = { {"s", 1}, {"min", 60}, {"hr", 60}, {"day", 24}, {"month", 30}, {"year", 365} }; static char time[128], *p; int i, max=sizeof(earthman)/sizeof(struct time_unit); double scale; int denom; seconds /= earthman[0].scale; for (scale=1,i=1; i0; i--) { denom = floor(seconds/scale); if (denom>0) { sprintf(p, "%d %s, ", denom, earthman[i].name); while((*p)!=(char)0) p++; } seconds -= denom*scale; scale /= earthman[i].scale; } sprintf(p, "%.3f %s", seconds, earthman[0].name); return(time); } /* end earth_time() */