/*********************************************/
/*  O(N), Multi-Channel Perturbation Method  */
/*  to Calclate the Local Density of States  */
/*  [Phys. Rev. B 56, pp. 3524 (1997)].      */
/*                                           */
/*                   Ver 2.1  June 22, 1998  */
/*                 Developed by Ju Li (MIT)  */
/*********************************************/

/*
  Version Notes:
  1. TTY interface.
  2. Woon's pair potential.
  3. Dynamical matrix for arbitrary
     configuration and pair potential.
  4. User-defined supercell k
     sampling or random sampling.
  5. High-precision (order-12) 
     integration of equation of motion.
  6. Spin-wave amplitude encoding
     to minimize the point spread.
  7. Shift the spectrum for better 
     convergence at low frequencies.
  8. Parallel shared memory using
     UNIX fork(), shmop(), semop().
  9. Flop rate benchmark.
*/

#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <time.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/sem.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <math.h>
#include "mcp.h"

int main()
{
    int i,j,k,l,m,n,ip,ipm,ibm;
    double cc,dd,ee,tmp[3][3],sumweight;
    double sxji,syji,szji,rx,ry,rz;
    char *p, sentence[MAX_STRING_LENGTH];
    
    /* Read instructions from input, such as "mon" file: */
    printf ("\nReading instructions...\n");
    
    fscanf (stdin,
	    "Name of configuration (->*, default=perfect_crystal.lj):\n%s\n\n",
	    config_name);
    if (!strcasecmp(config_name,"default"))
	sprintf(config_name, "perfect_crystal.lj");
    
    fscanf (stdin,
            "Pair potential to be used (LJ6_12 or WOON):\n%s\n\n",
            potential_function);
    LJ6_12 = !strcmp(potential_function,"LJ6_12");
    WOON = !strcmp(potential_function,"WOON");
    if ((!LJ6_12) && (!WOON))
    {
        fprintf (stdout, "\n **Error: Choose between LJ6_12 or WOON pair potential\n");
        exit(1);
    }

    fscanf (stdin,
	    "Load configuration from file (default=*):\n%s\n\n",
	    fn_config_read);
    if (!strcasecmp(fn_config_read,"default"))
	sprintf(fn_config_read, "%s", config_name);
    
    fscanf(stdin,
	   "Atom index and Cartesian direction for LDOS:\n%d %lf %lf %lf\n\n",
	   &iatom, &Q[0][0], &Q[0][1], &Q[0][2]);
    
    fscanf (stdin,
	    "Save LDOS on file (default=*.ldos):\n%s\n\n",
	    fn_ldos);
    if (!strcasecmp(fn_ldos, "default"))
	sprintf(fn_ldos, "%s.ldos", config_name);

    fscanf (stdin, "Supercell k-point sampling file (if numeric then\n");
    fscanf (stdin,
	    "apply random sampling in BZ, default=Kpts/sckpt.8):\n%s\n\n",
	    fn_kpt);
    if (random_k_sampling=((*fn_kpt>='0')&&(*fn_kpt<='9')))
    {	
	/* user inputs a integer, do random k sampling */
	for (p=fn_kpt;(*p>='0')&&(*p<='9');p++); *p = 0;
	sscanf(fn_kpt, "%d", &num_kpts);
    }
    else if (!strcasecmp(fn_kpt, "default"))
	sprintf(fn_kpt, "Kpts/sckpt.8");

    fscanf(stdin,
	   "Ignore dynamical matrix components smaller than (THz^2):\n%lf\n\n",
	   &tiny);
    
    fscanf(stdin, "Minimum (base) frequency (THz):\n%lf\n\n", &min_v);
    fscanf(stdin, "Maximum frequency (THz):\n%lf\n\n", &max_v);
    fscanf(stdin, "Shift the spectrum by (THz):\n%lf\n\n", &shift_v);
    
    fscanf(stdin,
	   "Twist period of spin-wave amplitude encoding:\n%d\n\n", 
	   &ktwist);

    fscanf(stdin,
	   "Number of steps of a period for the maximum frequency:\n%d\n\n", 
	   &mincycle);
    
    fscanf(stdin, "To run on how many processors:\n%d\n\n", &num_processors);
    if (num_processors > MAX_PROCESSORS)
    {
	printf ("MAX_PROCESSORS < %d, revise source code.\n",
		num_processors);
	exit(1);
    }

    fscanf(stdin, "Work load of the master process\n");
    fscanf(stdin,
	   "(-1 -> {1., 0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98}):\n%lf\n\n",
	   &master_load);
    if (master_load < 0)
	master_load = default_master_load[num_processors-1];
    if (num_processors==1) master_load = 1.0;
    
    fscanf(stdin, "Frequency of getting reported (in Gflop)\n");
    fscanf(stdin, "(NULL=don't, default=0.5 Gflop):\n%s\n\n", sentence);
    if (print_floprate=strcasecmp(sentence, "NULL"))
	if (!strcasecmp(sentence, "default"))
	    work_in_Gflop = 0.5;
	else sscanf(sentence, "%lf", &work_in_Gflop);

    fscanf (stdin,
	    "Random number generator seed (long integer):\n%ld\n\n",
	    &iseed);
    srandom (iseed);

    printf("Instructions read complete.\n\n");
    /* Instructions read complete */
    
    /* print out mission statement */
    printf ("LDOS will be calculated for atom %d in \"%s\"\n",
	    iatom, config_name);
    printf ("along (%.2f,%.2f,%.2f) direction using %s potential\n", 
	    Q[0][0], Q[0][1], Q[0][2], potential_function); 
    printf ("with Rcut = %f = %f Angstrom,\n",
	    CUTOFF, CUTOFF*ULENGTH_IN_ANGSTROM);

    /* first channel, after-transformation freq. fixed: */
    min_channel = (int) ceil(shift_v/min_v);
    dd = min_channel*min_v; 
    /* second channel, after-transformation freq. fixed: */
    ee = dd + min_v;                
    /* make the 2nd real freq. (cc x the 1st real freq.) */
    cc = 2.5;
    shift_v = sqrt((cc*cc*dd*dd-ee*ee)/(cc*cc-1.));
    /* maximum after-transformation freq. */
    max_v = sqrt(max_v*max_v+shift_v*shift_v);
    max_channel = floor(max_v/min_v); 
    printf ("for %d frequency channels between [%f, %f] THz, or\n",
	    max_channel-min_channel+1, sqrt(dd*dd-shift_v*shift_v),
	    sqrt(max_channel*max_channel*min_v*min_v-shift_v*shift_v));
    printf ("[%f, %f] THz for the shifted dynamical matrix.\n\n",
	    dd, max_channel*min_v); 
	
    /* allocating memory for frequency channels */
    printf ("Allocating memory for %d frequency channels.\n\n",
	    max_channel-min_channel+1);
    /* frequencies in THz */
    v1 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); 
    v2 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); 
    v4 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); 
    v6 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); 
    v8 = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); 
    v10 = (double *) malloc((max_channel-min_channel+1)*sizeof(double));
    /* real and imaginary amplitude series */
    ar = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); 
    ai = (double *) malloc((max_channel-min_channel+1)*sizeof(double));
    /* spin wave phase encoding */
    phi = (double *) malloc((max_channel-min_channel+1)*sizeof(double));
    /* LDOS result */
    fup = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); 
    fvp = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); 
    ldos = (double *) malloc((max_channel-min_channel+1)*sizeof(double)); 
    for (m=0; m<=max_channel-min_channel; m++)
    {
	v1[m] = (m+min_channel)*min_v;
	v2[m] = pow(v1[m],2.); 
	v4[m] = pow(v1[m],4.);
	v6[m] = pow(v1[m],6.);
	v8[m] = pow(v1[m],8.);
	v10[m]= pow(v1[m],10.);
	ar[m] = 0.;
	ai[m] = 0.;
	phi[m] = 0.;
	ldos[m] = 0.;
    }

    /* total number of integration steps: */
    mint = max_channel*mincycle;
    /* 2 x Pi x (timestep size in ps): */
    delta = 2.*PI/min_v/mint;
    printf ("Run track is %d steps long, equivalent step size = %f ps.\n",
	    mint, delta/2./PI);
    
    del2 = pow(delta,2.)/2.;
    del3 = pow(delta,3.)/6.;
    del4 = pow(delta,4.)/24.;
    del5 = pow(delta,5.)/120.;
    del6 = pow(delta,6.)/720.;
    del7 = pow(delta,7.)/5040.;
    del8 = pow(delta,8.)/40320.;
    del9 = pow(delta,9.)/362880.;
    del10 = pow(delta,10.)/3628800.;
    del11 = pow(delta,11.)/39916800.;
    
    printf ("Allocating memory for %d phases.\n\n", mint);
    /* prepare the phases */
    dsin = (double *) malloc(mint*sizeof(double));
    dcos = (double *) malloc(mint*sizeof(double));
    for (ip=1; ip<=mint; ip++)
    {
	dsin[ip-1] = sin(2.*PI/mint*ip);
	dcos[ip-1] = cos(2.*PI/mint*ip);
    }

    printf("The configuration is being loaded from \"%s\".\n",
	    fn_config_read); 
    if ((filehandle=fopen(fn_config_read, "r"))==NULL)
    {
	fprintf(stdout, "\n **Error: cannot locate file \"%s\"\n.",
		fn_config_read);
	exit(1);
    }
    fscanf (filehandle, "Number of particles = %d\n\n", &np);

    printf ("Allocate property arrays for %d particles.\n\n",np);
    /* reduced coordinates */
    mass = (double *) malloc(np*sizeof(double));
    s = (double *) malloc(3*np*sizeof(double));
    uold = (double *) malloc(6*np*sizeof(double));
	
    /* read in supercell geometry from configuration file */ 
    fscanf (filehandle, "H(1,1) = %lf Angstrom\n",   &H[0][0]);
    fscanf (filehandle, "H(1,2) = %lf Angstrom\n",   &H[0][1]);
    fscanf (filehandle, "H(1,3) = %lf Angstrom\n\n", &H[0][2]);
    fscanf (filehandle, "H(2,1) = %lf Angstrom\n",   &H[1][0]);
    fscanf (filehandle, "H(2,2) = %lf Angstrom\n",   &H[1][1]);
    fscanf (filehandle, "H(2,3) = %lf Angstrom\n\n", &H[1][2]);
    fscanf (filehandle, "H(3,1) = %lf Angstrom\n",   &H[2][0]);
    fscanf (filehandle, "H(3,2) = %lf Angstrom\n",   &H[2][1]);
    fscanf (filehandle, "H(3,3) = %lf Angstrom\n\n", &H[2][2]);
    for (i=0;i<3;i++)
	for (j=0;j<3;j++)
	    H[i][j] /= ULENGTH_IN_ANGSTROM;
    volume = fabs(matinv(H, HI));

    printf ("The supercell geometry is\n");
    printf ("        | %.5f %.5f %.5f |\n",
	  H[0][0]*ULENGTH_IN_ANGSTROM,
	  H[0][1]*ULENGTH_IN_ANGSTROM,
	  H[0][2]*ULENGTH_IN_ANGSTROM);	  
    printf (" H[A] = | %.5f %.5f %.5f |\n",
	  H[1][0]*ULENGTH_IN_ANGSTROM,
	  H[1][1]*ULENGTH_IN_ANGSTROM,
	  H[1][2]*ULENGTH_IN_ANGSTROM);
    printf ("        | %.5f %.5f %.5f |\n",
	  H[2][0]*ULENGTH_IN_ANGSTROM,
	  H[2][1]*ULENGTH_IN_ANGSTROM,
	  H[2][2]*ULENGTH_IN_ANGSTROM);
    
    printf ("volume = %f = %f A^3.\n\n", volume,
	    volume*ULENGTH_IN_ANGSTROM*ULENGTH_IN_ANGSTROM*ULENGTH_IN_ANGSTROM);
    if (np/volume > 0.85*MAXDENSITY)
    {
	printf("Current density %f is above 85%% of MAXDENSITY %f in \"mcp_constant.h\".\n");
	printf("Overflow feared. Please increase  MAXDENSITY in \"mcp_constant.h\".\n");
	exit(1);
    }

    /* To simplify the programming, we require the CUTOFF sphere to be
       contained in the parallelepiped formed by H(1,:), H(2,:),
       H(3,:), which requires the thickness opposite to any of the
       parallelepiped surfaces to be greater than 2*CUTOFF, but that
       thickness is just 1/|G_i|, where G^T * H = I, the corresponding
       reciprocal lattice vector. */
    if ((CUTOFF>0.5/sqrt(HI[0][0]*HI[0][0]+HI[1][0]*HI[1][0]+HI[2][0]*HI[2][0])) ||
        (CUTOFF>0.5/sqrt(HI[0][1]*HI[0][1]+HI[1][1]*HI[1][1]+HI[2][1]*HI[2][1])) ||
	(CUTOFF>0.5/sqrt(HI[0][2]*HI[0][2]+HI[1][2]*HI[1][2]+HI[2][2]*HI[2][2])))
    {
	printf ("For code efficiency we require a larger system,\n");
	printf ("so there is no self-interaction.\n");
	exit(1);
    }
    
    /* read in reduced coordinates from configuration file */ 
    fscanf(filehandle,
	   "Mass(amu)     sx        sy        sz     sx1(1/ps)  sy1(1/ps)  sz1(1/ps)\n");
    for (i=0;i<3*np;i+=3)
    {
	fscanf (filehandle, "%lf  %lf  %lf  %lf  %lf  %lf  %lf\n",
		&mass[i/3], &s[i], &s[i+1], &s[i+2], &cc,  &cc, &cc);
	/* convert from AMU to reduced mass unit */
	mass[i/3] = mass[i/3]/AVO*1E-3/UMASS_IN_KG;
    }
    fclose (filehandle);
    
    printf ("\"%s\" generated successfully on the computer.\n\n",
	    config_name);

    printf ("Obtain an orthogonal transformation\n");
    cc = sqrt(Q[0][0]*Q[0][0]+Q[0][1]*Q[0][1]+Q[0][2]*Q[0][2]);
    Q[0][0] /= cc;
    Q[0][1] /= cc;
    Q[0][2] /= cc;
    /* get a random vector v in space: */
    cc = Q[0][2];
    dd = Q[0][0];
    ee = -Q[0][1];
    /* take a cross-product v*c1 */
    Q[1][0] = dd*Q[0][2]-ee*Q[0][1];
    Q[1][1] = ee*Q[0][0]-cc*Q[0][2];
    Q[1][2] = cc*Q[0][1]-dd*Q[0][0];
    cc = sqrt(Q[1][0]*Q[1][0]+Q[1][1]*Q[1][1]+Q[1][2]*Q[1][2]);
    Q[1][0] /= cc;
    Q[1][1] /= cc;
    Q[1][2] /= cc;
    /* take the cross-product c1*c2 */
    Q[2][0] = Q[0][1]*Q[1][2]-Q[0][2]*Q[1][1];
    Q[2][1] = Q[0][2]*Q[1][0]-Q[0][0]*Q[1][2];
    Q[2][2] = Q[0][0]*Q[1][1]-Q[0][1]*Q[1][0];
    /* get the transpose */
    for (i=0; i<3; i++)
	for (j=0; j<3; j++)
	    QT[i][j] = Q[j][i];
    printf ("     | %7.4f %7.4f %7.4f |\n",Q[0][0],Q[0][1],Q[0][2]);	  
    printf (" Q = | %7.4f %7.4f %7.4f |\n",Q[1][0],Q[1][1],Q[1][2]);
    printf ("     | %7.4f %7.4f %7.4f |\n",Q[2][0],Q[2][1],Q[2][2]);
    printf ("making (%.2f,%.2f,%.2f) a normal coordinate.\n\n",
	    Q[0][0],Q[0][1],Q[0][2]);
    
    /* count the total number of interactions */
    printf ("Counting all possible interactions within Rcut = %f A.\n",
	    CUTOFF*ULENGTH_IN_ANGSTROM);
    for (num_dm=0,i=0; i<np; i++)
	for (j=i+1; j<np; j++)
	{
	    sxji = s[3*j]   - s[3*i];
	    syji = s[3*j+1] - s[3*i+1];
	    szji = s[3*j+2] - s[3*i+2];
	    if (sxji>0.5)  sxji--;
	    else if (sxji<-0.5) sxji++;
	    if(syji>0.5)  syji--;
	    else if (syji<-0.5) syji++;
	    if (szji>0.5)  szji--;
	    else if (szji<-0.5) szji++;
	    rx = H[0][0]*sxji + H[1][0]*syji + H[2][0]*szji;
	    ry = H[0][1]*sxji + H[1][1]*syji + H[2][1]*szji;
	    rz = H[0][2]*sxji + H[1][2]*syji + H[2][2]*szji;
	    if (rx*rx+ry*ry+rz*rz<CUTOFF*CUTOFF) num_dm++;
	}
    
    printf ("There are %d unique interactions in the system,\n",num_dm);
    printf ("and so on the average %d neighbors per atom.\n\n",2*num_dm/np);
    
    printf ("Allocating for %d real-space dynamical matrices.\n\n", num_dm);
    /* save the particle index j and i*/
    dm_index = (int *) malloc(2*num_dm*sizeof(int));
    dm = (double *) malloc ((3+6)*num_dm*sizeof(double));
    /* the first 3 are delta_s between j and i; the next 9 are 
       (xx, xy, xz, yx, yy, yz, zx, zy, zz) dynamical matrix
       components. After transformation on coordinates of IATOM,
       it can no longer be assumed symmetrical */
    if ((dm==NULL)||(dm_index==NULL))
    {
	printf("Not enough memory (%d bytes).\n",
	       (3+6)*num_dm*sizeof(double)+2*num_dm*sizeof(int));
	exit(1);
    }

    /* To count how many significant components there are on a column: */
    num = (int *) malloc((3*np+1)*sizeof(int));
    /* take for granted that all diagonal sub-matrices are full: */
    for (num[0]=0, i=1; i<=3*np; i++) num[i] = 6;
    
    /* evaluate the dynamical matrices */
    printf("Calculating all real-space dynamical matrices.\n\n");
    for (ip=0,i=0; i<np; i++)
	for (j=i+1; j<np; j++)
	{
	    sxji = s[3*j]   - s[3*i];
	    syji = s[3*j+1] - s[3*i+1];
	    szji = s[3*j+2] - s[3*i+2];
	    if (sxji>0.5)  sxji--;
	    else if (sxji<-0.5) sxji++;
	    if(syji>0.5)  syji--;
	    else if (syji<-0.5) syji++;
	    if (szji>0.5)  szji--;
	    else if (szji<-0.5) szji++;
	    rx = H[0][0]*sxji + H[1][0]*syji + H[2][0]*szji;
	    ry = H[0][1]*sxji + H[1][1]*syji + H[2][1]*szji;
	    rz = H[0][2]*sxji + H[1][2]*syji + H[2][2]*szji;
	    if (rx*rx+ry*ry+rz*rz<CUTOFF*CUTOFF)
	    {
		/* save the particle indices */
		dm_index[2*ip] = j;
		dm_index[2*ip+1] = i;
		/* save the relative delta s */
		dm[9*ip]   = sxji;
		dm[9*ip+1] = syji;
		dm[9*ip+2] = szji;
		/* evaluate the force constant matrix: */
		force_constants (rx, ry, rz, force_constant);
		/* do orthogonal transformations on force constant matrices */
		/* so our defined degree of freedom is a normal coordinate. */
		matmul (Q, force_constant, force_constant); 
		matmul (force_constant, QT, force_constant); 
		cc = UFREQ_IN_THZ*UFREQ_IN_THZ/sqrt(mass[i]*mass[j]); 
		/* convert dynamical matrices to THz^2: */
		dm[9*ip+3] = force_constant[0][0]*cc;
		dm[9*ip+4] = force_constant[0][1]*cc;
		dm[9*ip+5] = force_constant[0][2]*cc;
		dm[9*ip+6] = force_constant[1][1]*cc;
		dm[9*ip+7] = force_constant[1][2]*cc;
		dm[9*ip+8] = force_constant[2][2]*cc;
		/* count how many off-diagonal components are significant: */
		for (k=0; k<3; k++)
		    for (l=0; l<3; l++)
			if (fabs(force_constant[k][l]*cc)>tiny)
			{
			    num[3*j+l+1]+=2;
			    num[3*i+k+1]+=2;
			}
		ip++;
	    }
	}
    
    /* convert into accumulative index: the dynamical matrix is compressed */
    /* into 1D arrays row[]+value[], where the ith column of D(k) occupies */
    /* slots [num[i], num[i+1]) on those 1D arrays. */
    for (i=0; i<3*np; i++)
	num[i+1] += num[i];
    printf("After getting rid of off-diagonal components < %f THz^2,\n",tiny);
    printf("we will have on average %d significant elements per column on D(k).\n",
	   (num[3*np]>>1)/3/np);
    
    printf("Allocating memory for D(k) (sparse).\n\n");
    /* currently filled slots [num[i], idx[i]) for the ith column */
    idx = (int *) malloc(3*np*sizeof(int));
    /* row index for each D(k) element */
    row = (int *) malloc((num[3*np]>>1)*sizeof(int));
    /* value of D(k) element (complex) */
    value = (double *) malloc(num[3*np]*sizeof(double));
    if ((idx==NULL)||(row==NULL)||(value==NULL))
    {
	printf("Not enough memory (%d bytes).\n", (3*np+num[3*np]>>1)
	       *sizeof(int) + num[3*np]*sizeof(double));
	exit(1);
    }
    
    /* get sampling k-points in BZ */
    if (random_k_sampling)
    {
	printf("Randomly generating %d k-points in supercell BZ.\n\n",
	       num_kpts);
	kpts = (double *) malloc(4*num_kpts*sizeof(double));
	for (i=0; i<num_kpts; i++)
	{
	    /* coefficients of reciprocal lattice vectors */ 
	    kpts[4*i]   = frandom();
	    kpts[4*i+1] = frandom();
	    kpts[4*i+2] = frandom();
	    /* sampling weight */
	    kpts[4*i+3] = 1./num_kpts;
	}
    }
    else
    {
	printf("Reading supercell sampling k-points from \"%s\":\n", fn_kpt);
	if ((filehandle=fopen(fn_kpt, "r"))==NULL)
	{
	    printf ("sampling k-point file \"%s\" does not exist.\n",  fn_kpt);
	    exit(1);
	}
	fscanf(filehandle, "%d\n", &num_kpts);
	kpts = (double *) malloc(4*num_kpts*sizeof(double));
	printf("there are %d k-points stored in \"%s\"\n", num_kpts, fn_kpt);
	printf("(in reciprocal lattice vector coefficients).\n\n");
	for (i=0; i<num_kpts; i++)
	    fscanf (filehandle, "%lf %lf %lf %lf\n",
		    kpts+4*i, kpts+4*i+1, kpts+4*i+2, kpts+4*i+3);
	fclose(filehandle);
    }

    printf ("Applying for shared memory arrays for parallel computation.\n");
    /* allocate shared memory */
    if (!init_shm_arrays(6*np))                                
    {
	printf("Cannot obtain enough shared memory. Check if there is\n");
	printf("any undeleted shared memory blocks from previous runs.\n");
	exit(1);
    }
    printf ("\n");

    printf ("Applying for semaphores for multi-thread scheduling.\n");
    /* apply for semaphore set of num_processors components: */
    if (!init_semaphore(num_processors))
    {
	printf("Cannot obtain semaphore set of %d components. Check\n",
	       num_processors);
	printf("if there are any zombies from previous runs.\n");
	exit(1);
    }
    printf ("\n");

    printf ("Data layout (row indices):\n");
    printf ("Master  slave1  slave2 ->\n");
    /* set data layout */
    n = 3*np/num_processors;
    domain[0] = 0;
    domain[1] = ceil(master_load*n);
    for (i=1; i<num_processors; i++)
    {
	n = (3*np-domain[1])/(num_processors-1);
	domain[i+1] = domain[i]+n;
	printf("%d-%d ", domain[i-1], domain[i]-1);
    }
    domain[num_processors] = 3*np;
    printf("%d-%d\n\n", domain[i-1], domain[i]-1);
    
    printf ("Calculation starts.\n\n");
    /* Total flops = 8 x (# of non-zero matrix elements)  */
    /*                 x (# of time steps) x (order-2)/2. */
    steps_to_inform = ceil(work_in_Gflop*1.e9/8./(num[3*np]>>1)
			   /(ORDER_OF_INTEGRATION-2)*2);
    if (print_floprate)
	printf ("Flop rate will be shown for every %.1f Mflop chunk of work.\n",
		work_in_Gflop*1000);
    printf ("Final LDOS result will be saved on \"%s\".\n\n", fn_ldos);
        
    /* go over all supercell k-points */
    for (sumweight=0.,k=0; k<num_kpts; k++)
    {
	/* assemble the k-space dynamical matrix */
	printf ("k-point %d: assembling the sparse D(k) matrix.\n\n", k);
	
	/* clear all elements */
	for (i=0; i<num[3*np]; i++) value[i]=0.;

	/* slots for the diagonal D(k) sub-matrix elements */
	for (i=0; i<3*np; i++)
	{
	    idx[i] = num[i] + 6;
	    row[num[i]>>1]   = 2*((i/3)*3);
	    row[(num[i]>>1)+1] = 2*((i/3)*3+1);
	    row[(num[i]>>1)+2] = 2*((i/3)*3+2);
	}
	
	for (ip=0; ip<num_dm; ip++)
	{
	    /* j is always greater than i */
	    j = dm_index[2*ip];
	    i = dm_index[2*ip+1];
	    
	    /* (ix, ix) */
	    value[num[3*i]] += dm[9*ip+3];
	    /* (iy, ix) */
	    value[num[3*i]+2] += dm[9*ip+4];
	    /* (iz, ix) */
	    value[num[3*i]+4] += dm[9*ip+5];
	    
	    /* (ix, iy) */
	    value[num[3*i+1]] += dm[9*ip+4];
	    /* (iy, iy) */
	    value[num[3*i+1]+2] += dm[9*ip+6];
	    /* (iz, iy) */
	    value[num[3*i+1]+4] += dm[9*ip+7];
	    
	    /* (ix, iz) */
	    value[num[3*i+2]] += dm[9*ip+5];
	    /* (iy, iz) */
	    value[num[3*i+2]+2] += dm[9*ip+7];
	    /* (iz, iz) */
	    value[num[3*i+2]+4] += dm[9*ip+8];
	    
	    /* (jx, jx) */
	    value[num[3*j]] += dm[9*ip+3];
	    /* (jy, jx) */
	    value[num[3*j]+2] += dm[9*ip+4];
	    /* (jz, jx) */
	    value[num[3*j]+4] += dm[9*ip+5];
	    
	    /* (jx, jy) */
	    value[num[3*j+1]] += dm[9*ip+4];
	    /* (jy, jy) */
	    value[num[3*j+1]+2] += dm[9*ip+6];
	    /* (jz, jy) */
	    value[num[3*j+1]+4] += dm[9*ip+7];
	    
	    /* (jx, jz) */
	    value[num[3*j+2]] += dm[9*ip+5];
	    /* (jy, jz) */
	    value[num[3*j+2]+2] += dm[9*ip+7];
	    /* (jz, jz) */
	    value[num[3*j+2]+4] += dm[9*ip+8];
	    
	    /* phase */
	    ee = 2*PI*(kpts[4*k]*dm[9*ip]+
		       kpts[4*k+1]*dm[9*ip+1]+
		       kpts[4*k+2]*dm[9*ip+2]);
	    cc = cos(ee);
	    dd = sin(ee);
	    
	    if (fabs(dm[9*ip+3])>tiny)
	    {
		/* (ix, jx) */
		row[idx[3*j]>>1] = 2*(3*i);
		value[idx[3*j]] -= dm[9*ip+3]*cc;
		value[idx[3*j]+1] -= dm[9*ip+3]*dd;
		idx[3*j]+=2;
		/* (jx, ix) */
		row[idx[3*i]>>1] = 2*(3*j);
		value[idx[3*i]] -= dm[9*ip+3]*cc;
		value[idx[3*i]+1] += dm[9*ip+3]*dd;
		idx[3*i]+=2;		
	    }

	    if (fabs(dm[9*ip+6])>tiny)
	    {
		/* (iy, jy) */
		row[idx[3*j+1]>>1] = 2*(3*i+1);
		value[idx[3*j+1]] -= dm[9*ip+6]*cc;
		value[idx[3*j+1]+1] -= dm[9*ip+6]*dd;
		idx[3*j+1]+=2;
		/* (jy, iy) */
		row[idx[3*i+1]>>1] = 2*(3*j+1);
		value[idx[3*i+1]] -= dm[9*ip+6]*cc;
		value[idx[3*i+1]+1] += dm[9*ip+6]*dd;
		idx[3*i+1]+=2;		
	    }

	    if (fabs(dm[9*ip+8])>tiny)
	    {
		/* (iz, jz) */
		row[idx[3*j+2]>>1] = 2*(3*i+2);
		value[idx[3*j+2]] -= dm[9*ip+8]*cc;
		value[idx[3*j+2]+1] -= dm[9*ip+8]*dd;
		idx[3*j+2]+=2;
		/* (jz, iz) */
		row[idx[3*i+2]>>1] = 2*(3*j+2);
		value[idx[3*i+2]] -= dm[9*ip+8]*cc;
		value[idx[3*i+2]+1] += dm[9*ip+8]*dd;
		idx[3*i+2]+=2;		
	    }
	    
	    if (fabs(dm[9*ip+4])>tiny)
	    {
		/* (ix, jy) */
		row[idx[3*j+1]>>1] = 2*(3*i);
		value[idx[3*j+1]] -= dm[9*ip+4]*cc;
		value[idx[3*j+1]+1] -= dm[9*ip+4]*dd;
		idx[3*j+1]+=2;
		/* (jy, ix) */
		row[idx[3*i]>>1] = 2*(3*j+1);
		value[idx[3*i]] -= dm[9*ip+4]*cc;
		value[idx[3*i]+1] += dm[9*ip+4]*dd;
		idx[3*i]+=2;
		/* (iy, jx) */
		row[idx[3*j]>>1] = 2*(3*i+1);
		value[idx[3*j]] -= dm[9*ip+4]*cc;
		value[idx[3*j]+1] -= dm[9*ip+4]*dd;
		idx[3*j]+=2;
		/* (jx, iy) */
		row[idx[3*i+1]>>1] = 2*(3*j);
		value[idx[3*i+1]] -= dm[9*ip+4]*cc;
		value[idx[3*i+1]+1] += dm[9*ip+4]*dd;
		idx[3*i+1]+=2;		
	    }

	    if (fabs(dm[9*ip+5])>tiny)
	    {
		/* (ix, jz) */
		row[idx[3*j+2]>>1] = 2*(3*i);
		value[idx[3*j+2]] -= dm[9*ip+5]*cc;
		value[idx[3*j+2]+1] -= dm[9*ip+5]*dd;
		idx[3*j+2]+=2;
		/* (jz, ix) */
		row[idx[3*i]>>1] = 2*(3*j+2);
		value[idx[3*i]] -= dm[9*ip+5]*cc;
		value[idx[3*i]+1] += dm[9*ip+5]*dd;
		idx[3*i]+=2;
		/* (iz, jx) */
		row[idx[3*j]>>1] = 2*(3*i+2);
		value[idx[3*j]] -= dm[9*ip+5]*cc;
		value[idx[3*j]+1] -= dm[9*ip+5]*dd;
		idx[3*j]+=2;
		/* (jx, iz) */
		row[idx[3*i+2]>>1] = 2*(3*j);
		value[idx[3*i+2]] -= dm[9*ip+5]*cc;
		value[idx[3*i+2]+1] += dm[9*ip+5]*dd;
		idx[3*i+2]+=2;		
	    }
	    
	    if (fabs(dm[9*ip+7])>tiny)
	    {
		/* (iy, jz) */
		row[idx[3*j+2]>>1] = 2*(3*i+1);
		value[idx[3*j+2]] -= dm[9*ip+7]*cc;
		value[idx[3*j+2]+1] -= dm[9*ip+7]*dd;
		idx[3*j+2]+=2;
		/* (jz, iy) */
		row[idx[3*i+1]>>1] = 2*(3*j+2);
		value[idx[3*i+1]] -= dm[9*ip+7]*cc;
		value[idx[3*i+1]+1] += dm[9*ip+7]*dd;
		idx[3*i+1]+=2;
		/* (iz, jy) */
		row[idx[3*j+1]>>1] = 2*(3*i+2);
		value[idx[3*j+1]] -= dm[9*ip+7]*cc;
		value[idx[3*j+1]+1] -= dm[9*ip+7]*dd;
		idx[3*j+1]+=2;
		/* (jy, iz) */
		row[idx[3*i+2]>>1] = 2*(3*j+1);
		value[idx[3*i+2]] -= dm[9*ip+7]*cc;
		value[idx[3*i+2]+1] += dm[9*ip+7]*dd;
		idx[3*i+2]+=2;		
	    }
	}

	/* checksum */
	for (i=0; i<3*np; i++)
	    if (idx[i] != num[i+1])
	    {
		printf ("Internal data corruption.\n");
		free_shm();
		free_semaphore();		
		exit(1);
	    }

	/*************************************/
	/* Multi-Channel Perturbation Method */
	/*************************************/

	/* reset the error handlers to system defaults */
	reset_error_handler();
	/* fork the program here: resources such as row[] */
	/* (read only) will be automatically "cloned".    */
	master_pid = getpid();
	my_pid = master_pid;
	my_idx = 0;
	for (i=1; i<num_processors; i++)
	{
	    fork();
	    /* slave processes: wait here */
	    if ((my_pid=getpid())!=master_pid)
	    {
		my_idx = i;
		slave_work_till_die();
		exit(0);
	    }
	}
    
	/* set my own exceptions handler (only the master */
	/* process), slave processes use the default.     */
	set_my_own_error_handler();

	/* To generate an error-canceling amplitude series. */
	/* ++--++--++--++-- */
	ar[0] = 1.;
	ar[1] = 1.;
	if (frandom()>0.5) ar[1] = -ar[1];
	for (m=2; m<=max_channel-min_channel; m++)
	    ar[m] = -ar[m-2];

	/* +--++--++--++--+ */
	n = 1;
	if (frandom()>0.5) n = -n ;
	for (m=0; m<=max_channel-min_channel; m++)
	{
	    ai[m] = n*ar[m];
	    n = -n;
	}

	/* apply a random "spin-wave" encoding to avoid */
	/* numerical problems during time integration   */
	phi[0] = 0.;
	cc = 2*PI/ktwist;
	if (frandom()>0.5) cc = -cc;
	for (m=0; m<=max_channel-min_channel; m++)
	{
	    cmplx_mul (ar[m], ai[m], cos(phi[m]), sin(phi[m]),
		       &ar[m], &ai[m]);
	    if (m!=0) phi[m] = phi[m-1] + cc*frandom();
	    fup[m] = 0.;
	    fvp[m] = 0.;
	}

	/* Precondition the integration: only odd-power derivatives
	   exist at t=0. To save space, u2 stands for u3, u4 stands
	   for u5, etc. */
	
	for (i=0; i<6*np; i++)
	{
	    u[i] = 0.;
	    u2[i] = 0.;
	    u4[i] = 0.;
	    u6[i] = 0.;
	    u8[i] = 0.;
	    u10[i] = 0.;
	}
	
	for (m=0; m<=max_channel-min_channel; m++)
	{
	    u2[6*iatom] += v1[m]*ar[m];
	    u4[6*iatom] -= v1[m]*v2[m]*ar[m];
	    u6[6*iatom] += v1[m]*v4[m]*ar[m];
	    u8[6*iatom] -= v1[m]*v6[m]*ar[m];
	    u10[6*iatom] += v1[m]*v8[m]*ar[m];
	    
	    u2[6*iatom+1] += v1[m]*ai[m];
	    u4[6*iatom+1] -= v1[m]*v2[m]*ai[m];
	    u6[6*iatom+1] += v1[m]*v4[m]*ai[m];
	    u8[6*iatom+1] -= v1[m]*v6[m]*ai[m];
	    u10[6*iatom+1] += v1[m]*v8[m]*ai[m];
	}

	for (i=0; i<6*np; i+=2)
	    for (j=num[i>>1]; j<num[(i>>1)+1]; j+=2)
	    {
		u4[row[j>>1]]   -= value[j]*u2[i]-value[j+1]*u2[i+1];
		u4[row[j>>1]+1] -= value[j]*u2[i+1]+value[j+1]*u2[i];
	    }

	for (i=0; i<6*np; i+=2)
	    for (j=num[i>>1]; j<num[(i>>1)+1]; j+=2)
	    {
		u6[row[j>>1]]   -= value[j]*u4[i]-value[j+1]*u4[i+1];
		u6[row[j>>1]+1] -= value[j]*u4[i+1]+value[j+1]*u4[i];
	    }

	for (i=0; i<6*np; i+=2)
	    for (j=num[i>>1]; j<num[(i>>1)+1]; j+=2)
	    {
		u8[row[j>>1]]   -= value[j]*u6[i]-value[j+1]*u6[i+1];
		u8[row[j>>1]+1] -= value[j]*u6[i+1]+value[j+1]*u6[i];
	    }
	
	for (i=0; i<6*np; i+=2)
	    for (j=num[i>>1]; j<num[(i>>1)+1]; j+=2)
	    {
		u10[row[j>>1]]   -= value[j]*u8[i]-value[j+1]*u8[i+1];
		u10[row[j>>1]+1] -= value[j]*u8[i+1]+value[j+1]*u8[i];
	    }
	
	for (i=0; i<6*np; i++)
	{
	    u[i] = u2[i]*del3+u4[i]*del5+u6[i]*del7+u8[i]*del9+u10[i]*del11;
	    uold[i] = 0.;
	}

	/* real part */
	u1i = 0.;
	u3i = u2[6*iatom];
	u5i = u4[6*iatom];
	u7i = u6[6*iatom];
	u9i = u8[6*iatom];
	u11i = u10[6*iatom];
	ui   = 0.;
	u2i  = 0.;
	u4i  = 0.;
	u6i  = 0.;
	u8i  = 0.;
	u10i = 0.;

	/* imaginary part */
	v1i = 0.;
	v3i = u2[6*iatom+1];
	v5i = u4[6*iatom+1];
	v7i = u6[6*iatom+1];
	v9i = u8[6*iatom+1];
	v11i = u10[6*iatom+1];
	vi   = 0.;
	v2i  = 0.;
	v4i  = 0.;
	v6i  = 0.;
	v8i  = 0.;
	v10i = 0.;
      
	if (print_floprate) startwatch();

	/* main loop starts */
	for (ip=1; ip<=mint; ip++)
	{
	    /* calculate the perturbation on normal coordinate */
	    u2_ibb = 0.; 
	    u4_ibb = 0.;
	    u6_ibb = 0.;
	    u8_ibb = 0.;
	    u10_ibb = 0.;
	    
	    v2_ibb = 0.; 
	    v4_ibb = 0.;
	    v6_ibb = 0.;
	    v8_ibb = 0.;
	    v10_ibb = 0.;
	
	    for (m=0; m<=max_channel-min_channel; m++)
	    {
		ipm = ((m+min_channel)*ip-1)%mint;
		u2_ibb  += dsin[ipm]*ar[m];
		u4_ibb  -= v2[m]*dsin[ipm]*ar[m];
		u6_ibb  += v4[m]*dsin[ipm]*ar[m];
		u8_ibb  -= v6[m]*dsin[ipm]*ar[m];
		u10_ibb += v8[m]*dsin[ipm]*ar[m];
		v2_ibb  += dsin[ipm]*ai[m];
		v4_ibb  -= v2[m]*dsin[ipm]*ai[m];
		v6_ibb  += v4[m]*dsin[ipm]*ai[m];
		v8_ibb  -= v6[m]*dsin[ipm]*ai[m];
		v10_ibb += v8[m]*dsin[ipm]*ai[m];
	    }

	    /* Let the work be done in parallel: */
	    parallel_multiply_all
		(6*iatom, u2_ibb, u4_ibb, u6_ibb, u8_ibb, u10_ibb,
		 v2_ibb, v4_ibb, v6_ibb, v8_ibb, v10_ibb);
	    
	    if (ip!=1)
	    {
		u11i = (u10[6*iatom]-uo10i)/2./delta;
		u9i = ((u8[6*iatom]-uo8i)/2.-del3*u11i)/delta;
		u7i = ((u6[6*iatom]-uo6i)/2.-del3*u9i-del5*u11i)/delta;
		u5i = ((u4[6*iatom]-uo4i)/2.-del3*u7i-del5*u9i-del7*u11i)/delta;
		u3i = ((u2[6*iatom]-uo2i)/2.-del3*u5i-del5*u7i-del7*u9i
		       -del9*u11i)/delta;
		u1i = ((u[6*iatom]-uoi)/2.-del3*u3i-del5*u5i-del7*u7i
		       -del9*u9i-del11*u11i)/delta;
		
		v11i = (u10[6*iatom+1]-vo10i)/2./delta;
		v9i = ((u8[6*iatom+1]-vo8i)/2.-del3*v11i)/delta;
		v7i = ((u6[6*iatom+1]-vo6i)/2.-del3*v9i-del5*v11i)/delta;
		v5i = ((u4[6*iatom+1]-vo4i)/2.-del3*v7i-del5*v9i-del7*v11i)/delta;
		v3i = ((u2[6*iatom+1]-vo2i)/2.-del3*v5i-del5*v7i-del7*v9i
		       -del9*v11i)/delta;
		v1i = ((u[6*iatom+1]-voi)/2.-del3*v3i-del5*v5i-del7*v7i
		       -del9*v9i-del11*v11i)/delta;
	    }

	    /* calculate the integral */
	    uni = ui+delta*u1i+del2*u2i+del3*u3i+del4*u4i+del5*u5i
		+del6*u6i+del7*u7i+del8*u8i+del9*u9i+del10*u10i+del11*u11i;

	    un1i = u1i+delta*u2i+del2*u3i+del3*u4i+del4*u5i+del5*u6i
		+del6*u7i+del7*u8i+del8*u9i+del9*u10i+del10*u11i;

	    un2i = u2i+delta*u3i+del2*u4i+del3*u5i+del4*u6i+del5*u7i
		+del6*u8i+del7*u9i+del8*u10i+del9*u11i;

	    un3i = u3i+delta*u4i+del2*u5i+del3*u6i+del4*u7i+del5*u8i
		+del6*u9i+del7*u10i+del8*u11i;

	    un4i = u4i+delta*u5i+del2*u6i+del3*u7i+del4*u8i+del5*u9i
		+del6*u10i+del7*u11i;

	    un5i = u5i+delta*u6i+del2*u7i+del3*u8i+del4*u9i+del5*u10i
		+del6*u11i;

	    un6i = u6i+delta*u7i+del2*u8i+del3*u9i+del4*u10i+del5*u11i;
	    un7i = u7i+delta*u8i+del2*u9i+del3*u10i+del4*u11i;
	    un8i = u8i+delta*u9i+del2*u10i+del3*u11i;
	    un9i = u9i+delta*u10i+del2*u11i;
	    un10i = u10i+delta*u11i;
	    un11i = u11i;

	    vni = vi+delta*v1i+del2*v2i+del3*v3i+del4*v4i+del5*v5i
		+del6*v6i+del7*v7i+del8*v8i+del9*v9i+del10*v10i+del11*v11i;

	    vn1i = v1i+delta*v2i+del2*v3i+del3*v4i+del4*v5i+del5*v6i
		+del6*v7i+del7*v8i+del8*v9i+del9*v10i+del10*v11i;

	    vn2i = v2i+delta*v3i+del2*v4i+del3*v5i+del4*v6i+del5*v7i
		+del6*v8i+del7*v9i+del8*v10i+del9*v11i;

	    vn3i = v3i+delta*v4i+del2*v5i+del3*v6i+del4*v7i+del5*v8i
		+del6*v9i+del7*v10i+del8*v11i;

	    vn4i = v4i+delta*v5i+del2*v6i+del3*v7i+del4*v8i+del5*v9i
		+del6*v10i+del7*v11i;

	    vn5i = v5i+delta*v6i+del2*v7i+del3*v8i+del4*v9i+del5*v10i
		+del6*v11i;

	    vn6i = v6i+delta*v7i+del2*v8i+del3*v9i+del4*v10i+del5*v11i;
	    vn7i = v7i+delta*v8i+del2*v9i+del3*v10i+del4*v11i;
	    vn8i = v8i+delta*v9i+del2*v10i+del3*v11i;
	    vn9i = v9i+delta*v10i+del2*v11i;
	    vn10i = v10i+delta*v11i;
	    vn11i = v11i;

	    for (m=0; m<=max_channel-min_channel; m++)
	    {
		ipm = ((m+min_channel)*ip-1)%mint;
		ibm = ((m+min_channel)*(ip+mint-1)-1)%mint;
		fup[m] +=
		    (uni-un2i/v2[m]+un4i/v4[m]-un6i/v6[m]+un8i/v8[m]
		     -un10i/v10[m])/v1[m]*dsin[ipm]
		    -(ui-u2i/v2[m]+u4i/v4[m]-u6i/v6[m]+u8i/v8[m]
		      -u10i/v10[m])/v1[m]*dsin[ibm]
		    +(un1i-un3i/v2[m]+un5i/v4[m]-un7i/v6[m]+un9i/v8[m]
		      -un11i/v10[m])/v2[m]*dcos[ipm] 
		    -(u1i-u3i/v2[m]+u5i/v4[m]-u7i/v6[m]+u9i/v8[m]
		      -u11i/v10[m])/v2[m]*dcos[ibm];
		fvp[m] +=
		    (vni-vn2i/v2[m]+vn4i/v4[m]-vn6i/v6[m]+vn8i/v8[m]
		     -vn10i/v10[m])/v1[m]*dsin[ipm]
		    -(vi-v2i/v2[m]+v4i/v4[m]-v6i/v6[m]+v8i/v8[m]
		      -v10i/v10[m])/v1[m]*dsin[ibm]
		    +(vn1i-vn3i/v2[m]+vn5i/v4[m]-vn7i/v6[m]+vn9i/v8[m]
		      -vn11i/v10[m])/v2[m]*dcos[ipm] 
		    -(v1i-v3i/v2[m]+v5i/v4[m]-v7i/v6[m]+v9i/v8[m]
		      -v11i/v10[m])/v2[m]*dcos[ibm];
	    }

	    /* a chunk of work done */
	    if (print_floprate && (ip%steps_to_inform==0))
	    {
		/* cc = seconds spent */
		checkwatch (&cc); 
		dd = ip*(ORDER_OF_INTEGRATION-2)/2.*8.*(num[3*np]>>1);
		printf ("step = %d, %f%% of a base cycle completed.\n",
			ip, 100.*ip/mint);
		ee = dd*1e-6/cc;
		printf ("Mflops/s up to now = %f,\n", ee); 
		printf ("Mflops/s per processor = %f.\n\n", ee/num_processors);
	    }

	    uo10i = u10i;
	    uo8i  = u8i;
	    uo6i  = u6i;
	    uo4i  = u4i;
	    uo2i  = u2i;
	    uoi   = ui;

	    vo10i = v10i;
	    vo8i  = v8i;
	    vo6i  = v6i;
	    vo4i  = v4i;
	    vo2i  = v2i;
	    voi   = vi;

	    u10i = u10[6*iatom];
	    u8i  = u8[6*iatom];
	    u6i  = u6[6*iatom];
	    u4i  = u4[6*iatom];
	    u2i  = u2[6*iatom];
	    ui   = u[6*iatom];

	    v10i = u10[6*iatom+1];
	    v8i  = u8[6*iatom+1];
	    v6i  = u6[6*iatom+1];
	    v4i  = u4[6*iatom+1];
	    v2i  = u2[6*iatom+1];
	    vi   = u[6*iatom+1];

	    /* Verlet leap frog integration */
	    for (i=0; i<6*np; i++)
	    {
		cc = uold[i];
		uold[i] = u[i];
		u[i] = 2.*(u[i]+del2*u2[i]+del4*u4[i]+del6*u6[i]+
			   del8*u8[i]+del10*u10[i])-cc;
	    }
	} /* main loop ends */

	if (print_floprate)
	{
	    checkwatch (&cc); 
	    dd = mint*(ORDER_OF_INTEGRATION-2)/2.*8.*(num[3*np]>>1);
	    ee = dd*1e-6/cc;
	    printf ("Mflops/s during the cycle = %f\n", ee); 
	    printf ("Mflops/s per processor = %f\n\n", ee/num_processors);
	}

	/* calculate the LDOS */
	for (m=0; m<=max_channel-min_channel; m++)
	{
	    /* decode the spin-wave */
	    cmplx_mul (ar[m], ai[m], cos(phi[m]), -sin(phi[m]),
		       &ar[m], &ai[m]);
	    cmplx_mul (fup[m], fvp[m], cos(phi[m]), -sin(phi[m]),
		       &fup[m], &fvp[m]);
	    /* plug into the formula */
	    /* cancel out all odd-numbered noise functions */
	    /* scale back to real freq. DOS */
	    dd = -2.*(m+min_channel)*min_v*min_v/PI/PI
		*(fup[m]/ar[m]+fvp[m]/ai[m])/2.
		*sqrt(v1[m]*v1[m]-shift_v*shift_v)/v1[m];
	    ldos[m] += kpts[4*k+3]*dd;
	}
	
	sumweight += kpts[4*k+3]; 
	    
	/* save to .ldos file */
	filehandle = fopen(fn_ldos, "w+");
	for (m=0; m<=max_channel-min_channel; m++)
	    fprintf (filehandle, "%9.6f %9.6f\n",
		     sqrt(v1[m]*v1[m]-shift_v*shift_v),
		     ldos[m]/sumweight);
	fclose(filehandle);
	
	/* Kill the slave processes here   */
	/* because D(k) is going to be changed. */
	master_set_command (SLAVE_DIE);
	master_sleep();
	
    } /* sample over supercell k's */

    /* free all allocated memories */
    printf ("Freeing all allocated memories.\n");
    free (kpts);
    free (value);
    free (row);
    free (idx);
    free (num);
    free (dm);
    free (dm_index);
    free (uold);
    free (s);
    free (mass);
    free (ldos);
    free (fvp);
    free (fup);
    free (phi);
    free (ai);
    free (ar);
    free (v10);
    free (v8);
    free (v6);
    free (v4);
    free (v2);
    free (v1);
    free (dcos);
    free (dsin);
     
    /* free IPC resources */
    free_shm();
    free_semaphore();
    
    printf("Mission terminates successfully.\n\n");
    return(0);
} /* end main() */

      
/* Evaluate the force constant matrix of a pair interaction. */
void force_constants
(double rx, double ry, double rz, double force_constant[3][3])
{
    double r1, r2, a, b; 
    double ir2, ir4, ir6, ir12;
    double first, second; /* V'/r and V''/r */
    r2 = rx*rx+ry*ry+rz*rz;
    if (LJ6_12)
    {
	ir2 = 1./r2;
	ir4 = ir2 * ir2;
	ir6 = ir4 * ir2;
	ir12 = ir6 * ir6;
	first = (-48.*ir12+24.*ir6)*ir2;
	second = (624.*ir12-168.*ir6)*ir4;
    }
    else if(WOON)
    {
	r1 = sqrt(r2);
	a = WOON_EPSILON * WOON_Q * exp(-WOON_K1 * (r1-WOON_RE));
	b = WOON_EPSILON * (WOON_Q+1) * exp(-WOON_K2 * (r1-WOON_RE)); 
	first = (-WOON_K1*a+WOON_K2*b)/r1;
	second = (WOON_K1*WOON_K1*a-WOON_K2*WOON_K2*b)/r2;
    }
    force_constant[0][0] = (second-first/r2)*rx*rx+first;
    force_constant[0][1] = (second-first/r2)*rx*ry;
    force_constant[0][2] = (second-first/r2)*rx*rz;
    force_constant[1][1] = (second-first/r2)*ry*ry+first;
    force_constant[1][2] = (second-first/r2)*ry*rz;
    force_constant[2][2] = (second-first/r2)*rz*rz+first;
    force_constant[1][0] = force_constant[0][1];
    force_constant[2][0] = force_constant[0][2];
    force_constant[2][1] = force_constant[1][2]; 
    return;
} /* end force_constants() */

	 
double matinv (double A[3][3], double B[3][3])
{
    double D11,D22,D33,D12,D21,D13,D31,D32,D23,volume;
    D11=A[1][1]*A[2][2]-A[1][2]*A[2][1];
    D22=A[2][2]*A[0][0]-A[2][0]*A[0][2];
    D33=A[0][0]*A[1][1]-A[0][1]*A[1][0];
    D12=A[1][2]*A[2][0]-A[1][0]*A[2][2];
    D23=A[2][0]*A[0][1]-A[2][1]*A[0][0];
    D31=A[0][1]*A[1][2]-A[0][2]*A[1][1];
    D13=A[1][0]*A[2][1]-A[2][0]*A[1][1];
    D21=A[2][1]*A[0][2]-A[0][1]*A[2][2];
    D32=A[0][2]*A[1][0]-A[1][2]*A[0][0];
    volume=A[0][0]*D11+A[0][1]*D12+A[0][2]*D13;
    B[0][0]=D11/volume;
    B[1][1]=D22/volume;
    B[2][2]=D33/volume;
    B[0][1]=D21/volume;
    B[1][2]=D32/volume;
    B[2][0]=D13/volume;
    B[1][0]=D12/volume;
    B[2][1]=D23/volume;
    B[0][2]=D31/volume;
    return (volume);
} /* end matinv() */


void matmul (double A[3][3], double B[3][3], double D[3][3])
{
    volatile double C[3][3];
    C[0][0] = A[0][0]*B[0][0] + A[0][1]*B[1][0] + A[0][2]*B[2][0];
    C[0][1] = A[0][0]*B[0][1] + A[0][1]*B[1][1] + A[0][2]*B[2][1];
    C[0][2] = A[0][0]*B[0][2] + A[0][1]*B[1][2] + A[0][2]*B[2][2];
    C[1][0] = A[1][0]*B[0][0] + A[1][1]*B[1][0] + A[1][2]*B[2][0];
    C[1][1] = A[1][0]*B[0][1] + A[1][1]*B[1][1] + A[1][2]*B[2][1];
    C[1][2] = A[1][0]*B[0][2] + A[1][1]*B[1][2] + A[1][2]*B[2][2];
    C[2][0] = A[2][0]*B[0][0] + A[2][1]*B[1][0] + A[2][2]*B[2][0];
    C[2][1] = A[2][0]*B[0][1] + A[2][1]*B[1][1] + A[2][2]*B[2][1];
    C[2][2] = A[2][0]*B[0][2] + A[2][1]*B[1][2] + A[2][2]*B[2][2];
    D[0][0] = C[0][0]; D[0][1] = C[0][1]; D[0][2] = C[0][2];
    D[1][0] = C[1][0]; D[1][1] = C[1][1]; D[1][2] = C[1][2];
    D[2][0] = C[2][0]; D[2][1] = C[2][1]; D[2][2] = C[2][2];
    return;
} /* end matmul() */


void cmplx_mul
(double ar, double ai, double br, double bi, double *dr, double *di)
{
    volatile double cr = ar*br-ai*bi;
    *di = ar*bi+ai*br;
    *dr = cr;
    return;
} /* cmplx_mul() */

	
/* returns pseudo-random number on (0,1) */
double frandom()
{
    return ((random()+0.5)/pow((double)2, (double)31));
} /* end frandom() */


/* manage an internal list of shared memories */
void *apply_for_shm (int size)
{
    static int last_KEY = 0;
    if (num_shm_seg==MAX_SHM_SEG)
    {
	printf ("Error in apply_for_shm(): maximum of %d\n", MAX_SHM_SEG);
	printf ("shared memory segments exceeded.\n");
	return (NULL);
    }
    shm[num_shm_seg].size = size;
    for (shm[num_shm_seg].KEY = last_KEY+1;
	 ((shm[num_shm_seg].ID=shmget(shm[num_shm_seg].KEY,size,IPC_CREAT|0600))
	  ==-1)&&(shm[num_shm_seg].KEY<65535);
	 shm[num_shm_seg].KEY++);
    if(shm[num_shm_seg].ID!=-1)
    {
	printf ("shared memory segment %d: KEY = %d  ID = %d\n",
		num_shm_seg+1,shm[num_shm_seg].KEY,shm[num_shm_seg].ID);
	/* The system will find a good logical address to attach the
	   shared memory segment to. Should be rounded to 16 bytes. */
	shm[num_shm_seg].addr =
	    shmat(shm[num_shm_seg].ID,(char *)0,0600|SHM_RND);
        if (shm[num_shm_seg].addr==(void *)-1)
	{
            printf ("Error in apply_for_shm(): fail to attach shm segment %d\n",
		    num_shm_seg+1);
	    return(NULL);
	}
	else
	{
	    printf("Succeed in getting %d bytes of shared memory.\n", size);
	    /* clear the allocated region */
	    memset (shm[num_shm_seg].addr, 0, size); 
	    last_KEY = shm[num_shm_seg].KEY;
	    return (shm[num_shm_seg++].addr);
	}
    }
    else
    {
	printf("Error in apply_for_shm(): fail to get %d bytes of shared memory.\n",
	       size);
	return(NULL);
    }
} /* apply_for_shm() */


/* allocate shared arrays that are WRITABLE */
int init_shm_arrays (int n6)
{
    int i, not_null=1;
    int totalsize = sizeof(int)*(num_processors+1) /* memeory for domain[] */ 
	+ sizeof(char)*num_processors       /* memeory for commands[] */ 
	+ sizeof(double)*n6*(6+num_processors); /* for u,u2,u4,u6,u8,u10,copy[] */
				 
    num_shm_seg = 0;
    size_of_vector_in_double = n6;
    size_of_vector_in_bytes  = size_of_vector_in_double*sizeof(double);
    
    /* first see if we can allocate everything in one piece */
    if ((shm_base=(double *)apply_for_shm(totalsize))!=NULL)
    {
	/* displacement variables used in multichannel method */
	u   = shm_base;
	u2  = u  + n6; 
	u4  = u2 + n6; 
	u6  = u4 + n6; 
	u8  = u6 + n6; 
	u10 = u8 + n6;
	for (i=0; i<num_processors; i++)
	    copy[i] = u10 + n6 + i*n6;
	domain = (int *) (copy[0]+n6*num_processors);
	commands = (char *) (domain+num_processors+1);
	return(1);
    }

    printf ("Unable to allocate shared memory in one block ->\n"); 
    printf ("Let us try segmented allocation.\n");
    
    /* pack into maximally contiguous blocks to increase cache hit */
    shm_base = (double *)
	apply_for_shm(size_of_vector_in_bytes*3+(num_processors+1)*sizeof(int));
    u = shm_base;
    u2 = u + n6; 
    u4 = u2 + n6;
    domain = (int *) (u4 + n6);
    not_null = not_null && (u!=NULL);

    u6 = (double *)
	apply_for_shm (size_of_vector_in_bytes*3+sizeof(char)*num_processors);
    u8 =  u6 + n6;
    u10 = u8 + n6;
    commands = (char *) (u10+n6);
    not_null = not_null && (u6!=NULL);

    for (i=0; i<num_processors; i++)
    {
	if (i%2==0)
	{
	    copy[i] = apply_for_shm (2*size_of_vector_in_bytes);
	    not_null = not_null && (copy[i]!=NULL);
	}
	else
	    copy[i] = copy[i-1] + size_of_vector_in_double;
    }
    return(not_null);
} /* end init_shm_arrays() */


/* free all allocated shared memory */
void free_shm()
{
    for (num_shm_seg--; num_shm_seg>=0; num_shm_seg--)
	if (shmctl(shm[num_shm_seg].ID, IPC_RMID, NULL) == -1)
	    printf("Unable to remove shared memory ID=%d.\n",
		   shm[num_shm_seg].ID);
} /* free_shm() */


/* semaphore set for multiprocess scheduling */
int init_semaphore (int num_processors)
{
    int i;
    for (sem_KEY=1;((sem_ID=semget(sem_KEY,num_processors,IPC_CREAT|0600))
		    ==-1)&&(sem_KEY<65535);sem_KEY++);
    printf ("semaphore KEY = %d   ID = %d\n", sem_KEY, sem_ID);
    if(sem_ID!=-1)
    {
	/* define standard semaphore operations */
	/* increase one element */
	semop_add.sem_op = +1;   
	semop_add.sem_flg = 0600;
	/* decrease one element */
	semop_minus.sem_op = -1;    
	semop_minus.sem_flg = 0600;
	/* sleep till one element is set to zero */
	semop_zero.sem_op = 0;        
	semop_zero.sem_flg = 0600;
	/* hypnotize the slave processes now by setting them all to 1 */
	for (i=1; i<num_processors; i++)
	{
	    sem_set.val = 1;
	    semctl(sem_ID, i, SETVAL, sem_set);
	}
	return(1);
    }
    else
	return(0);
} /* init_semaphore() */


/* free allocated semaphore set */
void free_semaphore()
{
    if(semctl(sem_ID, 0, IPC_RMID, NULL)==-1)
        printf("Unable to remove semaphore ID=%d.\n",sem_ID);
    return;
} /* free_semaphore() */


void slave_work_till_die()
{
    while (1)
    {
	slave_sleep();
        if (commands[my_idx] == SLAVE_DIE)
	{
	    fprintf(stdout, "Slave %d exits.\n", my_idx);
	    fflush(stdout);
	    slave_touch_base();
	    exit(0);
	}
	else
	{
	    do_my_job();
	    slave_touch_base();
	}
    }
} /* slave_work_till_die() */

/*
  A work session goes as:
  0) Master puts what to do in commands[].
  1) Master set his semaphore to be num_processors-1,
     set slaves' semaphores to be zero.
  2) A slave wakes up when his semaphore is zero,
     does his job defined in commands[my_idx],
     decrease master's semaphore by one,
     then sleep.
  3) Master finishes his share,
     then sleep till his semaphore is zero.
*/

void parallel_multiply_all
(int ibb, double u2_ibb, double u4_ibb, double u6_ibb,
 double u8_ibb, double u10_ibb, double v2_ibb, double v4_ibb,
 double v6_ibb, double v8_ibb, double v10_ibb)
{
    /* zero higher order u's in parallel */
    master_set_command (ZERO_U2_U10);
    do_my_job();
    master_sleep();
    
    /* put in perturbation */
    u2[ibb] = u2_ibb; 
    u4[ibb] = u4_ibb; 
    u6[ibb] = u6_ibb; 
    u8[ibb] = u8_ibb; 
    u10[ibb] = u10_ibb;
    
    u2[ibb+1] = v2_ibb; 
    u4[ibb+1] = v4_ibb; 
    u6[ibb+1] = v6_ibb; 
    u8[ibb+1] = v8_ibb; 
    u10[ibb+1] = v10_ibb; 
    
    /* do the multiplications in parallel */
    master_set_command (U_TO_COPY);  do_my_job(); master_sleep();
    master_set_command (COPY_TO_U2); do_my_job(); master_sleep();
    master_set_command (U2_TO_COPY); do_my_job(); master_sleep();
    master_set_command (COPY_TO_U4); do_my_job(); master_sleep();
    master_set_command (U4_TO_COPY); do_my_job(); master_sleep();
    master_set_command (COPY_TO_U6); do_my_job(); master_sleep();
    master_set_command (U6_TO_COPY); do_my_job(); master_sleep();
    master_set_command (COPY_TO_U8); do_my_job(); master_sleep();
    master_set_command (U8_TO_COPY); do_my_job(); master_sleep();
    master_set_command (COPY_TO_U10); do_my_job(); master_sleep();
    
    return;
} /* parallel_multiply_all() */


void master_set_command (char what)
{
    int i;
    /* put what in commands[] */
    memset ((void *)commands, what, num_processors*sizeof(char));
    /* set the master's ledger */
    sem_set.val = num_processors-1;
    semctl (sem_ID, 0, SETVAL, sem_set);
    for (i=1;i<num_processors;i++)
    {
	semop_minus.sem_num = i;
	/* wake the slaves up */
	semop(sem_ID, &semop_minus, 1);
    } 
} /* master_set_command() */


/* A process does a job given in commands[] */
void do_my_job()
{
    int i, j;
    double *in, *out;

    switch (commands[my_idx])
    {
	/* first phase: matrix multiplication, into my COPY[] */
    case  U_TO_COPY:   in=u;  goto there;
    case U2_TO_COPY:  in=u2;  goto there;
    case U4_TO_COPY:  in=u4;  goto there;
    case U6_TO_COPY:  in=u6;  goto there;
    case U8_TO_COPY:  in=u8;
	
    there:
    out = copy[my_idx];
    /* clear COPY[] */
    memset (out, 0, size_of_vector_in_bytes);
    /* sparse matrix multiplication */
    for (i=2*domain[my_idx]; i<2*domain[my_idx+1]; i+=2)
    {
	/* printf ("%d %d %d\n", i, num[i>>1], num[(i>>1)+1]); */
	for (j=num[i>>1]; j<num[(i>>1)+1]; j+=2)
	{
	    out[row[j>>1]]   -= value[j]*in[i]-value[j+1]*in[i+1];
	    out[row[j>>1]+1] -= value[j]*in[i+1]+value[j+1]*in[i];
	}
    }
    break;
    
    /* second phase: add copies to next array */
    case COPY_TO_U2:  out=u2;   goto further;
    case COPY_TO_U4:  out=u4;   goto further;
    case COPY_TO_U6:  out=u6;   goto further;
    case COPY_TO_U8:  out=u8;   goto further;
    case COPY_TO_U10: out=u10;  
	
    further:
    for (j=0; j<num_processors; j++)
	for (i=2*domain[my_idx]; i<2*domain[my_idx+1]; i++)
	    out[i] += *(copy[j]+i);
    break;

    /* zeroth phase: clear u2 to u10 */
    case ZERO_U2_U10:
	memset(u2+2*domain[my_idx],0,(domain[my_idx+1]-domain[my_idx])
	       *sizeof(double)*2);   
	memset(u4+2*domain[my_idx],0,(domain[my_idx+1]-domain[my_idx])
	       *sizeof(double)*2);   
	memset(u6+2*domain[my_idx],0,(domain[my_idx+1]-domain[my_idx])
	       *sizeof(double)*2);   
	memset(u8+2*domain[my_idx],0,(domain[my_idx+1]-domain[my_idx])
	       *sizeof(double)*2);   
	memset(u10+2*domain[my_idx],0,(domain[my_idx+1]-domain[my_idx])
	       *sizeof(double)*2);   
	break;
    }
    return;
} /* do_my_job() */


void master_sleep()
{
    /* wait till his semaphore is 0 */
    semop_zero.sem_num = (ushort) 0;
    semop(sem_ID, &semop_zero, 1);
} /* master_sleep() */


/* a slave hands over system resources
   and sleep until semaphore = 0 */
void slave_sleep()
{
    semop_zero.sem_num = (ushort) my_idx;
    semop (sem_ID, &semop_zero, 1);
} /* slave_sleep() */


/* after a slave finishes his real work: */
void slave_touch_base()
{
    /* decreases master's semaphore */
    semop_minus.sem_num = 0;
    semop (sem_ID, &semop_minus, 1);
    /* sets his semaphore to be 1, meaning he can work again */
    semop_add.sem_num = (ushort) my_idx;
    semop (sem_ID, &semop_add, 1);
} /* slave_touch_base() */


void my_error_handler(int signal_number)
{
    printf("Exceptions %d occured: use my own error handler.\n",
	   signal_number);
    free_shm();
    free_semaphore();
    exit(1);
} /* my_error_handler() */


void set_my_own_error_handler()
{
    signal(SIGINT, &my_error_handler);
    signal(SIGTERM, &my_error_handler);
} /* set_my_own_error_handler() */


/* reset signal handler to system default */
void reset_error_handler()
{
    signal(SIGINT, SIG_DFL);
    signal(SIGTERM, SIG_DFL); 
} /* reset_error_handler() */

/* High resolution timers */
/*
void startwatch()
{
    real_t_start = gethrtime();
    return;       
}

void checkwatch(double *time_passed_in_seconds)
{
  real_t_end = gethrtime();
  (*time_passed_in_seconds) =
      (double) (real_t_end-real_t_start)*1.e-9;
    return;
}
*/ /* for Xolas */

/*
void startwatch()
{
    getrusage(RUSAGE_SELF,&rus);
    return;   
} 

void checkwatch (double *time_passed_in_seconds)
{
  struct timeval utime;
  getrusage(RUSAGE_SELF,&rue);
  utime.tv_sec = rue.ru_utime.tv_sec - rus.ru_utime.tv_sec ;
  if ( rue.ru_utime.tv_usec < rus.ru_utime.tv_usec ) {
    utime.tv_sec--;
    utime.tv_usec = 1000000l - rus.ru_utime.tv_usec +
      rue.ru_utime.tv_usec;
  } else
    utime.tv_usec = rue.ru_utime.tv_usec -
      rus.ru_utime.tv_usec;
  *time_passed_in_seconds = (double)utime.tv_sec
      + (double)utime.tv_usec*1e-6;
  return;
} 
*/ /* for SGI and Linux */

void startwatch()
{
    time(&real_t_start);
    return;   
} /* startwatch() */

void checkwatch (double *time_passed_in_seconds)
{
    time(&real_t_end);
    (*time_passed_in_seconds) =
	(double) (real_t_end-real_t_start);
  return;
} /* checkwatch() */