/* MOLecular DYnamics simulation code, Moldy.
Copyright (C) 1988, 1992, 1993 Keith Refson
 
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2, or (at your option) any
later version.
 
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
 
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 
In other words, you are welcome to use, share and improve this program.
You are forbidden to forbid anyone else to use, share and improve
what you give them.   Help stamp out software-hoarding!  */
/******************************************************************************
 * Ewald	The reciprocal-space part of the standard Ewald sum technique *
 ******************************************************************************
 *      Revision Log
 *       $Log: ewald.c,v $
 *       Revision 2.13  1996/11/05 16:45:49  keith
 *       - Reorganized code by extracting generation of sin and cosine tables
 *         into new function trig_recur()
 *       - Optimized main loops and trig_recur to avoid cache conflicts.  Small
 *         gain on the T3D, HUGE gain (x4) on the IBM RS6000.
 *       - New function allocate_arrays() sets out arrays in memory.
 *       _ Preprocessor macros NCACHE and NLINE are used to tune this. NCACHE
 *         is a (possibly sub-multiple) of the cache size, and array dimensions
 *         are rounded up to this. NLINE is larger than the cache-line size and
 *         this space is inserted between arrays as padding.  Both are
 *         specified in WORDS (sizeof(real)).  Also used in accel.c for
 *         declaration of site force arrays.
 *
 *       Revision 2.12  1996/03/19 12:27:48  keith
 *       Parallelized trig function generation loops.  This is conditional
 *       on macro MPPMANY and is only a gain on the T3D. Requires
 *       par_collect_all() in parallel.c.
 *
 *       Revision 2.11  1996/02/07 18:26:58  keith
 *       Restructured for convergence of std and RIL versions
 *        - eliminated ewald-inner.
 *
 *       Revision 2.10  1994/12/30 11:46:08  keith
 *       Fixed bug which caused core dump for very small k-cutoff (hmax=0)
 *
 * Revision 2.9  1994/07/07  16:58:14  keith
 * Versions for SPMD parallel machines.
 * These are based on the *_parallel.c versions for shared-memory but
 * with the parallel loops and compiler directives deleted.
 *
 * Ewald.c has some more optimization of the serial bits.
 *
 * Revision 2.7  1994/06/08  13:13:59  keith
 * New version of array allocator which breaks up requests for DOS.
 * Now must use specific "afree()" paired with arralloc().
 *
 * Revision 2.6  1994/02/17  16:38:16  keith
 * Significant restructuring for better portability and
 * data modularity.
 *
 * Got rid of all global (external) data items except for
 * "control" struct and constant data objects.  The latter
 * (pot_dim, potspec, prog_unit) are declared with CONST
 * qualifier macro which evaluates to "const" or nil
 * depending on ANSI/K&R environment.
 * Also moved as many "write" instantiations of "control"
 * members as possible to "startup", "main" leaving just
 * "dump".
 *
 * Declared as "static"  all functions which should be.
 *
 * Added CONST qualifier to (re-)declarations of ANSI library
 * emulation routines to give reliable compilation even
 * without ANSI_LIBS macro. (#define's away for K&R
 * compilers)
 *
 * Revision 2.5  1994/01/26  16:34:36  keith
 * Fixed non-ansi #endif.
 *
 * Revision 2.3  93/10/28  10:28:59  keith
 * Corrected declarations of stdargs functions to be standard-conforming
 * 
 * Revision 2.1  93/09/02  12:34:44  keith
 * Optimized qsincos() -- should give up to 25% speed improvement on
 * compilers without assert no aliasing options.
 * 
 * Revision 2.0  93/03/15  14:49:49  keith
 * Added copyright notice and disclaimer to apply GPL
 * to all modules. (Previous versions licensed by explicit 
 * consent only).
 * 
 * Revision 1.22  93/03/12  12:23:05  keith
 * Reorganized defines to recognise all ANSI (__type__) forms.
 * Moved spxpy() from aux.c to force.c and force_parallel.c
 * 
 * 
 * Revision 1.21  93/03/09  15:59:58  keith
 * Changed all *_t types to *_mt for portability.
 * Reordered header files for GNU CC compatibility.
 * 
 * Revision 1.20  93/03/05  15:01:53  keith
 * Added CRAY parallelising directives
 * 
 * Revision 1.19  92/08/13  17:56:58  keith
 * Modified nprocessors to limit execution to 1 proc
 * unless env var THREADS explicitly set.
 * 
 * Revision 1.18  92/06/26  17:03:02  keith
 * Got rid of assumption that memory returned by talloc() or
 * arralloc() is zeroed.  This enhances ANSI compatibility.
 * Removed memory zeroing from alloc.c() in consequence.
 * 
 * Revision 1.17  92/02/26  14:33:48  keith
 * Got rid of pstrip pragmas for convex -- they just broke things
 * 
 * Revision 1.16  91/11/27  15:15:56  keith
 * Corrected calculation of sheet energy term for charged framework.
 * Split force loop so as to omit frame-frame force (and stress) terms.
 * Added spaces so that Stellix compiler doesn't choke on "=*"
 * Replaced main loop variable with pointer to work round Stellix 2.3 bug
 * 
 * Revision 1.16  91/11/26  12:48:43  keith
 * Put #ifdefs around machine-specific pragmas for portability.  Now
 * supports Stardent 1000,2000 & 3000 (titan) series and convex.
 * 
 * Revision 1.13  91/08/24  16:55:18  keith
 * Added pragmas for convex C240 parallelization
 * 
 * Revision 1.12  91/08/15  18:13:17  keith
 * Modifications for better ANSI/K&R compatibility and portability
 * --Changed sources to use "gptr" for generic pointer -- typedefed in "defs.h"
 * --Tidied up memcpy calls and used struct assignment.
 * --Moved defn of NULL to stddef.h and included that where necessary.
 * --Eliminated clashes with ANSI library names
 * --Modified defs.h to recognise CONVEX ANSI compiler
 * --Modified declaration of size_mt and inclusion of sys/types.h in aux.c
 *   for GNU compiler with and without fixed includes.
 * 
 * Revision 1.11  91/05/29  17:02:00  keith
 * Modified for minor speed improvement on Stardent titan
 * 
 * Revision 1.10  91/03/12  16:30:08  keith
 * Stardent Titan (ST3000) version.
 * 
 * Revision 1.9  90/09/28  13:29:19  keith
 * Inserted braces around VECTORIZE directives and changed include files
 * for STARDtardent 3000 series (via cond. comp symbol "ardent").
 * 
 * Revision 1.8  90/08/29  11:00:49  keith
 * Speeded up loop at 231 to improve parallel efficiency.
 * 
 * Revision 1.7  90/08/01  19:11:40  keith
 * Modified to exclude framework-framework interactions.
 * N.B. Excluded from pe and stress but NOT forces (as they sum to 0).
 * 
 * Revision 1.6  90/05/16  18:40:57  keith
 * Renamed own freer from cfree to tfree.
 * 
 * Revision 1.5  90/05/16  14:20:47  keith
 * *** empty log message ***
 * 
 * Revision 1.4  90/05/02  15:37:31  keith
 * Removed references to size_mt and time_t typedefs, no longer in "defs.h"
 * 
 * Revision 1.3  90/04/26  15:29:48  keith
 * Changed declaration of arralloc back to char*
 * 
 * Revision 1.2  90/04/25  10:37:06  keith
 * Fixed bug which led to ihkl[] accessing outside bounds of chx[] etc arrays
 * when in const-pressure mode and MD cell expanded between steps.
 * 
 * Revision 1.1  90/01/31  13:18:59  keith
 * Initial revision
 * 
 * Revision 1.6  89/12/15  12:56:26  keith
 * Added conditional ionclusion of <fastmath.h> for stellar
 * 
 * Revision 1.5  89/11/01  17:29:10  keith
 * Sin and cos loop vectorised - mat_vec_mul extracted from loop.
 * 'Uniform charge sheet' term added in case of electrically charged system.
 * 
 * Revision 1.5  89/10/26  11:29:26  keith
 * Sin and cos loop vectorised - mat_vec_mul extracted from loop.
 * 'Uniform charge sheet' term added in case of electrically charged layer.
 * 
 * Revision 1.5  89/10/26  11:27:31  keith
 * Sin and cos loop vectorised - mat_vec_mul extracted from loop.
 * 
 * Revision 1.4  89/10/02  11:39:14  keith
 * Fixed error in *star macros which assumed rlv's were cols of h(-1) r.t. rows.
 * 
 * Revision 1.3  89/06/09  13:38:17  keith
 * Older code for computation of q cos/sin k.r restored conditionally by
 * use of macro OLDEWALD.  This is for more primitive vectorising compilers.
 * 
 * Revision 1.2  89/06/08  10:42:51  keith
 * Modified to circumvent compiler bug in VMS/VAXC 2.4-026
 * 
 * Revision 1.1  89/04/20  16:00:39  keith
 * Initial revision
 * 
 */
#ifndef lint
static char *RCSid = "$Header: /home/eeyore_data/keith/md/moldy/RCS/ewald.c,v 2.13 1996/11/05 16:45:49 keith Exp $";
#endif
/*========================== Program include files ===========================*/
#include 	"defs.h"
/*========================== Library include files ===========================*/
#ifdef stellar
#   include 	<fastmath.h>
#else
#ifdef titan
#   include 	<vmath.h>
#else
#   include 	<math.h>
#endif
#endif
#include	"stddef.h"
#include 	"stdlib.h"
#include        "string.h"
/*========================== Program include files ===========================*/
#include 	"structs.h"
#include 	"messages.h"
/*========================== External function declarations ==================*/
gptr            *talloc();	       /* Interface to memory allocator       */
void            tfree();	       /* Free allocated memory	      	      */
void            afree();	       /* Free allocated array	      	      */
double	err_fn();			/* Error function		      */
double	det();				/* Determinant of 3x3 matrix	      */
void	invert();			/* Inverts a 3x3 matrix		      */
void	mat_vec_mul();			/* Multiplies a 3x3 matrix by 3xN vect*/
void	mat_sca_mul();			/* Multiplies a 3x3 matrix by scalar  */
void	transpose();			/* Transposes a 3x3 matrix	      */
void    zero_real();            	/* Initialiser                        */
void    zero_double();          	/* Initialiser                        */
double	sum();				/* Sum of elements of 'real' vector   */
#if defined(ANSI) || defined(__STDC__)
gptr	*arralloc(size_mt,int,...); 	/* Array allocator		      */
void	note(char *,...);		/* Write a message to the output file */
void	message(int *,...);		/* Write a warning or error message   */
#else
gptr	*arralloc();	        	/* Array allocator		      */
void	note();				/* Write a message to the output file */
void	message();			/* Write a warning or error message   */
#endif
/*========================== External data references ========================*/
extern	contr_mt	control;       	/* Main simulation control record     */
extern int		ithread, nthreads;
/*========================== Macros ==========================================*/
#define astar hinvp[0]
#define bstar hinvp[1]
#define cstar hinvp[2]
#define moda(hmat) (hmat[0][0])
#define modb(hmat) sqrt(SQR(hmat[0][1]) + SQR(hmat[1][1]))
#define modc(hmat) sqrt(SQR(hmat[0][2]) + SQR(hmat[1][2]) + SQR(hmat[2][2]))
/*========================== Cache Parameters=================================*/
/* The default values are for the Cray T3D but are probably good enough 
 *  for most other systems too. */
#ifndef NCACHE
#   define NCACHE (256*sizeof(double)/sizeof(real))
#endif
#ifndef NLINE
#   define NLINE  (4*sizeof(double)/sizeof(real))
#endif
/*============================================================================*/
   struct s_hkl {double kx, ky, kz; int h,k,l;};
/*****************************************************************************
 * qsincos().  Evaluate q sin(k.r) and q cos(k.r).  This is in a separate    *
 * function because some compilers (notably Stellar's) generate MUCH better  *
 * vector code this way. 						     *
 *****************************************************************************/
static
void      qsincos(coshx,sinhx,cosky,sinky,coslz,sinlz,chg,
		  qcoskr,qsinkr,k,l,nsites)
real coshx[], sinhx[], cosky[], sinky[], coslz[], sinlz[],
     chg[], qcoskr[], qsinkr[];
int  k,l,nsites;
{
   int is;
   real qckr;
   
   if( k >= 0 )
      if( l >= 0 )
      {
VECTORIZE
	 for(is = 0; is < nsites; is++)
	 {
	    qckr = chg[is]*(
		  (coshx[is]*cosky[is] - sinhx[is]*sinky[is])*coslz[is] 
                - (sinhx[is]*cosky[is] + coshx[is]*sinky[is])*sinlz[is]);
	    qsinkr[is] = chg[is]*(
                  (sinhx[is]*cosky[is] + coshx[is]*sinky[is])*coslz[is] 
		+ (coshx[is]*cosky[is] - sinhx[is]*sinky[is])*sinlz[is]);
	    qcoskr[is] = qckr;
	 }
      }
      else
      {
VECTORIZE
	 for(is = 0; is < nsites; is++)
	 {
	    qckr = chg[is]*(
		  (coshx[is]*cosky[is] - sinhx[is]*sinky[is])*coslz[is] 
                + (sinhx[is]*cosky[is] + coshx[is]*sinky[is])*sinlz[is]);
	    qsinkr[is] = chg[is]*(
                  (sinhx[is]*cosky[is] + coshx[is]*sinky[is])*coslz[is] 
		- (coshx[is]*cosky[is] - sinhx[is]*sinky[is])*sinlz[is]);
	    qcoskr[is] = qckr;
	 }
      }
   else
      if( l >= 0 )
      {
VECTORIZE
	 for(is = 0; is < nsites; is++)
	 {
	    qckr = chg[is]*(
		  (coshx[is]*cosky[is] + sinhx[is]*sinky[is])*coslz[is] 
                - (sinhx[is]*cosky[is] - coshx[is]*sinky[is])*sinlz[is]);
	    qsinkr[is] = chg[is]*(
                  (sinhx[is]*cosky[is] - coshx[is]*sinky[is])*coslz[is] 
		+ (coshx[is]*cosky[is] + sinhx[is]*sinky[is])*sinlz[is]);
	    qcoskr[is] = qckr;
	 }
      }
      else
      {
VECTORIZE
	 for(is = 0; is < nsites; is++)
	 {
	    qckr = chg[is]*(
		  (coshx[is]*cosky[is] + sinhx[is]*sinky[is])*coslz[is] 
                + (sinhx[is]*cosky[is] - coshx[is]*sinky[is])*sinlz[is]);
	    qsinkr[is] = chg[is]*(
                  (sinhx[is]*cosky[is] - coshx[is]*sinky[is])*coslz[is] 
		- (coshx[is]*cosky[is] + sinhx[is]*sinky[is])*sinlz[is]);
	    qcoskr[is] = qckr;
	 }
     }
}

/******************************************************************************
 * Calculate cos and sin of astar*x, bstar*y & cstar*z for each charged site  *
 * Use addition formulae to get sin(h*astar*x)=sin(Kx*x) etc for each site    *
 ******************************************************************************/
static
void trig_recur(chx,shx,sin1x,cos1x,site0,site1,site2,kstar,hmax,ns0,ns1)
real **chx, **shx, sin1x[],cos1x[], *site0, *site1, *site2, *kstar;
int ns0, ns1, hmax;
{
   int h, is;
   real *coshx, *sinhx, *cm1, *sm1, coss, kr;
   real ksx = kstar[0], ksy = kstar[1], ksz = kstar[2];

   sinhx = shx[0];    coshx = chx[0]; 
VECTORIZE
   for(is = ns0; is < ns1; is++)
   {
      coshx[is] = 1.0;
      sinhx[is] = 0.0;
   }      

   if( hmax >= 1 )
   {
      coshx = chx[1]; sinhx = shx[1];
VECTORIZE
      for(is = ns0; is < ns1; is++)
      {
	 kr = ksx*site0[is]+ksy*site1[is]+ksz*site2[is];
	 coshx[is] = cos(kr); sinhx[is] = sin(kr);
      }

      memcp(cos1x+ns0, coshx+ns0, (ns1-ns0)*sizeof(real));
      memcp(sin1x+ns0, sinhx+ns0, (ns1-ns0)*sizeof(real));

      for(h = 2; h <= hmax; h++)
      {
         coshx = chx[h]; sinhx = shx[h];
         cm1 = chx[h-1]; sm1 = shx[h-1];
VECTORIZE
         for(is = ns0; is < ns1; is++)
      	 {
      	    coss      = cm1[is]*cos1x[is] - sm1[is]*sin1x[is];
      	    sinhx[is] = sm1[is]*cos1x[is] + cm1[is]*sin1x[is];
      	    coshx[is] = coss;
      	 }
      }
   }
}

/******************************************************************************
 * allocate_arrays(). Memory allocation for Ewald's arrays.  We malloc one    *
 * large block and divide it out so as to precisely control the relative      *
 * offsets of the arrays.  This is to avoid cache conflicts.                  *
 * Revert to bog-standard method for MSDOS				      *
 ******************************************************************************/
static real*  allocate_arrays(nsarray, hmax, kmax, lmax,
			      chx, cky, clz, shx, sky, slz, chg,
			      cos1x, cos1y, cos1z, sin1x, sin1y, sin1z, 
			      qcoskr, qsinkr)
int	nsarray, hmax, kmax, lmax;
real	***chx, ***cky, ***clz, ***shx, ***sky, ***slz;
real	**cos1x, **cos1y, **cos1z, **sin1x, **sin1y, **sin1z;
real	**chg, **qcoskr, **qsinkr;
{
   int h, k,l; 
   real *csp, *base;

#ifndef __MSDOS__
   /*
    * Attempt to cache-align these arrays for optimum performance.
    * This requires NON-ANSI pointer conversion and arithmetic.
    * It should work with any UNIX address space, so make it conditional.
    */
   base = dalloc((2*(hmax+kmax+lmax)+15)*nsarray+10*NLINE+NCACHE);
#if defined(unix) && !defined(__BOUNDS_CHECKING_ON)
   csp = (real*)0 + (((base - (real*)0) - 1 | NCACHE-1) + 1);
#else
   csp = base;
#endif

   *chx = (real**)arralloc(sizeof(real*),1,0,hmax);
   *cky = (real**)arralloc(sizeof(real*),1,0,kmax);
   *clz = (real**)arralloc(sizeof(real*),1,0,lmax);
   *shx = (real**)arralloc(sizeof(real*),1,0,hmax);
   *sky = (real**)arralloc(sizeof(real*),1,0,kmax);
   *slz = (real**)arralloc(sizeof(real*),1,0,lmax);
 
   *qcoskr = csp; csp += nsarray + NLINE;
   *qsinkr = csp; csp += nsarray + NLINE;
   *chg = csp;    csp += nsarray + NLINE;
   for(h = 0; h <= hmax; h++, csp += nsarray)
     (*chx)[h] = csp;
   csp += NLINE;
   for(h = 0; h <= hmax; h++, csp += nsarray)
     (*shx)[h] = csp;
   csp += NLINE;
   for(k = 0; k <= kmax; k++, csp += nsarray)
     (*cky)[k] = csp;
   csp += NLINE;
   for(k = 0; k <= kmax; k++, csp += nsarray)
     (*sky)[k] = csp;
   csp += NLINE;
   for(l = 0; l <= lmax; l++, csp += nsarray)
     (*clz)[l] = csp;
   csp += NLINE;
   for(l = 0; l <= lmax; l++, csp += nsarray)
     (*slz)[l] = csp;
   csp += NLINE;
   *cos1x = csp;  csp += nsarray;
   *cos1y = csp;  csp += nsarray;
   *cos1z = csp;  csp += nsarray + NLINE;
   *sin1x = csp;  csp += nsarray;
   *sin1y = csp;  csp += nsarray;
   *sin1z = csp;  csp += nsarray;
#else
   *chx = (real**)arralloc((size_mt)sizeof(real),2, 0, hmax, 0, nsarray-1);
   *cky = (real**)arralloc((size_mt)sizeof(real),2, 0, kmax, 0, nsarray-1);
   *clz = (real**)arralloc((size_mt)sizeof(real),2, 0, lmax, 0, nsarray-1);
   *shx = (real**)arralloc((size_mt)sizeof(real),2, 0, hmax, 0, nsarray-1);
   *sky = (real**)arralloc((size_mt)sizeof(real),2, 0, kmax, 0, nsarray-1);
   *slz = (real**)arralloc((size_mt)sizeof(real),2, 0, lmax, 0, nsarray-1);
   *cos1x = (*chx)[1]; *cos1y = (*cky)[1]; *cos1z = (*clz)[1];
   *sin1x = (*shx)[1]; *sin1y = (*sky)[1]; *sin1z = (*slz)[1];

   csp = base = dalloc(3*nsarray);
   *chg = csp;    csp += nsarray;
   *qcoskr = csp; csp += nsarray;
   *qsinkr = csp; csp += nsarray;
#endif

   return base;
}
/******************************************************************************
 *  Ewald  Calculate reciprocal-space part of coulombic forces		      *
 ******************************************************************************/
void	ewald(site,site_force,system,species,chgx,pe,stress)
real		**site,			/* Site co-ordinate arrays	 (in) */
		**site_force;		/* Site force arrays		(out) */
system_mp	system;			/* System record		 (in) */
spec_mt	species[];			/* Array of species records	 (in) */
real		chgx[];			/* Array of site charges	 (in) */
double		*pe;			/* Potential energy		(out) */
mat_mt		stress;			/* Stress virial		(out) */
{
   mat_mt	hinvp;			/* Matrix of reciprocal lattice vects*/
   int		h, k, l;		/* Recip. lattice vector indices     */
   int		i, j, is, ssite;	/* Counters.			     */
   spec_mp	spec;			/* species[ispec]		     */
   int		nsites = system->nsites;
   double	pe_k,			/* Pot'l energy for current K vector */
		coeff, coeff2;		/* 2/(e0V) * A(K) & similar	     */
   double	r_4_alpha = -1.0/(4.0 * control.alpha * control.alpha);
   double	sqcoskr,sqsinkr,	/* Sum q(i) sin/cos(K.r(i))          */
		sqcoskrn, sqsinkrn,
		sqcoskrf, sqsinkrf;
   double	ksq,			/* Squared magnitude of K vector     */
		kcsq = SQR(control.k_cutoff);
   double	kx,ky,kz,kzt;
   vec_mt	kv;			/* (Kx,Ky,Kz)  			     */
   real		force_comp, kv0, kv1, kv2, sfx, sfy;
   struct	s_hkl *hkl, *phkl;
   int		nhkl = 0;
/*
 * Maximum values of h, k, l  s.t. |k| < k_cutoff
 */
   int		hmax = floor(control.k_cutoff/(2*PI)*moda(system->h)),
		kmax = floor(control.k_cutoff/(2*PI)*modb(system->h)),
		lmax = floor(control.k_cutoff/(2*PI)*modc(system->h));
/*
 * lower and upper limits for parallel loops.   
 */
#if defined(SPMD) && defined(MPPMANY)
   int  nsnode = (nsites+nthreads-1)/nthreads;
   int  nsarr0 = nsnode * nthreads;
   int  ns0 = MIN(nsites, nsnode * ithread), 
        ns1 = MIN(nsites, nsnode * (ithread+1));
#else	    
   int		ns0 = 0, ns1 = nsites;
   int		nsarr0 = nsites;
#endif
/*
 * Round up size of arrays to cache sub-multiple size.
 */
#if 1
   int		nsarray = (nsarr0 - 1 | NCACHE - 1) + 1;
#else
   int		nsarray = nsarr0;
#endif
/*
 * Arrays for cos & sin (h x(i)), (k y(i)) and (l z(i)) eg chx[h][isite]
 * and pointers to a particular h,k or l eg coshx[is] = chh[2][is]
 */
   real		**chx, **cky, **clz, **shx, **sky, **slz;
   real		*cshkl;
   real		*coshx, *cosky, *coslz, *sinhx, *sinky, *sinlz;
   real		*cos1x, *cos1y, *cos1z, *sin1x, *sin1y, *sin1z;
   real		*chg;
   real		*site_fx = site_force[0],
   		*site_fy = site_force[1],
   		*site_fz = site_force[2];
   real		*qcoskr,		/* q(i) cos(K.R(i))	      */
		*qsinkr;		/* q(i) sin(K.R(i))	      */
   double	vol = det(system->h);	/* Volume of MD cell		      */
   static	double	self_energy,	/* Constant self energy term	      */
   			sheet_energy;	/* Correction for non-neutral system. */
   static	boolean init = true;	/* Flag for the first call of function*/
   static       int     nsitesxf;       /* Number of non-framework sites.     */
/*
 * Trig array set-up.  The precise storage layout is to avoid cache conflicts.
 */
   cshkl = allocate_arrays(nsarray, hmax, kmax, lmax,
			   &chx, &cky, &clz, &shx, &sky, &slz, &chg,
			   &cos1x, &cos1y, &cos1z, &sin1x, &sin1y, &sin1z, 
			   &qcoskr, &qsinkr);
   memcp(chg, chgx, nsites*lsizeof(real));
/*
 * First call only - evaluate self energy term and store for subsequent calls
 * Self energy includes terms for non-framework sites only.
 */
   if(init)
   {
      double	sqsq = 0, sq = 0, sqxf, intra, r;
      int	js, frame_flag;

      self_energy = sheet_energy = 0;
      ssite = 0;
      spec = species; 
      while( spec < species+system->nspecies && ! spec->framework)
      {
	 intra = 0.0;
	 for(is = 0; is < spec->nsites; is++)
	    for(js = is+1; js < spec->nsites; js++)
	    {
	       r = DISTANCE(spec->p_f_sites[is], spec->p_f_sites[js]);
	       intra += chg[ssite+is] * chg[ssite+js]
		       *err_fn(control.alpha * r) / r;
	    }
	 self_energy += spec->nmols * intra;
	 ssite += spec->nsites*spec->nmols;
	 spec++;
      }
      nsitesxf = ssite;
      frame_flag = (spec != species+system->nspecies);

      for(is = 0; is < nsitesxf; is++)
      {
	 sq += chg[is];
	 sqsq += SQR(chg[is]);
      }
      self_energy += control.alpha / sqrt(PI) * sqsq;
      /*
       * Sqxf is total non-framework charge.  Calculate grand total in sq.
       */
      sqxf = sq;
      for(; is < nsites; is++)
	 sq += chg[is];
      /*
       *  Charged-system/uniform sheet correction terms (really in direct
       *  space term but included here for convenience).
       *  1) For charged framework only.
       */
      if( frame_flag )
      {
	 sheet_energy = PI*SQR(sq-sqxf) / (2.0*SQR(control.alpha));
	 message(NULLI, NULLP, INFO, FRACHG, 
		 (sq-sqxf)*CONV_Q, sheet_energy/vol*CONV_E);
      }
      /* 
       *  2) Case of entire system non-neutral.
       */
      if( fabs(sq)*CONV_Q > 1.0e-5)
      {
	 sheet_energy -= intra = PI*SQR(sq) / (2.0*SQR(control.alpha));
	 message(NULLI, NULLP, WARNING, SYSCHG, sq*CONV_Q, intra/vol*CONV_E);
      }

      note("Ewald self-energy = %f Kj/mol",self_energy*CONV_E);
   }

   if( ithread == 0 )
   {
      *pe -= self_energy;		/* Subtract self energy term	      */
      *pe += sheet_energy/vol;		/* Uniform charge correction	      */
      for(i=0; i<3; i++)
	 stress[i][i] += sheet_energy/vol;
   }
   
   invert(system->h, hinvp);		/* Inverse of h is matrix of r.l.v.'s */
   mat_sca_mul(2*PI, hinvp, hinvp);
   /*
    * Build array hkl[] of k vectors within cutoff. 
    * N. B. This code presupposes upper-triangular H matrix.
    */
   hkl = aalloc(4*(hmax+1)*(kmax+1)*(lmax+1), struct s_hkl);
   for(h = 0; h <= hmax; h++)
      for(k = (h==0 ? 0 : -kmax); k <= kmax; k++)
      {
	 kx = h*astar[0] + k*bstar[0];
	 ky = h*astar[1] + k*bstar[1];
	 kzt = h*astar[2] + k*bstar[2];
	 ksq = SQR(kx) + SQR(ky);
	 for(l = (h==0 && k==0 ? 1 : -lmax); l <= lmax; l++)
	 {
	    kz = kzt + l*cstar[2];
	    if( SQR(kz)+ksq < kcsq )
	    {
	       hkl[nhkl].h = h; hkl[nhkl].k = k; hkl[nhkl].l = l;
	       hkl[nhkl].kx = kx;
	       hkl[nhkl].ky = ky;
	       hkl[nhkl].kz = kz;
	       nhkl++;
	    }
	 }
      }
   if( init )
      note("%d K-vectors included in reciprocal-space sum",nhkl);
   init = false;
/*
 * Calculate cos and sin of astar*x, bstar*y & cstar*z for each charged site
 * Use addition formulae to get sin(h*astar*x)=sin(Kx*x) etc for each site
 */
   trig_recur(chx,shx,sin1x,cos1x,site[0],site[1],site[2],astar,hmax,ns0,ns1);
   trig_recur(cky,sky,sin1y,cos1y,site[0],site[1],site[2],bstar,kmax,ns0,ns1);
   trig_recur(clz,slz,sin1z,cos1z,site[0],site[1],site[2],cstar,lmax,ns0,ns1);

#if defined(SPMD) && defined(MPPMANY)
   par_collect_all(chx[0]+ns0, chx[0], nsnode, nsarray, hmax+1);
   par_collect_all(shx[0]+ns0, shx[0], nsnode, nsarray, hmax+1);
   par_collect_all(cky[0]+ns0, cky[0], nsnode, nsarray, kmax+1);
   par_collect_all(sky[0]+ns0, sky[0], nsnode, nsarray, kmax+1);
   par_collect_all(clz[0]+ns0, clz[0], nsnode, nsarray, lmax+1);
   par_collect_all(slz[0]+ns0, slz[0], nsnode, nsarray, lmax+1);
#endif
/*
 * Start of main loops over K vector indices h, k, l between -*max, *max etc.
 * To avoid calculating K and -K, only half of the K-space box is covered. 
 * Points on the axes are included once and only once. (0,0,0) is omitted.
 */
   for(phkl = hkl+ithread; phkl < hkl+nhkl; phkl += nthreads)
   {
/*
 * Calculate actual K vector and its squared magnitude.
 */
      h  = phkl->h;	    k     = phkl->k;  l     = phkl->l;
      kv0 = kv[0] = phkl->kx; 
      kv1 = kv[1] = phkl->ky; 
      kv2 = kv[2] = phkl->kz;

      ksq = SUMSQ(kv);

/*
 * Calculate pre-factors A(K) etc
 */
      coeff  = 2.0 / (EPS0 * vol) * exp(ksq * r_4_alpha) / ksq;
      coeff2 = 2.0 * (1.0 - ksq * r_4_alpha) / ksq;
      
/*
 * Set pointers to array of cos (h*astar*x) (for efficiency & vectorisation)
 */
      coshx = chx[h];  cosky = cky[abs(k)]; coslz = clz[abs(l)];
      sinhx = shx[h];  sinky = sky[abs(k)]; sinlz = slz[abs(l)];

/*
 * Calculate q(i)*cos/sin(K.R(i)) by addition formulae. Note handling of
 * negative k and l by using sin(-x) = -sin(x), cos(-x) = cos(x). For
 * efficiency & vectorisation there is a loop for each case.
 */
      qsincos(coshx,sinhx,cosky,sinky,coslz,sinlz,chg,
	      qcoskr,qsinkr,k,l,nsites);
      sqcoskrn = sum(nsitesxf, qcoskr, 1);
      sqsinkrn = sum(nsitesxf, qsinkr, 1);
      sqcoskrf = sum(nsites-nsitesxf, qcoskr+nsitesxf, 1);
      sqsinkrf = sum(nsites-nsitesxf, qsinkr+nsitesxf, 1);
      sqcoskr = sqcoskrn + sqcoskrf;
      sqsinkr = sqsinkrn + sqsinkrf;
      
/*
 * Evaluate potential energy contribution for this K and add to total.
 * Exclude frame-frame interaction terms.
 */
      pe_k = 0.5 * coeff * (sqcoskrn*(sqcoskrn+sqcoskrf+sqcoskrf) +
			    sqsinkrn*(sqsinkrn+sqsinkrf+sqsinkrf));
      *pe += pe_k;

      sqsinkr *= coeff; sqcoskr *= coeff;
      sqsinkrn *= coeff; sqcoskrn *= coeff;
/*
 * Calculate long-range coulombic contribution to stress tensor
 */
NOVECTOR
      for(i = 0; i < 3; i++)
      {
	 stress[i][i] += pe_k;
NOVECTOR
	 for(j = i; j < 3; j++)
	    stress[i][j] -= pe_k * coeff2 * kv[i] * kv[j];
      }
/*
 * Evaluation of site forces.   Non-framework sites interact with all others
 */
VECTORIZE
      for(is = 0; is < nsitesxf; is++)
      {
	 force_comp = qsinkr[is]*sqcoskr - qcoskr[is]*sqsinkr;
	 sfx =  site_fx[is] + kv0 * force_comp;
	 sfy =  site_fy[is] + kv1 * force_comp;
	 site_fz[is] +=       kv2 * force_comp;
	 site_fx[is] = sfx;
	 site_fy[is] = sfy;
      }
/*
 *  Framework sites -- only interact with non-framework sites
 */
VECTORIZE
      for(is = nsitesxf; is < nsites; is++)
      {
	 force_comp = qsinkr[is]*sqcoskrn - qcoskr[is]*sqsinkrn;
	 sfx =  site_fx[is] + kv0 * force_comp;
	 sfy =  site_fy[is] + kv1 * force_comp;
	 site_fz[is] +=       kv2 * force_comp;
	 site_fx[is] = sfx;
	 site_fy[is] = sfy;
      }
/*
 * End of loop over K vectors.
 */
   }
   afree((gptr*)chx); afree((gptr*)cky); afree((gptr*)clz); 
   afree((gptr*)shx); afree((gptr*)sky); afree((gptr*)slz);
   xfree(cshkl);
   xfree(hkl);
}