/*----------------------------------------------------------------------------*/

/*
 * Performance-Monitoring Counters Library, for Intel/AMD Processors and Linux
 * Author:  Don Heller, dheller@scl.ameslab.gov
 * Last revised:  19 January 2001
 */

/*----------------------------------------------------------------------------*/

/* usage, Pentium Pro/II/III:
 *   rabbit6.sh [trials] [cutoff]
 *   rabbit6 -i scripts/in.perinstr0 0 [trials] [cutoff]
 *   rabbit6 -i scripts/in.perinstr1 1 [trials] [cutoff]
 *
 * usage, Pentium:
 *   rabbit6.5.sh [trials] [cutoff]
 *   rabbit6 -i scripts/in.perinstr0.5 0 [trials] [cutoff]
 *
 * usage, Athlon:
 *   (someone needs to write the input files and modify this code)
 */

/*----------------------------------------------------------------------------*/

/* measure a function, print "interesting" ratios of events */

void Test_info(int len);	// general initialization
void Test_init(int len);	// before each test
void Test(int len);		// the test itself

/* default values for command line */
#define TRIALS 3
#define CUTOFF 0.01

#ifdef DEBUG
  // look for problems with mis-aligned local double
  int fht_first = 0;
  char debug_fmt[] = "%4s &%-4s = 0x%08x\n";
#define ADDR(W,X) \
  if ((int)(&X) & 7) { printf(debug_fmt, W, #X, (unsigned int)&X); }
#endif

#ifndef STATIC
#define STATIC
#endif

/* this particular example is an FFT and its inverse
 * try it on various sizes
 */
#define TestLen		(256*1024)
	/* power of 2 */
#define TestStart	(16*1024)
#define TestStop	(16*1024)
	/* up to TestLen */

/*----------------------------------------------------------------------------*/

#include <pmc_lib.h>

/* for sqrt() */
#include <math.h>

/* for atoi(), atof() */
#include <stdlib.h>

/*----------------------------------------------------------------------------*/

int main(int argc, char * argv[])
{
  pmc_control_t Ctl = pmc_control_null;
  pmc_counter_t *a, c;
  pmc_data_t t0, t1;

  int i,j,k, m,n, len, out, e;

  /* command-line default values */
  int flag = 0, trials = TRIALS;
  double cutoff = CUTOFF;

  /* initialize internal data structures, read command-line arguments */

  if (pmc_getargs(stderr, argv[0], &argc, &argv, &Ctl) == FALSE)
    { exit(RABBIT_FAILURE); }

  if (argc > 0) { flag   = atoi(argv[0]); } else { exit(RABBIT_FAILURE); }
  if (argc > 1) { trials = atoi(argv[1]); }
  if (argc > 2) { cutoff = atof(argv[2]); }

  if (pmc_open(0) == FALSE)		/* open /dev/pmc */
    { exit(RABBIT_FAILURE); }

  for (len = TestStart; len <= TestStop; len *= 2) {
    Test_info(len);		// prepare for the test, once only

    pmc_counter_init(&c, &Ctl);

    /* for (out = 0; out < Ctl.num_counters; out++) { ... } */

    for (i = 0, out = 0; i < Ctl.event_pairs; i++) {
      for (j = 0; j < Ctl.replication; j++, out++) {
	a = &Ctl.counters[out];
	pmc_counter_reset(a);
	pmc_select(a);
	for (k = 0; k < trials; k++) {

	  Test_init(len);	// prepare for the test

	  pmc_read(&t0);
	    Test(len);		// the test itself
	  pmc_read(&t1);

	  pmc_accumulate(a,&t1,&t0);
	  // pmc_accumulate(&c,&t1,NULL);
	}
	pmc_accumulate_counter(&c,a);
      }
    }

    // print cycles, instruction count
    printf("min, mean, max, std. dev. over %lld trials\n", pmc_intervals(&c));
    printf("cycles:       ");
    printf(" %10lld", pmc_min_cycles(&c));
    printf(" %10.0f", pmc_mean_cycles(&c));
    printf(" %10lld", pmc_max_cycles(&c));
    printf(" %10.0f", sqrt(pmc_variance_cycles(&c)));
    printf("\n");

    e = 1 - flag;
    printf("instructions: ");
    printf(" %10lld", pmc_min_events(&c,e));
    printf(" %10.0f", pmc_mean_events(&c,e));
    printf(" %10lld", pmc_max_events(&c,e));
    printf(" %10.0f", sqrt(pmc_variance_events(&c,e)));
    printf("\n");

    printf("events per cycle\n");
    printf("    min, mean, max, std. dev. over %d trials, cutoff = %f\n",
	trials, cutoff);

    for (i = 0, out = 0; i < Ctl.event_pairs; i++) {
      for (j = 0; j < Ctl.replication; j++, out++) {
	a = &Ctl.counters[out];
	if (pmc_mean_events_per_cycle(a,flag) > cutoff) {
	  e = Ctl.events[i][flag];
	  printf("0x%02x %-30.30s", e, pmc_event_name(e,flag));
	  printf(" %10.6f", pmc_min_events_per_cycle(a,flag));
	  printf(" %10.6f", pmc_mean_events_per_cycle(a,flag));
	  printf(" %10.6f", pmc_max_events_per_cycle(a,flag));
	  printf(" %10.6f", sqrt(pmc_variance_events_per_cycle(a,flag)));
	  printf("\n");
	}
      }
    }

    printf("events per instruction retired\n");
    printf("    min, mean, max, std. dev. over %d trials, cutoff = %f\n",
	trials, cutoff);

    if (flag == 0)
      { m = 0; n = 1; }
    else
      { m = 1; n = 0; }

    for (i = 0, out = 0; i < Ctl.event_pairs; i++) {
      for (j = 0; j < Ctl.replication; j++, out++) {
	a = &Ctl.counters[out];
	if (pmc_mean_ratio_events(a,m,n) > cutoff) {
	  e = Ctl.events[i][m];
	  printf("0x%02x %-30.30s", e, pmc_event_name(e,m));
	  printf(" %10.6f", pmc_min_ratio_events(a,m,n));
	  printf(" %10.6f", pmc_mean_ratio_events(a,m,n));
	  printf(" %10.6f", pmc_max_ratio_events(a,m,n));
	  printf(" %10.6f", sqrt(pmc_variance_ratio_events(a,m,n)));
	  printf("\n");
	}
      }
    }

  }

  pmc_close();				/* close /dev/pmc */

  exit(RABBIT_SUCCESS);
}

/*----------------------------------------------------------------------------*/

/* This example is modified from one of Al Aburto's benchmark programs.
 * From all the comments in the code, I hope it's legal to do this!
 */

double real[2*TestLen + 1];
double imag[2*TestLen + 1];

void fht(double * fz, int n);
void ifft(int n, double * real, double * imag);
void  fft(int n, double * real, double * imag);
// void  realfft(int n, double * real);
// void realifft(int n, double * real);

void Test_info(int len)
{
  printf("len = %d\n", len);
#ifdef DEBUG
  printf("addr real = 0x%08x\n", (unsigned int)&real);	// aligned properly?
  printf("addr imag = 0x%08x\n", (unsigned int)&imag);
#endif
}

void Test_init(int len)
{
  int i;

  for (i = 0; i < len; i++)
    {
      real[i] = (double) i;
      imag[i] = 0.0;
    }
}

void Test(int len)	/* at most TestLen */
{
  fft(len, real, imag);
  ifft(len, real, imag);
}

/*----------------------------------------------------------------------------*/

/*
** FFT and FHT routines
**  Copyright 1988, 1993; Ron Mayer
**
**  fht(fz,n);
**      Does a hartley transform of "n" points in the array "fz".
**  fft(n,real,imag)
**      Does a fourier transform of "n" points of the "real" and
**      "imag" arrays.
**  ifft(n,real,imag)
**      Does an inverse fourier transform of "n" points of the "real"
**      and "imag" arrays.
**  realfft(n,real)
**      Does a real-valued fourier transform of "n" points of the
**      "real" and "imag" arrays.  The real part of the transform ends
**      up in the first half of the array and the imaginary part of the
**      transform ends up in the second half of the array.
**  realifft(n,real)
**      The inverse of the realfft() routine above.
**
**
** NOTE: This routine uses at least 2 patented algorithms, and may be
**       under the restrictions of a bunch of different organizations.
**       Although I wrote it completely myself; it is kind of a derivative
**       of a routine I once authored and released under the GPL, so it
**       may fall under the free software foundation's restrictions;
**       it was worked on as a Stanford Univ project, so they claim
**       some rights to it; it was further optimized at work here, so
**       I think this company claims parts of it.  The patents are
**       held by R. Bracewell (the FHT algorithm) and O. Buneman (the
**       trig generator), both at Stanford Univ.
**       If it were up to me, I'd say go do whatever you want with it;
**       but it would be polite to give credit to the following people
**       if you use this anywhere:
**           Euler     - probable inventor of the fourier transform.
**           Gauss     - probable inventor of the FFT.
**           Hartley   - probable inventor of the hartley transform.
**           Buneman   - for a really cool trig generator
**           Mayer(me) - for authoring this particular version and
**                       including all the optimizations in one package.
**       Thanks,
**       Ron Mayer; mayer@acuson.com
**
*/

#define GOOD_TRIG
// #include "trigtbl.h"

/* start of trigtbl.h */
/*
** Please only distribute this with it's associated FHT routine.
** This algorithm is apparently patented(!) and the code copyrighted.
** See the comment with the fht routine for more info.
**  -Thanks,
**   Ron Mayer
*/

#ifdef GOOD_TRIG
#else
#define FAST_TRIG
#endif

#if defined(GOOD_TRIG)
#define FHT_SWAP(a,b,t) {(t)=(a); (a)=(b); (b)=(t);}
#define TRIG_VARS                                                \
      int t_lam = 0;
#define TRIG_INIT(k,c,s)                                         \
     {                                                           \
      int i;                                                     \
      for (i = 2 ; i<=k ; i++)                                   \
	  {coswrk[i] = costab[i]; sinwrk[i] = sintab[i];}        \
      t_lam = 0;                                                 \
      c = 1;                                                     \
      s = 0;                                                     \
     }
#define TRIG_NEXT(k,c,s)                                         \
     {                                                           \
	 int i,j;                                                \
	 (t_lam)++;                                              \
	 for (i = 0 ; !((1<<i)&t_lam) ; i++);                    \
	 i = k-i;                                                \
	 s = sinwrk[i];                                          \
	 c = coswrk[i];                                          \
	 if (i>1)                                                \
	    {                                                    \
	     for (j = k-i+2 ; (1<<j)&t_lam ; j++);               \
	     j         = k - j;                                  \
	     sinwrk[i] = halsec[i] * (sinwrk[i-1] + sinwrk[j]);  \
	     coswrk[i] = halsec[i] * (coswrk[i-1] + coswrk[j]);  \
	    }                                                    \
     }
#define TRIG_RESET(k,c,s)
#endif

#if defined(FAST_TRIG)
#define TRIG_VARS                                        \
      double t_c,t_s;
#define TRIG_INIT(k,c,s)                                 \
    {                                                    \
     t_c  = costab[k];                                   \
     t_s  = sintab[k];                                   \
     c    = 1;                                           \
     s    = 0;                                           \
    }
#define TRIG_NEXT(k,c,s)                                 \
    {                                                    \
     double t = c;                                       \
     c   = t*t_c - s*t_s;                                \
     s   = t*t_s + s*t_c;                                \
    }
#define TRIG_RESET(k,c,s)
#endif

static double halsec[20]=
    {
     0,
     0,
     .54119610014619698439972320536638942006107206337801,
     .50979557910415916894193980398784391368261849190893,
     .50241928618815570551167011928012092247859337193963,
     .50060299823519630134550410676638239611758632599591,
     .50015063602065098821477101271097658495974913010340,
     .50003765191554772296778139077905492847503165398345,
     .50000941253588775676512870469186533538523133757983,
     .50000235310628608051401267171204408939326297376426,
     .50000058827484117879868526730916804925780637276181,
     .50000014706860214875463798283871198206179118093251,
     .50000003676714377807315864400643020315103490883972,
     .50000000919178552207366560348853455333939112569380,
     .50000000229794635411562887767906868558991922348920,
     .50000000057448658687873302235147272458812263401372
    };
static double costab[20]=
    {
     .00000000000000000000000000000000000000000000000000,
     .70710678118654752440084436210484903928483593768847,
     .92387953251128675612818318939678828682241662586364,
     .98078528040323044912618223613423903697393373089333,
     .99518472667219688624483695310947992157547486872985,
     .99879545620517239271477160475910069444320361470461,
     .99969881869620422011576564966617219685006108125772,
     .99992470183914454092164649119638322435060646880221,
     .99998117528260114265699043772856771617391725094433,
     .99999529380957617151158012570011989955298763362218,
     .99999882345170190992902571017152601904826792288976,
     .99999970586288221916022821773876567711626389934930,
     .99999992646571785114473148070738785694820115568892,
     .99999998161642929380834691540290971450507605124278,
     .99999999540410731289097193313960614895889430318945,
     .99999999885102682756267330779455410840053741619428
    };
static double sintab[20]=
    {
     1.0000000000000000000000000000000000000000000000000,
     .70710678118654752440084436210484903928483593768846,
     .38268343236508977172845998403039886676134456248561,
     .19509032201612826784828486847702224092769161775195,
     .09801714032956060199419556388864184586113667316749,
     .04906767432741801425495497694268265831474536302574,
     .02454122852291228803173452945928292506546611923944,
     .01227153828571992607940826195100321214037231959176,
     .00613588464915447535964023459037258091705788631738,
     .00306795676296597627014536549091984251894461021344,
     .00153398018628476561230369715026407907995486457522,
     .00076699031874270452693856835794857664314091945205,
     .00038349518757139558907246168118138126339502603495,
     .00019174759731070330743990956198900093346887403385,
     .00009587379909597734587051721097647635118706561284,
     .00004793689960306688454900399049465887274686668768
    };
static double coswrk[20]=
    {
     .00000000000000000000000000000000000000000000000000,
     .70710678118654752440084436210484903928483593768847,
     .92387953251128675612818318939678828682241662586364,
     .98078528040323044912618223613423903697393373089333,
     .99518472667219688624483695310947992157547486872985,
     .99879545620517239271477160475910069444320361470461,
     .99969881869620422011576564966617219685006108125772,
     .99992470183914454092164649119638322435060646880221,
     .99998117528260114265699043772856771617391725094433,
     .99999529380957617151158012570011989955298763362218,
     .99999882345170190992902571017152601904826792288976,
     .99999970586288221916022821773876567711626389934930,
     .99999992646571785114473148070738785694820115568892,
     .99999998161642929380834691540290971450507605124278,
     .99999999540410731289097193313960614895889430318945,
     .99999999885102682756267330779455410840053741619428
    };
static double sinwrk[20]=
    {
     1.0000000000000000000000000000000000000000000000000,
     .70710678118654752440084436210484903928483593768846,
     .38268343236508977172845998403039886676134456248561,
     .19509032201612826784828486847702224092769161775195,
     .09801714032956060199419556388864184586113667316749,
     .04906767432741801425495497694268265831474536302574,
     .02454122852291228803173452945928292506546611923944,
     .01227153828571992607940826195100321214037231959176,
     .00613588464915447535964023459037258091705788631738,
     .00306795676296597627014536549091984251894461021344,
     .00153398018628476561230369715026407907995486457522,
     .00076699031874270452693856835794857664314091945205,
     .00038349518757139558907246168118138126339502603495,
     .00019174759731070330743990956198900093346887403385,
     .00009587379909597734587051721097647635118706561284,
     .00004793689960306688454900399049465887274686668768
    };
/* end of trigtbl.h */

char fht_version[] = "Brcwl-Hrtly-Ron-dbld";

const double SQRT2 = 1.41421356237309504880168872420969807856967187537694;

void fht(double * fz, int n)
{
  // these could also be declared local to each loop, as noted
  STATIC double a,b;
  STATIC double c1,s1,s2,c2,s3,c3,s4,c4;
  STATIC double f0,g0,f1,g1,f2,g2,f3,g3;

  double *fi,*fn,*gi;
  int i,k,k1,k2,k3,k4,kx;
  TRIG_VARS

  for (k1 = 1, k2 = 0; k1 < n; k1++)
    {
      for (k = n>>1; (!((k2^=k)&k)); k >>= 1);
      if (k1 > k2)
	{
	  // double a;
	  a = fz[k1]; fz[k1] = fz[k2]; fz[k2] = a;
	}
    }
  for (k = 0; (1<<k) < n; k++);
  k &= 1;
  if (k == 0)
    {
      for (fi = fz, fn = fz + n; fi < fn; fi += 4)
	{
	  // double f0, f1, f2, f3;
	  f0    = fi[0] + fi[1];
	  f1    = fi[0] - fi[1];
	  f2    = fi[2] + fi[3];
	  f3    = fi[2] - fi[3];
	  fi[0] = (f0 + f2);
	  fi[1] = (f1 + f3);
	  fi[2] = (f0 - f2);
	  fi[3] = (f1 - f3);
	}
    }
  else
    {
      for (fi = fz, fn = fz + n, gi = fi + 1; fi < fn; fi += 8, gi += 8)
	{
	  // double s1,c1,s2,c2,s3,c3,s4,c4,g0,f0,f1,g1,f2,g2,f3,g3;
	  c1    = fi[0] - gi[0];
	  s1    = fi[0] + gi[0];
	  c2    = fi[2] - gi[2];
	  s2    = fi[2] + gi[2];
	  c3    = fi[4] - gi[4];
	  s3    = fi[4] + gi[4];
	  c4    = fi[6] - gi[6];
	  s4    = fi[6] + gi[6];
	  f1    = (s1 - s2);
	  f0    = (s1 + s2);
	  g1    = (c1 - c2);
	  g0    = (c1 + c2);
	  f3    = (s3 - s4);
	  f2    = (s3 + s4);
	  g3    = SQRT2 * c4;
	  g2    = SQRT2 * c3;
	  fi[4] = f0 - f2;
	  fi[0] = f0 + f2;
	  fi[6] = f1 - f3;
	  fi[2] = f1 + f3;
	  gi[4] = g0 - g2;
	  gi[0] = g0 + g2;
	  gi[6] = g1 - g3;
	  gi[2] = g1 + g3;
	}
    }
 if (n < 16) return;

 do
    {
     // double s1,c1;
     k  += 2;
     k1  = 1  << k;
     k2  = k1 << 1;
     k4  = k2 << 1;
     k3  = k2 + k1;
     kx  = k1 >> 1;
	fi  = fz;
	gi  = fi + kx;
	fn  = fz + n;
	do
	   {
	    // double g0,f0,f1,g1,f2,g2,f3,g3;
	    f1      = fi[0 ] - fi[k1];
	    f0      = fi[0 ] + fi[k1];
	    f3      = fi[k2] - fi[k3];
	    f2      = fi[k2] + fi[k3];
	    fi[k2]  = f0         - f2;
	    fi[0 ]  = f0         + f2;
	    fi[k3]  = f1         - f3;
	    fi[k1]  = f1         + f3;
	    g1      = gi[0 ] - gi[k1];
	    g0      = gi[0 ] + gi[k1];
	    g3      = SQRT2  * gi[k3];
	    g2      = SQRT2  * gi[k2];
	    gi[k2]  = g0         - g2;
	    gi[0 ]  = g0         + g2;
	    gi[k3]  = g1         - g3;
	    gi[k1]  = g1         + g3;
	    gi     += k4;
	    fi     += k4;
	   } while (fi<fn);
     TRIG_INIT(k,c1,s1);
     for (i = 1; i<kx; i++)
       {
	// double c2,s2;
	TRIG_NEXT(k,c1,s1);
	c2 = c1*c1 - s1*s1;
	s2 = 2*(c1*s1);
	    fn = fz + n;
	    fi = fz +i;
	    gi = fz +k1-i;
	    do
	       {
		// double a,b,g0,f0,f1,g1,f2,g2,f3,g3;
		b       = s2*fi[k1] - c2*gi[k1];
		a       = c2*fi[k1] + s2*gi[k1];
		f1      = fi[0 ]    - a;
		f0      = fi[0 ]    + a;
		g1      = gi[0 ]    - b;
		g0      = gi[0 ]    + b;
		b       = s2*fi[k3] - c2*gi[k3];
		a       = c2*fi[k3] + s2*gi[k3];
		f3      = fi[k2]    - a;
		f2      = fi[k2]    + a;
		g3      = gi[k2]    - b;
		g2      = gi[k2]    + b;
		b       = s1*f2     - c1*g3;
		a       = c1*f2     + s1*g3;
		fi[k2]  = f0        - a;
		fi[0 ]  = f0        + a;
		gi[k3]  = g1        - b;
		gi[k1]  = g1        + b;
		b       = c1*g2     - s1*f3;
		a       = s1*g2     + c1*f3;
		gi[k2]  = g0        - a;
		gi[0 ]  = g0        + a;
		fi[k3]  = f1        - b;
		fi[k1]  = f1        + b;
		gi     += k4;
		fi     += k4;
	       } while (fi<fn);
       }
     TRIG_RESET(k,c1,s1);
    } while (k4<n);

#ifdef DEBUG
  if (fht_first) {
    fht_first = 0;
    ADDR(" fht",a)
    ADDR(" fht",b)
    ADDR(" fht",c1)
    ADDR(" fht",s1)
    ADDR(" fht",s2)
    ADDR(" fht",c2)
    ADDR(" fht",s3)
    ADDR(" fht",c3)
    ADDR(" fht",s4)
    ADDR(" fht",c4)
    ADDR(" fht",f0)
    ADDR(" fht",g0)
    ADDR(" fht",f1)
    ADDR(" fht",g1)
    ADDR(" fht",f2)
    ADDR(" fht",g2)
    ADDR(" fht",f3)
    ADDR(" fht",g3)
  }
#endif
}


void ifft(int n, double * real, double * imag)
{
 STATIC double a,b,c,d;
 STATIC double q,r,s,t;
 int i,j,k;

#ifdef DEBUG
 fht_first = 1;
#endif

 fht(real,n);

#ifdef DEBUG
 fht_first = 1;
#endif

 fht(imag,n);

 for (i = 1,j = n-1,k = n/2; i<k; i++,j--) {
  a = real[i]; b = real[j];  q = a+b; r = a-b;
  c = imag[i]; d = imag[j];  s = c+d; t = c-d;
  imag[i] = (s+r)*0.5;  imag[j] = (s-r)*0.5;
  real[i] = (q-t)*0.5;  real[j] = (q+t)*0.5;
 }

#ifdef DEBUG
  ADDR("ifft",a)
  ADDR("ifft",b)
  ADDR("ifft",c)
  ADDR("ifft",d)
  ADDR("ifft",q)
  ADDR("ifft",r)
  ADDR("ifft",s)
  ADDR("ifft",t)
#endif
}

/*
void realfft(int n, double * real)
{
 double a,b;
 int i,j,k;
 fht(real,n);
 for (i = 1, j = n-1, k = n/2; i < k; i++, j--) {
  a = real[i];
  b = real[j];
  real[j] = (a-b)*0.5;
  real[i] = (a+b)*0.5;
 }
}
*/

void fft(int n, double * real, double * imag)
{
 STATIC double a,b,c,d;
 STATIC double q,r,s,t;
 int i,j,k;

 for (i = 1,j = n-1,k = n/2; i<k; i++,j--) {
  a = real[i]; b = real[j];  q = a+b; r = a-b;
  c = imag[i]; d = imag[j];  s = c+d; t = c-d;
  real[i] = (q+t)*.5; real[j] = (q-t)*.5;
  imag[i] = (s-r)*.5; imag[j] = (s+r)*.5;
 }

#ifdef DEBUG
 fht_first = 1;
#endif

 fht(real,n);

#ifdef DEBUG
 fht_first = 1;
#endif

 fht(imag,n);

#ifdef DEBUG
  ADDR(" fft",a)
  ADDR(" fft",b)
  ADDR(" fft",c)
  ADDR(" fft",d)
  ADDR(" fft",q)
  ADDR(" fft",r)
  ADDR(" fft",s)
  ADDR(" fft",t)
#endif
}

/*
void realifft(int n, double * real)
{
 double a,b;
 int i,j,k;
 for (i = 1, j = n-1, k = n/2; i < k; i++, j--) {
  a = real[i];
  b = real[j];
  real[j] = (a-b);
  real[i] = (a+b);
 }
 fht(real,n);
}
*/
