/*----------------------------------------------------------------------------*/

/*
 * Performance-Monitoring Counters Library, for Intel/AMD Processors and Linux
 * Author:  Don Heller, dheller@scl.ameslab.gov
 * Last revised:  19 January 2001
 */

/*----------------------------------------------------------------------------*/

/* test memory access, print "interesting" events
 * compile:	gcc -DTEST=1 -O `pmc_options` -o mem_events mem_events.c -lpmc -lm
 * usage:	mem_events [min_length] [max_length] [alignment] [trials]
 *	min_length and max_length must be multiples of 8
 *	min_length must be at least 16
 * examples, Pentium Pro/II/III:
 *	mem_events --e 0x62 --bus_agent 0,1 1m
 *	mem_events 1m
 * examples, Athlon:
 *	mem_events -g 0 1m
 */

/*----------------------------------------------------------------------------*/

/* default values */

#ifndef TEST
#define TEST 2
#endif

#define MIN_LENGTH 128
#define MAX_LENGTH 128
	/* min and max number of words to copy */

#define ALIGN  PMC_CACHE_LINE

#define TRIALS 100

#define CUTOFF 0.0001
	/* events per word copied, ignore anything less; 1/32 = 0.03125 */

// #define USE_CORRECTION

void access(int *src, int *dst, int n);
	/* n must be a multiple of 8, at least 8 */

/*----------------------------------------------------------------------------*/

#include <pmc_lib.h>
#include <math.h>
#include <stdlib.h>

/*----------------------------------------------------------------------------*/

int Atoi(char *in)
{
  int n = 0;

  while (isdigit(*in)) {
    n = 10*n + (*in - '0');
    in++;
  }
  if (*in == 'k' || *in == 'K') { n *= 1024; }
  if (*in == 'm' || *in == 'M') { n *= 1024*1024; }

  return n;
}

/*----------------------------------------------------------------------------*/

int main(int argc, char * argv[])
{
  pmc_control_t Ctl = pmc_control_null;

  int *src, *dst, incr = 8;
  int min_length = MIN_LENGTH, max_length = MAX_LENGTH;
  int align = ALIGN;
  int i,j,k, m = 0, n, out, trials = TRIALS;
  pmc_data_t t0, t1;
  pmc_counter_t c;

  /* initialize internal data structures, read command-line arguments */

  Ctl.clean = 10;
  Ctl.stats = 1;
  for (i = 0; i < pmc_event_counters; i++) { Ctl.os[i] = 0; }

  if (pmc_getargs(stderr, argv[0], &argc, &argv, &Ctl) == FALSE)
    { exit(RABBIT_FAILURE); }

  if (argc > 0) { min_length = Atoi(argv[0]); max_length = min_length; }
  if (argc > 1) { max_length = Atoi(argv[1]); }
  if (argc > 2) { align      = Atoi(argv[2]); }
  if (argc > 3) { trials     = Atoi(argv[3]); }

  printf("test memory access, #%d, length = %d : %d, alignment = %d, trials = %d\n",
    TEST, min_length, max_length, align, trials);
#ifdef USE_CORRECTION
  printf("  correction from %d words\n", (m = 16));
#endif

  if (min_length <= 16 || max_length <= 16 || max_length < min_length
      || align < 0 || trials <= 0)
    {
      printf("  invalid request\n");
      exit(RABBIT_FAILURE);
    }

  src = (int *) pmc_calloc(max_length, 4, align);
  dst = (int *) pmc_calloc(max_length, 4, align);

  i = (int) src; j = (int) dst;
  printf("  src = 0x%08lx, dst = 0x%08lx, dst-src = 0x%08lx\n", i, j, j-i);

  if (src == NULL || dst == NULL)
    {
      printf("  memory allocation failed\n");
      exit(RABBIT_FAILURE);
    }

  for (i = 0; i < max_length; i++) {
    dst[i] = src[i] = 0xbabef00d;
  }

  if (pmc_open(0) == FALSE)		/* open /dev/pmc */
    { exit(RABBIT_FAILURE); }

  for (n = min_length; n <= max_length; n += incr) {
    if (n >  1*1024) { incr =  32; }
    if (n >  4*1024) { incr = 128; }
    if (n > 16*1024) { incr = 512; }

    pmc_counter_reset(&c);	/* for the number of cycles */

    for (out = 0; out < Ctl.num_counters; out++) {
      pmc_counter_reset(&Ctl.counters[out]);
      pmc_select(&Ctl.counters[out]);

      access(src, dst, n);	/* first time doesn't count */

      for (k = 0; k < trials; k++) {
	pmc_read(&t0);

	access(src, dst, n);

	pmc_read(&t1);
	pmc_accumulate(&Ctl.counters[out], &t1, &t0);
	pmc_accumulate(&c, &t1, NULL);
      }
    }

    {
      int e;
      double Min, Mean, Max, StdDev, len;
      pmc_counter_t correction;

      printf("Copying %d words\n", n);
      printf("Events per word, cutoff %5.4f            Min      Mean     Max      Std.Dev.\n",
	CUTOFF);

#ifdef USE_CORRECTION
      len = (double) (n-m);
      for (i = 0, out = 0; i < Ctl.event_pairs; i++) {
	for (j = 0; j < Ctl.replication; j++, out++) {
	  /* estimate function call overhead */
	  correction.selector = Ctl.counters[out].selector;
  	  correction.clean = 10;
  	  correction.stats = 1;
	  pmc_counter_reset(&correction);
	  pmc_select(&correction);
	  access(src, dst, m);	/* first time doesn't count */
	  for (k = 0; k < trials; k++) {
	     pmc_read(&t0);
	     access(src, dst, m);
	     pmc_read(&t1);
	     pmc_accumulate(&correction, &t1, &t0);
	  }

	  for (k = 0; k < pmc_event_counters; k++) {
	    Min  = ((double) pmc_min_events(&Ctl.counters[out],k) -
		    (double) pmc_min_events(&correction,k))		/ len;
	    Mean = (pmc_mean_events(&Ctl.counters[out],k) -
		    pmc_mean_events(&correction,k))			/ len;
	    Max  = ((double) pmc_max_events(&Ctl.counters[out],k) -
		    (double) pmc_max_events(&correction,k))		/ len;
	    StdDev = (sqrt(pmc_variance_events(&Ctl.counters[out],k)) -
		      sqrt(pmc_variance_events(&correction,k)))		/ len;
	    if (Mean > CUTOFF) {
	      e = Ctl.events[i][k];
	      printf("0x%02x %3d %-30s %8.4f %8.4f %8.4f %10.6f\n",
		    e, e, pmc_event_name(e,k),
		    Min, Mean, Max, StdDev);
	    }
	  }
	}
      }
#else
      len = (double) n;
      for (i = 0, out = 0; i < Ctl.event_pairs; i++) {
	for (j = 0; j < Ctl.replication; j++, out++) {
	  for (k = 0; k < pmc_event_counters; k++) {
	    Min  = (double)    pmc_min_events(&Ctl.counters[out],k)	/ len;
	    Mean =            pmc_mean_events(&Ctl.counters[out],k)	/ len;
	    Max  = (double)    pmc_max_events(&Ctl.counters[out],k)	/ len;
	    StdDev = sqrt(pmc_variance_events(&Ctl.counters[out],k))	/ len;
	    if (Mean > CUTOFF) {
	      e = Ctl.events[i][k];
	      printf("0x%02x %3d %-30s %8.4f %8.4f %8.4f %10.6f\n",
		    e, e, pmc_event_name(e,k),
		    Min, Mean, Max, StdDev);
	    }
	  }
	}
      }
#endif

      printf("Cycles                                    Min      Mean      Max     Std.Dev.\n");
      Min  = (double)    pmc_min_cycles(&c)	/ len;
      Mean =            pmc_mean_cycles(&c)	/ len;
      Max  = (double)    pmc_max_cycles(&c)	/ len;
      StdDev = sqrt(pmc_variance_cycles(&c))	/ len;
      printf("cycles                                  %8.4f %8.4f %8.4f %10.6f\n",
		    Min, Mean, Max, StdDev);

      fflush(stdout);
    }
  }

  pmc_close();				/* close /dev/pmc */

  exit(RABBIT_SUCCESS);
}

/*----------------------------------------------------------------------------*/

void access(int *src, int *dst, int n)
{
#if TEST == 0
  /* nothing at all */
#endif

#if TEST == 1
  __asm__ __volatile__ (
    "movl %0,%%esi\n\t"		/* src */
    "movl %1,%%edi\n\t"		/* dst */
    "movl %2,%%ecx\n\t"		/* n */

    "cld\n\t"		/* rep/movsl will increment esi, edi by 4 */
    "rep\n\t"
    "movsl"
    :					/* output */
    : "g" (src), "g" (dst), "g" (n)	/* input */
    : "esi", "edi", "ecx"		/* clobbered */
    );
#endif

#if TEST == 2
  __asm__ __volatile__ (
    "movl %0,%%esi\n\t"		/* src */
    "movl %1,%%edi\n\t"		/* dst */
    "movl %2,%%ecx\n\t"		/* n */

    "xorl %%edx,%%edx\n\t"	/* int j = 0 */

    ".align 4\n"
"1:\n\t"
    "movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
    "movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
    "incl %%edx\n\t"			/* j++ */
    "cmpl %%ecx,%%edx\n\t"		/* j cmp n */
    "jl 1b"
    :
    : "g" (src), "g" (dst), "g" (n)
    : "esi", "edi", "ecx", "eax", "edx"
    );
#endif

#if TEST == 3
  __asm__ __volatile__ (
    "movl %0,%%esi\n\t"     /* src */
    "movl %1,%%edi\n\t"     /* dst */
    "movl %2,%%ecx\n\t"     /* n */

    "xorl %%edx,%%edx\n\t"  /* int j = 0 */

    ".align 4\n"
"1:\n\t"
    "movl (%%esi,%%edx,4),%%eax\n\t"        /* eax = src[j] */
    "movl %%eax,(%%edi,%%edx,4)\n\t"        /* dst[j] = eax */
    "incl %%edx\n\t"                        /* j++ */
    "movl (%%esi,%%edx,4),%%eax\n\t"        /* eax = src[j] */
    "movl %%eax,(%%edi,%%edx,4)\n\t"        /* dst[j] = eax */
    "incl %%edx\n\t"                        /* j++ */
    "cmpl %%ecx,%%edx\n\t"                  /* j cmp n */
    "jl 1b"
    :
    : "g" (src), "g" (dst), "g" (n)
    : "esi", "edi", "ecx", "eax", "edx"
    );
#endif

#if TEST == 4
  __asm__ __volatile__ (
    "movl %0,%%esi\n\t"     /* src */
    "movl %1,%%edi\n\t"     /* dst */
    "movl %2,%%ecx\n\t"     /* n */

    "xorl %%edx,%%edx\n\t"  /* int j = 0 */

    ".align 4\n"
"1:\n\t"
    "movl (%%esi,%%edx,4),%%eax\n\t"        /* eax = src[j] */
    "movl %%eax,(%%edi,%%edx,4)\n\t"        /* dst[j] = eax */
    "incl %%edx\n\t"                        /* j++ */
    "movl (%%esi,%%edx,4),%%eax\n\t"        /* eax = src[j] */
    "movl %%eax,(%%edi,%%edx,4)\n\t"        /* dst[j] = eax */
    "incl %%edx\n\t"                        /* j++ */
    "movl (%%esi,%%edx,4),%%eax\n\t"        /* eax = src[j] */
    "movl %%eax,(%%edi,%%edx,4)\n\t"        /* dst[j] = eax */
    "incl %%edx\n\t"                        /* j++ */
    "movl (%%esi,%%edx,4),%%eax\n\t"        /* eax = src[j] */
    "movl %%eax,(%%edi,%%edx,4)\n\t"        /* dst[j] = eax */
    "incl %%edx\n\t"                        /* j++ */
    "cmpl %%ecx,%%edx\n\t"                  /* j cmp n */
    "jl 1b"
    :
    : "g" (src), "g" (dst), "g" (n)
    : "esi", "edi", "ecx", "eax", "edx"
    );
#endif

#if TEST == 5
  __asm__ __volatile__ (
    "movl %0,%%esi\n\t"     /* src */
    "movl %1,%%edi\n\t"     /* dst */
    "movl %2,%%ecx\n\t"     /* n */

    "xorl %%edx,%%edx\n\t"  /* int j = 0 */

    ".align 4\n"
"1:\n\t"
    "movl (%%esi,%%edx,4),%%eax\n\t"        /* eax = src[j] */
    "movl %%eax,(%%edi,%%edx,4)\n\t"        /* dst[j] = eax */
    "movl 4(%%esi,%%edx,4),%%eax\n\t"       /* eax = src[j+1] */
    "movl %%eax,4(%%edi,%%edx,4)\n\t"       /* dst[j+1] = eax */
    "movl 8(%%esi,%%edx,4),%%eax\n\t"       /* eax = src[j+2] */
    "movl %%eax,8(%%edi,%%edx,4)\n\t"       /* dst[j+2] = eax */
    "movl 12(%%esi,%%edx,4),%%eax\n\t"      /* eax = src[j+3] */
    "movl %%eax,12(%%edi,%%edx,4)\n\t"      /* dst[j+3] = eax */
    "addl $4,%%edx\n\t"                     /* j += 4 */
    "cmpl %%ecx,%%edx\n\t"                  /* j cmp n */
    "jl 1b"
    :
    : "g" (src), "g" (dst), "g" (n)
    : "esi", "edi", "ecx", "eax", "edx"
    );
#endif

#if TEST == 6
  __asm__ __volatile__ (
    "movl %0,%%esi\n\t"     /* src */
    "movl %1,%%edi\n\t"     /* dst */
    "movl %2,%%ecx\n\t"     /* n */

    "xorl %%edx,%%edx\n\t"  /* int j = 0 */

    ".align 4\n"
"1:\n\t"
    "movl (%%esi,%%edx,4),%%eax\n\t"        /* eax = src[j] */
    "movl 4(%%esi,%%edx,4),%%ebx\n\t"       /* ebx = src[j+1] */
    "movl %%eax,(%%edi,%%edx,4)\n\t"        /* dst[j] = eax */
    "movl %%ebx,4(%%edi,%%edx,4)\n\t"       /* dst[j+1] = ebx */
    "movl 8(%%esi,%%edx,4),%%eax\n\t"       /* eax = src[j+2] */
    "movl 12(%%esi,%%edx,4),%%ebx\n\t"      /* ebx = src[j+3] */
    "movl %%eax,8(%%edi,%%edx,4)\n\t"       /* dst[j+2] = eax */
    "movl %%ebx,12(%%edi,%%edx,4)\n\t"      /* dst[j+3] = ebx */
    "addl $4,%%edx\n\t"                     /* j += 4 */
    "cmpl %%ecx,%%edx\n\t"                  /* j cmp n */
    "jl 1b"
    :
    : "g" (src), "g" (dst), "g" (n)
    : "esi", "edi", "ecx", "eax", "ebx", "edx"
    );
#endif

#if TEST == 7
  __asm__ __volatile__ (
    "movl %0,%%esi\n\t"     /* src */
    "movl %1,%%edi\n\t"     /* dst */
    "movl %2,%%ecx\n\t"     /* n */

    "xorl %%edx,%%edx\n\t"  /* int j = 0 */

    ".align 4\n"
"1:\n\t"
    "movl (%%esi,%%edx,4),%%eax\n\t"        /* eax = src[j] */
    "movl 4(%%esi,%%edx,4),%%ebx\n\t"       /* ebx = src[j+1] */
    "movl %%eax,(%%edi,%%edx,4)\n\t"        /* dst[j] = eax */
    "movl %%ebx,4(%%edi,%%edx,4)\n\t"       /* dst[j+1] = ebx */
    "movl 8(%%esi,%%edx,4),%%eax\n\t"       /* eax = src[j+2] */
    "movl 12(%%esi,%%edx,4),%%ebx\n\t"      /* ebx = src[j+3] */
    "movl %%eax,8(%%edi,%%edx,4)\n\t"       /* dst[j+2] = eax */
    "movl %%ebx,12(%%edi,%%edx,4)\n\t"      /* dst[j+3] = ebx */
    "movl 16(%%esi,%%edx,4),%%eax\n\t"      /* eax = src[j+4] */
    "movl 20(%%esi,%%edx,4),%%ebx\n\t"      /* ebx = src[j+5] */
    "movl %%eax,16(%%edi,%%edx,4)\n\t"      /* dst[j+4] = eax */
    "movl %%ebx,20(%%edi,%%edx,4)\n\t"      /* dst[j+5] = ebx */
    "movl 24(%%esi,%%edx,4),%%eax\n\t"      /* eax = src[j+6] */
    "movl 28(%%esi,%%edx,4),%%ebx\n\t"      /* ebx = src[j+7] */
    "movl %%eax,24(%%edi,%%edx,4)\n\t"      /* dst[j+6] = eax */
    "movl %%ebx,28(%%edi,%%edx,4)\n\t"      /* dst[j+7] = ebx */
    "addl $8,%%edx\n\t"                     /* j += 8 */
    "cmpl %%ecx,%%edx\n\t"                  /* j cmp n */
    "jl 1b"
    :
    : "g" (src), "g" (dst), "g" (n)
    : "esi", "edi", "ecx", "eax", "ebx", "edx"
    );
#endif

#if TEST == 8
  register int i;
  for (i = 0; i < n; i++) { dst[i] = src[i]; }
#endif

#if TEST == 9
  register int i;
  for (i = 0; i < n; i += 2) { dst[i] = src[i]; dst[i+1] = src[i+1]; }
#endif

#if TEST == 10
  register int i;
  for (i = 0; i < n; i += 4) {
    dst[i] = src[i];     dst[i+1] = src[i+1];
    dst[i+2] = src[i+2]; dst[i+3] = src[i+3];
  }
#endif

#if TEST == 11
  register int i;
  for (i = 0; i < n; i += 8) {
    dst[i] = src[i];     dst[i+1] = src[i+1];
    dst[i+2] = src[i+2]; dst[i+3] = src[i+3];
    dst[i+4] = src[i+4]; dst[i+5] = src[i+5];
    dst[i+6] = src[i+6]; dst[i+7] = src[i+7];
  }
#endif

  return;
}

/*----------------------------------------------------------------------------*/
