/*----------------------------------------------------------------------------*/

/*
 * Performance-Monitoring Counters Library, for Intel/AMD Processors and Linux
 * Author:  Don Heller, dheller@scl.ameslab.gov
 * Last revised:  18 May 2001
 */

/*----------------------------------------------------------------------------*/

/*
 * gcc -O `pmc_options` -o mem_test mem_test.c -lpmc
 * mem_test [1]
 *   'mem_test 1' will flush the cache before the timing test
 */

#define MEM 16

/*----------------------------------------------------------------------------*/

#include <pmc_private.h>

void pmc_guess_memory_speed(const int flush, const int words);

/*----------------------------------------------------------------------------*/

int main(int argc, char *argv[])
{
  pmc_control_t Ctl = pmc_control_null;
  int flush = FALSE, q = -1, w1, w2, wm, words, mem = MEM;
  int L1, L2;

  if (pmc_getargs(stdout, "mem_test", &argc, &argv, &Ctl) == FALSE)
    {
      exit(1);
    }

  if (pmc_open(0) == FALSE)
    {
      exit(1);
    }

  printf("\nprocessor speed estimate: %d MHz\n", pmc_guess_mhz(5));

  printf("\nmemory speed estimates:\n");

  if (argv != NULL && argv[0] != NULL && argv[0][0] == '1')
    { q = pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH); }
  if (q != -1)
    { flush = TRUE; }
  if (flush)
    { printf("  cache flush before test\n"); }
  else
    { printf("  no cache flush before test\n"); }

  L1 = pmc_cache_size(1);
  L2 = pmc_cache_size(2);

  printf("  L1 dcache = %d KB, L2 = %d KB, mem = %d MB\n", L1, L2, mem);

  w1 = L1*1024/2/sizeof(int);
  w2 = L2*1024/2/sizeof(int);
  wm = mem*1024*1024/2/sizeof(int);

  for (words = 8; words < w1; words += 8)
    { pmc_guess_memory_speed(flush,words); }

  for (words = w1; words < w2; words += 32)
    { pmc_guess_memory_speed(flush,words); }

  for (words = w2; words <= wm; words += 128)
    { pmc_guess_memory_speed(flush,words); }

  pmc_guess_memory_speed(flush,0);	/* release allocated storage */

  pmc_close();

  exit(0);
}

/*----------------------------------------------------------------------------*/

void pmc_guess_memory_speed(const int flush, const int words)
{
  static int *src = NULL, *dst = NULL, size = 0;

  pmc_data_t t[2];
  pmc_counter_t c;
  int i, trials = 10;
  double dwords = (double) words;
  const int alloc = 32*1024;		/* allocation chunk */
  const int align = sizeof(int);	/* allocation alignment */

  if (words <= 0)	/* cleanup */
    {
      pmc_free(src); src = NULL;
      pmc_free(dst); dst = NULL;
      size = 0;
      return;
    }

  if (size < words)
    {
      pmc_free(src); src = NULL;
      pmc_free(dst); dst = NULL;
      size = alloc*(words/alloc + 1);
    }

  printf("  copying %d words; %d trials; min, mean, max cycles per word\n",
    words, trials);

  if (src == NULL)
    {
      unsigned long int s, d;
      s = (unsigned long int) (src = (int *) pmc_calloc(size, 4, align));
      d = (unsigned long int) (dst = (int *) pmc_calloc(size, 4, align));
      printf("    allocated %d words, src, dst = 0x%08lx, 0x%08lx, dst - src = 0x%08lx\n",
	size, s, d, d-s);
    }

  if (src == NULL || dst == NULL)
    {
      pmc_free(src); src = NULL;
      pmc_free(dst); dst = NULL;
      size = 0;
      printf("    allocation failed\n");
      return;
    }

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* note - pmc_read_clock() is always inline */

  /* test memory speed using rep/movsl */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      __asm__ __volatile__ (
	"movl %0,%%esi\n\t"	/* src */
	"movl %1,%%edi\n\t"	/* dst */
	"movl %2,%%ecx\n\t"	/* words */

	"cld\n\t"
	"rep\n\t"
	"movsl"
	:
	: "g" (src), "g" (dst), "g" (words)
	: "esi", "edi", "ecx"
	);

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("     movsl %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* test memory speed using movl */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      __asm__ __volatile__ (
	"movl %0,%%esi\n\t"	/* src */
	"movl %1,%%edi\n\t"	/* dst */
	"movl %2,%%ecx\n\t"	/* words */

	"xorl %%edx,%%edx\n\t"	/* int j = 0 */

	".p2align 4,,7\n"
"1:\n\t"
	"movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
	"movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
	"incl %%edx\n\t"			/* j++ */
	"cmpl %%ecx,%%edx\n\t"			/* j cmp words */
	"jl 1b"
	:
	: "g" (src), "g" (dst), "g" (words)
	: "esi", "edi", "ecx", "eax", "edx"
	);

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("    1xmovl %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* test memory speed using unrolled movl */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      __asm__ __volatile__ (
	"movl %0,%%esi\n\t"	/* src */
	"movl %1,%%edi\n\t"	/* dst */
	"movl %2,%%ecx\n\t"	/* words */

	"xorl %%edx,%%edx\n\t"	/* int j = 0 */

	".p2align 4,,7\n"
"1:\n\t"
	"movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
	"movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
	"incl %%edx\n\t"			/* j++ */
	"movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
	"movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
	"incl %%edx\n\t"			/* j++ */
	"cmpl %%ecx,%%edx\n\t"			/* j cmp words */
	"jl 1b"
	:
	: "g" (src), "g" (dst), "g" (words)
	: "esi", "edi", "ecx", "eax", "edx"
	);

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("    2xmovl %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* test memory speed using unrolled movl */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      __asm__ __volatile__ (
	"movl %0,%%esi\n\t"	/* src */
	"movl %1,%%edi\n\t"	/* dst */
	"movl %2,%%ecx\n\t"	/* words */

	"xorl %%edx,%%edx\n\t"	/* int j = 0 */

	".p2align 4,,7\n"
"1:\n\t"
	"movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
	"movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
	"incl %%edx\n\t"			/* j++ */
	"movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
	"movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
	"incl %%edx\n\t"			/* j++ */
	"movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
	"movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
	"incl %%edx\n\t"			/* j++ */
	"movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
	"movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
	"incl %%edx\n\t"			/* j++ */
	"cmpl %%ecx,%%edx\n\t"			/* j cmp words */
	"jl 1b"
	:
	: "g" (src), "g" (dst), "g" (words)
	: "esi", "edi", "ecx", "eax", "edx"
	);

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("   4axmovl %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* test memory speed using unrolled movl */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      __asm__ __volatile__ (
	"movl %0,%%esi\n\t"	/* src */
	"movl %1,%%edi\n\t"	/* dst */
	"movl %2,%%ecx\n\t"	/* words */

	"xorl %%edx,%%edx\n\t"	/* int j = 0 */

	".p2align 4,,7\n"
"1:\n\t"
	"movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
	"movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
	"movl 4(%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j+1] */
	"movl %%eax,4(%%edi,%%edx,4)\n\t"	/* dst[j+1] = eax */
	"movl 8(%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j+2] */
	"movl %%eax,8(%%edi,%%edx,4)\n\t"	/* dst[j+2] = eax */
	"movl 12(%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j+3] */
	"movl %%eax,12(%%edi,%%edx,4)\n\t"	/* dst[j+3] = eax */
	"addl $4,%%edx\n\t"			/* j += 4 */
	"cmpl %%ecx,%%edx\n\t"			/* j cmp words */
	"jl 1b"
	:
	: "g" (src), "g" (dst), "g" (words)
	: "esi", "edi", "ecx", "eax", "edx"
	);

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("   4bxmovl %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* test memory speed using unrolled movl */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      __asm__ __volatile__ (
	"movl %0,%%esi\n\t"	/* src */
	"movl %1,%%edi\n\t"	/* dst */
	"movl %2,%%ecx\n\t"	/* words */

	"xorl %%edx,%%edx\n\t"	/* int j = 0 */

	".p2align 4,,7\n"
"1:\n\t"
	"movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
	"movl 4(%%esi,%%edx,4),%%ebx\n\t"	/* ebx = src[j+1] */
	"movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
	"movl %%ebx,4(%%edi,%%edx,4)\n\t"	/* dst[j+1] = ebx */
	"movl 8(%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j+2] */
	"movl 12(%%esi,%%edx,4),%%ebx\n\t"	/* ebx = src[j+3] */
	"movl %%eax,8(%%edi,%%edx,4)\n\t"	/* dst[j+2] = eax */
	"movl %%ebx,12(%%edi,%%edx,4)\n\t"	/* dst[j+3] = ebx */
	"addl $4,%%edx\n\t"			/* j += 4 */
	"cmpl %%ecx,%%edx\n\t"			/* j cmp words */
	"jl 1b"
	:
	: "g" (src), "g" (dst), "g" (words)
	: "esi", "edi", "ecx", "eax", "ebx", "edx"
	);

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("   4cxmovl %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* test memory speed using unrolled movl */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      __asm__ __volatile__ (
	"movl %0,%%esi\n\t"	/* src */
	"movl %1,%%edi\n\t"	/* dst */
	"movl %2,%%ecx\n\t"	/* words */

	"xorl %%edx,%%edx\n\t"	/* int j = 0 */

	".p2align 4,,7\n"
"1:\n\t"
	"movl (%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j] */
	"movl 4(%%esi,%%edx,4),%%ebx\n\t"	/* ebx = src[j+1] */
	"movl %%eax,(%%edi,%%edx,4)\n\t"	/* dst[j] = eax */
	"movl %%ebx,4(%%edi,%%edx,4)\n\t"	/* dst[j+1] = ebx */
	"movl 8(%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j+2] */
	"movl 12(%%esi,%%edx,4),%%ebx\n\t"	/* ebx = src[j+3] */
	"movl %%eax,8(%%edi,%%edx,4)\n\t"	/* dst[j+2] = eax */
	"movl %%ebx,12(%%edi,%%edx,4)\n\t"	/* dst[j+3] = ebx */
	"movl 16(%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j+4] */
	"movl 20(%%esi,%%edx,4),%%ebx\n\t"	/* ebx = src[j+5] */
	"movl %%eax,16(%%edi,%%edx,4)\n\t"	/* dst[j+4] = eax */
	"movl %%ebx,20(%%edi,%%edx,4)\n\t"	/* dst[j+5] = ebx */
	"movl 24(%%esi,%%edx,4),%%eax\n\t"	/* eax = src[j+6] */
	"movl 28(%%esi,%%edx,4),%%ebx\n\t"	/* ebx = src[j+7] */
	"movl %%eax,24(%%edi,%%edx,4)\n\t"	/* dst[j+6] = eax */
	"movl %%ebx,28(%%edi,%%edx,4)\n\t"	/* dst[j+7] = ebx */
	"addl $8,%%edx\n\t"			/* j += 8 */
	"cmpl %%ecx,%%edx\n\t"			/* j cmp words */
	"jl 1b"
	:
	: "g" (src), "g" (dst), "g" (words)
	: "esi", "edi", "ecx", "eax", "ebx", "edx"
	);

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("    8xmovl %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* test memory speed using fld */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      __asm__ __volatile__ (
	"movl %0,%%esi\n\t"	/* src */
	"movl %1,%%edi\n\t"	/* dst */
	"movl %2,%%ecx\n\t"	/* words/2 */

	"xorl %%edx,%%edx\n\t"	/* int j = 0 */

	".p2align 4,,7\n"
"1:\n\t"
	"fldl (%%esi,%%edx,8)\n\t"		/* src[2*j], src[2*j+1] */
	"fstpl (%%edi,%%edx,8)\n\t"		/* dst[2*j], dst[2*j+1] */
	"incl %%edx\n\t"			/* j++ */
	"cmpl %%ecx,%%edx\n\t"			/* j cmp words/2 */
	"jl 1b"
	:
	: "g" (src), "g" (dst), "g" (words/2)
	: "esi", "edi", "ecx", "edx"
	);

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("    1xfldl %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* test memory speed using fld */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      __asm__ __volatile__ (
	"movl %0,%%esi\n\t"	/* src */
	"movl %1,%%edi\n\t"	/* dst */
	"movl %2,%%ecx\n\t"	/* words/2 */

	"xorl %%edx,%%edx\n\t"	/* int j = 0 */

	".p2align 4,,7\n"
"1:\n\t"
	"fldl (%%esi,%%edx,8)\n\t"		/* src[2*j], src[2*j+1] */
	"fldl 8(%%esi,%%edx,8)\n\t"		/* src[2*j+2], src[2*j+3] */
	"fstpl 8(%%edi,%%edx,8)\n\t"		/* dst[2*j+2], dst[2*j+3] */
	"fstpl (%%edi,%%edx,8)\n\t"		/* dst[2*j], dst[2*j+1] */
	"addl $2,%%edx\n\t"			/* j += 2 */
	"cmpl %%ecx,%%edx\n\t"			/* j cmp words/2 */
	"jl 1b"
	:
	: "g" (src), "g" (dst), "g" (words/2)
	: "esi", "edi", "ecx", "edx"
	);

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("    2xfldl %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* test memory speed using fld */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      __asm__ __volatile__ (
	"movl %0,%%esi\n\t"	/* src */
	"movl %1,%%edi\n\t"	/* dst */
	"movl %2,%%ecx\n\t"	/* words/2 */

	"xorl %%edx,%%edx\n\t"	/* int j = 0 */

	".p2align 4,,7\n"
"1:\n\t"
	"fldl (%%esi,%%edx,8)\n\t"		/* src[2*j], src[2*j+1] */
	"fldl 8(%%esi,%%edx,8)\n\t"		/* src[2*j+2], src[2*j+3] */
	"fldl 16(%%esi,%%edx,8)\n\t"		/* src[2*j+4], src[2*j+5] */
	"fldl 24(%%esi,%%edx,8)\n\t"		/* src[2*j+6], src[2*j+7] */
	"fstpl 24(%%edi,%%edx,8)\n\t"		/* dst[2*j+6], dst[2*j+7] */
	"fstpl 16(%%edi,%%edx,8)\n\t"		/* dst[2*j+4], dst[2*j+5] */
	"fstpl 8(%%edi,%%edx,8)\n\t"		/* dst[2*j+2], dst[2*j+3] */
	"fstpl (%%edi,%%edx,8)\n\t"		/* dst[2*j], dst[2*j+1] */
	"addl $4,%%edx\n\t"			/* j += 4 */
	"cmpl %%ecx,%%edx\n\t"			/* j cmp words/2 */
	"jl 1b"
	:
	: "g" (src), "g" (dst), "g" (words/2)
	: "esi", "edi", "ecx", "edx"
	);

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("    4xfldl %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  pmc_counter_zero(&c);
  c.clean = 10;
  c.stats = 1;
  pmc_select_clock(&c);

  /* test memory speed using memcpy */
  for (i = 0; i < trials; i++)
    {
      if (flush) pmc_configure(PMC_CONFIGURE_CACHE, PMC_CONFIGURE_FLUSH);

      pmc_read_clock(&t[0]);

      memcpy(dst, src, words*sizeof(int));

      pmc_read_clock(&t[1]);

      pmc_accumulate_clock(&c, &t[1], &t[0]);
    }

  printf("    memcpy %.3f %.3f %.3f\n",
    (double)(c.min.cycles)/dwords, c.mean.cycles/dwords, (double)(c.max.cycles)/dwords);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
}

/*----------------------------------------------------------------------------*/
