/*----------------------------------------------------------------------------*/

/*
 * Performance-Monitoring Counters Library, for Intel/AMD Processors and Linux
 * Author:  Don Heller, dheller@scl.ameslab.gov
 * Last revised:  15 March 2001
 */

/* This example shows that integer multiply uses the floating-point unit on
 *   a Pentium II.
 * The C-shell script menu12.csh includes all the commands (read it first!).
 *
 * Compile with
 *	cc `pmc_options` -O -o version1 -DTYPE=int menu12.c -lpmc -lm
 *	cc `pmc_options` -O -S -DTYPE=int menu12.c
 * Run with (assuming C-shell syntax)
 *	version1 -C 10 --o 0 1 2 >& int.1
 * The relevant assembly code is for c += a*b (in menu12.s, comments added)
 *	movl -1008(%ebp),%eax		# load a, EAX = a
 *	imull -1012(%ebp),%eax		# load b, EAX = EAX*b
 *	addl %eax,-1016(%ebp)		# load c, c + EAX, store to c
 * You can verify that there are no floating-point instructions in menu12.s.
 *   All Intel fp instructions start with f.
 * Edit menu12.s to remove the imull instruction, and recompile with
 *	cc `pmc_options` -O -o version2 menu12.s -lpmc -lm
 * Rerun with
 *	version2 -C 10 --o 0 1 2 >& int.2
 * Finally,
 *	more int.1 int.2
 *	diff int.1 int.2
 * Note that each measurement is taken 10 times.  There is a compulsary
 *   cache miss on the first measurement, which is a cache hit on the next
 *   9 measurements.
 * Here is the result of diff on a Pentium II:  (some spaces removed)
 *	5c5
 *	< Command executed: version1 1 2
 *	---
 *	> Command executed: version2 1 2
 *	14,15c14,15
 *	< 0x43  67  data_mem_refs                 50        247252747.25
 *	< 0x45  69  dcu_lines_in                   1          4945054.95
 *	---
 *	> 0x43  67  data_mem_refs                 40        290322580.65
 *	> 0x45  69  dcu_lines_in                   1          7258064.52
 *	22c22
 *	< 0x86 134  ifu_mem_stall                650       3250000000.00
 *	---
 *	> 0x86 134  ifu_mem_stall                900       6750000000.00
 *	24c24
 *	< 0x79 121  cpu_clk_unhalted              90        450000000.00
 *	---
 *	> 0x79 121  cpu_clk_unhalted              60        450000000.00
 *	58,59c58,59
 *	< 0x10  16  fp_comp_ops_exe               10         50000000.00
 *	< 0x12  18  mul                           10         50000000.00
 *	---
 *	> 0x10  16  fp_comp_ops_exe                0                0.00
 *	> 0x12  18  mul                            0                0.00
 *	65,67c65,67
 *	< 0xd0 208  inst_decoder                  40        200000000.00
 *	< 0xc0 192  inst_retired                  40        200000000.00
 *	< 0xc2 194  uops_retired                  80        400000000.00
 *	---
 *	> 0xd0 208  inst_decoder                  30        225000000.00
 *	> 0xc0 192  inst_retired                  30        225000000.00
 *	> 0xc2 194  uops_retired                  60        450000000.00
 *	80c80
 *	< 0xa2 162  resource_stalls               50        250000000.00
 *	---
 *	> 0xa2 162  resource_stalls               20        150000000.00
 *	94c94
 *	< c = 800
 *	---
 *	> c = 400
 * You can see that "imull -1012(%ebp),%eax" uses 1 data memory reference,
 *   3 cycles, 1 floating-point multiply operation, 2 micro-ops (load and
 *   fp multiply), and will stall the processor for 3 cycles if the result
 *   is needed in the next instruction.
 * There is one memory reference per trial related to pmc_read() that is not
 *   being removed by the overhead calculation.  The Level 1 instruction cache
 *   stalls are generally unimportant.
 *
 * Now change the type of a,b,c from int to double and repeat the experiment.
 *	cc `pmc_options` -O -o version1 -DTYPE=double menu12.c -lpmc -lm
 *	cc `pmc_options` -O -S -DTYPE=double menu12.c
 * The relevant lines from menu12.s are (comments added)
 *	fldl -1032(%ebp)	# load a
 *	fmull -1040(%ebp)	# load b, a*b
 *	faddl -1048(%ebp)	# load c, c + a*b
 *	fstpl -1048(%ebp)	# store c
 * Edit menu12.s to remove the fmull instruction.
 * Here is the result of diff on a Pentium II:  (some spaces removed)
 *	5c5
 *	< Command executed: version1 1 2
 *	---
 *	> Command executed: version2 1 2
 *	14,15c14,15
 *	< 0x43  67  data_mem_refs                 50        187500000.00
 *	< 0x45  69  dcu_lines_in                   1          3750000.00
 *	---
 *	> 0x43  67  data_mem_refs                 40        225000000.00
 *	> 0x45  69  dcu_lines_in                   1          5625000.00
 *	22c22
 *	< 0x86 134  ifu_mem_stall                900       3375000000.00
 *	---
 *	> 0x86 134  ifu_mem_stall                650       3656250000.00
 *	24c24
 *	< 0x79 121  cpu_clk_unhalted             120        450000000.00
 *	---
 *	> 0x79 121  cpu_clk_unhalted              80        450000000.00
 *	56c56
 *	< 0xc1 193  flops                         20         75000000.00
 *	---
 *	> 0xc1 193  flops                         10         56250000.00
 *	58,59c58,59
 *	< 0x10  16  fp_comp_ops_exe               20         75000000.00
 *	< 0x12  18  mul                           10         37500000.00
 *	---
 *	> 0x10  16  fp_comp_ops_exe               10         56250000.00
 *	> 0x12  18  mul                            0                0.00
 *	65,67c65,67
 *	< 0xd0 208  inst_decoder                  50        187500000.00
 *	< 0xc0 192  inst_retired                  50        187500000.00
 *	< 0xc2 194  uops_retired                  80        300000000.00
 *	---
 *	> 0xd0 208  inst_decoder                  40        225000000.00
 *	> 0xc0 192  inst_retired                  40        225000000.00
 *	> 0xc2 194  uops_retired                  60        337500000.00
 *	80c80
 *	< 0xa2 162  resource_stalls               70        262500000.00
 *	---
 *	> 0xa2 162  resource_stalls               50        281250000.00
 *	94c94
 *	< c = 800
 *	---
 *	> c = 400
 * You can see that "fmull -1040(%ebp)" uses 1 data memory reference,
 *   4 cycles, 1 floating-point multiply operation, 2 micro-ops (load and
 *   fp multiply), and will stall the processor for 2 cycles if the result
 *   is needed in the next instruction.
 * There is one main difference then between imull and fmull:  imull does
 *   not count as a floating-point operation (event 0xc1, flops) even though
 *   it causes an computational operation in the floating-point unit (event
 *   0x10, fp_comp_ops_exe), and counts as a floating-point multiply (event
 *   0x12, mul).
 *
 * On the Pentium, a similar analysis shows 1 data memory reference (a read),
 *   1 instruction executed, and 1 floating-point operation, for both imull
 *   and fmull.
 */

/*----------------------------------------------------------------------------*/

#define TRIALS 10

/*----------------------------------------------------------------------------*/

#include <pmc_lib.h>

/*----------------------------------------------------------------------------*/

int main(int argc, char * argv[])
{
  int status = RABBIT_NO_STATUS;
  pmc_control_t Ctl = pmc_control_null;

  int Argc = argc;
  char ** Argv = argv;

  int i,j,k, out, t;
  pmc_data_t t0, t1;

  TYPE a, b, c;

  /* initialize internal data structures, read command-line arguments */

  if (pmc_getargs(stderr, argv[0], &Argc, &Argv, &Ctl) == FALSE)
    { exit(RABBIT_FAILURE); }

  /* remove the pmc options from the command line */

  t = argc - (Argc + 1);
  for (i = 1; i <= Argc; i++)
    { argv[i] = argv[t + i]; }
  argc = Argc + 1;

  a = atoi(argv[1]); b = atoi(argv[2]); c = 0;
  printf("a = %d, b = %d, c = %d\n", (int)a, (int)b, (int)c);

  if (pmc_open(0) == FALSE)		/* open /dev/pmc */
    { exit(RABBIT_FAILURE); }

  /* for (out = 0; out < Ctl.num_counters; out++) { ... } */

  for (i = 0, out = 0; i < Ctl.event_pairs; i++) {
    for (j = 0; j < Ctl.replication; j++, out++) {
      pmc_select(&Ctl.counters[out]);
      for (k = 0; k < TRIALS; k++) {
	pmc_read(&t0);

        c += a*b;

	pmc_read(&t1);
	pmc_accumulate(&Ctl.counters[out],&t1,&t0);
      }
    }
  }

  pmc_close();				/* close /dev/pmc */

  printf("c = %d\n", (int)c);		/* prevent over-optimization */

  pmc_print_results(argc, argv, &Ctl);	/* ignore the return value */

  exit(status);
}

/*----------------------------------------------------------------------------*/
