/*----------------------------------------------------------------------------*/

/*
 * Performance-Monitoring Counters Library, for Intel/AMD Processors and Linux
 * Author:  Don Heller, dheller@scl.ameslab.gov
 * Last revised:  15 March 2001
 */

/*----------------------------------------------------------------------------*/

/* This example shows that integer divide uses the floating-point unit on a
 *   Pentium II.
 * The C-shell script menu13.csh includes all the commands (read it first!).
 *
 * Compile with
 *	cc `pmc_options` -O -o version1 -DTYPE=int menu13.c -lpmc -lm
 *	cc `pmc_options` -O -S -DTYPE=int menu13.c
 * Run with (assuming C-shell syntax)
 *	version1 -C 10 --o 0 10 5 >& int.1
 * The relevant assembly code is for c += a/b (in menu13.s, comments added)
 *	movl -1008(%ebp),%eax	# load a, EAX = a
 *	cltd			# EDX = sign-extension of EAX
 *	idivl -1012(%ebp)	# load b, EDX:EAX/b, EAX = a/b, EDX = a%b 
 *	addl %eax,-1016(%ebp)	# load c, c + EAX, store to c
 * You can verify that there are no floating-point instructions in menu13.s.
 *   All Intel fp instructions start with f.
 * Edit menu13.s to remove the idivl instruction, and recompile with
 *	cc `pmc_options` -O -o version2 menu13.s -lpmc -lm
 * Rerun with
 *	version2 -C 10 --o 0 10 5 >& int.2
 * Finally,
 *	more int.1 int.2
 *	diff int.1 int.2
 * Note that each measurement is taken 10 times.  There is a compulsary
 *   cache miss on the first measurement, which is a cache hit on the next
 *   9 measurements.
 * Here is the result of diff on a Pentium II:  (some spaces removed)
 *	5c5
 *	< Command executed: version1 10 5
 *	---
 *	> Command executed: version2 10 5
 *	14,15c14,15
 *	< 0x43  67  data_mem_refs                 50         51136363.64
 *	< 0x45  69  dcu_lines_in                   1          1022727.27
 *	---
 *	> 0x43  67  data_mem_refs                 40        333333333.33
 *	> 0x45  69  dcu_lines_in                   1          8333333.33
 *	22c22
 *	< 0x86 134  ifu_mem_stall                650        664772727.27
 *	---
 *	> 0x86 134  ifu_mem_stall                900       8100000000.00
 *	24c24
 *	< 0x79 121  cpu_clk_unhalted             440        450000000.00
 *	---
 *	> 0x79 121  cpu_clk_unhalted              50        450000000.00
 *	58c58
 *	< 0x10  16  fp_comp_ops_exe               10         10227272.73
 *	---
 *	> 0x10  16  fp_comp_ops_exe                0                0.00
 *	60,61c60,61
 *	< 0x14  20  cycles_div_busy              350        357954545.45
 *	< 0x13  19  div                           10         10227272.73
 *	---
 *	> 0x14  20  cycles_div_busy                0                0.00
 *	> 0x13  19  div                            0                0.00
 *	65,67c65,67
 *	< 0xd0 208  inst_decoder                  50         51136363.64
 *	< 0xc0 192  inst_retired                  50         51136363.64
 *	< 0xc2 194  uops_retired                 120        122727272.73
 *	---
 *	> 0xd0 208  inst_decoder                  40        360000000.00
 *	> 0xc0 192  inst_retired                  40        360000000.00
 *	> 0xc2 194  uops_retired                  70        630000000.00
 *	80c80
 *	< 0xa2 162  resource_stalls              380        388636363.64
 *	--
 *	> 0xa2 162  resource_stalls               10         90000000.00
 *	94c94
 *	< c = 800
 *	---
 *	> c = 4000
 * You can see that "idivl -1012(%ebp)" uses 1 data memory reference, 39
 *   cycles (of which 35 occupy the fl.pt. divide unit), 1 fl.pt. operation
 *   (a divide), 5 micro-ops (load, 4 others which cannot be determined),
 *   and will stall the processor for 35-37 cycles if the result is needed
 *   in the next instruction.
 * There is one memory reference per trial related to pmc_read() that is not
 *   being removed by the overhead calculation.  The Level 1 instruction cache
 *   stalls are generally unimportant.
 * The experiment could also be done by removing both the cltd and idivl
 *   instructions.  The clock time actually increases to 6 cycles because of
 *   another stall.
 *
 * Now change the type of a,b,c from int to double and repeat the experiment.
 *	cc `pmc_options` -O -o version1 -DTYPE=double menu13.c -lpmc -lm
 *	cc `pmc_options` -O -S -DTYPE=double menu13.c
 * The relevant lines from menu13.s are (comments added)
 *	fldl -1032(%ebp)	# load a
 *	fdivl -1040(%ebp)	# load b, a/b
 *	faddl -1048(%ebp)	# load c, c + a/b
 *	fstpl -1048(%ebp)	# store to c
 * Edit menu13.s to remove the fdivl instruction.
 * Here is the result of diff on a Pentium II:  (some spaces removed)
 *	5c5
 *	< Command executed: version1 10 5
 *	---
 *	> Command executed: version2 10 5
 *	14,15c14,15
 *	< 0x43  67  data_mem_refs                 50         50000000.00
 *	< 0x45  69  dcu_lines_in                   1          1000000.00
 *	---
 *	> 0x43  67  data_mem_refs                 40        225000000.00
 *	> 0x45  69  dcu_lines_in                   1          5625000.00
 *	22c22
 *	< 0x86 134  ifu_mem_stall                900        900000000.00
 *	---
 *	> 0x86 134  ifu_mem_stall                650       3656250000.00
 *	24c24
 *	< 0x79 121  cpu_clk_unhalted             450        450000000.00
 *	---
 *	> 0x79 121  cpu_clk_unhalted              80        450000000.00
 *	56c56
 *	< 0xc1 193  flops                         20         20000000.00
 *	---
 *	> 0xc1 193  flops                         10         56250000.00
 *	58c58
 *	< 0x10  16  fp_comp_ops_exe               20         20000000.00
 *	---
 *	> 0x10  16  fp_comp_ops_exe               10         56250000.00
 *	60,61c60,61
 *	< 0x14  20  cycles_div_busy              360        360000000.00
 *	< 0x13  19  div                           10         10000000.00
 *	---
 *	> 0x14  20  cycles_div_busy                0                0.00
 *	> 0x13  19  div                            0                0.00
 *	65,67c65,67
 *	< 0xd0 208  inst_decoder                  50         50000000.00
 *	< 0xc0 192  inst_retired                  50         50000000.00
 *	< 0xc2 194  uops_retired                  80         80000000.00
 *	---
 *	> 0xd0 208  inst_decoder                  40        225000000.00
 *	> 0xc0 192  inst_retired                  40        225000000.00
 *	> 0xc2 194  uops_retired                  60        337500000.00
 *	80c80
 *	< 0xa2 162  resource_stalls              400        400000000.00
 *	---
 *	> 0xa2 162  resource_stalls               50        281250000.00
 *	94c94
 *	< c = 800
 *	---
 *	> c = 4000
 * You can see that "fdivl -1040(%ebp)" uses 1 data memory reference, 37
 *   cycles (of which 36 occupy the fl.pt. divide unit), 1 fl.pt. operation
 *   (a divide), 2 micro-ops (load, fp divide), and will stall the processor
 *   for 35 cycles if the result is needed in the next instruction.
 * There is one main difference then between idivl and fdivl: idivl does not
 *   count as a floating-point operation even though it causes an execution
 *   in the floating-point unit.
 *
 * On the Pentium, a similar analysis shows 1 data memory reference (a read),
 *   1 instruction executed, for both idivl and fdivl, and 1 floating-point
 *   operation for fdivl.
 */

/*----------------------------------------------------------------------------*/

#define TRIALS 10

/*----------------------------------------------------------------------------*/

#include <pmc_lib.h>

/*----------------------------------------------------------------------------*/

int main(int argc, char * argv[])
{
  int status = RABBIT_NO_STATUS;
  pmc_control_t Ctl = pmc_control_null;

  int Argc = argc;
  char ** Argv = argv;

  int i,j,k, out, t;
  pmc_data_t t0, t1;

  TYPE a, b, c;

  /* initialize internal data structures, read command-line arguments */

  if (pmc_getargs(stderr, argv[0], &Argc, &Argv, &Ctl) == FALSE)
    { exit(RABBIT_FAILURE); }

  /* remove the pmc options from the command line */

  t = argc - (Argc + 1);
  for (i = 1; i <= Argc; i++)
    { argv[i] = argv[t + i]; }
  argc = Argc + 1;

  a = atoi(argv[1]); b = atoi(argv[2]); c = 0;
  printf("a = %d, b = %d, c = %d\n", (int)a, (int)b, (int)c);

  if (pmc_open(0) == FALSE)		/* open /dev/pmc */
    { exit(RABBIT_FAILURE); }

  /* for (out = 0; out < Ctl.num_counters; out++) { ... } */

  for (i = 0, out = 0; i < Ctl.event_pairs; i++) {
    for (j = 0; j < Ctl.replication; j++, out++) {
      pmc_select(&Ctl.counters[out]);
      for (k = 0; k < TRIALS; k++) {
	pmc_read(&t0);

        c += a/b;

	pmc_read(&t1);
	pmc_accumulate(&Ctl.counters[out],&t1,&t0);
      }
    }
  }

  pmc_close();				/* close /dev/pmc */

  printf("c = %d\n", (int)c);		/* prevent over-optimization */

  pmc_print_results(argc, argv, &Ctl);	/* ignore the return value */

  exit(status);
}

/*----------------------------------------------------------------------------*/
