\ Floating point benchmarks in ANS Forth


0 [IF] ========================================================
This file is maintained by:
      ...
      ...  
===============================================================
[THEN]

DECIMAL

\ ************************************************
\ Select system to be tested, set FORTHSYSTEM
\ to value of selected target.
\ Set SPECIFICS FALSE to avoid system dependencies.
\ Set SPECIFICS TRUE to show off implementation tricks.
\ Set HACKING FALSE to use the base source code.
\ Set HACKING TRUE to optimise the source code.
\ ************************************************

 1  CONSTANT VfxForth3		\ MPE VFX Forth v3.x
 2  CONSTANT Pfw22		\ MPE ProForth 2.2
 3  CONSTANT SwiftForth20	\ FI SwiftForth 2.0
 4  CONSTANT SwiftForth15	\ FI SwiftForth 1.5
 5  CONSTANT Win32Forth		\ Win32Forth 4.2
 6  CONSTANT BigForth		\ BigForth 11 July 1999
 7  CONSTANT BigForth-Linux	\ BigForth 11 July 1999
 8  CONSTANT iForth		\ iForth 1.12 5 Aug 2001
 9  CONSTANT iForth20		\ iForth 2.0 8 June 2002
10  CONSTANT SwiftForth22	\ FI SwiftForth 2.2.2.9
11  CONSTANT gForth-fast	\ gForth 0.6.9

s" gforth" environment? [if]
 2drop
 gforth-fast
[else]
 iforth20
\ VfxForth3    			\ select system to test
\ SwiftForth22 
\ Win32Forth   
[then]
CONSTANT ForthSystem

 FALSE CONSTANT specifics	\ TRUE to use system dependent code
 FALSE CONSTANT hacking		\ TRUE to use "guru" level code that
				\ makes assumptions of an optimising compiler.
 TRUE  CONSTANT ANSSystem	\ Some Forth 83 systems cannot compile
				\ all the test examples without carnal
				\ knowledge, especially if the compiler
				\ checks control structures.

: .specifics ( -- ) 	\ display trick state
	." using"  specifics 0=
	IF  ."  no"  THEN
  	."  extensions" ;

: .hacking ( -- )	\ display hack state
	."  using"  hacking 0=
  	IF  ."  no"  THEN
  	."  hackery" ;

: .testcond ( -- ) 	\ display test conditions
	.specifics ."  and" .hacking ;


\ *****************************
\ VFX Forth for Windows harness
\ *****************************

VfxForth3 ForthSystem = [IF]

[defined] +idata [if]
  +idata	\ enable P4 data options
  variable zzz	\ preallocate first IDATA buffer
[then]

TRUE CONSTANT ndp?	\ -- flag ; TRUE if NDP stack version

c" C:\Program Files\mpe\VfxEval\Lib"
  setmacro LibDir
c" C:\Program Files\mpe\VfxEval\Lib"
  setmacro NdpDir

ndp? [if]
  S" %NdpDir%\Ndp387" INCLUDED
[else]
  S" %NdpDir%\Hfp387" INCLUDED
[then]

  char . dp-char !			\ select ANS number conversion
  char . fp-char !

-short-branches				\ disable short forward branches

extern: DWORD PASCAL GetTickCount( void );

: COUNTER 	\ -- ms
  GetTickCount ;

: DOC ;
: ENDDOC ;
: =: CONSTANT ;
: -- POSTPONE \ ; IMMEDIATE
: F1+    1e F+ ;
: F2DUP  FOVER FOVER ;
: F2DROP FDROP FDROP ;
: FSQR   FDUP F* ; 
: EOL ;

1e fatan 4e f* FCONSTANT PI

[THEN]


\ ******************************
\ iForth 2.0 for Windows harness
\ ******************************

iForth20 ForthSystem = [IF]

NEEDS -miscutil
NEEDS -dynlink
  0 VALUE 'counter
  S" kernel32.dll" LIBRARY-OPEN THROW ( dll)
  S" GetTickCount" ROT LIBRARY-FIND THROW TO 'counter 

: COUNTER ( -- ms ) 0 'counter FOREIGN ;

[THEN]


\ **********************
\ SwiftForth 2.0 harness
\ **********************


SwiftForth20 ForthSystem = [IF]

CR .( Not tested )

include C:\program Files\SwiftForth20\Lib\Options\fpmath.f
include C:\Program Files\SwiftForth20\Lib\FSLib\Library\fsl-util.f

[THEN]


\ **********************
\ SwiftForth 2.2 harness
\ **********************


SwiftForth22 ForthSystem = [IF]

CR .( Not tested )

\ FPCONFIG.F should be in the BENCHMRK folder
include C:\Program files\SwiftForth2229\Lib\Options\fpmath.f
include C:\Program Files\SwiftForth2229\Unsupported\FSLib\Library\fsl-util.f

[THEN]


\ ******************
\ Win32Forth harness
\ ******************

Win32Forth ForthSystem = [IF]

CR .( Not tested )

: COUNTER 	\ -- ms
  Call GetTickCount ;

[THEN]

\ **************
\ gforth harness
\ **************

gForth-fast ForthSystem = [IF]

variable out	\ -- addr

: temit		\ -- char
  1 out +!  (emit)
; ' temit is emit

: ttype		\ addr len --
  dup out +!  (type)
; ' ttype is type

: cr		\ --
  cr  out off
;

: >pos          \ n -- ; step to position n
  out @ - spaces
;

decimal

0 CONSTANT U>D

: counter	\ -- ms
  cputime d+ 1000 um/mod nip
;

create pocket 256 allot

: c"		\ -- [comp] ; -- addr [interp]
  state @ if
    postpone c"
  else
    [char] " parse   pocket place  pocket
  endif
; immediate

: [o/n] ; IMMEDIATE

: M/            \ d n1 -- quot
  sm/rem nip
;

: buffer:	\ n -- ; -- addr
  create
    here  over allot  swap erase
;

: 2-		\ n -- n-2
  s" 2 -" evaluate
; immediate

: u2/		\ u -- u'
  s" 1 RSHIFT" evaluate
; immediate

: not		\ x -- x'
  s" invert" evaluate
; immediate

0 constant HWND_DESKTOP
16 constant WM_CLOSE

: SendMessage	\ h m w l -- flag
  2drop 2drop  0
;

: EOL ;
: DOC ;
: ENDDOC ;
: =: CONSTANT ;
: -- POSTPONE \ ; IMMEDIATE
: F1+    1e F+ ;
: F2DUP  FOVER FOVER ;
: F2DROP FDROP FDROP ;
: FSQR   FDUP F* ; 

: FVALUE ( r "name" -- )
    create f,
  does>
    f@ ;

: [fto] ( r "name" -- )
    ' >body postpone literal postpone f! ; immediate compile-only

: s>f ( n -- r ) s>d d>f ;
: f>s ( r -- n ) f>d drop ;


[THEN]

\ ********************
\ Start of common code
\ ********************

0 VALUE _time_
VARIABLE TotalTime

: TIMER-RESET ( -- )
	COUNTER TO _time_  ;

: #? ( d1 -- d2 )
	2DUP OR 0=
	IF  BL HOLD  ELSE  #  THEN ;

: .secs ( ms -- )
	0 <#  BL HOLD # # # [CHAR] . HOLD # #? #?  #> TYPE 
	." seconds" ;

: .ELAPSED ( -- ) 
	COUNTER _time_ -
	DUP TotalTime +!
	.secs ;

: MS?  COUNTER _time_ - ; ( -- ms )

\ ==============================================================
\ ============ Finally -- the benchmark code ===================
\ ==============================================================

( *
  * LANGUAGE    : ANS Forth
  * PROJECT     : Forth Environments
  * DESCRIPTION : Estimate MFLOPS rating 
  * CATEGORY    : Benchmark
  * AUTHOR      : Marcel Hendrix 
  * LAST CHANGE : January 13, 2001, Marcel Hendrix 
  * )


\ ANEW -flops 

0 [IF]
   Flops.c is a 'c' program which attempts to estimate your systems
   floating-point 'MFLOPS' rating for the FADD, FSUB, FMUL, and FDIV
   operations based on specific 'instruction mixes' (discussed below).
   The program provides an estimate of PEAK MFLOPS performance by making
   maximal use of register variables with minimal interaction with main
   memory. The execution loops are all small so that they will fit in
   any cache. Flops.c can be used along with Linpack and the Livermore
   kernels (which exersize memory much more extensively) to gain further
   insight into the limits of system performance. The flops.c execution
   modules also include various percent weightings of FDIV's (from 0% to
   25% FDIV's) so that the range of performance can be obtained when
   using FDIV's. FDIV's, being computationally more intensive than
   FADD's or FMUL's, can impact performance considerably on some systems.
   
   Flops.c consists of 8 independent modules (routines) which, except for
   module 2, conduct numerical integration of various functions. Module
   2, estimates the value of pi based upon the Maclaurin series expansion
   of atan(1). MFLOPS ratings are provided for each module, but the
   programs overall results are summerized by the MFLOPS(1), MFLOPS(2),
   MFLOPS(3), and MFLOPS(4) outputs.

   The MFLOPS(1) result is identical to the result provided by all
   previous versions of flops.c. It is based only upon the results from
   modules 2 and 3. Two problems surfaced in using MFLOPS(1). First, it
   was difficult to completely 'vectorize' the result due to the 
   recurrence of the 's' variable in module 2. This problem is addressed
   in the MFLOPS(2) result which does not use module 2, but maintains
   nearly the same weighting of FDIV's (9.2%) as in MFLOPS(1) (9.6%).
   The second problem with MFLOPS(1) centers around the percentage of
   FDIV's (9.6%) which was viewed as too high for an important class of
   problems. This concern is addressed in the MFLOPS(3) result where NO
   FDIV's are conducted at all. 
   
   The number of floating-point instructions per iteration (loop) is
   given below for each module executed:

   MODULE   FADD   FSUB   FMUL   FDIV   TOTAL  Comment
     1        7      0      6      1      14   7.1%  FDIV's
     2        3      2      1      1       7   difficult to vectorize.
     3        6      2      9      0      17   0.0%  FDIV's
     4        7      0      8      0      15   0.0%  FDIV's
     5       13      0     15      1      29   3.4%  FDIV's
     6       13      0     16      0      29   0.0%  FDIV's
     7        3      3      3      3      12   25.0% FDIV's
     8       13      0     17      0      30   0.0%  FDIV's
   
   A*2+3     21     12     14      5      52   A=5, MFLOPS(1), Same as
	   40.4%  23.1%  26.9%  9.6%          previous versions of the
						flops.c program. Includes
						only Modules 2 and 3, does
						9.6% FDIV's, and is not
						easily vectorizable.
   
   1+3+4     58     14     66     14     152   A=4, MFLOPS(2), New output
   +5+6+    38.2%  9.2%   43.4%  9.2%          does not include Module 2,
   A*7                                         but does 9.2% FDIV's.
   
   1+3+4     62      5     74      5     146   A=0, MFLOPS(3), New output
   +5+6+    42.9%  3.4%   50.7%  3.4%          does not include Module 2,
   7+8                                         but does 3.4% FDIV's.

   3+4+6     39      2     50      0      91   A=0, MFLOPS(4), New output
   +8       42.9%  2.2%   54.9%  0.0%          does not include Module 2,
						and does NO FDIV's.

   NOTE: Various timer routines are included as indicated below. The
	timer routines, with some comments, are attached at the end 
	of the main program.

   NOTE: Please do not remove any of the printouts.

   EXAMPLE COMPILATION:
   UNIX based systems
       cc -DUNIX -O flops.c -o flops
       cc -DUNIX -DROPT flops.c -o flops 
       cc -DUNIX -fast -O4 flops.c -o flops 
       .
       .
       .
     etc.

   Al Aburto
   aburto@nosc.mil

[THEN]

 1e 			FCONSTANT A0
-0.1666666666671334e 	FCONSTANT A1
 0.833333333809067e-2	FCONSTANT A2 
 0.198412715551283e-3	FVALUE    A3 -- changeable ...
 0.27557589750762e-5	FCONSTANT A4 
 0.2507059876207e-7	FVALUE    A5 -- changeable ...
 0.164105986683e-9	FCONSTANT A6 

 1e			FCONSTANT B0 
-0.4999999999982e	FCONSTANT B1 
 0.4166666664651e-1	FCONSTANT B2 
-0.1388888805755e-2	FCONSTANT B3 
 0.24801428034e-4	FCONSTANT B4 
-0.2754213324e-6	FCONSTANT B5 
 0.20189405e-8		FCONSTANT B6 

 0.3999999946405e-1	FCONSTANT D1
 0.96e-3		FCONSTANT D2 
 0.1233153e-5		FCONSTANT D3

 0.48e-3		FCONSTANT E2
 0.411051e-6		FCONSTANT E3

( * **************************************************
   Set Variable Values.                             
   T[1] references all timing results relative to   
   one million loops.                               
                                                  
   The program will execute from 31250 to 512000000 
   loops based on a runtime of Module 1 of at least 
   TLimit = 15.0 seconds. That is, a runtime of 15  
   seconds for Module 1 is used to determine the    
   number of loops to execute.                      
                                                  
  No more than NLimit = 512000000 loops are allowed
  ************************************************** * )

1500     =: TLimit 	-- 1.5 seconds ( was 15000 )
51200000 =: NLimit	-- maximum number of loops ( was 512000000 )
15625    =: loops	-- Initial number of loops, DO NOT CHANGE!
 
: FVALUES  0 ?DO  0e FVALUE  LOOP ;

	8 FVALUES  T[0]  T[1]  T[2]  T[3]  T[4]  T[5]  T[6]  T[7]
	8 FVALUES  T[8]  T[9]  T[10] T[11] T[12] T[13] T[14] T[15]
	8 FVALUES  T[16] T[17] T[18] T[19] T[20] T[21] T[22] T[23]
	8 FVALUES  T[24] T[25] T[26] T[27] T[28] T[29] T[30] T[31]
	4 FVALUES  T[32] T[33] T[34] T[35]

0e FVALUE scale
0e FVALUE nulltime

loops VALUE m

6 FVALUES sa xx uu ss vv ww

-- There exists a magical n for which FLOPS is much, much, faster...
-- Use findbest.frt  ( set Nlimit lower )

: FMS?  ( F: -- time  ) MS?  S>F 1e-3 F* ;
: SFMS? ( F: -- stime ) FMS? scale F*  nulltime F- ;

\ ( * *************************************************
\  Module 1.  Calculate integral of df(x)/f(x) defined  
\             below.  Result is ln(f(1)). There are 14  
\             double precision operations per loop      
\             ( 7 +, 0 -, 6 *, 1 / ) that are included  
\             in the timing.                            
\             50.0% +, 00.0% -, 42.9% *, and 07.1% /    
\    ************************************************* * )

: MODULE-1 ( -- )
	loops 0 LOCALS| sa n |
	1e 0e 0e 0e 0e [FTO] xx [FTO] uu [FTO] ss [FTO] vv [FTO] ww

	BEGIN 
	  sa TLimit 
      <	WHILE
   	  0e [FTO] ss
   	  0e [FTO] vv
   	  1e [FTO] ww
	  n 2* DUP TO n  S>F 1/F [FTO] xx
	  TIMER-RESET       
       	        n 1 DO  ww vv F+ [FTO] vv 
			vv xx F* ( uu )
			D3 FOVER F* D2 F+  FOVER F* D1 F+   
			FSWAP ( t uu )
			E3 FOVER F* E2 F+  FOVER F* D1 F+  F* ww F+   
			F/ ss F+ [FTO] ss
		  LOOP
          MS? TO sa
	  n NLimit
     >=	UNTIL THEN

	1e6  n S>F F/ FDUP [FTO] scale [FTO] T[1]

	TIMER-RESET  
	   n 0 DO LOOP  
	FMS?  	        scale F*  0e FMAX  [FTO] nulltime
	sa S>F 1e-3 F*  scale F*  nulltime F-  [FTO] T[2]
	T[2] 14e ( #flops?) F/  FDUP [FTO] T[3]  1/F [FTO] T[4]

	D1 D2 F+ D3 F+   D1 E2 F+ E3 F+ F1+  F/ 
	D1 F+  ss F2* F+  F2/  xx F* 1/F 
	FDUP 40e3 F*  scale F/  F>S TO m  ( note: m is multiple of 4 )
	( sb) 25.2e F- ( error)

	."     1   " ( error) FE. 7 SPACES 
			 T[2] FE. 5 SPACES  
			 T[4] FE. 5 SPACES ;

\ ( * *****************************************************
\    Module 2.  Calculate value of PI from Taylor Series 
\               expansion of atan(1.0).  There are 7     
\               double precision operations per loop     
\               ( 3 +, 2 -, 1 *, 1 / ) that are included 
\               in the timing.                           
\               42.9% +, 28.6% -, 14.3% *, and 14.3% /   
\    ***************************************************** * )


: MODULE-2 ( -- )
	1e 0e 0e 0e 0e -1e [FTO] sa [FTO] xx [FTO] uu [FTO] ss [FTO] vv [FTO] ww

	TIMER-RESET ( note: m is multiple of 4 )
	   m 2/ 0 DO  2e uu F+ [FTO] uu  
	  	      2e uu F+ [FTO] uu  
		LOOP
	FMS? scale F*  0e FMAX [FTO] T[5]

	sa [FTO] uu  0e [FTO] vv  0e [FTO] ww  0e [FTO] xx

	TIMER-RESET 
	-5e 5e uu
	m 2/ 0 DO  2e F+
		   F2DUP         F- xx F+ [FTO] xx
		   2 FPICK FOVER F* vv F+ [FTO] vv
		   F2DUP   	 F/ ww F+ [FTO] ww

		   2e F+ 
		   2 FPICK FOVER F- xx F+ [FTO] xx
		   F2DUP	 F* vv F+ [FTO] vv
		   2 FPICK FOVER F/ ww F+ [FTO] ww   ( a b c -- a b c a/c )
	  LOOP
	FDROP F2DROP

	FMS? scale F* [FTO] T[6]
	T[6] T[5] F-  7e ( #flops) F/   FDUP [FTO] T[7]   1/F [FTO] T[8]
	sa xx F*  m S>F F/  F>S TO m

	ww 4e F* 5e F/  5e vv F/  F+ ( sb)
	31.25e  vv FDUP FSQR F*  F/  F-  PI F- ( pi_error)

	CR ."     2   " FE. 7 SPACES  
	T[6] T[5] F- 	FE. 5 SPACES  
	T[8]		FE. 5 SPACES ;

\ ( * ******************************************************
\    Module 3.  Calculate integral of sin(x) from 0.0 to
\               PI/3.0 using Trapezoidal Method. Result 
\               is 0.5. There are 17 double precision   
\               operations per loop (6 +, 2 -, 9 *, 0 /)
\               included in the timing.                 
\               35.3% +, 11.8% -, 52.9% *, and 00.0% /  
\    ****************************************************** * )

: MODULE-3 ( -- )
	PI  m 3 * S>F  F/
	0e 0e 0e 0e [FTO] uu [FTO] ss [FTO] vv [FTO] ww [FTO] xx

	TIMER-RESET
	  m 1 DO  1e       vv F+ [FTO] vv
		  vv xx F*	 [FTO] uu
		  uu FSQR 
		  A6 FOVER F* A5 F-  
		     FOVER F* A4 F+  
		     FOVER F* A3 F-  
		     FOVER F* A2 F+  
		     FOVER F* A1 F+  
		  F* F1+ uu F*	ss F+ [FTO] ss 
	    LOOP
	SFMS? FDUP [FTO] T[9]  17e F/ FDUP [FTO] T[10]  1/F [FTO] T[11]

	PI 3e F/ [FTO] uu
	uu FSQR  [FTO] ww
	A6 ww F* A5 F-  
	   ww F* A4 F+  
	   ww F* A3 F-  
	   ww F* A2 F+  
	   ww F* A1 F+  
	   ww F* 1e F+ uu F* ( sa)
	ss F2* F+ F2/  xx F*  0.5e F- ( error )
	
	CR ."     3   " FE. 7 SPACES  
	    	  T[9]	FE. 5 SPACES  
	          T[11]	FE. 5 SPACES ;

\ ( * ***********************************************************
\    Module 4.  Calculate Integral of cos(x) from 0.0 to PI/3 
\               using the Trapezoidal Method. Result is       
\               sin(PI/3). There are 15 double precision      
\               operations per loop (7 +, 0 -, 8 *, and 0 / ) 
\               included in the timing.                       
\               50.0% +, 00.0% -, 50.0% *, 00.0% /            
\    *********************************************************** * )

: MODULE-4 ( -- )
	A3 FNEGATE [FTO] A3
	A5 FNEGATE [FTO] A5
	PI  m 3 * S>F  F/
	0e 0e 0e 0e [FTO] uu [FTO] ss [FTO] vv [FTO] ww [FTO] xx
	
	TIMER-RESET
	  0e
	  m 1 DO  
	  	  F1+ FDUP xx F*  FSQR
		  B6 FOVER F* B5 F+  
		     FOVER F* B4 F+  
		     FOVER F* B3 F+  
		     FOVER F* B2 F+  
		     FOVER F* B1 F+  
		           F* F1+ ss F+ [FTO] ss 
	    LOOP
	  FDROP
	SFMS? FDUP [FTO] T[12]  15e F/ FDUP [FTO] T[13]  1/F [FTO] T[14]
	
	PI 3e F/ [FTO] uu
	uu FSQR  [FTO] ww
	B6 ww F* B5 F+  
	   ww F* B4 F+  
	   ww F* B3 F+  
	   ww F* B2 F+  
	   ww F* B1 F+  
	   ww F* F1+ ( sa)
	ss F2* F1+ F+ F2/  xx F*  ( sa )
	
	A6 ww F* A5 F+  
	   ww F* A4 F+  
	   ww F* A3 F+  
	   ww F* A2 F+  
	   ww F* A1 F+  
	   ww F* A0 F+ uu F* ( sb) F- ( error )

	CR ."     4   " FE. 7 SPACES  
		  T[12]	FE. 5 SPACES  
	    	  T[14]	FE. 5 SPACES ;

\ ( * ***********************************************************
\    Module 5.  Calculate Integral of tan(x) from 0.0 to PI/3 
\               using the Trapezoidal Method. Result is       
\               ln(cos(PI/3)). There are 29 double precision  
\               operations per loop (13 +, 0 -, 15 *, and 1 /)
\               included in the timing.                       
\               46.7% +, 00.0% -, 50.0% *, and 03.3% /        
\    *********************************************************** * )

: MODULE-5 ( -- )
	PI  m 3 * S>F  F/
	0e 0e 0e 0e [FTO] uu [FTO] ss [FTO] vv [FTO] ww [FTO] xx

	TIMER-RESET
	  0e
	  m 1 DO  
	  	  F1+ FDUP xx F* FDUP FSQR FSWAP FOVER ( ww uu ww )
		  A6 FOVER F* A5 F+  
		     FOVER F* A4 F+  
		     FOVER F* A3 F+  
		     FOVER F* A2 F+  
		     FOVER F* A1 F+  
		           F* F1+ 
		           F* ( ww vv ) FSWAP

		  B6 FOVER F* B5 F+  
		     FOVER F* B4 F+  
		     FOVER F* B3 F+  
		     FOVER F* B2 F+  
		     FOVER F* B1 F+  
		           F* F1+ F/ ss F+ [FTO] ss 
	    LOOP
	  FDROP
	SFMS? FDUP [FTO] T[15]  29e F/ FDUP [FTO] T[16]  1/F [FTO] T[17]
	
	PI 3e F/ [FTO] uu
	uu FSQR  [FTO] ww

	A6 ww F* A5 F+  
	   ww F* A4 F+  
	   ww F* A3 F+  
	   ww F* A2 F+  
	   ww F* A1 F+  
	   ww F* F1+ uu F* ( sa ) 

	B6 ww F* B5 F+  
	   ww F* B4 F+  
	   ww F* B3 F+  
	   ww F* B2 F+  
	   ww F* B1 F+  
	   ww F* F1+ F/ ( sa/sb )

	ss F2* F+ F2/  xx F*  0.6931471805599453e F- ( error )
	
	CR ."     5   " FE. 7 SPACES  
		  T[15]	FE. 5 SPACES  
		  T[17]	FE. 5 SPACES ;

\ ( * ***********************************************************
\    Module 6.  Calculate Integral of sin(x)*cos(x) from 0.0  
\               to PI/4 using the Trapezoidal Method. Result  
\               is sin(PI/4)^2. There are 29 double precision 
\               operations per loop (13 +, 0 -, 16 *, and 0 /)
\               included in the timing.                       
\               46.7% +, 00.0% -, 53.3% *, and 00.0% /        
\    *********************************************************** * )

: MODULE-6 ( -- )
	PI  m 4 * S>F  F/
	0e 0e 0e 0e [FTO] uu [FTO] ss [FTO] vv [FTO] ww [FTO] xx

	TIMER-RESET
	  0e
	  m 1 DO
	  	  F1+ FDUP xx F* FDUP
		  FSQR ( uu ww ) FSWAP FOVER ( ww uu ww )
		  A6 FOVER F* A5 F+  
		     FOVER F* A4 F+  
		     FOVER F* A3 F+  
		     FOVER F* A2 F+  
		     FOVER F* A1 F+  
		           F* F1+ 
		  F* ( vv ) FSWAP

		  B6 FOVER F* B5 F+  
		     FOVER F* B4 F+  
		     FOVER F* B3 F+  
		     FOVER F* B2 F+  
		     FOVER F* B1 F+  
		     	   F* F1+ F* ss F+ [FTO] ss 
	    LOOP
	  FDROP
	SFMS? FDUP [FTO] T[18]  29e F/ FDUP [FTO] T[19]  1/F [FTO] T[20]

	PI 4e F/ [FTO] uu
	uu FSQR  [FTO] ww

	A6 ww F* A5 F+  
	   ww F* A4 F+  
	   ww F* A3 F+  
	   ww F* A2 F+  
	   ww F* A1 F+  
	   ww F* F1+ uu F* ( sa ) 

	B6 ww F* B5 F+  
	   ww F* B4 F+  
	   ww F* B3 F+  
	   ww F* B2 F+  
	   ww F* B1 F+  
	   ww F* F1+ F* ( sa*sb )

	ss F2* F+ F2/  xx F*  0.25e F- ( error )

	CR ."     6   " FE. 7 SPACES  
		  T[18]	FE. 5 SPACES  
		  T[20]	FE. 5 SPACES ;

\ ( * ******************************************************
\    Module 7.  Calculate value of the definite integral 
\               from 0 to sa of 1/(x+1), x/(x*x+1), and  
\               x*x/(x*x*x+1) using the Trapezoidal Rule.
\               There are 12 double precision operations 
\               per loop ( 3 +, 3 -, 3 *, and 3 / ) that 
\               are included in the timing.              
\               25.0% +, 25.0% -, 25.0% *, and 25.0% /   
\    ****************************************************** * )

: MODULE-7 ( -- )
	0e 0e 0e 0e 1e 102.3321513995275e [FTO] sa [FTO] ww [FTO] ss [FTO] uu [FTO] vv [FTO] xx
	sa m S>F F/ [FTO] vv

	TIMER-RESET
	0e
	m 1 DO  
		F1+ FDUP vv F* FDUP [FTO] xx
		( xx) FSQR          [FTO] uu
		ss
		ww  FDUP       xx F+  F/  
		xx  uu         ww F+  F/  F+
		uu  FDUP xx F* ww F+  F/  F+ F- [FTO] ss
	  LOOP
	FDROP

	SFMS? FDUP [FTO] T[21]  12e F/ FDUP [FTO] T[22]  1/F [FTO] T[23]

	sa      [FTO] xx
	xx FSQR [FTO] uu
	ww	ww  xx       ww F+ F/  F+
		xx  uu       ww F+ F/  F+
		uu  xx uu F* ww F+ F/  F+  FNEGATE ( sa)
	ss F2* F+  vv F* 18e F* [FTO] sa

	sa -2000e F*  scale F/ F>S TO m 

	sa 500.2e F+ ( error)

	CR ."     7   " FE. 7 SPACES  
		  T[21] FE. 5 SPACES  
		  T[23]	FE. 5 SPACES ;

\ ( * ***********************************************************
\    Module 8.  Calculate Integral of sin(x)*cos(x)*cos(x)    
\               from 0 to PI/3 using the Trapezoidal Method.  
\               Result is (1-cos(PI/3)^3)/3. There are 30     
\               double precision operations per loop included 
\               in the timing:                                
\                  13 +,     0 -,    17 *          0 /        
\               46.7% +, 00.0% -, 53.3% *, and 00.0% /        
\    *********************************************************** * )

: MODULE-8 ( -- )
	PI  m 3 * S>F  F/
	0e 0e 0e 0e [FTO] uu [FTO] ss [FTO] vv [FTO] ww [FTO] xx

	TIMER-RESET
	  0e
	  m 1 DO
	  	  F1+ FDUP xx F*
		  FDUP FSQR  ( uu ww )
		  B6 FOVER F* B5 F+  
		     FOVER F* B4 F+  
		     FOVER F* B3 F+  
		     FOVER F* B2 F+  
		     FOVER F* B1 F+  
		     FOVER F* F1+ 
		  FSQR ( vv^2 )	FSWAP

		  A6 FOVER F* A5 F+  
		     FOVER F* A4 F+  
		     FOVER F* A3 F+  
		     FOVER F* A2 F+  
		     FOVER F* A1 F+  
		           F* F1+ ( uu vv^2 w )
		  F* F* ss F+ [FTO] ss
	    LOOP
	  FDROP
	SFMS? FDUP [FTO] T[24]  30e F/ FDUP [FTO] T[25]  1/F [FTO] T[26]

	PI 3e F/ [FTO] uu
	uu FSQR  [FTO] ww

	A6 ww F* A5 F+  
	   ww F* A4 F+  
	   ww F* A3 F+  
	   ww F* A2 F+  
	   ww F* A1 F+  
	   ww F* F1+ uu F* ( sa ) 

	B6 ww F* B5 F+  
	   ww F* B4 F+  
	   ww F* B3 F+  
	   ww F* B2 F+  
	   ww F* B1 F+  
	   ww F* F1+ FSQR F* ( sa*sb^2 )

	ss F2* F+ F2/  xx F*  0.29166666666666667e F- ( error )

	CR ."     8   " FE. 7 SPACES  
		  T[24]	FE. 5 SPACES  
		  T[26]	FE. 5 SPACES ;

\ *************************************************   
\   MFLOPS(1) output. This is the same weighting   
\   used for all previous versions of the flops.c  
\   program. Includes Modules 2 and 3 only.        
\ *************************************************  
: MFLOPS(1) ( F: -- t )
	T[6] T[5] F- 5e F*  T[9] F+  52e F/  
	FDUP [FTO] T[27]  1/F FDUP [FTO] T[28] ;

\ *************************************************
\   MFLOPS(2) output. This output does not include 
\   Module 2, but it still does 9.2% FDIV's.       
\ ************************************************* 
: MFLOPS(2) ( F: -- t )
	T[2] T[9] F+ T[12] F+ T[15] F+ T[18] F+ T[21] 4e F* F+  152e F/  
	FDUP [FTO] T[29]  1/F FDUP [FTO] T[30] ;

\ *************************************************
\   MFLOPS(3) output. This output does not include 
\   Module 2, but it still does 3.4% FDIV's.       
\ *************************************************
: MFLOPS(3) ( F: -- t )
	T[2] T[9] F+ T[12] F+ T[15] F+ T[18] F+ T[21] F+ T[24] F+ 146e F/  
	FDUP [FTO] T[31]  1/F FDUP [FTO] T[32] ;
	
\ *************************************************
\   MFLOPS(4) output. This output does not include 
\   Module 2, and it does NO FDIV's.               
\ *************************************************
: MFLOPS(4) ( F: -- t )
	T[9] T[12] F+  T[18] F+ T[24] F+ 91e F/  
 	FDUP [FTO] T[33] 1/F FDUP [FTO] T[34] ;


: MAIN ( -- )
	PRECISION >R 4 SET-PRECISION
	CR ."    FLOPS Forth Program (Double Precision), V2.0 14 Jan 2001" 
	CR 
	1e6 loops S>F F/ [FTO] scale
	loops TO m
	0.198412715551283e-3 [FTO] A3
 	0.2507059876207e-7   [FTO] A5

	CR ."   Module     Error        RunTime      MFLOPS" 
	CR ."                            (usec)" 
	CR
	MODULE-1	MODULE-2	
	MODULE-3	MODULE-4
	MODULE-5	MODULE-6
	MODULE-7	MODULE-8
	CR 
	CR ."    Iterations      = " m         EOL 10  .R 
	CR ."    NullTime (usec) = " nulltime  EOL FE.
	CR ."    MFLOPS(1)       = " MFLOPS(1) EOL FE.  
	CR ."    MFLOPS(2)       = " MFLOPS(2) EOL FE.
	CR ."    MFLOPS(3)       = " MFLOPS(3) EOL FE.
	CR ."    MFLOPS(4)       = " MFLOPS(4) EOL FE.
	CR
	R> SET-PRECISION ;

0 [IF]

  ( P54c )
FORTH> main
   FLOPS Forth Program (Double Precision), V2.0 14 Jan 2001

  Module     Error        RunTime      MFLOPS
                           (usec)
    1      4.4764E-13      0.4775     29.3194
    2     -6.9400E-14      0.3472     20.1584
    3     -8.7708E-15      0.3845     44.2133
    4      4.2299E-14      0.3290     45.5927
    5      2.4980E-14      0.7865     36.8722
    6     -2.8311E-15      0.6035     48.0530
    7     -5.1273E-11      0.8840     13.5747
    8      3.0198E-14      0.5985     50.1253

   Iterations      =    4001600
   NullTime (usec) =     0.0365
   MFLOPS(1)       =    24.5196
   MFLOPS(2)       =    24.8488
   MFLOPS(3)       =    35.9296
   MFLOPS(4)       =    47.5072

  ( PII - 350 MHz ) ---------------------------------------------
   FLOPS Forth Program (Double Precision), V2.0 14 Jan 2001

  Module     Error        RunTime      MFLOPS
                           (usec)
    1      2.5935E-13      0.1568     89.3142
    2      1.6160E-13      0.0729     96.0549
    3     -1.9445E-14      0.1365    124.5421
    4     -1.4575E-13      0.1266    118.5185
    5     -1.1170E-13      0.2874    100.8915
    6     -1.3711E-15      0.2406    120.5195
    7     -6.2357E-11      0.3480     34.4828
    8     -1.2270E-14      0.2268    132.3043

   Iterations      =   16006400
   NullTime (usec) =     0.0114
   MFLOPS(1)       =   103.8183
   MFLOPS(2)       =    64.9607
   MFLOPS(3)       =    95.8870
   MFLOPS(4)       =   124.5829

  ( Athlon 900 MHz ) ------------------------------------------
   FLOPS Forth Program (Double Precision), V2.0 14 Jan 2001

  Module     Error        RunTime      MFLOPS
                           (usec)
    1      4.8317E-13      0.0260    538.4615
    2      1.8135E-13      0.0150    466.1811
    3     -9.6647E-15      0.0394    431.9174
    4      1.5201E-13      0.0263    570.7491
    5     -1.2225E-13      0.0498    582.0006
    6     -6.9949E-15      0.0454    638.4589
    7     -1.1880E-11      0.0782    153.5386
    8     -8.7737E-14      0.0452    662.9834

   Iterations      =   64025600
   NullTime (usec) =     0.0073
   MFLOPS(1)       =   454.3965
   MFLOPS(2)       =   304.2948
   MFLOPS(3)       =   470.5171
   MFLOPS(4)       =   582.1671

   ( lcc on p54c ) --------------------------------------------
   FLOPS C Program (Double Precision), V2.0 18 Dec 1992

   Module     Error        RunTime      MFLOPS
                            (usec)
     1    -4.6896e-013      0.8074     17.3388
     2     2.2160e-013      0.6187     11.3148
     3    -6.9944e-015      0.8015     21.2102
     4    -9.7256e-014      0.5705     26.2913
     5    -1.6542e-014      1.2869     22.5352
     6     4.3632e-014      1.1051     26.2429
     7    -4.9454e-011      1.3232      9.0692
     8     7.2164e-014      1.2262     24.4661

   Iterations      =   32000000
   NullTime (usec) =     0.0181
   MFLOPS(1)       =    13.3512
   MFLOPS(2)       =    15.4095
   MFLOPS(3)       =    20.5035
   MFLOPS(4)       =    24.5728


"c:\lcc\src\flops\lcc\flops.exe"
Return code 33

   ( ms vc++ 6.0 on Athlon 900 MHz )


   FLOPS C Program (Double Precision), V2.0 18 Dec 1992

   Module     Error        RunTime      MFLOPS
                            (usec)
     1     1.3358e-012      0.0311    449.5735
     2     2.0517e-013      0.0188    373.3333
     3     1.7542e-014      0.0423    402.2181
     4    -5.4512e-014      0.0340    441.7855
     5     3.3307e-016      0.0631    459.7473
     6    -1.9040e-014      0.0617    470.3497
     7     2.6034e-011      0.0521    230.2849
     8    -5.4068e-014      0.0617    486.5687

   Iterations      =   64000000
   NullTime (usec) =     0.0052
   MFLOPS(1)       =   382.3090
   MFLOPS(2)       =   345.0380
   MFLOPS(3)       =   422.1369
   MFLOPS(4)       =   456.0689

[THEN]

: .ABOUT CR ." Try: MAIN " ;

                .ABOUT 

                              ( * End of Source * )