From: Tom Barrett <barrett@pacific.mps.ohio-state.edu>
Date: Tue, 26 Jul 1994 10:06:11 -0400

This is an update to info-mac/sci/fft-in-asm-src.txt

This file contains three routines:
* void tb_68k_four1_extended(long double *data, long nn, long isign);
* void tb_68k_four1_single(float *data, long nn, long isign);
* void tb_68k_fourn_single(float *data, long *nn, long ndim, long isign)

------------------------------ CUT HERE ------------------------------

/* This code is a hand-assembled version of the fft routine from Numerical
Recipes.  See the book for information about how it works.  All variable
names in comments refer to those in the book.

To use this routine:
* You must have a math coprocessor.
* Use Think C (users of other compilers may be able to adapt it).

This file contains three routines:
* void tb_68k_four1_extended(long double *data, long nn, long isign);
* void tb_68k_four1_single(float *data, long nn, long isign);
* void tb_68k_fourn_single(float *data, long *nn, long ndim, long isign)

The data to be processed is stored in an array of nn complex numbers, so
that data[] is an array of 2*nn extendeds or floats.

Notes:

* Set "Native floating-point format" under "Compiler Settings" in the
	Options... dialog box for the extended routine.  This uses the
	12-byte format which the math coprocessor uses internally.

* This routine is DESTRUCTIVE.  The output data is placed in the space
	where the input data was.  If you still want the input, make a copy
	and pass the copy to the routine.

* In the book Numerical recipes in C, from which this routine is taken,
	the first element of the array is accessed as data[1].  This is due
	the the book's Fortran heritage.  In the the first element is data[0].
	One way to compensate is to subtract one from the pointer before calling
	the FFT routine.  Another way would be to use data[i-1] and data[i]
	instead of data[i] and data[i+1] (they always occur in pairs).  This
	routine expects 'data' to be a pointer to the first element of the array.
	If you are replacing the C version, and compensated for this in the
	routine that called four1 (like the book suggests), then this is an issue.

* 'nn' must be a power of 2 (like 8, 16, 32...).  Useable range is between
	8 and 128M (2^27 complex numbers) for four1.  Fourn has been tested up
	to 1024^2.

* 'isign' must be 1 or -1, where -1 corresponds to an inverse fft.


Test results (Quadra 950) for 1024-point IFFT:

four1		extended		single

C			56 msec			27
asm			29				20
DSP			---				1.9 *

* this number from August 92 Byte magazine

fourn		32^2		128^2		512^2		1024^2

C			25 msec		557			12.8 sec	59.4
asm			19			425			10.14		47.1

---> about 30% faster !

Thomas Barrett
Physics Dept, Ohio State University
barrett@pacific.mps.ohio-state.edu
1994

Thanks to:  Dan Flatin & Pascal Laubin.
*/

#define	PI	3.1415926535897932384626433

/* ----------------------- tb_four1.c -----------------------------

optimized version of Numerical Recipes' fft routine

Thomas Barrett, 1993

written for Think C
this routine assumes that data contains 12-byte 6888x-native long doubles
also, you must have a math coprocessor to run this routine

-------------------------------------------------------------------
register usage:

d0	I				a0	data[I]			fp0	WPR
d1	J				a1	data[J]			fp1	WPI
d2	M				a2	data[0]			fp2	WR
d3	loop, MMAX							fp3	WI
d4	ISTEP								fp4	TEMPR	\
d5	NN,N								fp5	TEMPI	 \	internal
d6	internal							fp6			 /	calculations
										fp7			/
---------------------------------------------------------------- */

void tb_68k_four1_extended(long double *data, long nn, long isign)
{
	long double twopi = 2.0 * PI * isign;
	
	asm 68020, 68881 {
		movem.l	a2/d3-d6,-(sp)
		fmovem.x	fp4-fp7,-(sp)
		
		move.l	nn,d5
		clr.l	d3
		move.l	d5,d3		; d3 = loop counter
		move.l	#-1,d0		; i(d0) = -1
		movea.l	data,a0
		suba.l	#12,a0		; pointer to array indexed by 0
		movea.l	a0,a2		; a2 = *(data[0])
		suba.l	#12,a0
		
	; ------------ re-order values ---------------------------------
		
		move.l	#1,d1		; j(d1) = 1
@bits	adda.l	#24,a0		; a0 = *(data[i])
		addq.l	#2,d0		; i += 2
		cmp.l	d1,d0		; cmp j,i
		bge		@nosw		; branch if i(d0) >= j(d1)
@swap	movea.l	a2,a1
		move.l	d1,d6
	;	mulu.l	#12,d6
		lsl.l	#2,d6		; these four instructions are equivalent to
		adda.l	d6,a1		; the mulu.l #12 and save a dozen cycles
		lsl.l	#1,d6
		adda.l	d6,a1		; a1 = *(data[j])

		fmove.x	(a0),fp0	; swap
		fmove.x	(a1),fp1
		fmove.x	fp1,(a0)
		fmove.x	fp0,(a1)
		fmove.x	12(a0),fp0
		fmove.x	12(a1),fp1
		fmove.x	fp1,12(a0)
		fmove.x	fp0,12(a1)

@nosw	move.l	d5,d2		; m(d2) = nn(d5) = #points
@jloop	cmp.l	#2,d2
		blt		@jrdy		; branch if m(d2) < 2
		cmp.l	d2,d1
		ble		@jrdy		; branch if j(d1) <= m(d2)
@fixj	sub.l	d2,d1		; j -= m
		lsr.l	#1,d2		; m /= 2
		bra		@jloop
@jrdy	add.l	d2,d1		; j += m
		subq.l	#1,d3
		bne		@bits
		
	; --------------- order is now ready -------------------------
		
		lsl.l	#1,d5		; n(d5) = 2*nn(was d5) = #long doubles
		move.l	#2,d3		; mmax(d3) = 2
		
	; -------------------- outer loop -----------------------------
		
@loop	cmp.l	d3,d5
		ble		@done		; branch if n(d5) <= mmax(d3)
		move.l	d3,d4
		lsl.l	#1,d4		; istep(d4) = 2*mmax(d3)
		fmove.x	twopi,fp1
		fmove.l	d3,fp0
		fdiv.x	fp0,fp1		; theta(fp1) = 2 pi / mmax(d3)
		fmove.x	fp1,fp0
		fmove.w	#2,fp2
		fdiv.x	fp2,fp0		; fp0 = 1/2 theta
		fsin.x	fp0
		fmul.x	fp0,fp0
		fmul.x	fp2,fp0
		fneg.x	fp0			; wpr(fp0) = -2 sin^2(1/2 theta)
		fsin.x	fp1			; wpi(fp1) = sin(theta)
		fmove.w	#1,fp2		; wr(fp2) = 1
		fmove.w	#0,fp3		; wi(fp3) = 0
		
	; ------------------ inner loops -------------------------
		
		move.l	#1,d2		; m(d2) = 1
@mloop	move.l	d2,d0		; i(d0) = m(d2)

		move.l	d0,d6		; i(d0)
		movea.l	a2,a0
		lsl.l	#2,d6
		adda.l	d6,a0
		lsl.l	#1,d6
		adda.l	d6,a0		; a0 = pointer to 1st i
		movea.l	a0,a1
		move.l	d3,d6		; mmax(d3)
		lsl.l	#2,d6
		adda.l	d6,a1
		lsl.l	#1,d6
		adda.l	d6,a1		; a1 = pointer to 1st j
		move.l	d4,d6		; istep(d4)
		mulu.l	#12,d6		; 12 * istep. pointer increment

@iloop	move.l	d0,d1
		add.l	d3,d1		; j(d1) = i(d0) + mmax(d3)
		
		fmove.x	(a1),fp4	; fp4 = data[j]
		fmove.x	fp4,fp7
		fmul.x	fp2,fp4
		fmove.x	12(a1),fp6	; fp6 = data[j+1]
		fmove.x	fp6,fp5
		fmul.x	fp3,fp6
		fsub.x	fp6,fp4		; tempr(fp4) = wr(fp2)*data[j] - wi(fp3)*data[j+1]
		fmul.x	fp2,fp5
		fmul.x	fp3,fp7
		fadd.x	fp7,fp5		; tempi(fp5) = wr*data[j+1] + wi*data[j]
		
		fmove.x	(a0),fp6	; fp6 = data[i]
		fmove.x	fp6,fp7
		fadd.x	fp4,fp6
		fmove.x	fp6,(a0)	; data[i] = data[i] + tempr(fp4)
		fsub.x	fp4,fp7
		fmove.x	fp7,(a1)	; data[j] = data[i] - tempr(fp4)

		fmove.x	12(a0),fp6	; fp6 = data[i+1]
		fmove.x	fp6,fp7
		fadd.x	fp5,fp6
		fmove.x	fp6,12(a0)	; data[i+1] = data[i+1] + tempi(fp5)
		fsub.x	fp5,fp7
		fmove.x	fp7,12(a1)	; data[j+1] = data[i+1] - tempi(fp5)
		
		adda.l	d6,a0
		adda.l	d6,a1

		add.l	d4,d0		; i(d0) += istep(d4)
		cmp.l	d5,d0
		ble		@iloop		; branch if i(d0) <= n(d5)
		
	; ---------------update wr & wi ------------------------
		
		fmove.x	fp2,fp5		; wtemp(fp5) = wr(fp2)
		fmove.x	fp2,fp6
		fmul.x	fp0,fp6
		fadd.x	fp6,fp2		; wr(fp2) += wr(fp2) * wpr(fp0)
		fmove.x	fp3,fp6
		fmul.x	fp1,fp6
		fsub.x	fp6,fp2		; wr(fp2) -= wi(fp3) * wpi(fp1)
		fmove.x	fp3,fp6
		fmul.x	fp0,fp6
		fadd.x	fp6,fp3		; wi(fp3) += wi(fp3) * wpr(fp0)
		fmul.x	fp1,fp5
		fadd.x	fp5,fp3		; wi(fp3) += wtemp(fp5) * wpi(fp1)
		
		addq.l	#2,d2		; m(d2) += 2
		cmp.l	d3,d2
		blt		@mloop		; branch if m(d2) < mmax(d3)
		move.l	d4,d3		; mmax(d3) = istep(d4)
		bra		@loop

	; -------------------- done ----------------------------

@done	fmovem.x	(sp)+,fp4-fp7
		movem.l	(sp)+,a2/d3-d6
	}
}


void tb_68k_four1_single(float *data, long nn, long isign)
{
	float twopi = 2.0 * PI * isign;
	
	asm 68020, 68881 {
		movem.l	a2/d3-d6,-(sp)
		fmovem.x	fp4-fp7,-(sp)
		
		move.l	nn,d5
		clr.l	d3
		move.l	d5,d3		; d3 = loop counter
		move.l	#-1,d0		; i(d0) = -1
		movea.l	data,a0
		suba.l	#4,a0		; pointer to array indexed by 0
		movea.l	a0,a2		; a2 = *(data[0])
		suba.l	#4,a0
		
	; ------------ re-order values ---------------------------------
		
		move.l	#1,d1		; j(d1) = 1
@bits	adda.l	#8,a0		; a0 = *(data[i])
		addq.l	#2,d0		; i += 2
		cmp.l	d1,d0		; cmp j,i
		bge		@nosw		; branch if i(d0) >= j(d1)
@swap	movea.l	a2,a1
		move.l	d1,d6
	;	mulu.l	#4,d6
		lsl.l	#2,d6
		adda.l	d6,a1		; a1 = *(data[j])

		move.l	(a0),d2
		move.l	(a1),d4
		move.l	d4,(a0)
		move.l	d2,(a1)
		move.l	4(a0),d2
		move.l	4(a1),d4
		move.l	d4,4(a0)
		move.l	d2,4(a1)
		
@nosw	move.l	d5,d2		; m(d2) = nn(d5) = #points
@jloop	cmp.l	#2,d2
		blt		@jrdy		; branch if m(d2) < 2
		cmp.l	d2,d1
		ble		@jrdy		; branch if j(d1) <= m(d2)
@fixj	sub.l	d2,d1		; j -= m
		lsr.l	#1,d2		; m /= 2
		bra		@jloop
@jrdy	add.l	d2,d1		; j += m
		subq.l	#1,d3
		bne		@bits
		
	; --------------- order is now ready -------------------------
		
		lsl.l	#1,d5		; n(d5) = 2*nn(was d5) = #long doubles
		move.l	#2,d3		; mmax(d3) = 2
		
	; -------------------- outer loop -----------------------------
		
@loop	cmp.l	d3,d5
		ble		@done		; branch if n(d5) <= mmax(d3)
		move.l	d3,d4
		lsl.l	#1,d4		; istep(d4) = 2*mmax(d3)
		fmove.s	twopi,fp1
		fmove.l	d3,fp0
		fdiv.x	fp0,fp1		; theta(fp1) = 2 pi / mmax(d3)
		fmove.x	fp1,fp0
		fmove.w	#2,fp2
		fdiv.x	fp2,fp0		; fp0 = 1/2 theta
		fsin.x	fp0
		fmul.x	fp0,fp0
		fmul.x	fp2,fp0
		fneg.x	fp0			; wpr(fp0) = -2 sin^2(1/2 theta)
		fsin.x	fp1			; wpi(fp1) = sin(theta)
		fmove.w	#1,fp2		; wr(fp2) = 1
		fmove.w	#0,fp3		; wi(fp3) = 0
		
	; ------------------ inner loops -------------------------
		
		move.l	#1,d2		; m(d2) = 1
@mloop	move.l	d2,d0		; i(d0) = m(d2)

		move.l	d0,d6		; i(d0)
		movea.l	a2,a0
		lsl.l	#2,d6
		adda.l	d6,a0		; a0 = pointer to 1st i
		movea.l	a0,a1
		move.l	d3,d6		; mmax(d3)
		lsl.l	#2,d6
		adda.l	d6,a1		; a1 = pointer to 1st j
		move.l	d4,d6		; istep(d4)
		mulu.l	#4,d6		; 4 * istep. pointer increment

@iloop	move.l	d0,d1
		add.l	d3,d1		; j(d1) = i(d0) + mmax(d3)

		fmove.s	(a1),fp4	; fp4 = data[j]
		fmove.x	fp4,fp7
		fmul.x	fp2,fp4
		fmove.s	4(a1),fp6	; fp6 = data[j+1]
		fmove.x	fp6,fp5
		fmul.x	fp3,fp6
		fsub.x	fp6,fp4		; tempr(fp4) = wr(fp2)*data[j] - wi(fp3)*data[j+1]
		fmul.x	fp2,fp5
		fmul.x	fp3,fp7
		fadd.x	fp7,fp5		; tempi(fp5) = wr*data[j+1] + wi*data[j]
		
		fmove.s	(a0),fp6	; fp6 = data[i]
		fmove.x	fp6,fp7
		fadd.x	fp4,fp6
		fmove.s	fp6,(a0)	; data[i] = data[i] + tempr(fp4)
		fsub.x	fp4,fp7
		fmove.s	fp7,(a1)	; data[j] = data[i] - tempr(fp4)

		fmove.s	4(a0),fp6	; fp6 = data[i+1]
		fmove.x	fp6,fp7
		fadd.x	fp5,fp6
		fmove.s	fp6,4(a0)	; data[i+1] = data[i+1] + tempi(fp5)
		fsub.x	fp5,fp7
		fmove.s	fp7,4(a1)	; data[j+1] = data[i+1] - tempi(fp5)
		
		adda.l	d6,a0
		adda.l	d6,a1

		add.l	d4,d0		; i(d0) += istep(d4)
		cmp.l	d5,d0
		ble		@iloop		; branch if i(d0) <= n(d5)
		
	; ---------------update wr & wi ------------------------
		
		fmove.x	fp2,fp5		; wtemp(fp5) = wr(fp2)
		fmove.x	fp2,fp6
		fmul.x	fp0,fp6
		fadd.x	fp6,fp2		; wr(fp2) += wr(fp2) * wpr(fp0)
		fmove.x	fp3,fp6
		fmul.x	fp1,fp6
		fsub.x	fp6,fp2		; wr(fp2) -= wi(fp3) * wpi(fp1)
		fmove.x	fp3,fp6
		fmul.x	fp0,fp6
		fadd.x	fp6,fp3		; wi(fp3) += wi(fp3) * wpr(fp0)
		fmul.x	fp1,fp5
		fadd.x	fp5,fp3		; wi(fp3) += wtemp(fp5) * wpi(fp1)
		
		addq.l	#2,d2		; m(d2) += 2
		cmp.l	d3,d2
		blt		@mloop		; branch if m(d2) < mmax(d3)
		move.l	d4,d3		; mmax(d3) = istep(d4)
		bra		@loop

	; -------------------- done ----------------------------

@done	fmovem.x	(sp)+,fp4-fp7
		movem.l	(sp)+,a2/d3-d6
	}
}


/*-------------------------------------------------------------------
register usage:

d0	nprev	ip3		ip3				fp0	WPR
d1			i1		i1				fp1	WPI
d2			i2		i2				fp2	WR
d3			i3		i3				fp3	WI
d4			i2rev	ifp1			fp4	TEMPR	\
d5	idim,n	ip2		ip2				fp5	TEMPI	 \	internal
d6	*		*,ibit	ifp2			fp6			 /	calculations
d7	*		*		*				fp7			/

		* = temp
---------------------------------------------------------------- */

void tb_68k_fourn_single(float *data, long *nn, long ndim, long isign)
{
	float twopi = 2.0 * PI * isign;
	long ip1,idim,n,nprev,ntot;

	asm 68020, 68881 {
		movem.l	a2/d3-d7,-(sp)
		fmovem.x	fp4-fp7,-(sp)
		
		move.l	#1,d6		; d6 = ntot
		movea.l	data,a2
		suba.l	#4,a2		; a2 = pointer to array indexed by zero
		movea.l	nn,a1
		move.l	ndim,d7
		subq.l	#1,d7
@1		mulu.l	(a1),d6
		adda.l	#4,a1
		dbra	d7,@1
		move.l	d6,ntot
		
	; -------------------- begin outer loop  ---------------
	
		move.l	#1,d0		; nprev(d0) = 1
		move.l	ndim,d5		; idim(d5) = ndim
@loop0	move.l	d5,idim		; save idim
		move.l	d0,nprev	; save nprev
		
		subq.l	#1,d5
		lsl.l	#2,d5
		movea.l	nn,a1
		adda.l	d5,a1
		move.l	(a1),d5		; n(d5)=nn[idim]
		move.l	d5,n
		
		move.l	d0,d7
		move.l	d0,d4
		mulu.l	d5,d4
		move.l	ntot,d0
		divu.l	d4,d0		; nrem(d0) = ntot/(n*nprev)
		
		lsl.l	#1,d7		; ip1 = nprev << 1
		move.l	d7,ip1
		mulu.l	d7,d5		; ip2(d5) = ip1 * n
		mulu.l	d5,d0		; ip3(d0) = ip2 * nrem
		move.l	#1,d4		; i2rev(d4) = 1
		
	; -------------------- loop 1 --------------------------

		move.l	#1,d2		; i2(d2) = 1
@loop1	cmp.l	d4,d2
		bge		@l1a		; skip if i2(d2) >= i2rev(d4)

		move.l	d2,d1
@l1b	move.l	d1,d3
		
@l1c	move.l	d4,d7
		add.l	d3,d7
		sub.l	d2,d7		; i3rev(d7) = i2rev(d4) + i3(d3) - i2(d2)

@swap	move.l	d3,d6
		lsl.l	#2,d6
		movea.l	a2,a0
		adda.l	d6,a0		; a0 -> data[i3]
		move.l	d7,d6
		lsl.l	#2,d6
		movea.l	a2,a1
		adda.l	d6,a1		; a1 -> data[i3rev]

		move.l	(a0),d6
		move.l	(a1),d7
		move.l	d6,(a1)
		move.l	d7,(a0)
		move.l	4(a0),d6
		move.l	4(a1),d7
		move.l	d6,4(a1)
		move.l	d7,4(a0)

		add.l	d5,d3
		cmp.l	d0,d3
		ble		@l1c
		
		add.l	#2,d1
		move.l	d2,d7
		add.l	ip1,d7
		subq.l	#2,d7
		cmp.l	d7,d1
		ble		@l1b

@l1a	move.l	d5,d6
		lsr.l	#1,d6		; ibit(d6) = ip2 >> 1
			
@l1y	cmp.l	ip1,d6
		blt		@l1x
		cmp.l	d4,d6
		bge		@l1x
		
		sub.l	d6,d4		; i2rev(d4) -= ibit(d6)
		lsr.l	#1,d6		; ibit >>= 1
		bra		@l1y
		
@l1x	add.l	d6,d4		; i2rev += ibit

		add.l	ip1,d2
		cmp.l	d5,d2
		ble		@loop1		; loop if i2(d2) <= ip2(d5)

	; -------------------- loop 2 --------------------------

		move.l	ip1,d4		; ifp1(d4) = ip1
@loop2	cmp.l	d5,d4
		bge		@l2x		; branch if ifp1(d4) >= ip2(d5)
		
		move.l	d4,d6
		lsl.l	#1,d6		; ifp2(d6) = ifp1 << 1
		move.l	d6,d7

		fmove.s	twopi,fp1
		divu.l	ip1,d7
		fmove.l	d7,fp0
		fdiv.x	fp0,fp1		; theta(fp1) = 2 pi / (ifp2/ip1)
		fmove.x	fp1,fp0
		fmove.w	#2,fp2
		fdiv.x	fp2,fp0		; fp0 = 1/2 theta
		fsin.x	fp0
		fmul.x	fp0,fp0
		fmul.x	fp2,fp0
		fneg.x	fp0			; wpr(fp0) = -2 sin^2(1/2 theta)
		fsin.x	fp1			; wpi(fp1) = sin(theta)
		fmove.w	#1,fp2		; wr(fp2) = 1
		fmove.w	#0,fp3		; wi(fp3) = 0
		
		move.l	#1,d3
@l2a	move.l	d3,d1
@l2b	move.l	d1,d2
@l2c	move.l	d2,d7		; k1(d7) = i2(d2)
		lsl.l	#2,d7
		movea.l	a2,a0
		adda.l	d7,a0		; a0 -> data[k1]
		move.l	d4,d7
		add.l	d2,d7		; k2(d7) = k1(=i2(d2))+ifp1(d4)
		lsl.l	#2,d7
		movea.l	a2,a1
		adda.l	d7,a1		; a1 -> data[k2]

		fmove.s	(a1),fp4	; fp4 = data[k2]
		fmove.x	fp4,fp7
		fmul.x	fp2,fp4
		fmove.s	4(a1),fp6	; fp6 = data[k2+1]
		fmove.x	fp6,fp5
		fmul.x	fp3,fp6
		fsub.x	fp6,fp4		; tempr(fp4) = wr(fp2)*data[k2] - wi(fp3)*data[k2+1]
		fmul.x	fp2,fp5
		fmul.x	fp3,fp7
		fadd.x	fp7,fp5		; tempi(fp5) = wr*data[k2+1] + wi*data[k2]
		
		fmove.s	(a0),fp6	; fp6 = data[k1]
		fmove.x	fp6,fp7
		fadd.x	fp4,fp6
		fmove.s	fp6,(a0)	; data[k1] = data[k1] + tempr(fp4)
		fsub.x	fp4,fp7
		fmove.s	fp7,(a1)	; data[k2] = data[k1] - tempr(fp4)

		fmove.s	4(a0),fp6	; fp6 = data[k1+1]
		fmove.x	fp6,fp7
		fadd.x	fp5,fp6
		fmove.s	fp6,4(a0)	; data[k1+1] = data[k1+1] + tempi(fp5)
		fsub.x	fp5,fp7
		fmove.s	fp7,4(a1)	; data[k2+1] = data[k1+1] - tempi(fp5)
		
		add.l	d6,d2		; i2(d2) += ifp2(d6)
		cmp.l	d0,d2
		ble		@l2c		; branch if i2(d2) <= ip3(d0)
			
		add.l	#2,d1
		move.l	d3,d7
		add.l	ip1,d7
		subq.l	#2,d7
		cmp.l	d7,d1
		ble		@l2b		; branch if i1(d1) <= (i3(d3) + ip1 - 2)(d7)

	; ---------------update wr & wi ------------------------
		
		fmove.x	fp2,fp5		; wtemp(fp5) = wr(fp2)
		fmove.x	fp2,fp6
		fmul.x	fp0,fp6
		fadd.x	fp6,fp2		; wr(fp2) += wr(fp2) * wpr(fp0)
		fmove.x	fp3,fp6
		fmul.x	fp1,fp6
		fsub.x	fp6,fp2		; wr(fp2) -= wi(fp3) * wpi(fp1)
		fmove.x	fp3,fp6
		fmul.x	fp0,fp6
		fadd.x	fp6,fp3		; wi(fp3) += wi(fp3) * wpr(fp0)
		fmul.x	fp1,fp5
		fadd.x	fp5,fp3		; wi(fp3) += wtemp(fp5) * wpi(fp1)
		
		add.l	ip1,d3		; i3(d3) += ip1
		cmp.l	d4,d3
		ble		@l2a		; branch if i3(d3) <= ifp1(d4)

		move.l	d6,d4		; ifp1(d4) = ifp2(d6)
		bra		@loop2

	; -------------------- end outer loop  -----------------

@l2x	move.l	nprev,d0
		mulu.l	n,d0		; nprev(d0) *= n
		move.l	idim,d5
		subq.l	#1,d5
		bne		@loop0

	; -------------------- done ----------------------------

@done	fmovem.x	(sp)+,fp4-fp7
		movem.l	(sp)+,a2/d3-d7
	}
}