; 32-bit math
;
; The 32-bit evaluation routines represent numbers from -2.0 to +2.0 using
; a fixed-point format containing one sign bit, one bit to the left of the
; binary point, and 30 bits to the right of the binary point.
;
; This fixed-point format is the same as Apple's Fract format (see Inside
; Mac vol. 4, p. 63.)  My routines are faster than Apple's routines because
; they are register-based, and because they need to compute only three
; partial products in the case when both factors are equal.

; Macro to multiply a longint by itself.  This is used often enough to
; justify the macro. (32 x 32 -> 64 version)
;
; s, high, low and tmp must be data ragisters; no two may be the same.
;
macro	s32_1 s,high,low,tmp =
	tst.l {s}	  ; If argument is negative, make it positive
	bpl.s @1
	neg.l {s}
@1	move {s},{low}	  ; Compute b^2 part of  (a+b)^2 = a^2 + 2ab + b^2
	mulu {s},{low}

	swap {s}	  ; Compute a^2 part.
	move {s},{high}
	mulu {s},{high}
	|

macro	s32_2 s,high,low,tmp =
	move {s},{tmp}	  ; Compute 2ab part.
	swap {s}
	mulu {s},{tmp}	  ; {tmp} = a*b
	add.l {tmp},{tmp} ; {tmp} = 2*a*b

	move.l {tmp},{s}  ; Clone this result so we can split it into two
			  ; halves.

	clr {tmp}	  ; Shift the MS half of 2ab into position
	swap {tmp}

	swap {s}	  ; Shift less significant half of 2ab into position
	clr {s}

	add.l {s},{low}	  ; Finally, add them together!
	addx.l {tmp},{high}
	|

macro	square_32 s,high,low,tmp =
	s32_1 {s},{high},{low},{tmp}
	s32_2 {s},{high},{low},{tmp}
	|

; Longint x Longint multiply subroutine, with a little bit (but not much)
; in common with the LongMul routine in the ROM, (C) Apple Computer Co.
; Oops, I guess I shouldn't say that I looked at the ROM (-:
;
;  D6	   used as sign flag.
;
; factor 1			a	b
; times factor 2	     x	c	d
;			     ------------
; b * d				p	q
; a * d			r	s
; b * c			t	u
; a * c	     +	v	w
;	     ----------------------------
; product	v	r+t+w	p+s+u	q
;
;			; D0	D1	D2	D3	D4	D5
;			;------	-------	-------	-------	-------	-------
_long_mult 		; a:b	c:d
	tst.l d0	; Make both factors positive and remember sign
	smi d6		; of product.
	bpl.s _lm_0
	neg.l d0
_lm_0	tst.l d1
	bpl.s _lm_00
	not.b d6
	neg.l d1
			; D0	D1	D2	D3	D4	D5
			;------	-------	-------	-------	-------	-------
_lm_00	move d0,d3	; a:b	c:d		0:b
	mulu d1,d3	; a:b	c:d		p:q

	swap d0		; b:a	c:d		p:q
	swap d1		; b:a	d:c		p:q
	move d0,d2	; b:a	d:c	0:a	p:q
	mulu d1,d2	; b:a	d:c	v:w	p:q

			; D0	D1	D2	D3	D4	D5
			;------	-------	-------	-------	-------	-------
	swap d0		; a:b	d:c	v:w	p:q
	move d0,d4	; a:b	d:c	v:w	p:q	0:b
	mulu d1,d4	; a:b	d:c	v:w	p:q	t:u
	move.l d4,d5	; a:b	d:c	v:w	p:q	t:u	t:u
	clr d4		; a:b	d:c	v:w	p:q	t:0	t:u
	swap d4		; a:b	d:c	v:w	p:q	0:t	t:u
	swap d5		; a:b	d:c	v:w	p:q	0:t	u:t
	clr d5		; a:b	d:c	v:w	p:q	0:t	u:0
	add.l d5,d3	; a:b	d:c	v:w	pu:q	0:t	u:0
	addx.l d4,d2	; a:b	d:c	v:tw	pu:q	0:t	u:0

			; D0	D1	D2	D3	D4	D5
			;------	-------	-------	-------	-------	-------
	swap d0		; b:a	d:c	v:tw	pu:q	0:t	u:0
	swap d1		; b:a	c:d	v:tw	pu:q	0:t	u:0
	move d0,d4	; b:a	c:d	v:tw	pu:q	0:a	u:0
	mulu d1,d4	; b:a	c:d	v:tw	pu:q	r:s	u:0
	move.l d4,d5	; b:a	c:d	v:tw	pu:q	r:s	r:s
	swap d5		; b:a	c:d	v:tw	pu:q	r:s	s:r
	clr d5		; b:a	c:d	v:tw	pu:q	r:s	s:0
	clr d4		; b:a	c:d	v:tw	pu:q	r:0	s:0
	swap d4		; b:a	c:d	v:tw	pu:q	0:r	s:0
	add.l d5,d3	; b:a	c:d	v:tw	psu:q	0:r	s:0
	addx.l d4,d2	; b:a	c:d	v:rtw	psu:q	0:r	s:0

	tst.b d6	; Set sign of product correctly.
	beq.s _lm_3
	neg.l d3
	negx.l d2
_lm_3	rts

; Subroutine to compute Z', given Z and C
;
;               Z = a + b i
;
;                     2         2    2
;               Z' = Z + C = ( a  - b  + Cr ) + ( 2ab + Ci ) i
;
; In the comments below, a^-b^ means a^2-b^2, a' means a^2+b^2+Cr, h means high 32 bits, l means low 32 bits, J
; means high 32 bits adjusted to put decimal in correct position, * means this step trashes the register.
;
;			; D0	D1	D2	D3	D4	D5	D6	A0	A1	A2	A3	A4
; At startup		;		a	b				Cr	Ci
; At finish		; *	*	a'	b'	*	*	*	Cr	Ci	a	b	a'
;			;------	-------	-------	-------	-------	-------	-------	-------	-------	-------	-------	-------
; At startup		;		a	b				Cr	Ci
_z_prime_32 
	move.l d2,a2	;		a	b				Cr	Ci	a
	move.l d3,a3	;		a	b				Cr	Ci	a	b
	;
  square_32 d3,d0,d1,d5	; b^2h	b^2l	a	*		*		Cr	Ci	a	b
	;
  square_32 d2,d5,d6,d3	; b^2h	b^2l	*	*		a^2h	a^2l	Cr	Ci	a	b
	;
	sub.l d1,d6
	subx.l d0,d5	; b^2h	b^2l				a^-b^h	a^-b^l	Cr	Ci	a	b
			;					60 frac. bits
			;					D5 has 28 frac bits
			
	swap d6
	add.w d6,d6
	addx.l d5,d5	;					D5 has 29 frac bits
	
	add.l a0,d5	; b^2h	b^2l				a'h	a'l 	Cr	Ci	a	b
	bvs.s _z_exit	; Check for overflow.
	
			; D0	D1	D2	D3	D4	D5	D6	A0	A1	A2	A3	A4
	add.w d6,d6	;------	-------	-------	-------	-------	-------	-------	-------	-------	-------	------- -------
	addx.l d5,d5	; b^2h	b^2l				a'J	*	Cr	Ci	a	b
	bvs.s _z_exit	;					D5 has 30 frac bits
	move.l d5,a4	; b^2h	b^2l				a'J	*	Cr	Ci	a	b	a'

	move.l a2,d0
	move.l a3,d1	; a	b				a'	 	Cr	Ci	a	b	a'
	jsr _long_mult	; *	*	abh	abl	*	*	*	Cr	Ci	a	b	a'

	swap d3	
	add.w d3,d3
	addx.l d2,d2	;  		2abh	2abl				Cr	Ci	a	b	a'
			;		60 frac. bits
			;		D2 has 28 frac bits
	add.w d3,d3
	addx.l d2,d2	;		D2 has 29 frac bits
	bvs.s _z_exit	; [ Leave this out for neat little patterns... ]

	add.l a1,d2	;		b'h	b'l				Cr	Ci	a	b	a'
	bvs.s _z_exit
	add.w d3,d3
	addx.l d2,d2	;		b'J	*				Cr	Ci	a	b	a'
	bvs.s _z_exit	;		D2 has 30 frac bits

	move.l d2,d3	; 		b'	b'				Cr	Ci	a	b	a'
	move.l a4,d2	;		a'	b'				Cr	Ci	a	b	a'
_z_exit	rts

; This subroutine is like _in_set, but uses 32-bit math for all compu-
; tations.
;
; Reg.	Input				Output
; ----	-------------------------	------
; D0	Real component of point		Unchanged.
;	   (29 frac. bits)
; D1	Imag. component of point	Unchanged.
;	   (29 frac. bits)
; D2	   used as longint real component of Z
; D3	   used as longint imag. component of Z
; D4	   used as longint real component of Z'
; D5	   used as longint imag. component of Z'
; D6
; D7.w	Ignored.			Number of iterations it took.
;					(If this is -1, assume that the
;					point is in the Mandelbrot Set.)
; A0	   used as real component of point
; A1	   used as imag. component of point
;
;
_in_set_32 movem.l d0-d6/a0-a4,-(sp)
	moveq #0,d2		; Initial Z in Ls1.30 format (real)
	moveq #0,d3		; (imag.)
	move n_max,d7
	move.l d0,a0
	move.l d1,a1

_is3_loop jsr _z_prime_32
	bvs.s _is3_exit

_is3_next dbf d7,_is3_loop
	move #-1,d7		; -1 means it's in the set.
	bra _is3_rts

_is3_exit sub n_max,d7		; Compute n = n_max - d7
	neg d7

	jsr _is32_rpm

	move.l a2,d2		; Get old a and b
	move.l a3,d3
_is3_z1				; Compute a^2 + b^2 for old Z
	square_32 d3,d0,d1,d5	; d0,d1 = b^2 (60 frac. bits)
_is3_z2
	square_32 d2,d3,d4,d5	; d3,d4 = a^2 (60 frac. bits)
_is3_z3
	add.l d3,d0		; d0 = MS word of a^2 + b^2, 28 frac. bits
	compare_l d0,#$40000000	; Is it less than 4.0000 ?
	blt.s _is3_rts		; If so, it was still inside the circle.
	subq #1,d7	; Otherwise Z was outside the circle; adjust count
			; to make countour lines smooth.
_is3_rts movem.l (sp)+,d0-d6/a0-a4
	rts

; This routine makes my initials "RPM" appear at certain places in the
; picture.  The initials can only be found if you use 32-bit math, and
; inspect a region near a lemniscate of order greater than 20.
;
_is32_rpm movem.l d2-d3/a0,-(sp)
	compare d7,#20
	ble _is3r_rts
	move.l a2,d2
	move.l a3,d3
	swap d2
	swap d3
	compare d3,#$7c00
	blt _is3r_rts
	compare d3,#$7e00
	bge _is3r_rts
	tst d2
	blt _is3r_rts
	compare d2,#$0800
	bge _is3r_rts

	sub #$7c00,d3
	lsr #6,d3
	lsr #6,d2

	lea _is3r_data,a0
	not d3
	and #$0007,d3
	asl #2,d3
	adda d3,a0
	move.l (a0),d3
	asl.l d2,d3
	tst.l d3
	bpl _is3r_rts
	subq #1,d7
_is3r_rts 
	movem.l (sp)+,d2-d3/a0
	rts
_is3r_data	dc.l $f1e20800, $89131800, $8912a800, $f1e24800
		dc.l $a1020800, $91020800, $89020800, $00000000


; Subroutine to compute Z', given Z and C
;
;               Z = a + b i
;
;                     2         2    2
;               Z' = Z + C = ( a  - b  + Cr ) + ( 2ab + Ci ) i
;
; In the comments below, a^-b^ means a^2-b^2, a' means a^2+b^2+Cr, h means high 32 bits, l means low 32 bits, J
; means high 32 bits adjusted to put decimal in correct position, * means this step trashes the register.
;
;			; D0	D1	D2	D3	D4	D5	D6	A0	A1	A2	A3	A4
; At startup		;		a	b				Cr	Ci
; At finish		; *	*	a'	b'		a'	*	Cr	Ci	a	b
;			;------	-------	-------	-------	-------	-------	-------	-------	-------	-------	-------	-------
; At startup		;		a	b				Cr	Ci
_z_prime_020:
	move.l d2,a2	;		a	b				Cr	Ci	a
	move.l d3,a3	;		a	b				Cr	Ci	a	b

	move.l	d3,d1
	muls3264 3,0,1	; b^2h	b^2l	a	b				Cr	Ci	a	b

	move.l	d2,d6
	muls3264 2,5,6	; b^2h	b^2l	a	b		a^2h	a^2l	Cr	Ci	a	b

	sub.l d1,d6
	subx.l d0,d5	; b^2h	b^2l	a	b		a^-b^h	a^-b^l	Cr	Ci	a	b
			;					60 frac. bits
			;					D5 has 28 frac bits

	; At this point, we swap d6 and use add.w rather than add.l, because add.w is faster and we
	; only need the top two bits anyway.
	swap d6
	add.w d6,d6
	addx.l d5,d5	;		a	b		D5 has 29 frac bits

	add.l a0,d5	; b^2h	b^2l	a	b		a'h	a'l 	Cr	Ci	a	b
	bvs.s _z020_exit ; Check for overflow.
	
			; D0	D1	D2	D3	D4	D5	D6	A0	A1	A2	A3	A4
	add.w d6,d6	;------	-------	-------	-------	-------	-------	-------	-------	-------	-------	------- -------
	addx.l d5,d5	; b^2h	b^2l	a	b		a'J	*	Cr	Ci	a	b
	bvs.s _z020_exit ;					D5 has 30 frac bits

	move.l	d2,d0	; a	b^2l	a	b		a'J	 	Cr	Ci	a	b
	muls3264 0,2,3	; a	b^2l	abh	abl		a'J	 	Cr	Ci	a	b

	swap d3
	add.w d3,d3
	addx.l d2,d2	;  		2abh	2abl		a'		Cr	Ci	a	b
			;		60 frac. bits
			;		D2 has 28 frac bits
	add.w d3,d3
	addx.l d2,d2	;		D2 has 29 frac bits
	bvs.s _z020_exit ; [ Leave this out for neat little patterns... ]

			; D0	D1	D2	D3	D4	D5	D6	A0	A1	A2	A3	A4
			;------	-------	-------	-------	-------	-------	-------	-------	-------	-------	------- -------
	add.l a1,d2	;		b'h	b'l		a'		Cr	Ci	a	b
	bvs.s _z020_exit
	add.w d3,d3
	addx.l d2,d2	;		b'J	*		a'		Cr	Ci	a	b
	bvs.s _z020_exit ;		D2 has 30 frac bits

	move.l d2,d3	; 		b'	b'		a'		Cr	Ci	a	b
	move.l d5,d2	;		a'	b'		a'		Cr	Ci	a	b
_z020_exit:
	rts

; This subroutine is like _in_set, but uses 32-bit math for all compu-
; tations.  It also uses 68020 instructions to do the multiplications.
;
; Reg.	Input				Output
; ----	-------------------------	------
; D0	Real component of point		Unchanged.
;	   (29 frac. bits)
; D1	Imag. component of point	Unchanged.
;	   (29 frac. bits)
; D2	   used as longint real component of Z
; D3	   used as longint imag. component of Z
; D4	   used as longint real component of Z'
; D5	   used as longint imag. component of Z'
; D6
; D7.w	Ignored.			Number of iterations it took.
;					(If this is -1, assume that the
;					point is in the Mandelbrot Set.)
; A0	   used as real component of point
; A1	   used as imag. component of point
;
;
_in_set_020 movem.l d0-d6/a0-a4,-(sp)
	moveq #0,d2		; Initial Z in Ls1.30 format (real)
	moveq #0,d3		; (imag.)
	move n_max,d7
	move.l d0,a0
	move.l d1,a1

_is0_loop jsr _z_prime_020
	bvs.s _is0_exit

_is0_next dbf d7,_is0_loop
	move #-1,d7		; -1 means it's in the set.
	bra _is0_rts

_is0_exit sub n_max,d7		; Compute n = n_max - d7
	neg d7

	jsr _is32_rpm

	move.l a2,d2		; Get old a and b
	move.l a3,d3
_is0_z1				; Compute a^2 + b^2 for old Z
	square_32 d3,d0,d1,d5	; d0,d1 = b^2 (60 frac. bits)
_is0_z2
	square_32 d2,d3,d4,d5	; d3,d4 = a^2 (60 frac. bits)
_is0_z3
	add.l d3,d0		; d0 = MS word of a^2 + b^2, 28 frac. bits
	compare_l d0,#$40000000	; Is it less than 4.0000 ?
	blt.s _is0_rts		; If so, it was still inside the circle.
	subq #1,d7	; Otherwise Z was outside the circle; adjust count
			; to make countour lines smooth.
_is0_rts movem.l (sp)+,d0-d6/a0-a4
	rts