gsetup.s 32.6 KB

Raw Blame History Permalink


	/****** NOTE:
	 ******
	 ******	This code is the optimized version for HARDWARE 2!
	 ******
	 ****** It won't run (100%) on hardware 1.
	 ******
	 ******/

 ##########################################################################
 #
 # Triangle Setup Routine.
 # When entering this code we have a points buffer full of points,
 # and registers r1, r2, r3 point to the three vertices of a triangle.
 #
 ##########################################################################

#ifdef SETUP_ALONE
#include <rsp.h>
#include "mbi.h"

		.text	beginSetup

#include "gdmem.h"
#include "gfx_regs.h"
#endif


 # ########################### CLIP TEST #################################
.name   minp,           $1
.name   midp,           $2
.name   maxp,           $3
.name   tmp,            $8
 #cc,name	rejectmask,	$6
.name   tmp2,           $9
.name   ccor,           $11     # OR of all points' clip codes
.name   ccand,          $12     # AND of all points' clip codes


                .ent    clipAndSetup
clipAndSetup:
 # ########################### CLIP TEST #################################

        lh      ccor, (RSP_PTS_CC)(maxp)	# or Clip Codes together &
        lh      tmp, (RSP_PTS_CC)(midp)	# and Clip Codes together
        lh      tmp2, (RSP_PTS_CC)(minp)	#
	and	ccand, ccor, tmp		#
	or      ccor, ccor, tmp                 #
	and	ccand, ccand, tmp2		#
#ifdef NEAR_CLIP_OFF
	andi	ccand, ccand, 0x7030		# only see reject + xyz, - xy
#else /* NEAR_CLIP_OFF */
	andi	ccand, ccand, 0x7070		# only see reject +/- xyz
#endif /* NEAR_CLIP_OFF */
	bne	ccand, zero, GfxDone		# Trivial rejection ?
	or      ccor, ccor, tmp2                #
 ### BRANCH OCCURS TO GfxDone: IF TRIVIALLY REJECTED

 	andi	ccor, ccor, 0x4343		# only see clip/accept +/- xyz
	bne     ccor, zero, startClip           # if ccor is 0, no clipping

 ### JUMP OCCURS to doClip or startClip: IF clipping is neccessary
 ### NOTE: delay slot is first instruction of beginSetup:

                .end    clipAndSetup

.unname ccor
.unname ccand
.unname minp
.unname midp
.unname maxp
.unname tmp
.unname tmp2
 #cc.unname rejectmask
 # ########################### END CLIP TEST #############################


#define one4th	vconst[4]


#define	LOWX	0	/* used to index elements of edge vectors */
#define	LOWY	1
#define MIDX	2
#define MIDY	3
#define HIGHX	4
#define HIGHY	5

/* scalar registers: */
.name	minp,		$1
.name	midp,		$2
.name	maxp,		$3
.name	miny, 		$9

.name	tmp,		$7
.name	flatp,		$4
.name	rdp_cmd,	$5
.name	rdp_flg,	$6
.name	dscratchp,	$8
.name	midy, 		$10
.name	maxy, 		$11
.name	negR, 		$12
.name	rendState,	$13

/* these are "global", used for both edge and attribute setup */
.name	DxXDyi,		$v0
.name	DxXDyf,		$v1
.name	yf,		$v2
.name	xHighf,		$v3
.name	EDel,		$v4
.name	invri,		$v27
.name	invrf,		$v26

/* these registers are dynamic, allocated and released as they are used */
.name	ri,		$v29
.name	rf,		$v28
.name	Hd,		$v9
.name	Md,		$v10
.name	Ld,		$v11
.name	td,		$v12
.name	vmin, 		$v13
.name	vmid, 		$v14
.name	vmax, 		$v15

.name	frontrej, $14
.name	backrej,  $15
.name	doreject, $16
.name	bsignr,    $17

.name	jnk,	$v16
.name	t1i,	$v17
.name	t1f,	$v18
.name	t2i,	$v19
.name	t2f,	$v20

.name	allWi,	$v5
.name	allWf,	$v6
.name   wscl,   $v21

		.ent	beginSetup

beginSetup:

	# load screen coordinates (pre-sort):
	llv	vmin[0], RSP_PTS_XS(minp)
	llv	vmid[0], RSP_PTS_XS(midp)	# element 1 is y
	llv	vmax[0], RSP_PTS_XS(maxp)	# element 0 is x,
	lw	rendState, RSP_STATE_RENDER(rsp_state)

	addi	dscratchp, zero, RSP_SETUP_TMP_OFFSET

	# load all the W's and Wscale for texture perspective while
	# doing this.
	lsv     wscl[0], RSP_STATE_PERSPNORM(rsp_state)
	lsv	allWi[0], RSP_PTS_W_INT(minp)

			vsub	Md, vmid, vmin
	lsv	allWf[0], RSP_PTS_W_FRAC(minp)
			vsub	Hd, vmax, vmin
	lsv	allWi[2], RSP_PTS_W_INT(midp)
			vsub	td, vmin, vmid
	lsv	allWf[2], RSP_PTS_W_FRAC(midp)

	lsv	allWi[4], RSP_PTS_W_INT(maxp)
	lsv	allWf[4], RSP_PTS_W_FRAC(maxp)

	# compute the partial products...
	# careful with the math here...
			vmudh	jnk, Hd, Md[1]
	lh	miny, RSP_PTS_YS(minp)		# get the y's (BEGIN SETUP)
			vsar	t1f, t1f, t1f[1]
	lh	midy, RSP_PTS_YS(midp)
			vsar	t1i, t1i, t1i[0]
	lh	maxy, RSP_PTS_YS(maxp)
			vmudh	jnk, td, Hd[1]
	andi	frontrej, rendState, G_CULL_FRONT
			vsar	t2f, t2f, t2f[1]
	andi	backrej, rendState, G_CULL_BACK
			vsar	t2i, t2i, t2i[0]
	addi	negR, zero, 0

	# begin back-face test:
	#
	# Back-face test is the sign of the plane equation BEFORE VERTEX
	# SORT, tested with the CULL_FRONT or CULL_BACK flags.
	# We toggle a bit during the sort and possibly correct the
	# pleq sign afterwards...
	#
	# Actual back-face computation is SU code weaved in among
	# the VU code.

	# y-sort. Remember, input screen coords are S11.2
	#
	# This sort code was written by Gudrun Achtenhagen, gudrun@engr.sgi.com
	# Contact her if there are any bugs. :-)
	#
    swap1:	slt	tmp, midy, miny		#if midy>miny, tmp gets 0
	    	blez	tmp, swap2		#if tmp>0, branch
		add	tmp, midy, $0		#put midy in tmp
		add	midy, miny, $0		#put miny in midy
		add	miny, tmp, $0		#put tmp in miny
		addu	tmp, midp, $0		#put midp in tmp
		addu	midp, minp, $0		#put minp in midp
		addu	minp, tmp, $0		#put tmp in minp
		xori	negR, negR, 0x0001

		.align	8	# ensure dual-issue of branch target
    swap2:
			vaddc	rf, t1f, t2f
		slt	tmp, maxy, midy		#if maxy>midy, tmp gets 0
			vadd	ri, t1i, t2i
		blez	tmp, sortDone		#if tmp>0, branch
		add	tmp, maxy, $0		#put maxy in tmp
		add	maxy, midy, $0		#put midy in maxy
		add	midy, tmp, $0		#put midy in tmp
		addu	tmp, maxp, $0		#put maxp in tmp
		addu	maxp, midp, $0		#put midp in maxp
		addu	midp, tmp, $0		#put tmp in midp
		j	swap1
		xori	negR, negR, 0x0001
    sortDone:
	# this branch target is aligned for dual-issue (see above)

	# load screen coordinates:	(S11.2)
			vlt	invri, ri, vconst[0]
		llv	vmax[0], RSP_PTS_XS(maxp)	# element 0 is x,
	 		vor	invrf, ri, rf
		llv	vmid[0], RSP_PTS_XS(midp)	# element 1 is y
		llv	vmin[0], RSP_PTS_XS(minp)

	# possibly negate R
		blez	negR, posiR
 		vsub	EDel, vmax, vmid	# delay slot, low deltas
		vmudn	rf, rf, vconst[3]	# negate R
		vmadh	ri, ri, vconst[3]
 		vmadn	rf, vconst, vconst[0]
	posiR:

	# compute edge deltas:		(S11.2)
	# (Need to do this again after the sort)
	# save out vertex pointers for attribute processing
	# while doing this.
			 		vsub	Md, vmid, vmin
	mfc2	bsignr, invri[0]
			 		vsub	Hd, vmax, vmin
	mfc2	doreject, invrf[0]

.unname jnk
.unname t1i
.unname t1f
.unname t2i
.unname t2f
.unname	td
.unname	vmin
.unname	vmid
.unname	vmax

	# if (r < 0) then triangle is a back-face.
	# finish back-face processing in the SU.

/* DELAY HERE! (2 clocks) */
			# align these for Newton
	sra	bsignr, bsignr, 31
			vmov	ri[3], ri[0]
	and	backrej, backrej, bsignr
			vmov	rf[3], rf[0]

	# If (r == 0), triangle is NULL, we should bail out completely.
			vmov	EDel[MIDX], Md[0]
 	beq	doreject, zero, SetupReject
	# note delay slot

			# align these for speed later.
	xori	bsignr, bsignr, 0xffff
	# re-test the sign of r *after* the sort for left/right-ness
			vlt	invri, ri, vconst[0]
	and	frontrej, frontrej, bsignr
			vmov	EDel[MIDY], Md[1]
	or	doreject, backrej, frontrej
			vmov	EDel[HIGHX], Hd[0]
	bgtz	doreject, SetupReject
			vmov	EDel[HIGHY], Hd[1]
	mfc2	tmp, invri[0]

.unname	negR

.unname	Hd
.unname	Md
.unname	Ld

.unname ri
.unname rf

.name	invEDeli,	$v7
.name	invEDelf,	$v8
.name	EDeli,		$v9
.name	EDelf,		$v10

	# these registers are for some attribute (texture) computation
	# that we are going to sneak in during edge setup.
.name	nearWi,	$v19
.name	nearWf,	$v20

	#
	# compute 1/r
	# R is about 10 bits accurate coming from the rcp table.
	# We need to do a Newton's iteration pass here to get more
	# precision. Each iteration should get another 10 bits...
	#
	# (r and invr registers are coordinated with Newton routine)
	#
		jal	NewtonDiv
		addi	rdp_flg, zero, 0x80	# delay slot

	# identify left- or right-major triangle:
	# if (r < 0) dir = 0 else dir = 1
	#
		bltz	tmp, rightMajor
		lb	rdp_cmd, RSP_STATE_TRI(rsp_state)	# delay slot
		addi	rdp_flg, zero, 0x0	# left-major
    rightMajor:
	# Ldx/Ldy, Mdx/Mdy, Hdx/Hdy:
	#
	# Since the rcp ROM is 10 bits, that's good enough for
	# the edge slopes. Newton's doesn't help.
	#
	# Get triangle command from state and construct the proper RDP
	# command while we do this.
	#
			vmudm	EDeli, EDel, vconst[4]	# make S15.16
			vmadn	EDelf, vconst, vconst[0]

			vrcp	invEDelf[LOWY], EDel[LOWY]	# 1.0/Ldy
			vrcph	invEDeli[LOWY], vconst[0]
	ori	rdp_cmd, rdp_cmd, G_TRI_FILL

	# stick in tile number
	lb	tmp, RSP_STATE_TEX_TILE(rsp_state)
			vrcp	invEDelf[MIDY], EDel[MIDY]	# 1.0/Mdy
			vrcph	invEDeli[MIDY], vconst[0]

			vrcp	invEDelf[HIGHY], EDel[HIGHY]	# 1.0/Hdy
			vrcph	invEDeli[HIGHY], vconst[0]
	or	rdp_flg, rdp_flg, tmp	# 3 cycles after load

	# open for output
#if !(defined(OUTPUT_DRAM)||defined(OUTPUT_FIFO))
		jal	OutputOpen
		addi	$18, zero, 176 	# worst case guess (delay slot)
#endif /* !(OUTPUT_DRAM || OUTPUT_FIFO) */

	#
	# We used to shift down the rcp results all the way,
	# then do the multiply. If we don't shift it down all the
	# way, do the mult, then shift some more, we get better
	# precision on the degenerate cases.
	#
#if 0
	 		vmudl	invEDelf, invEDelf, vconst1[2]	# make S15.16
	sb	rdp_cmd, 0(outp)	# output rdp command
	 		vmadm	invEDeli, invEDeli, vconst1[2]
	sb	rdp_flg, 1(outp)	# output poly flag
	  		vmadn	invEDelf, vconst, vconst[0]
#else
	 		vmudl	invEDelf, invEDelf, vconst1[4]	# make S15.16
	sb	rdp_cmd, 0(outp)	# output rdp command
	 		vmadm	invEDeli, invEDeli, vconst1[4]
	sb	rdp_flg, 1(outp)	# output poly flag
	  		vmadn	invEDelf, vconst, vconst[0]
#endif

	# Do some other work during the pipeline delay:
	# We scale up EDel so that later, during the attribute computation,
	# the 1/r multiply gives us the right S15.16 aligned answer.

		vmudh	EDel, EDel, vconst[5]	# mult by 4 for attributes

.name 	xi,	$v12
.name 	xf,	$v13

.unname	frontrej
.unname	backrej
.unname	doreject
.unname	bsignr

	# x setup:	(S15.16)
	# we load these into unusual elements so we can group the
	# multiplies later during the X adjust step...
	# (finish edge slopes while we do this)
	# The slope answer will end up in the Y element...
        # scale W's down to match 1/w for texture perspective,
	# while doing this.

	lsv	xi[(LOWX*2)],  RSP_PTS_XS(midp)
		                vmudl   allWf, allWf, wscl[0]
	lsv	xi[(MIDX*2)],  RSP_PTS_XS(minp) # same as high
		                vmadm   allWi, allWi, wscl[0]
	lsv	xi[(HIGHX*2)], RSP_PTS_XS(minp)
		                vmadn   allWf, vconst, vconst[0]

	# y setup:	(S11.2)
	sll	tmp, miny, 14	# get frac part of high y-coord
			vmudl	DxXDyf, invEDelf, EDelf[0q]	# Ldx / Ldy
			vmadm	DxXDyf, invEDeli, EDelf[0q]	# Mdx / Mdy
			vmadn	DxXDyf, invEDelf, EDeli[0q]	# Hdx / Hdy
			vmadh	DxXDyi, invEDeli, EDeli[0q]
	mtc2	tmp, yf[0]
			vmadn	DxXDyf, vconst, vconst[0]
	sw	maxp, 0(dscratchp)

#if 0
	# no downshift needed
#else
	# shift down some more...
	#
	# we may be able to tolerate some slop, and not do a 2-part
	# shift, once the bow-tie fix is in hardware
	# Tue May 16 15:14:34 PDT 1995
	#
 		vmudl	invEDelf, invEDelf, vconst[4]
 		vmadm	invEDeli, invEDeli, vconst[4]
  		vmadn	invEDelf, vconst, vconst[0]

 		vmudl	DxXDyf, DxXDyf, vconst[4]
 		vmadm	DxXDyi, DxXDyi, vconst[4]
  		vmadn	DxXDyf, vconst, vconst[0]
#endif


.unname	EDeli
.unname	EDelf

.name vtmp,	$v16

	# clear lower bits of slope fractions to match the edgewalker
	# only use this chopped frac to back up the starting point.
	# pass the complete slope to the stepper.
	sh	maxy, 2(outp)	# output y coords S11.2
			vand	vtmp, DxXDyf, vconst1[1]

	# translate S11.2 x's to S15.16.
	sh	miny, 6(outp)
		vmudm	xi, xi, one4th
	sw	midp, 4(dscratchp)
		vmadn	xf, vconst, vconst[0]
	sw	minp, 8(dscratchp)

	# Check DxXDy for "nearly-horizontal". Make horizontal, if so.
	# (only a single-precision clamp)
	sh	midy, 4(outp)
			vcr	DxXDyi, DxXDyi, vconst1[6]

.unname miny
.unname midy
.unname maxy

	# adjust x's to proper place:
	#
	# xHigh = xHigh - DxXHDy * yHigh.frac
	# xMid  = xHigh - DxXMDy * yHigh.frac
	# xLow  = xMid 			(already on sub-pixel grid)
	#
	# Start the output while we do this...
	#
.name xHighi,	$v9
.name t1i,	$v10
.name t1f,	$v11

	# Clever use of registers, careful where answer ends up.
	# Remember, the DxXDy slopes are in the Y elements...
	# Do the mult for both equations at once, since we
	# lined up the registers that way.
	#
		ssv	xi[(LOWX*2)],  8(outp)	# output xLow
				vmudl	t1f, vtmp, yf[0]
		ssv	xf[(LOWX*2)], 10(outp)
				vmadm	t1i, DxXDyi, yf[0]
		ssv	DxXDyi[(LOWY*2)], 12(outp)
				vmadn	t1f, vconst, vconst[0]
		ssv	DxXDyf[(LOWY*2)], 14(outp)

	# do both subtracts at the same time, since we sneakily
	# lined up xi/xf that way...
	# find the nearest W while we do this.

	# pre-compute these in the stall delay
.name	ptpp,	$15
.name	toutp,	$16
.name	stmaxi,	$v17
.name	stmaxf,	$v18

		andi	tmp, rdp_cmd, G_RDP_TRI_TXTR_MASK

	addi	ptpp, dscratchp, 8		# also the loop counter
	addi	toutp, dscratchp, 16


 				vsubc	xHighf, xf, t1f[1q]
		ssv	DxXDyi[(HIGHY*2)], 20(outp)
 				vsub	xHighi, xi, t1i[1q]
		ssv	DxXDyf[(HIGHY*2)], 22(outp)
				vsubc	wscl,  allWf, allWf[1]
		ssv	DxXDyi[(MIDY*2)], 28(outp)
				vlt	nearWi, allWi, allWi[1]
		ssv	DxXDyf[(MIDY*2)], 30(outp)
				vmrg	nearWf, allWf, allWf[1]
		ssv	xHighi[(HIGHX*2)], 16(outp)	# output xHigh
				vsubc	wscl,  nearWf, allWf[2]
		ssv	xHighf[(HIGHX*2)], 18(outp)
				vlt	nearWi, nearWi, allWi[2]
		ssv	xHighi[(MIDX*2)],  24(outp)	# output xMid
				vmrg	nearWf, nearWf, allWf[2]
		ssv	xHighf[(MIDX*2)],  26(outp)
		addi	outp, outp, 32	# increment output pointer
.unname xHighi
.unname vtmp
.unname t1i
.unname t1f
.unname xi
.unname xf

	#
	# Begin attribute setup:
	#
/*
 * at this point, the only registers in use should be:
 *
 *	$v0	DxXDyi
 *	$v1	DxXDyf
 *	$v2	yf
 *	$v3	xHighf
 *	$v4	EDel
 *	$v27	invri
 *	$v26	invrf
 *	$v7	invEDeli
 *	$v8	invEDelf
 *
 * plus these texture things we already computed:
 *
 *	$v5	allWi
 *	$v6	allWf
 *      $v17	stmaxi
 *      $v18    stmaxf
 *	$v19	nearWi
 *	$v20	nearWf
 *	$v21	wscl
 *
 */


#define FAST_TXTR_SETUP
#ifdef  FAST_TXTR_SETUP
	#
	# Texture setup. If we aren't doing texturing, we can skip
	# around this.
	#
	# This version is unrolled, pipelining two groups of
	# calculations. This is 23 instructions more, but
	# 50 clock cycles faster.
	#
		blez	tmp, AttributeSetup
		# note delay slot.

.name	ptp1,	$14
.name	ptp2,	$17
.name	ptp3,	$18

.name	ptTX1i,	$v9	# these registers hold S, T, 1/W
.name	ptTX1f,	$v10	# for each vertex.
.name	invW1f,	$v11
.name	invW1i,	$v12
.name	vtmpi,	$v15
.name	vtmpf,	$v16

.name	ptTX2i,	$v22	# these registers hold S, T, 1/W
.name	ptTX2f,	$v23	# for each vertex.
.name	invW2f,	$v24
.name	invW2i,	$v25

	# make sure nearW < 1.0. The reason we do this is
	# to safeguard against numerical inaccuracies due to
	# w divide, etc. If nearW/(nearest W) is not less than
	# 1.0, the perspective scale will go the wrong direction,
	# resulting in scrolling textures and wobbles.
	# We've tried lots of methods to tune this, eventually
	# settling on a scaling operation; we used to scale by 0.8
	# but still see wobbles on large texture coordinates (repeated
	# textures) so we now use 0.5.
	#
	# (get the point pointers while doing that)
			vmudl	nearWf, nearWf, vconst1[5]
	lw	ptp1,  0(ptpp)
			vmadm	nearWi, nearWi, vconst1[5]
	lw	ptp2, -4(ptpp)
			vmadn	nearWf, vconst, vconst[0]
	lw	ptp3, -8(ptpp)

	# load S and T:
		llv	ptTX1i[0], RSP_PTS_S(ptp1)
		llv	ptTX1i[8], RSP_PTS_S(ptp2)
		llv	ptTX2i[0], RSP_PTS_S(ptp3)

	# Load 1/W saved from vertex transform.
	# Stick a magic number in for w. Later during the vmult
	# this will scale and shift the frac up where we want it
	# for the attribute calculations.
		lsv	invW1f[0], RSP_PTS_INVW_FRAC(ptp1)
		lsv	invW1i[0], RSP_PTS_INVW_INT(ptp1)

		lsv	invW1f[8], RSP_PTS_INVW_FRAC(ptp2)
			vmov	ptTX1i[2], vconst1[0]
		lsv	invW1i[8], RSP_PTS_INVW_INT(ptp2)
			vmov	ptTX1i[6], vconst1[0]

		lsv	invW2f[0], RSP_PTS_INVW_FRAC(ptp3)
			vmov	ptTX2i[2], vconst1[0]
		lsv	invW2i[0], RSP_PTS_INVW_INT(ptp3)

	# normalize the W's:
	# (this is NW/W)
	# (we can't cheat here; we need the double-precision multiply
	# in order to handle all kinds of w's, including orthographic...)
	# Redundant store of nearW by the SU in the loop saves time later.
			vmudl	allWf, invW1f, nearWf[0]
			vmadm	allWf, invW1i, nearWf[0]
	ssv	nearWi[0], 68(dscratchp)
			vmadn	allWf, invW1f, nearWi[0]
	ssv	nearWf[0], 76(dscratchp)
			vmadh	allWi, invW1i, nearWi[0]

			vmudl	vtmpf, invW2f, nearWf[0]
			vmadm	vtmpf, invW2i, nearWf[0]
			vmadn	nearWf, invW2f, nearWi[0]
			vmadh	nearWi, invW2i, nearWi[0]

	# multiply (S, T, W) by normalized 1/W's
			vmudm	vtmpf, ptTX1i, allWf[0h]
			vmadh	ptTX1i, ptTX1i, allWi[0h]
			vmadn	ptTX1f, vconst, vconst[0]

			vmudm	vtmpf, ptTX2i, nearWf[0]
			vmadh	ptTX2i, ptTX2i, nearWi[0]
			vmadn	ptTX2f, vconst, vconst[0]

	# output to scratch memory
	sdv	ptTX1i[8],  (16)(toutp)
	sdv	ptTX1f[8],  (24)(toutp)

	sdv	ptTX1i[0],  (0)(toutp)
	sdv	ptTX1f[0],  (8)(toutp)

	sdv	ptTX2i[0],  (32)(toutp)
	sdv	ptTX2f[0],  (40)(toutp)

	# single precision, yuk
			vabs	ptTX1i, ptTX1i, ptTX1i
	llv	nearWi[0], (16)(toutp)
			vabs	ptTX2i, ptTX2i, ptTX2i
	llv	nearWf[0], (24)(toutp)
			vabs	nearWi, nearWi, nearWi

	# find max S' and T' coordinates for LOD normalization
			vge	stmaxi, ptTX1i, ptTX2i
			vmrg	stmaxf, ptTX1f, ptTX2f

	/* DELAY HERE! */
			vge	stmaxi, stmaxi, nearWi
			vmrg	stmaxf, stmaxf, nearWf

	# store of stmax happens below

.unname	ptp1
.unname	ptp2
.unname	ptp3
.unname	ptpp
.unname	toutp

.unname	ptTX1i
.unname	ptTX1f
.unname	ptTX2i
.unname	ptTX2f
.unname	allWi
.unname	allWf
.unname	vtmpi
.unname	vtmpf
.unname	invW1f
.unname	invW1i
.unname	invW2f
.unname	invW2i
.unname wscl

#else
	#
	# Texture setup. If we aren't doing texturing, we can skip
	# around this.
	#
	# We write out the vertex pointers, then loop through each
	# vertex. This makes for the most compact code.
	#
		blez	tmp, AttributeSetup
		# note delay slot.

.name	ptp,	$14

.name	ptTXi,	$v9	# these registers hold S, T, 1/W
.name	ptTXf,	$v10	# for each vertex.
.name	invWf,	$v11
.name	invWi,	$v12
.name	vtmpi,	$v15
.name	vtmpf,	$v16

		vmov	stmaxi[0], vconst1[5]	# max neg. num
		vmov	stmaxi[1], vconst1[5]	# max neg. num

	# make sure nearW < 1.0
		vmudl	nearWf, nearWf, vconst1[7]
		vmadm	nearWi, nearWi, vconst1[7]
		vmadn	nearWf, vconst, vconst[0]

	# loop through min, mid, and max:
TexPerspLoop:
		lw	ptp, 0(ptpp)	# get point pointer

	# load S and T:
		llv	ptTXi[0], RSP_PTS_S(ptp)

	# Load 1/W saved from vertex transform.
	# Stick a magic number in for w. Later during the vmult
	# this will scale and shift the frac up where we want it
	# for the attribute calculations.
	lsv	invWf[0], RSP_PTS_INVW_FRAC(ptp)
	lsv	invWi[0], RSP_PTS_INVW_INT(ptp)
			vmov	ptTXi[2], vconst1[0]

	# normalize the W's:
	# (this is NW/W)
	# (we can't cheat here; we need the double-precision multiply
	# in order to handle all kinds of w's, including orthographic...)
	# Redundant store of nearW by the SU in the loop saves time later.
/* DELAY HERE! */
			vmudl	allWf, nearWf, invWf[0]
			vmadm	allWf, nearWi, invWf[0]
	ssv	nearWi[0], 68(dscratchp)
			vmadn	allWf, nearWf, invWi[0]
	ssv	nearWf[0], 76(dscratchp)
			vmadh	allWi, nearWi, invWi[0]

	# multiply (S, T, W, L) by normalized 1/W's
			vmudm	vtmpf, ptTXi, allWf[0]
			vmadh	ptTXi, ptTXi, allWi[0]
	addi	toutp, toutp, 16
			vmadn	ptTXf, vconst, vconst[0]

	# single precision, yuk
			vabs	vtmpi, ptTXi, ptTXi

	# output to scratch memory and
	# find max S' and T' coordinates for LOD normalization

	sdv	ptTXi,  (0-16)(toutp)
			vge	stmaxi, stmaxi, vtmpi
	sdv	ptTXf,  (8-16)(toutp)
			vmrg	stmaxf, stmaxf, ptTXf

	bne	ptpp, dscratchp, TexPerspLoop
	addi	ptpp, ptpp, -4	# delay slot

	# store of stmax happens below

.unname	ptp
.unname	ptpp
.unname	toutp

.unname	ptTXi
.unname	ptTXf
.unname	allWi
.unname	allWf
.unname	vtmpi
.unname	vtmpf
.unname	invWf
.unname wscl
.unname	invWi
#endif

.name	Hdai,	$v9
.name	Hdaf,	$v10
.name	Mdai,	$v11
.name	Mdaf,	$v12
.name	adei,	$v13
.name	adef,	$v14
.name	amin,	$v15
.name	aminf,	$v16

.name	tMdai, 	$v21
.name	tMdaf, 	$v22
.name	amid,	$v23
.name	amidf,	$v24
.name	amax,	$v25
.name	amaxf,	$v5
.name	vjunk,	$v6
.name	vjunkf,	$v28

AttributeSetup:

	# store max S' and T' coordinates for LOD normalization
		slv	stmaxi[0], 64(dscratchp)
		slv	stmaxf[0], 72(dscratchp)
.unname	stmaxi
.unname	stmaxf
.unname	nearWi
.unname	nearWf

.name	ainiti, $v17
.name	ainitf, $v18
.name	tHdai, 	$v19
.name	tHdaf, 	$v20

	#
	# If we aren't doing any attributes at all, let's
	# bail out early.
	# (clear out all the bits while we do this)
	#
		andi	tmp, rdp_cmd, (G_RDP_TRI_ZBUFF_MASK | G_RDP_TRI_TXTR_MASK  | G_RDP_TRI_SHADE_MASK)
		blez	tmp, SetupDone
			vxor	ainitf,	vconst, vconst

	#
	# Collect all the attributes
	# in a vector (r,g,b,a,s,t,w,z) with one left over (l).
	# l (and z again) are computed in a second pass.

	# load attributes:
	# load smooth-shading colors first.
	# RGBA, use fancy packed load, then shift.
	# DMEM alignment is crucial here!

	# add .5 to the colors in order to work around a hardware
	# bug regarding span start color inprecision
	# Thu Jun  8 18:49:52 PDT 1995

		luv	amax[0], RSP_PTS_R_NX(maxp)
		 		vadd	aminf, ainitf, vconst1[5]
		luv	amin[0], RSP_PTS_R_NX(minp)
		 		vadd	amidf, ainitf, vconst1[5]
		andi	tmp, rendState, G_SHADING_SMOOTH
		 		vadd	amaxf, ainitf, vconst1[5]

	# test for flat shading
		bgtz	tmp, smoothShade
		luv	amid[0], RSP_PTS_R_NX(midp)	# delay slot

	# load flat-shading colors instead: (use same vertex)
		luv	amax[0], RSP_PTS_R_NX(flatp)
		luv	amin[0], RSP_PTS_R_NX(flatp)
		luv	amid[0], RSP_PTS_R_NX(flatp)

	smoothShade:
		vmudm	amax, amax, vconst[7]	# multiply by 1/512.0 to
		vmudm	amin, amin, vconst[7]	# move things into lower byte.
		vmudm	amid, amid, vconst[7]

	# load S, T, and W:
	# These have been previously computed and stored in scratch memory.
		ldv	aminf[8], (16 +  8)(dscratchp)
		ldv	amin[8],  (16 +  0)(dscratchp)
		ldv	amidf[8], (16 + 24)(dscratchp)
		ldv	amid[8],  (16 + 16)(dscratchp)
		ldv	amaxf[8], (16 + 40)(dscratchp)
		ldv	amax[8],  (16 + 32)(dscratchp)

	# load z's.
	# Use the proper 'screen-space' Z.
		lsv	aminf[14], RSP_PTS_ZSF(minp)
		lsv	amin[14], RSP_PTS_ZS(minp)
		lsv	amidf[14], RSP_PTS_ZSF(midp)
		lsv	amid[14], RSP_PTS_ZS(midp)
		lsv	amaxf[14], RSP_PTS_ZSF(maxp)
		lsv	amax[14], RSP_PTS_ZS(maxp)

	# compute attribute deltas: (S15.16) watch alignment!
/* DELAY HERE! */
  		vsubc	Mdaf, amidf, aminf
 		vsub	Mdai, amid, amin
 		vsubc	tHdaf, aminf, amaxf
		vsub	tHdai, amin, amax
 		vsubc	Hdaf, amaxf, aminf
		vsub	Hdai, amax, amin
		vsubc	tMdaf, aminf, amidf
		vsub	tMdai, amin, amid

	#
	# These multiplies use the full precision of the accumulator.
	# They are basically 32-bit integer multiplies, but the
	# fractional component is also included, although only the
	# upper 32-bits of answer are used.
	#
	# See note up above about why EDel is being scaled up.
	#
	# S15.16 * S11.4 = SS26.20
	# (we only use the upper SS26.4, which we'll multiply
	# by 1/r below)
	#
#if 0
		# compute DeAtt directly, divide Hda/ydelta,
		# instead of:	  de = dy + dx * DxXHDy
		vmudl	vjunk, Hdaf, invEDelf[HIGHY]
		vmadm	vjunk, Hdai, invEDelf[HIGHY]
		vmadn	adef,  Hdaf, invEDeli[HIGHY]
		vmadh	adei,  Hdai, invEDeli[HIGHY]

		# DxAtt = Mdy*Hda - Hdy*Mda
		vmudn	vjunk, Hdaf, EDel[MIDY]
		vmadh	vjunk, Hdai, EDel[MIDY]
		vmadn	vjunk, tMdaf, EDel[HIGHY]
 		vmadh	vjunk, tMdai, EDel[HIGHY]
		vsar	Hdai, Hdai, Hdai[0]
		vsar	Hdaf, Hdaf, Hdaf[1]

		# DyAtt = Hdx*Mda - Mdx*Hda
		vmudn	vjunk, Mdaf, EDel[HIGHX]
		vmadh	vjunk, Mdai, EDel[HIGHX]
		vmadn	vjunk, tHdaf, EDel[MIDX]
		vmadh	vjunk, tHdai, EDel[MIDX]
		vsar	Mdai, Mdai, Mdai[0]
		vsar	Mdaf, Mdaf, Mdaf[1]

	# divide by r (S4.27)
	# This multiply results in the proper S15.16 attributes
	# that we need (texture is S10.21)
	#
		vmudl	vjunk, Hdaf, invrf[3]
		vmadm	vjunk, Hdai, invrf[3]
		vmadn	Hdaf, Hdaf, invri[3]
		vmadh	Hdai,  Hdai, invri[3]

		vmudl	vjunk, Mdaf, invrf[3]
		vmadm	vjunk, Mdai, invrf[3]
		vmadn	Mdaf, Mdaf, invri[3]
		vmadh	Mdai,  Mdai, invri[3]
#else
	/*
	 * We'd like to do it the above way, it's faster and
	 * shorter. But the precision errors keep biting us.
	 * If we do it this way, the errors are minimized:
	 */
		# DxAtt = Mdy*Hda - Hdy*Mda
		vmudn	vjunk, Hdaf, EDel[MIDY]
		vmadh	vjunk, Hdai, EDel[MIDY]
		vmadn	vjunk, tMdaf, EDel[HIGHY]
		vmadh	vjunk, tMdai, EDel[HIGHY]
		vsar	Hdai, Hdai, Hdai[0]
		vsar	Hdaf, Hdaf, Hdaf[1]

		# DyAtt = Hdx*Mda - Mdx*Hda
		vmudn	vjunk, Mdaf, EDel[HIGHX]
		vmadh	vjunk, Mdai, EDel[HIGHX]
		vmadn	vjunk, tHdaf, EDel[MIDX]
		vmadh	vjunk, tHdai, EDel[MIDX]
		vsar	Mdai, Mdai, Mdai[0]
		vsar	Mdaf, Mdaf, Mdaf[1]

	# divide by r (S4.27)
	#
	# This multiply results in the proper S15.16 attributes
	# that we need (texture is S10.21)
	#
	# begin storing colors as we do this:
	#
			vmudl	vjunk, Hdaf, invrf[3]
			vmadm	vjunk, Hdai, invrf[3]
			vmadn	Hdaf,  Hdaf, invri[3]
			vmadh	Hdai,  Hdai, invri[3]

			vmudl	vjunk, Mdaf, invrf[3]
			vmadm	vjunk, Mdai, invrf[3]
			vmadn	Mdaf,  Mdaf, invri[3]
	sdv	Hdai[0],    8(outp)
			vmadh	Mdai,  Mdai, invri[3]
	sdv	Hdaf[0],   24(outp)

		# convert to edge slope representation:
		#   de = dy + dx * DxXHDy
/* DELAY HERE! */
			vmudn	vjunk, Mdaf, vconst[1]
			vmadh	vjunk, Mdai, vconst[1]	# use accum for add...
			vmadl	vjunk, Hdaf, DxXDyf[HIGHY]
			vmadm	vjunk, Hdai, DxXDyf[HIGHY]
			vmadn	adef,  Hdaf, DxXDyi[HIGHY]
	sdv	Mdai[0],   40(outp)
			vmadh	adei,  Hdai, DxXDyi[HIGHY]
	sdv	Mdaf[0],   56(outp)
#endif

.unname vjunk
.unname vjunkf
.name	pp1i,	$v6
.name	pp1f,	$v28
		# attribute X adjust:
		#   att = att - (de * yHigh.frac)
/* DELAY HERE! */
				vmudl	pp1f, adef,   yf[0]
		sdv	adei[0],   32(outp)
				vmadm	pp1i, adei,   yf[0]
		sdv	adef[0],   48(outp)
				vmadn	pp1f, vconst, vconst[0]
/* DELAY HERE! */
				vsubc	ainitf, aminf, pp1f
				vsub	ainiti, amin,   pp1i	# delay slot
.unname	pp1i
.unname	pp1f

		andi	tmp, rdp_cmd, G_RDP_TRI_SHADE_MASK
	#
	# All done.
	# Write out the proper record to the RDP, based on the drawing
	# modes.
	#
	# (get ready for next test in the branch delay slots)

	# write out the rest of shade and increment outp
		blez	tmp, outputTXTR
		andi	tmp, rdp_cmd, G_RDP_TRI_TXTR_MASK	# delay

		addi	outp, outp, 64	# increment output pointer
		sdv	ainiti[0], (0-64)(outp)	# 0
		sdv	ainitf[0], (16-64)(outp)	# 16

	# write out texture
  outputTXTR:	blez	tmp, outputZBUF
		andi	tmp, rdp_cmd, G_RDP_TRI_ZBUFF_MASK	# delay
	#
	# Scale texture parameters to ensure that they all remain
	# in-bounds for the hardware LOD computation:
	#

	# free up some registers
.unname	amaxf
.unname	tHdai
.unname	tHdaf
.unname	tMdai
.unname	tMdaf
.unname	amid
.unname	amidf
.unname	amax
.unname	invri
.unname	invrf

.name scalei,	$v5
.name scalef,	$v6
.name vtmpf,	$v19
.name coordMi,	$v20
.name coordMf,	$v21
.name t1i,	$v22
.name t1f,	$v23
.name absdxi,	$v24
.name absdyi,	$v25
.name absdxf,	$v26
.name absdyf,	$v27

	# shift >> 5 to get some guard bits for this computation:
	addi	$16, zero, 0x0800
	mtc2	$16, vtmpf[0]

	# find abs() of all the slopes:
	# (sloppy, single-precision test only)
	# use the original fractional vector when needed
	# load maxS', maxT', and nearW into vector
			vabs	absdxi, Hdai, Hdai
	ldv	coordMi[8], 64(dscratchp)
			vabs	absdyi, Mdai, Mdai
	ldv	coordMf[8], 72(dscratchp)

	# shift >> 5 to get some guard bits for this computation:
	                vmudm   absdxi, absdxi, vtmpf[0]
	                vmadn   absdxf, vconst, vconst[0]

	                vmudm   absdyi, absdyi, vtmpf[0]
	                vmadn   absdyf, vconst, vconst[0]

	                vmudl   coordMf, coordMf, vtmpf[0]
	                vmadm   coordMi, coordMi, vtmpf[0]
	                vmadn   coordMf, vconst, vconst[0]

			vmudn	absdxf, absdxf, vconst[2]
			vmadh	absdxi, absdxi, vconst[2]
			vmadn	absdxf, vconst, vconst[0]

	# compute |coordMax| + 2*|d?dx| + |d?dy|
	# add using the accumulator:
			vmadn	t1f, absdyf, vconst[1]
			vmadh	t1i, absdyi, vconst[1]

	addi	$16, zero, 0x0040
			vmadn	scalef, coordMf, vconst[1]
	mtc2	$16, vtmpf[0]
			vmadh	scalei, coordMi, vconst[1]

	# find max of scale factors
			vsubc	t1f, scalef, scalef[5]
			vge	scalei, scalei, scalei[5]
			vmrg	scalef, scalef, scalef[5]
			vsubc	t1f, scalef, scalef[6]
			vge	scalei, scalei, scalei[6]
			vmrg	scalef, scalef, scalef[6]

	# shift >> 10 to get the scale ratio:
	                vmudl   scalef, scalef, vtmpf[0]
	                vmadm   scalei, scalei, vtmpf[0]
	                vmadn   scalef, vconst, vconst[0]

	# compute 1/scalefactor
	# sloppy, Newton's not needed.
			vrcph	t1f[0], scalei[4]
			vrcpl	scalef[0], scalef[4]
			vrcph	scalei[0], vconst[0]

	# convert to s15.16
			vmudn	scalef, scalef, vconst[2]
			vmadh	scalei, scalei, vconst[2]

	# if scale > 1.0, make 1.0 (don't want to scale)
			vlt	scalei, scalei, vconst[1]
			vmrg	scalef, scalef, vconst[0]

.unname absdxi
.unname absdyi
.unname absdxf
.unname absdyf
.unname coordMi
.unname coordMf
.unname t1i
.unname t1f
.unname vtmpf

.name tiniti,	$v19
.name tinitf,	$v20
.name tHdai,	$v21
.name tHdaf,	$v22
.name tMdai,	$v23
.name tMdaf,	$v24
.name tadei,	$v25
.name tadef,	$v26

	# scale init, dx, dy, de
/* DELAY HERE! */
			vmudl	tinitf, ainitf, scalef[0]
			vmadm	tinitf, ainiti, scalef[0]
			vmadn	tinitf, ainitf, scalei[0]
			vmadh	tiniti, ainiti, scalei[0]

			vmudl	tHdaf, Hdaf, scalef[0]
			vmadm	tHdaf, Hdai, scalef[0]
			vmadn	tHdaf, Hdaf, scalei[0]
	sdv	tiniti[8],  0(outp)
			vmadh	tHdai, Hdai, scalei[0]
	sdv	tinitf[8], 16(outp)

			vmudl	tMdaf, Mdaf, scalef[0]
			vmadm	tMdaf, Mdai, scalef[0]
			vmadn	tMdaf, Mdaf, scalei[0]
	sdv	tHdai[8],    8(outp)
			vmadh	tMdai, Mdai, scalei[0]
	sdv	tHdaf[8],   24(outp)

			vmudl	tadef, adef, scalef[0]
			vmadm	tadef, adei, scalef[0]
			vmadn	tadef, adef, scalei[0]
	sdv	tMdai[8],   40(outp)
			vmadh	tadei, adei, scalei[0]
	sdv	tMdaf[8],   56(outp)

	# write out the rest of texture parameters and increment outp:
		addi	outp, outp, 64	# increment output pointer
		sdv	tadei[8],   (32-64)(outp)	# 32
		sdv	tadef[8],   (48-64)(outp)	# 48

.unname scalei
.unname scalef
.unname tiniti
.unname tinitf
.unname tHdai
.unname tHdaf
.unname tMdai
.unname tMdaf
.unname tadei
.unname tadef

  outputZBUF:
		blez	tmp, SetupDone
		# note delay slot
	#
	# Scale Z-values up, screen coordinates were limited
	# to 10 integer bits, but the hardware floating point format
	# needs valid bits in the upper range for best performance.
	#
				vmudn	adef, adef, vconst1[4]
				vmadh	adei, adei, vconst1[4]
				vmadn	adef, vconst, vconst[0]

				vmudn	aminf, aminf, vconst1[4]
				vmadh	amin, amin, vconst1[4]
				vmadn	aminf, vconst, vconst[0]

	ssv	adei[14],    8(outp)	# output z stuff.
				vmudn	Hdaf, Hdaf, vconst1[4]
	ssv	adef[14],   10(outp)
				vmadh	Hdai, Hdai, vconst1[4]
				vmadn	Hdaf, vconst, vconst[0]

				vmudn	Mdaf, Mdaf, vconst1[4]
				vmadh	Mdai, Mdai, vconst1[4]
				vmadn	Mdaf, vconst, vconst[0]

	# clamp dzdy if near zero, for decal mode:
		lbu	tmp, RSP_STATE_TEX_LOD(rsp_state)
		sub	tmp, zero, tmp
		beq	tmp, zero, noZClamp
		mtc2	tmp, $v6[0]
		vch	Mdai, Mdai, $v6[0]
		vcl	Mdaf, Mdaf, vconst[0]

.name	pp1i,	$v6
.name	pp1f,	$v28

    noZClamp:
		# re-compute attribute X adjust after the Z scale:
		#   att = att - (de * yHigh.frac)
	ssv	Hdai[14],    4(outp)
				vmudl	pp1f, adef,   yf[0]
	ssv	Hdaf[14],    6(outp)
				vmadm	pp1i, adei,   yf[0]
	ssv	Mdai[14],   12(outp)
				vmadn	pp1f, vconst, vconst[0]
	ssv	Mdaf[14],   14(outp)
				vsubc	ainitf, aminf, pp1f
				vsub	ainiti, amin,   pp1i
.unname	pp1i
.unname	pp1f
		addi	outp, outp, 16	# increment output pointer
		ssv	ainiti[14],  (0-16)(outp)	# 0
		ssv	ainitf[14],  (2-16)(outp)	# 2

SetupDone:			# done or rejected. do any clean-up.
		jal	OutputClose
		# note delay slot

SetupReject:	# no OutputClose needed...
		nop

 		jr	return_save
		nop

		.end	beginSetup

/* un-name scalar registers: */
.unname	minp
.unname	midp
.unname	maxp
.unname	flatp
.unname	rdp_cmd
.unname	rdp_flg
.unname	tmp
.unname	dscratchp
.unname	rendState

/* un-name vector registers: */
.unname	DxXDyi
.unname	DxXDyf
.unname	yf
.unname	xHighf
.unname	EDel
.unname	invEDeli
.unname	invEDelf

.unname	Hdai
.unname	Hdaf
.unname	Mdai
.unname	Mdaf
.unname	adei
.unname	adef
.unname	ainiti
.unname	ainitf
.unname	amin
.unname	aminf

#if 0
	# test for thorough register un-naming.
.name r1, $1
.name r2, $2
.name r3, $3
.name r4, $4
.name r5, $5
.name r6, $6
.name r7, $7
.name r8, $8
.name r9, $9
.name r10, $10
.name r11, $11
.name r12, $12
.name r13, $13
.name r14, $14
.name r15, $15
.name r16, $16
.name r17, $17
.name r18, $18
.name r19, $19
.name r20, $20

.name vv0, $v0
.name vv1, $v1
.name vv2, $v2
.name vv3, $v3
.name vv4, $v4
.name vv5, $v5
.name vv6, $v6
.name vv7, $v7
.name vv8, $v8
.name vv9, $v9
.name vv10, $v10
.name vv11, $v11
.name vv12, $v12
.name vv13, $v13
.name vv14, $v14
.name vv15, $v15
.name vv16, $v16
.name vv17, $v17
.name vv18, $v18
.name vv19, $v19
.name vv20, $v20
.name vv21, $v21
.name vv22, $v22
.name vv23, $v23
.name vv24, $v24
.name vv25, $v25
.name vv26, $v26
.name vv27, $v27
.name vv28, $v28
.name vv29, $v29

#endif