gtsetup.s 18.4 KB

Raw Blame History Permalink


/*
 * Copyright 1995, Silicon Graphics, Inc.
 * ALL RIGHTS RESERVED
 *
 * UNPUBLISHED -- Rights reserved under the copyright laws of the United
 * States.   Use of a copyright notice is precautionary only and does not
 * imply publication or disclosure.
 *
 * U.S. GOVERNMENT RESTRICTED RIGHTS LEGEND:
 * Use, duplication or disclosure by the Government is subject to restrictions
 * as set forth in FAR 52.227.19(c)(2) or subparagraph (c)(1)(ii) of the Rights
 * in Technical Data and Computer Software clause at DFARS 252.227-7013 and/or
 * in similar or successor clauses in the FAR, or the DOD or NASA FAR
 * Supplement.  Contractor/manufacturer is Silicon Graphics, Inc.,
 * 2011 N. Shoreline Blvd. Mountain View, CA 94039-7311.
 *
 * THE CONTENT OF THIS WORK CONTAINS CONFIDENTIAL AND PROPRIETARY
 * INFORMATION OF SILICON GRAPHICS, INC. ANY DUPLICATION, MODIFICATION,
 * DISTRIBUTION, OR DISCLOSURE IN ANY FORM, IN WHOLE, OR IN PART, IS STRICTLY
 * PROHIBITED WITHOUT THE PRIOR EXPRESS WRITTEN PERMISSION OF SILICON
 * GRAPHICS, INC.
 *
 */

/*
 * File:		gtsetup.s
 * Creator:		hsa@sgi.com
 * Create Date:		Thu Oct 12 13:34:53 PDT 1995
 *
 */

 ##########################################################################
 #
 # TURBO Triangle Setup Routine.
 # When entering this code we have a points buffer full of points,
 # and registers r1, r2, r3 point to the three vertices of a triangle.
 #
 ##########################################################################

#define one4th	vconst[4]

#define	LOWX	0	/* used to index elements of edge vectors */
#define	LOWY	1
#define MIDX	2
#define MIDY	3
#define HIGHX	4
#define HIGHY	5

	/*
	 * Register Allocation:
	 * Longer living registers are assigned to the lower registers,
	 * except the common edge/attribute vector registers are assigned
	 * to the high registers.
	 *
	 * Attribute (vector) registers grow down from the top, in order
	 * not to interfere with edge registers.
	 */

/* scalar registers: */
.name	trip,		$1
.name	tricnt,		$2
.name	trin,		$3

.name	minp,		$4
.name	midp,		$5
.name	maxp,		$6
.name	flatp,		$7
.name	rdp_cmd,	$8
.name	rdp_flg,	$9
.name	tmp,		$10
.name	rendState,	$11
.name	miny, 		$12
.name	midy, 		$13
.name	maxy, 		$14
.name	backrej,	$15
.name	doreject,	$16
.name	bsignr,		$17
.name	negR, 		$18

/* these are "global", used for both edge and attribute setup */
.name	EDel,		$v0
.name	ri,		$v29	# registers fixed by Newton's
.name	rf,		$v28
.name	invri,		$v27
.name	invrf,		$v26

/* these registers are dynamic, allocated and released as they are used */
.name	DxXDyi,		$v1
.name	DxXDyf,		$v2
.name	yf,		$v3
.name	xHighf,		$v4
.name	Hd,		$v5
.name	Md,		$v6
.name	Ld,		$v7
.name	td,		$v8
.name	vmin, 		$v9
.name	vmid, 		$v10
.name	vmax, 		$v11
.name	jnk,		$v12
.name	t1i,		$v13
.name	t1f,		$v14
.name	t2i,		$v15
.name	t2f,		$v16

TrinProc:

	# check state for XFM_ONLY
		lb	tmp, RSP_STATE_FLAG(rsp_state)
		andi	tmp, tmp, GT_FLAG_XFM_ONLY
		bgtz	tmp, GfxDone


		sw	return, RSP_RETURN_SAVE(zero)
		addi	trip, zero, RSP_TRIN_OFFSET
		lb	tricnt, RSP_STATE_TRICOUNT(rsp_state)
		sll	tricnt, tricnt, 2
		beq	tricnt, zero, TriSkip	# no tris
		add	tricnt, tricnt, trip
		lw	rendState, RSP_STATE_RENDER(rsp_state)
   NextTri:
		beq	trip, tricnt, TriDone	# done with tris, do sync
	# get index
		lb	minp, 0(trip)
		lb	midp, 1(trip)
		lb	maxp, 2(trip)

	# index*sizeof(point)
		sll	minp, minp, 4
		sll	midp, midp, 4
		sll	maxp, maxp, 4

	# add point buffer offset
		addi	minp, minp, RSP_POINTS_OFFSET
		addi	midp, midp, RSP_POINTS_OFFSET
		addi	maxp, maxp, RSP_POINTS_OFFSET

		andi	tmp, rendState, G_SHADING_SMOOTH
		beq	tmp, zero, loadFlat
    loadFlatDone:
		nop

	# do setup
		j	beginSetup
		addi	trip, trip, 4		# delay slot
		# beginSetup returns to NextTri
    TriDone:
	# send a pipe sync after group of primitives.
	# this is somewhat wasteful, but safer.
#if !(defined(OUTPUT_DRAM)||defined(OUTPUT_FIFO))
		jal	OutputOpen
		addi	$18, zero, 8
#endif /* !(OUTPUT_DRAM || OUTPUT_FIFO) */
		lui	$1, 0xe700
		sw	$1, 0(outp)
		sw	zero, 4(outp)
		jal	OutputClose
		addi	outp, outp, 8

    TriSkip:
		lw	return, RSP_RETURN_SAVE(zero)
		jr	return

	# load flat shade pointer
    loadFlat:
		lb	flatp, 3(trip)
		sll	flatp, flatp, 2
		sw	minp, (0 + RSP_SCRATCH_OFFSET)(zero)
		sw	midp, (4 + RSP_SCRATCH_OFFSET)(zero)
		sw	maxp, (8 + RSP_SCRATCH_OFFSET)(zero)
		j	loadFlatDone
		lw	flatp, RSP_SCRATCH_OFFSET(flatp)


		.ent	beginSetup
beginSetup:

	# load screen coordinates (pre-sort):
	llv	vmin[0], RSP_PTS_XS(minp)
	llv	vmid[0], RSP_PTS_XS(midp)	# element 1 is y
	llv	vmax[0], RSP_PTS_XS(maxp)	# element 0 is x,

/* DELAY HERE! */
#ifdef OUTPUT_FIFO
	# restore this, it was blown away by another
	# proc...
	lw	rendState, RSP_STATE_RENDER(rsp_state)
#endif
			vsub	Md, vmid, vmin
			vsub	Hd, vmax, vmin
			vsub	td, vmin, vmid
	addi	negR, zero, 0

/* DELAY HERE! */
	# compute the partial products...
	# careful with the math here...
			vmudh	jnk, Hd, Md[1]
	lh	miny, RSP_PTS_YS(minp)		# get the y's
			vsar	t1f, t1f, t1f[1]
	lh	midy, RSP_PTS_YS(midp)
			vsar	t1i, t1i, t1i[0]
	lh	maxy, RSP_PTS_YS(maxp)
			vmudh	jnk, td, Hd[1]
			vsar	t2f, t2f, t2f[1]
	andi	backrej, rendState, GT_CULL_BACK
			vsar	t2i, t2i, t2i[0]

	# begin back-face test:
	#
	# Back-face test is the sign of the plane equation BEFORE VERTEX
	# SORT, tested with the CULL_BACK flag.
	# We toggle a bit during the sort and possibly correct the
	# pleq sign afterwards...
	#
	# Actual back-face computation is SU code weaved in among
	# the VU code.

	# y-sort. Remember, input screen coords are S11.2
	#
	#
    swap1:	slt	tmp, midy, miny		#if midy>miny, tmp gets 0
	    	blez	tmp, swap2		#if tmp>0, branch
		add	tmp, midy, $0		#put midy in tmp
		add	midy, miny, $0		#put miny in midy
		add	miny, tmp, $0		#put tmp in miny
		addu	tmp, midp, $0		#put midp in tmp
		addu	midp, minp, $0		#put minp in midp
		addu	minp, tmp, $0		#put tmp in minp
		xori	negR, negR, 0x0001

		.align	8	# ensure dual-issue of branch target
    swap2:
			vaddc	rf, t1f, t2f
		slt	tmp, maxy, midy		#if maxy>midy, tmp gets 0
			vadd	ri, t1i, t2i
		blez	tmp, sortDone		#if tmp>0, branch
		add	tmp, maxy, $0		#put maxy in tmp
		add	maxy, midy, $0		#put midy in maxy
		add	midy, tmp, $0		#put midy in tmp
		addu	tmp, maxp, $0		#put maxp in tmp
		addu	maxp, midp, $0		#put midp in maxp
		addu	midp, tmp, $0		#put tmp in midp
		j	swap1
		xori	negR, negR, 0x0001
    sortDone:
	# this branch target is aligned for dual-issue (see above)

	# load screen coordinates:	(S11.2)
			vlt	invri, ri, vconst[0]
		llv	vmax[0], RSP_PTS_XS(maxp)	# element 0 is x,
	 		vor	invrf, ri, rf
		llv	vmid[0], RSP_PTS_XS(midp)	# element 1 is y
		llv	vmin[0], RSP_PTS_XS(minp)

	# possibly negate R
		blez	negR, posiR
 		vsub	EDel, vmax, vmid	# delay slot, low deltas
		vmudn	rf, rf, vconst[3]	# negate R
		vmadh	ri, ri, vconst[3]
 		vmadn	rf, vconst, vconst[0]
	posiR:

	# compute edge deltas:		(S11.2)
	# (Need to do this again after the sort)
	# save out vertex pointers for attribute processing
	# while doing this.
			 		vsub	Md, vmid, vmin
	mfc2	bsignr, invri[0]
			 		vsub	Hd, vmax, vmin
	mfc2	doreject, invrf[0]

.unname jnk
.unname t1i
.unname t1f
.unname t2i
.unname t2f

.unname	td
.unname	vmin
.unname	vmid
.unname	vmax

.name	vzeros,	$v20
.name	amin,	$v19
.name	amid,	$v18
.name	amax,	$v17
.name	vjunk,	$v16
.name	tMdai, 	$v15
.name	tHdai, 	$v14

	#
	# Collect all the attributes
	# in a vector (r,g,b,a,s,t,w,?)
	# load smooth-shading colors first.
	# RGBA, use fancy packed load, then shift.
	# DMEM alignment is crucial here!
	# usage of tmp pointer is to make alignments work!
	addi	tmp, maxp, 4
	luv	amax[0], RSP_PTS_R(tmp)
	addi	tmp, minp, 4
	luv	amin[0], RSP_PTS_R(tmp)
		vxor	vzeros, vconst, vconst
	addi	tmp, midp, 4
	luv	amid[0], RSP_PTS_R(tmp)

	# check for flat shading:
	andi	tmp, rendState, G_SHADING_SMOOTH
	bgtz	tmp, smoothShade
	addi	tmp, flatp, 4
	luv	amax[0], RSP_PTS_R(tmp)
	luv	amin[0], RSP_PTS_R(tmp)
	luv	amid[0], RSP_PTS_R(tmp)

smoothShade:

	# if (r < 0) then triangle is a back-face.
	# finish back-face processing in the SU.

			# align these for Newton
	sra	bsignr, bsignr, 31
			vmov	ri[3], ri[0]
	and	backrej, backrej, bsignr
			vmov	rf[3], rf[0]

	# If (r == 0), triangle is NULL, we should bail out completely.
			vmov	EDel[MIDX], Md[0]
 	beq	doreject, zero, SetupReject
	# note delay slot

		vmudm	amax, amax, vconst[7]	# multiply by 1/512.0 to
		vmudm	amin, amin, vconst[7]	# move things into lower byte.
	bgtz	backrej, SetupReject
		vmudm	amid, amid, vconst[7]

			# align these for speed later.
	# re-test the sign of r *after* the sort for left/right-ness
			vlt	invri, ri, vconst[0]
	andi	miny, miny, 0xfffc
			vmov	EDel[MIDY], Md[1]
	andi	midy, midy, 0xfffc
			vmov	EDel[HIGHX], Hd[0]
	andi	maxy, maxy, 0xfffc
			vmov	EDel[HIGHY], Hd[1]
	mfc2	tmp, invri[0]

.unname	negR

.unname	Hd
.unname	Md
.unname	Ld

	#
	# compute 1/r
	# R is about 10 bits accurate coming from the rcp table.
	# We need to do a Newton's iteration pass here to get more
	# precision. Each iteration should get another 10 bits...
	#
.name	r2i, 	$v25	# intermediate value
.name	r2f, 	$v24
.name	vtmpi, 	$v23	# constant 2.0
.name	vtmpf, 	$v22


	# load S and T:
	llv	amin[8],  RSP_PTS_S(minp)
			vrcph	invri[3], ri[3]
	llv	amid[8],  RSP_PTS_S(midp)
			vrcpl	invrf[3], rf[3]
	llv	amax[8],  RSP_PTS_S(maxp)
			vrcph	invri[3], vconst[0]

	# stick 1.0 in for W:
		vmov	amin[6], vconst1[0]
		vmov	amid[6], vconst1[0]

	# load z's:
	lsv	amin[14], RSP_PTS_ZS(minp)
			vmudn	invrf, invrf, vconst[2]
	lsv	amid[14], RSP_PTS_ZS(midp)
			vmadh	invri, invri, vconst[2]
	lsv	amax[14], RSP_PTS_ZS(maxp)
			vmadn	invrf, vconst, vconst[0]

		vmov	amax[6], vconst1[0]

			lqv	vtmpi[0], VNEWT_OFFSET(zero)
 			vxor	vtmpf, vconst, vconst
			vmudl	r2f, invrf, rf		#  R*X
			vmadm	r2f, invri, rf
			vmadn	r2f, invrf, ri
			vmadh	r2i, invri, ri
	# compute attribute deltas: (S15.16) watch alignment!
	vsub	tHdai, amin, amax
	vsub	tMdai, amin, amid
			vsubc	r2f, vtmpf, r2f		#  2 - (R*X)
			vsub	r2i, vtmpi, r2i
.unname vtmpi
.unname vtmpf
.name	Hdai,	$v22
.name	Mdai,	$v21
	vsub	Mdai, amid, amin
	vsub	Hdai, amax, amin
			vmudl	vjunk, invrf, r2f		#  R * (2-R*X)
			vmadm	vjunk, invri, r2f
	addi	rdp_flg, zero, 0x80
			vmadn	invrf, invrf, r2i
			vmadh	invri, invri, r2i
.unname r2i
.unname r2f
.unname ri
.unname rf
.name	invEDeli,	$v29
.name	invEDelf,	$v28
.name	adei,		$v25
.name	adef,		$v24
.name	Hdaf,		$v23
.name	EDeli,		$v5
.name	EDelf,		$v6


	# identify left- or right-major triangle:
	# if (r < 0) dir = 0 else dir = 1
		bltz	tmp, rightMajor
		lb	rdp_cmd, RSP_STATE_TRI(rsp_state)	# delay slot
		addi	rdp_flg, zero, 0x0	# left-major
    rightMajor:
	# Ldx/Ldy, Mdx/Mdy, Hdx/Hdy:
	#
	# Since the rcp ROM is 10 bits, that's good enough for
	# the edge slopes. Newton's doesn't help.
	#
	# Get triangle command from state and construct the proper RDP
	# command while we do this.
	#
			vmudm	EDeli, EDel, vconst[4]	# make S15.16
			vmadn	EDelf, vconst, vconst[0]

			vrcp	invEDelf[LOWY], EDel[LOWY]	# 1.0/Ldy
	# stick in tile number
	lb	tmp, RSP_STATE_TEX_TILE(rsp_state)
			vrcph	invEDeli[LOWY], vconst[0]
	# Always assumes we're doing at least color attribute
	ori	rdp_cmd, rdp_cmd, G_TRI_SHADE

			vrcp	invEDelf[MIDY], EDel[MIDY]	# 1.0/Mdy
			vrcph	invEDeli[MIDY], vconst[0]

			vrcp	invEDelf[HIGHY], EDel[HIGHY]	# 1.0/Hdy
	or	rdp_flg, rdp_flg, tmp	# 3 cycles after load
			vrcph	invEDeli[HIGHY], vconst[0]

	# open for output
#if !(defined(OUTPUT_DRAM)||defined(OUTPUT_FIFO))
		jal	OutputOpen
		addi	$18, zero, 176 	# worst case guess (delay slot)
#endif /* !(OUTPUT_DRAM || OUTPUT_FIFO) */

	#
	# We used to shift down the rcp results all the way,
	# then do the multiply. If we don't shift it down all the
	# way, do the mult, then shift some more, we get better
	# precision on the degenerate cases.
	#
#if 1
	 		vmudl	invEDelf, invEDelf, vconst1[2]	# make S15.16
	sb	rdp_cmd, 0(outp)	# output rdp command
	 		vmadm	invEDeli, invEDeli, vconst1[2]
	sb	rdp_flg, 1(outp)	# output poly flag
	  		vmadn	invEDelf, vconst, vconst[0]
#else
	 		vmudl	invEDelf, invEDelf, vconst1[4]	# make S15.16
	sb	rdp_cmd, 0(outp)	# output rdp command
	 		vmadm	invEDeli, invEDeli, vconst1[4]
	sb	rdp_flg, 1(outp)	# output poly flag
	  		vmadn	invEDelf, vconst, vconst[0]
#endif

	# y setup:	(S11.2)
	# (we removed fractional bits in vertex loop)

	# Do some other work during the pipeline delay:
	# We scale up EDel so that later, during the attribute computation,
	# the 1/r multiply gives us the right S15.16 aligned answer.

		vmudh	EDel, EDel, vconst[5]	# mult by 4 for attributes

.name 	xi,	$v7
.name 	xf,	$v8

.unname	backrej
.unname	doreject
.unname	bsignr

	# x setup:	(S15.16)
	# (finish edge slopes while we do this)
	# The slope answer will end up in the Y element...

	lsv	xi[(LOWX*2)],  RSP_PTS_XS(midp)
/* DELAY HERE! */
			vmudl	DxXDyf, invEDelf, EDelf[0q]	# Ldx / Ldy
	lsv	xi[(MIDX*2)],  RSP_PTS_XS(minp) # same as high
			vmadm	DxXDyf, invEDeli, EDelf[0q]	# Mdx / Mdy
	lsv	xi[(HIGHX*2)], RSP_PTS_XS(minp)
			vmadn	DxXDyf, invEDelf, EDeli[0q]	# Hdx / Hdy
			vmadh	DxXDyi, invEDeli, EDeli[0q]
	sh	maxy, 2(outp)	# output y coords S11.2
			vmadn	DxXDyf, vconst, vconst[0]
	# translate S11.2 x's to S15.16.
	sh	miny, 6(outp)

.unname	EDeli
.unname	EDelf

.name	Mdaf,	$v5

#if 1
	# no shift down needed...
#else
	# shift down some more...
 		vmudl	invEDelf, invEDelf, vconst[4]
 		vmadm	invEDeli, invEDeli, vconst[4]
  		vmadn	invEDelf, vconst, vconst[0]

 		vmudl	DxXDyf, DxXDyf, vconst[4]
 		vmadm	DxXDyi, DxXDyi, vconst[4]
  		vmadn	DxXDyf, vconst, vconst[0]
#endif

		vmudm	xi, xi, one4th
		vmadn	xf, vconst, vconst[0]

	# Check DxXDy for "nearly-horizontal". Make horizontal, if so.
	# (only a single-precision clamp)
	sh	midy, 4(outp)
			vcr	DxXDyi, DxXDyi, vconst1[6]

.unname miny
.unname midy
.unname maxy
	#
	# These attribute multiplies use the full precision of the
	# accumulator. They are basically integer multiplies,
	# with only the upper 32 bits retrieved from the accumulator.
	#
	# See note up above about why EDel is being scaled up.
	#
	# S15.16 * S11.4 = SS26.20
	# (we only use the upper SS26.4, which we'll multiply
	# by 1/r below)
	#
	# (consider other method of computing this?)

		ssv	DxXDyf[(LOWY*2)], 14(outp)
	# compute DeAtt directly, divide Hda/ydelta,
	# instead of:	  de = dy + dx * DxXHDy
	vmudm	vjunk, Hdai, invEDelf[HIGHY]
		ssv	DxXDyf[(HIGHY*2)], 22(outp)
	vmadh	adei,  Hdai, invEDeli[HIGHY]
		ssv	DxXDyf[(MIDY*2)], 30(outp)
	vmadn	adef,  vconst, vconst[0]
		ssv	xi[(LOWX*2)],  8(outp)	# output xLow
	# DxAtt = Mdy*Hda - Hdy*Mda
	# we don't need DyAtt for no-AA case
	vmudh	vjunk, Hdai, EDel[MIDY]
		ssv	xf[(LOWX*2)], 10(outp)
	vmadh	vjunk, tMdai, EDel[HIGHY]
		ssv	DxXDyi[(LOWY*2)], 12(outp)
	vsar	Hdai, Hdai, Hdai[0]
		ssv	DxXDyi[(HIGHY*2)], 20(outp)
	vsar	Hdaf, Hdaf, Hdaf[1]
		ssv	DxXDyi[(MIDY*2)], 28(outp)
		ssv	xi[(HIGHX*2)], 16(outp)	# output xHigh
		ssv	xf[(HIGHX*2)], 18(outp)
		ssv	xi[(MIDX*2)],  24(outp)	# output xMid
		ssv	xf[(MIDX*2)],  26(outp)
		addi	outp, outp, 32	# increment output pointer

.unname xi
.unname xf
.unname	DxXDyi
.unname	DxXDyf
.unname	xHighf
.unname	yf

	# divide by r (S4.27)
	# This multiply results in the proper S15.16 attributes
	# that we need (texture is S10.21)
	# Write out the proper record to the RDP, based on the drawing
	# modes, and increment outp.
	vmudl	vjunk, Hdaf, invrf[3]
		sdv	vzeros[0], 40(outp)	# DyAtt
	vmadm	vjunk, Hdai, invrf[3]
		sdv	vzeros[0], 56(outp)	# DyAtt
	vmadn	Hdaf,  Hdaf, invri[3]
		sdv	adei[0],   32(outp)
	vmadh	Hdai,  Hdai, invri[3]
		sdv	adef[0],   48(outp)
		sdv	amin[0],    0(outp)
		sdv	vzeros[0], 16(outp)	# init fracs
		andi	tmp, rdp_cmd, G_RDP_TRI_TXTR_MASK
		sdv	Hdai[0],    8(outp)
		sdv	Hdaf[0],   24(outp)

	# write out texture
		blez	tmp, outputZBUF
		addi	outp, outp, 64	# increment output pointer (delay)

		sdv	amin[8],    0(outp)
		sdv	vzeros[8], 16(outp)	# init fracs
		sdv	Hdai[8],    8(outp)
		sdv	Hdaf[8],   24(outp)
		sdv	vzeros[0], 40(outp)	# DyAtt
		sdv	vzeros[0], 56(outp)	# DyAtt
		sdv	adei[8],   32(outp)
		sdv	adef[8],   48(outp)
		addi	outp, outp, 64	# increment output pointer

outputZBUF:
		andi	tmp, rdp_cmd, G_RDP_TRI_ZBUFF_MASK
		blez	tmp, SetupDone

		# we need DyAtt for zbuffer, it's the delta-z
		# DyAtt = Hdx*Mda - Mdx*Hda
		vmudh	vjunk, Mdai, EDel[HIGHX]
		vmadh	vjunk, Mdai, EDel[MIDX]
		vsar	Mdai, Mdai, Mdai[0]
		vsar	Mdaf, Mdaf, Mdaf[1]

		vmudl	vjunk, Mdaf, invrf[3]
		vmadm	vjunk, Mdai, invrf[3]
		vmadn	Mdaf,  Mdaf, invri[3]
		vmadh	Mdai,  Mdai, invri[3]

	#
	# Scale Z-values up, screen coordinates were limited
	# to 10 integer bits, but the hardware floating point format
	# needs valid bits in the upper range for best performance.
	#
.name aminf,	$v1
			vmudn	adef, adef, vconst1[4]
			vmadh	adei, adei, vconst1[4]
			vmadn	adef, vconst, vconst[0]

			vmudn	aminf, vzeros, vconst1[4]
			vmadh	amin, amin, vconst1[4]
			vmadn	aminf, vconst, vconst[0]

	ssv	adei[14],    8(outp)	# output z stuff.
			vmudn	Hdaf, Hdaf, vconst1[4]
	ssv	adef[14],   10(outp)
			vmadh	Hdai, Hdai, vconst1[4]
			vmadn	Hdaf, vconst, vconst[0]

	ssv	Hdai[14],    4(outp)
	ssv	Hdaf[14],    6(outp)
	ssv	Mdai[14],   12(outp)
	ssv	Mdaf[14],   14(outp)

	addi	outp, outp, 16			# increment output pointer
	ssv	amin[14],  (0-16)(outp)	# 0
	ssv	aminf[14],  (2-16)(outp)	# 2
.unname aminf

SetupDone:	# done or rejected. do any clean-up.
		jal	OutputClose
		# note delay slot

SetupReject:	# no OutputClose needed...
		nop

		j	NextTri
		nop

		.end	beginSetup

/* un-name scalar registers: */
.unname	trip
.unname	tricnt
.unname	trin
.unname	minp
.unname	midp
.unname	maxp
.unname	flatp
.unname	rdp_cmd
.unname	rdp_flg
.unname	tmp
.unname	rendState

/* un-name vector registers: */
.unname	EDel
.unname	invEDeli
.unname	invEDelf

.unname	Hdai
.unname	Hdaf
.unname	Mdai
.unname	Mdaf
.unname	adei
.unname	adef
.unname	amin
.unname	vzeros
.unname vjunk
.unname	tMdai
.unname	amid
.unname	amax
.unname	tHdai
.unname	invri
.unname	invrf

#if 1
	# test for thorough register un-naming.
.name r1, $1
.name r2, $2
.name r3, $3
.name r4, $4
.name r5, $5
.name r6, $6
.name r7, $7
.name r8, $8
.name r9, $9
.name r10, $10
.name r11, $11
.name r12, $12
.name r13, $13
.name r14, $14
.name r15, $15
.name r16, $16
.name r17, $17
.name r18, $18
.name r19, $19
.name r20, $20

.name vv0, $v0
.name vv1, $v1
.name vv2, $v2
.name vv3, $v3
.name vv4, $v4
.name vv5, $v5
.name vv6, $v6
.name vv7, $v7
.name vv8, $v8
.name vv9, $v9
.name vv10, $v10
.name vv11, $v11
.name vv12, $v12
.name vv13, $v13
.name vv14, $v14
.name vv15, $v15
.name vv16, $v16
.name vv17, $v17
.name vv18, $v18
.name vv19, $v19
.name vv20, $v20
.name vv21, $v21
.name vv22, $v22
.name vv23, $v23
.name vv24, $v24
.name vv25, $v25
.name vv26, $v26
.name vv27, $v27
.name vv28, $v28
.name vv29, $v29

#endif