gtsetup.s 18.4 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764

/*
 * Copyright 1995, Silicon Graphics, Inc.
 * ALL RIGHTS RESERVED
 *
 * UNPUBLISHED -- Rights reserved under the copyright laws of the United
 * States.   Use of a copyright notice is precautionary only and does not
 * imply publication or disclosure.
 *
 * U.S. GOVERNMENT RESTRICTED RIGHTS LEGEND:	
 * Use, duplication or disclosure by the Government is subject to restrictions
 * as set forth in FAR 52.227.19(c)(2) or subparagraph (c)(1)(ii) of the Rights
 * in Technical Data and Computer Software clause at DFARS 252.227-7013 and/or
 * in similar or successor clauses in the FAR, or the DOD or NASA FAR
 * Supplement.  Contractor/manufacturer is Silicon Graphics, Inc.,
 * 2011 N. Shoreline Blvd. Mountain View, CA 94039-7311.
 *
 * THE CONTENT OF THIS WORK CONTAINS CONFIDENTIAL AND PROPRIETARY
 * INFORMATION OF SILICON GRAPHICS, INC. ANY DUPLICATION, MODIFICATION,
 * DISTRIBUTION, OR DISCLOSURE IN ANY FORM, IN WHOLE, OR IN PART, IS STRICTLY
 * PROHIBITED WITHOUT THE PRIOR EXPRESS WRITTEN PERMISSION OF SILICON
 * GRAPHICS, INC.
 *
 */
	
/*
 * File:		gtsetup.s
 * Creator:		hsa@sgi.com
 * Create Date:		Thu Oct 12 13:34:53 PDT 1995
 *
 */
	
 ##########################################################################
 #
 # TURBO Triangle Setup Routine.
 # When entering this code we have a points buffer full of points,
 # and registers r1, r2, r3 point to the three vertices of a triangle.
 #
 ##########################################################################
	
#define one4th	vconst[4]

#define	LOWX	0	/* used to index elements of edge vectors */
#define	LOWY	1
#define MIDX	2
#define MIDY	3
#define HIGHX	4
#define HIGHY	5

	/*
	 * Register Allocation:
	 * Longer living registers are assigned to the lower registers,
	 * except the common edge/attribute vector registers are assigned
	 * to the high registers.
	 *
	 * Attribute (vector) registers grow down from the top, in order
	 * not to interfere with edge registers.
	 */

/* scalar registers: */
.name	trip,		$1
.name	tricnt,		$2
.name	trin,		$3
	
.name	minp,		$4
.name	midp,		$5
.name	maxp,		$6
.name	flatp,		$7
.name	rdp_cmd,	$8
.name	rdp_flg,	$9
.name	tmp,		$10
.name	rendState,	$11
.name	miny, 		$12
.name	midy, 		$13
.name	maxy, 		$14
.name	backrej,	$15
.name	doreject,	$16
.name	bsignr,		$17
.name	negR, 		$18

/* these are "global", used for both edge and attribute setup */
.name	EDel,		$v0
.name	ri,		$v29	# registers fixed by Newton's
.name	rf,		$v28
.name	invri,		$v27
.name	invrf,		$v26

/* these registers are dynamic, allocated and released as they are used */
.name	DxXDyi,		$v1
.name	DxXDyf,		$v2
.name	yf,		$v3
.name	xHighf,		$v4
.name	Hd,		$v5
.name	Md,		$v6
.name	Ld,		$v7
.name	td,		$v8
.name	vmin, 		$v9
.name	vmid, 		$v10
.name	vmax, 		$v11	
.name	jnk,		$v12
.name	t1i,		$v13
.name	t1f,		$v14
.name	t2i,		$v15
.name	t2f,		$v16
	
TrinProc:

	# check state for XFM_ONLY
		lb	tmp, RSP_STATE_FLAG(rsp_state)
		andi	tmp, tmp, GT_FLAG_XFM_ONLY
		bgtz	tmp, GfxDone


		sw	return, RSP_RETURN_SAVE(zero)
		addi	trip, zero, RSP_TRIN_OFFSET
		lb	tricnt, RSP_STATE_TRICOUNT(rsp_state)
		sll	tricnt, tricnt, 2
		beq	tricnt, zero, TriSkip	# no tris
		add	tricnt, tricnt, trip
		lw	rendState, RSP_STATE_RENDER(rsp_state)
   NextTri:
		beq	trip, tricnt, TriDone	# done with tris, do sync
	# get index
		lb	minp, 0(trip)
		lb	midp, 1(trip)
		lb	maxp, 2(trip)
	
	# index*sizeof(point)
		sll	minp, minp, 4
		sll	midp, midp, 4
		sll	maxp, maxp, 4
	
	# add point buffer offset
		addi	minp, minp, RSP_POINTS_OFFSET
		addi	midp, midp, RSP_POINTS_OFFSET
		addi	maxp, maxp, RSP_POINTS_OFFSET
	
		andi	tmp, rendState, G_SHADING_SMOOTH
		beq	tmp, zero, loadFlat
    loadFlatDone:
		nop
	
	# do setup
		j	beginSetup
		addi	trip, trip, 4		# delay slot
		# beginSetup returns to NextTri
    TriDone:	
	# send a pipe sync after group of primitives.
	# this is somewhat wasteful, but safer.
#if !(defined(OUTPUT_DRAM)||defined(OUTPUT_FIFO))
		jal	OutputOpen
		addi	$18, zero, 8
#endif /* !(OUTPUT_DRAM || OUTPUT_FIFO) */
		lui	$1, 0xe700
		sw	$1, 0(outp)
		sw	zero, 4(outp)
		jal	OutputClose
		addi	outp, outp, 8
	
    TriSkip:
		lw	return, RSP_RETURN_SAVE(zero)
		jr	return

	# load flat shade pointer
    loadFlat:
		lb	flatp, 3(trip)
		sll	flatp, flatp, 2
		sw	minp, (0 + RSP_SCRATCH_OFFSET)(zero)
		sw	midp, (4 + RSP_SCRATCH_OFFSET)(zero)
		sw	maxp, (8 + RSP_SCRATCH_OFFSET)(zero)
		j	loadFlatDone
		lw	flatp, RSP_SCRATCH_OFFSET(flatp)
	

		.ent	beginSetup
beginSetup:
	
	# load screen coordinates (pre-sort):
	llv	vmin[0], RSP_PTS_XS(minp)
	llv	vmid[0], RSP_PTS_XS(midp)	# element 1 is y
	llv	vmax[0], RSP_PTS_XS(maxp)	# element 0 is x,

/* DELAY HERE! */
#ifdef OUTPUT_FIFO
	# restore this, it was blown away by another
	# proc...
	lw	rendState, RSP_STATE_RENDER(rsp_state)
#endif
			vsub	Md, vmid, vmin
			vsub	Hd, vmax, vmin
			vsub	td, vmin, vmid
	addi	negR, zero, 0
	
/* DELAY HERE! */
	# compute the partial products...
	# careful with the math here...
			vmudh	jnk, Hd, Md[1]
	lh	miny, RSP_PTS_YS(minp)		# get the y's
			vsar	t1f, t1f, t1f[1]
	lh	midy, RSP_PTS_YS(midp)
			vsar	t1i, t1i, t1i[0]
	lh	maxy, RSP_PTS_YS(maxp)
			vmudh	jnk, td, Hd[1]
			vsar	t2f, t2f, t2f[1]
	andi	backrej, rendState, GT_CULL_BACK
			vsar	t2i, t2i, t2i[0]
	
	# begin back-face test:
	#
	# Back-face test is the sign of the plane equation BEFORE VERTEX
	# SORT, tested with the CULL_BACK flag.
	# We toggle a bit during the sort and possibly correct the
	# pleq sign afterwards...
	#
	# Actual back-face computation is SU code weaved in among
	# the VU code.

	# y-sort. Remember, input screen coords are S11.2
	#
	#
    swap1:	slt	tmp, midy, miny		#if midy>miny, tmp gets 0
	    	blez	tmp, swap2		#if tmp>0, branch 
		add	tmp, midy, $0		#put midy in tmp	
		add	midy, miny, $0		#put miny in midy	
		add	miny, tmp, $0		#put tmp in miny	
		addu	tmp, midp, $0		#put midp in tmp	
		addu	midp, minp, $0		#put minp in midp	
		addu	minp, tmp, $0		#put tmp in minp	
		xori	negR, negR, 0x0001
	
		.align	8	# ensure dual-issue of branch target
    swap2:
			vaddc	rf, t1f, t2f
		slt	tmp, maxy, midy		#if maxy>midy, tmp gets 0
			vadd	ri, t1i, t2i
		blez	tmp, sortDone		#if tmp>0, branch 
		add	tmp, maxy, $0		#put maxy in tmp
		add	maxy, midy, $0		#put midy in maxy
		add	midy, tmp, $0		#put midy in tmp
		addu	tmp, maxp, $0		#put maxp in tmp
		addu	maxp, midp, $0		#put midp in maxp
		addu	midp, tmp, $0		#put tmp in midp
		j	swap1
		xori	negR, negR, 0x0001
    sortDone:
	# this branch target is aligned for dual-issue (see above)
	
	# load screen coordinates:	(S11.2)
			vlt	invri, ri, vconst[0]
		llv	vmax[0], RSP_PTS_XS(maxp)	# element 0 is x,
	 		vor	invrf, ri, rf
		llv	vmid[0], RSP_PTS_XS(midp)	# element 1 is y
		llv	vmin[0], RSP_PTS_XS(minp)

	# possibly negate R
		blez	negR, posiR
 		vsub	EDel, vmax, vmid	# delay slot, low deltas
		vmudn	rf, rf, vconst[3]	# negate R
		vmadh	ri, ri, vconst[3]
 		vmadn	rf, vconst, vconst[0]
	posiR:
	
	# compute edge deltas:		(S11.2)	
	# (Need to do this again after the sort)
	# save out vertex pointers for attribute processing
	# while doing this.
			 		vsub	Md, vmid, vmin
	mfc2	bsignr, invri[0]
			 		vsub	Hd, vmax, vmin
	mfc2	doreject, invrf[0]
	
.unname jnk
.unname t1i
.unname t1f
.unname t2i
.unname t2f
	
.unname	td
.unname	vmin
.unname	vmid
.unname	vmax
	
.name	vzeros,	$v20
.name	amin,	$v19
.name	amid,	$v18
.name	amax,	$v17
.name	vjunk,	$v16
.name	tMdai, 	$v15
.name	tHdai, 	$v14

	#
	# Collect all the attributes
	# in a vector (r,g,b,a,s,t,w,?) 
	# load smooth-shading colors first.
	# RGBA, use fancy packed load, then shift.
	# DMEM alignment is crucial here!
	# usage of tmp pointer is to make alignments work!
	addi	tmp, maxp, 4
	luv	amax[0], RSP_PTS_R(tmp)
	addi	tmp, minp, 4
	luv	amin[0], RSP_PTS_R(tmp)
		vxor	vzeros, vconst, vconst
	addi	tmp, midp, 4
	luv	amid[0], RSP_PTS_R(tmp)

	# check for flat shading:
	andi	tmp, rendState, G_SHADING_SMOOTH
	bgtz	tmp, smoothShade
	addi	tmp, flatp, 4
	luv	amax[0], RSP_PTS_R(tmp)
	luv	amin[0], RSP_PTS_R(tmp)
	luv	amid[0], RSP_PTS_R(tmp)
	
smoothShade:
	
	# if (r < 0) then triangle is a back-face.
	# finish back-face processing in the SU.

			# align these for Newton
	sra	bsignr, bsignr, 31
			vmov	ri[3], ri[0]
	and	backrej, backrej, bsignr
			vmov	rf[3], rf[0]
	
	# If (r == 0), triangle is NULL, we should bail out completely.
			vmov	EDel[MIDX], Md[0]
 	beq	doreject, zero, SetupReject
	# note delay slot
	
		vmudm	amax, amax, vconst[7]	# multiply by 1/512.0 to
		vmudm	amin, amin, vconst[7]	# move things into lower byte.
	bgtz	backrej, SetupReject
		vmudm	amid, amid, vconst[7]
	
			# align these for speed later.
	# re-test the sign of r *after* the sort for left/right-ness
			vlt	invri, ri, vconst[0]
	andi	miny, miny, 0xfffc
			vmov	EDel[MIDY], Md[1]
	andi	midy, midy, 0xfffc
			vmov	EDel[HIGHX], Hd[0]
	andi	maxy, maxy, 0xfffc
			vmov	EDel[HIGHY], Hd[1]
	mfc2	tmp, invri[0]
	
.unname	negR
	
.unname	Hd
.unname	Md
.unname	Ld
	
	#
	# compute 1/r
	# R is about 10 bits accurate coming from the rcp table.
	# We need to do a Newton's iteration pass here to get more
	# precision. Each iteration should get another 10 bits...
	#
.name	r2i, 	$v25	# intermediate value
.name	r2f, 	$v24
.name	vtmpi, 	$v23	# constant 2.0
.name	vtmpf, 	$v22
	

	# load S and T:
	llv	amin[8],  RSP_PTS_S(minp)
			vrcph	invri[3], ri[3]
	llv	amid[8],  RSP_PTS_S(midp)
			vrcpl	invrf[3], rf[3]
	llv	amax[8],  RSP_PTS_S(maxp)
			vrcph	invri[3], vconst[0]
	
	# stick 1.0 in for W:
		vmov	amin[6], vconst1[0]
		vmov	amid[6], vconst1[0]
	
	# load z's:
	lsv	amin[14], RSP_PTS_ZS(minp)
			vmudn	invrf, invrf, vconst[2]
	lsv	amid[14], RSP_PTS_ZS(midp)
			vmadh	invri, invri, vconst[2]
	lsv	amax[14], RSP_PTS_ZS(maxp)
			vmadn	invrf, vconst, vconst[0]

		vmov	amax[6], vconst1[0]
	
			lqv	vtmpi[0], VNEWT_OFFSET(zero)
 			vxor	vtmpf, vconst, vconst
			vmudl	r2f, invrf, rf		#  R*X
			vmadm	r2f, invri, rf
			vmadn	r2f, invrf, ri
			vmadh	r2i, invri, ri
	# compute attribute deltas: (S15.16) watch alignment!
	vsub	tHdai, amin, amax
	vsub	tMdai, amin, amid
			vsubc	r2f, vtmpf, r2f		#  2 - (R*X)
			vsub	r2i, vtmpi, r2i
.unname vtmpi
.unname vtmpf
.name	Hdai,	$v22
.name	Mdai,	$v21
	vsub	Mdai, amid, amin
	vsub	Hdai, amax, amin
			vmudl	vjunk, invrf, r2f		#  R * (2-R*X)
			vmadm	vjunk, invri, r2f
	addi	rdp_flg, zero, 0x80
			vmadn	invrf, invrf, r2i
			vmadh	invri, invri, r2i
.unname r2i
.unname r2f
.unname ri
.unname rf
.name	invEDeli,	$v29
.name	invEDelf,	$v28
.name	adei,		$v25
.name	adef,		$v24
.name	Hdaf,		$v23
.name	EDeli,		$v5
.name	EDelf,		$v6
	
	
	# identify left- or right-major triangle:
	# if (r < 0) dir = 0 else dir = 1
		bltz	tmp, rightMajor
		lb	rdp_cmd, RSP_STATE_TRI(rsp_state)	# delay slot
		addi	rdp_flg, zero, 0x0	# left-major
    rightMajor:
	# Ldx/Ldy, Mdx/Mdy, Hdx/Hdy:
	#
	# Since the rcp ROM is 10 bits, that's good enough for
	# the edge slopes. Newton's doesn't help.
	#
	# Get triangle command from state and construct the proper RDP
	# command while we do this.
	#
			vmudm	EDeli, EDel, vconst[4]	# make S15.16
			vmadn	EDelf, vconst, vconst[0]
	
			vrcp	invEDelf[LOWY], EDel[LOWY]	# 1.0/Ldy
	# stick in tile number
	lb	tmp, RSP_STATE_TEX_TILE(rsp_state)
			vrcph	invEDeli[LOWY], vconst[0]
	# Always assumes we're doing at least color attribute
	ori	rdp_cmd, rdp_cmd, G_TRI_SHADE
	
			vrcp	invEDelf[MIDY], EDel[MIDY]	# 1.0/Mdy
			vrcph	invEDeli[MIDY], vconst[0]
	
			vrcp	invEDelf[HIGHY], EDel[HIGHY]	# 1.0/Hdy
	or	rdp_flg, rdp_flg, tmp	# 3 cycles after load
			vrcph	invEDeli[HIGHY], vconst[0]
	
	# open for output
#if !(defined(OUTPUT_DRAM)||defined(OUTPUT_FIFO))
		jal	OutputOpen
		addi	$18, zero, 176 	# worst case guess (delay slot)
#endif /* !(OUTPUT_DRAM || OUTPUT_FIFO) */

	#
	# We used to shift down the rcp results all the way,
	# then do the multiply. If we don't shift it down all the
	# way, do the mult, then shift some more, we get better
	# precision on the degenerate cases.
	#
#if 1
	 		vmudl	invEDelf, invEDelf, vconst1[2]	# make S15.16
	sb	rdp_cmd, 0(outp)	# output rdp command
	 		vmadm	invEDeli, invEDeli, vconst1[2]
	sb	rdp_flg, 1(outp)	# output poly flag
	  		vmadn	invEDelf, vconst, vconst[0]
#else
	 		vmudl	invEDelf, invEDelf, vconst1[4]	# make S15.16
	sb	rdp_cmd, 0(outp)	# output rdp command
	 		vmadm	invEDeli, invEDeli, vconst1[4]
	sb	rdp_flg, 1(outp)	# output poly flag
	  		vmadn	invEDelf, vconst, vconst[0]
#endif

	# y setup:	(S11.2)
	# (we removed fractional bits in vertex loop)
	
	# Do some other work during the pipeline delay:
	# We scale up EDel so that later, during the attribute computation,
	# the 1/r multiply gives us the right S15.16 aligned answer.
	
		vmudh	EDel, EDel, vconst[5]	# mult by 4 for attributes
	
.name 	xi,	$v7
.name 	xf,	$v8
	
.unname	backrej
.unname	doreject
.unname	bsignr
	
	# x setup:	(S15.16)
	# (finish edge slopes while we do this)
	# The slope answer will end up in the Y element...
	
	lsv	xi[(LOWX*2)],  RSP_PTS_XS(midp)
/* DELAY HERE! */
			vmudl	DxXDyf, invEDelf, EDelf[0q]	# Ldx / Ldy
	lsv	xi[(MIDX*2)],  RSP_PTS_XS(minp) # same as high
			vmadm	DxXDyf, invEDeli, EDelf[0q]	# Mdx / Mdy
	lsv	xi[(HIGHX*2)], RSP_PTS_XS(minp)
			vmadn	DxXDyf, invEDelf, EDeli[0q]	# Hdx / Hdy
			vmadh	DxXDyi, invEDeli, EDeli[0q]
	sh	maxy, 2(outp)	# output y coords S11.2
			vmadn	DxXDyf, vconst, vconst[0]
	# translate S11.2 x's to S15.16.
	sh	miny, 6(outp)

.unname	EDeli
.unname	EDelf
	
.name	Mdaf,	$v5

#if 1
	# no shift down needed...
#else
	# shift down some more...
 		vmudl	invEDelf, invEDelf, vconst[4]
 		vmadm	invEDeli, invEDeli, vconst[4]
  		vmadn	invEDelf, vconst, vconst[0]
	
 		vmudl	DxXDyf, DxXDyf, vconst[4]
 		vmadm	DxXDyi, DxXDyi, vconst[4]
  		vmadn	DxXDyf, vconst, vconst[0]
#endif

		vmudm	xi, xi, one4th
		vmadn	xf, vconst, vconst[0]
	
	# Check DxXDy for "nearly-horizontal". Make horizontal, if so.
	# (only a single-precision clamp)
	sh	midy, 4(outp)
			vcr	DxXDyi, DxXDyi, vconst1[6]
	
.unname miny
.unname midy
.unname maxy
	#
	# These attribute multiplies use the full precision of the
	# accumulator. They are basically integer multiplies,
	# with only the upper 32 bits retrieved from the accumulator.
	#
	# See note up above about why EDel is being scaled up.
	#
	# S15.16 * S11.4 = SS26.20
	# (we only use the upper SS26.4, which we'll multiply
	# by 1/r below)
	#
	# (consider other method of computing this?)
	
		ssv	DxXDyf[(LOWY*2)], 14(outp)
	# compute DeAtt directly, divide Hda/ydelta, 
	# instead of:	  de = dy + dx * DxXHDy
	vmudm	vjunk, Hdai, invEDelf[HIGHY]
		ssv	DxXDyf[(HIGHY*2)], 22(outp)
	vmadh	adei,  Hdai, invEDeli[HIGHY]
		ssv	DxXDyf[(MIDY*2)], 30(outp)
	vmadn	adef,  vconst, vconst[0]
		ssv	xi[(LOWX*2)],  8(outp)	# output xLow
	# DxAtt = Mdy*Hda - Hdy*Mda
	# we don't need DyAtt for no-AA case
	vmudh	vjunk, Hdai, EDel[MIDY]
		ssv	xf[(LOWX*2)], 10(outp)
	vmadh	vjunk, tMdai, EDel[HIGHY]
		ssv	DxXDyi[(LOWY*2)], 12(outp)
	vsar	Hdai, Hdai, Hdai[0]
		ssv	DxXDyi[(HIGHY*2)], 20(outp)
	vsar	Hdaf, Hdaf, Hdaf[1]
		ssv	DxXDyi[(MIDY*2)], 28(outp)
		ssv	xi[(HIGHX*2)], 16(outp)	# output xHigh
		ssv	xf[(HIGHX*2)], 18(outp)
		ssv	xi[(MIDX*2)],  24(outp)	# output xMid
		ssv	xf[(MIDX*2)],  26(outp)
		addi	outp, outp, 32	# increment output pointer

.unname xi
.unname xf
.unname	DxXDyi
.unname	DxXDyf
.unname	xHighf
.unname	yf
	
	# divide by r (S4.27)
	# This multiply results in the proper S15.16 attributes
	# that we need (texture is S10.21)
	# Write out the proper record to the RDP, based on the drawing
	# modes, and increment outp.
	vmudl	vjunk, Hdaf, invrf[3]
		sdv	vzeros[0], 40(outp)	# DyAtt
	vmadm	vjunk, Hdai, invrf[3]
		sdv	vzeros[0], 56(outp)	# DyAtt
	vmadn	Hdaf,  Hdaf, invri[3]
		sdv	adei[0],   32(outp)
	vmadh	Hdai,  Hdai, invri[3]
		sdv	adef[0],   48(outp)
		sdv	amin[0],    0(outp)
		sdv	vzeros[0], 16(outp)	# init fracs
		andi	tmp, rdp_cmd, G_RDP_TRI_TXTR_MASK
		sdv	Hdai[0],    8(outp)
		sdv	Hdaf[0],   24(outp)	

	# write out texture
		blez	tmp, outputZBUF
		addi	outp, outp, 64	# increment output pointer (delay)
	
		sdv	amin[8],    0(outp)
		sdv	vzeros[8], 16(outp)	# init fracs
		sdv	Hdai[8],    8(outp)
		sdv	Hdaf[8],   24(outp)	
		sdv	vzeros[0], 40(outp)	# DyAtt
		sdv	vzeros[0], 56(outp)	# DyAtt
		sdv	adei[8],   32(outp)
		sdv	adef[8],   48(outp)
		addi	outp, outp, 64	# increment output pointer
	
outputZBUF:
		andi	tmp, rdp_cmd, G_RDP_TRI_ZBUFF_MASK
		blez	tmp, SetupDone

		# we need DyAtt for zbuffer, it's the delta-z
		# DyAtt = Hdx*Mda - Mdx*Hda
		vmudh	vjunk, Mdai, EDel[HIGHX]
		vmadh	vjunk, Mdai, EDel[MIDX]
		vsar	Mdai, Mdai, Mdai[0]
		vsar	Mdaf, Mdaf, Mdaf[1]

		vmudl	vjunk, Mdaf, invrf[3]
		vmadm	vjunk, Mdai, invrf[3]
		vmadn	Mdaf,  Mdaf, invri[3]
		vmadh	Mdai,  Mdai, invri[3]

	#
	# Scale Z-values up, screen coordinates were limited
	# to 10 integer bits, but the hardware floating point format
	# needs valid bits in the upper range for best performance.
	#
.name aminf,	$v1
			vmudn	adef, adef, vconst1[4]
			vmadh	adei, adei, vconst1[4]
			vmadn	adef, vconst, vconst[0]
	
			vmudn	aminf, vzeros, vconst1[4]
			vmadh	amin, amin, vconst1[4]
			vmadn	aminf, vconst, vconst[0]

	ssv	adei[14],    8(outp)	# output z stuff.
			vmudn	Hdaf, Hdaf, vconst1[4]
	ssv	adef[14],   10(outp)
			vmadh	Hdai, Hdai, vconst1[4]
			vmadn	Hdaf, vconst, vconst[0]
	
	ssv	Hdai[14],    4(outp)
	ssv	Hdaf[14],    6(outp)	
	ssv	Mdai[14],   12(outp)
	ssv	Mdaf[14],   14(outp)	

	addi	outp, outp, 16			# increment output pointer
	ssv	amin[14],  (0-16)(outp)	# 0
	ssv	aminf[14],  (2-16)(outp)	# 2
.unname aminf
	
SetupDone:	# done or rejected. do any clean-up.
		jal	OutputClose
		# note delay slot

SetupReject:	# no OutputClose needed...
		nop
	
		j	NextTri
		nop

		.end	beginSetup

/* un-name scalar registers: */
.unname	trip
.unname	tricnt
.unname	trin
.unname	minp
.unname	midp
.unname	maxp
.unname	flatp
.unname	rdp_cmd
.unname	rdp_flg
.unname	tmp
.unname	rendState
	
/* un-name vector registers: */
.unname	EDel
.unname	invEDeli
.unname	invEDelf
	
.unname	Hdai
.unname	Hdaf
.unname	Mdai
.unname	Mdaf
.unname	adei
.unname	adef
.unname	amin
.unname	vzeros
.unname vjunk
.unname	tMdai
.unname	amid
.unname	amax
.unname	tHdai
.unname	invri
.unname	invrf
	
#if 1
	# test for thorough register un-naming.
.name r1, $1
.name r2, $2
.name r3, $3
.name r4, $4
.name r5, $5
.name r6, $6
.name r7, $7
.name r8, $8
.name r9, $9
.name r10, $10
.name r11, $11
.name r12, $12
.name r13, $13
.name r14, $14
.name r15, $15
.name r16, $16
.name r17, $17
.name r18, $18
.name r19, $19
.name r20, $20

.name vv0, $v0
.name vv1, $v1
.name vv2, $v2
.name vv3, $v3
.name vv4, $v4
.name vv5, $v5
.name vv6, $v6
.name vv7, $v7
.name vv8, $v8
.name vv9, $v9
.name vv10, $v10
.name vv11, $v11
.name vv12, $v12
.name vv13, $v13
.name vv14, $v14
.name vv15, $v15
.name vv16, $v16
.name vv17, $v17
.name vv18, $v18
.name vv19, $v19
.name vv20, $v20
.name vv21, $v21
.name vv22, $v22
.name vv23, $v23
.name vv24, $v24
.name vv25, $v25
.name vv26, $v26
.name vv27, $v27
.name vv28, $v28
.name vv29, $v29
	
#endif