gsprite.s 46.1 KB

Raw Blame History Permalink


 ############################################################################
 #
 # Process the G_SPRITE command.
 #
 # 	in_bufp holds pointer to sprite data structure
 #
 # Kevin Luster, kluster@sgi.com
 #
 #
 ############################################################################

 # scaler registers

.name	ScaleY,			$1
.name	YStartP,		$2
.name	YEndR,			$3
.name	TEndR,      		$4
.name	TEndP,	        	$5
.name	PScreenY,		$6
.name	TStartP,		$7
.name	temp,			$8
.name	FractionalOffset,	$9
.name	NumLinesPerLoad,	$10
.name	UNUSED11,             	$11
.name	UNUSED12,      		$12
.name	UNUSED13,		$13
.name	SubImageHeight,		$14
.name	SourceImageOffsetT,	$15
.name	UNUSED16,		$16
.name	TexLoadStart,		$17
.name	TexLoadEnd,		$18
.name	UNUSED19,		$19
.name	TStartR,		$20
.name   YStartR,                $21

 # vector registers

.name	vzero,			$v0
.name	vtmp1,			$v1
.name	vtmp2,			$v2
.name	NumLinesPerLoadi,	$v5
.name	iNumLinesPerRectanglei,	$v7
.name	iNumLinesPerRectanglef,	$v8
.name	NumLoadsi,		$v9

 #	.ent	case_G_SPRITE
 # case_G_SPRITE:

	# Check if we have a color index texture
	lb	temp,	RSP_SP_SOURCETYPE(in_bufp)
	addi	temp,	temp,	-G_IM_FMT_CI
	bne	temp,	zero,	NoCITexture

	# zero out vzero, delay slot
	vxor	vzero,	vzero,	vzero

	j	LoadSpriteTLUT
	nop	# delay slot

NoCITexture:

	addi	temp,	zero,	G_TT_NONE

Resume:
	jal	WriteOutSetTextureLUT
	nop	# delay slot

  	jal	ComputeLoadingParameters
 	nop	# delay slot

  	jal	InitialSetup

	# Store done=0 into memory location
	sh	zero,	(6+RSP_SCRATCH_OFFSET)(zero)	# delay slot

  	jal	ComputeRectangles
  	nop	# delay slot

	j	ExitSprite
	nop	# delay slot


 ############################################################################

.unname	UNUSED11
.unname	UNUSED19

.name	modewd,	$11
.name	mask,	$19


WriteOutSetTextureLUT:

	# store return address since jump below will clobber it
	sh	return,		(10+RSP_SCRATCH_OFFSET)(zero)

#if !(defined(OUTPUT_DRAM)||defined(OUTPUT_FIFO))
	jal	OutputOpen
	addi	$18, zero, 8 	# Total size of all commands written out
#endif /* !(OUTPUT_DRAM || OUTPUT_FIFO) */

	# Write out gDPSetTextureLUT command

	lw	modewd,	RSP_STATE_OTHER_H(rsp_state)

	# addiu appears to be broken, so have to work around
	addi	mask,	zero,	-1
	sll	mask,	mask,	16
	addi	mask,	mask,	0x3fff

	and	mask,	modewd,	mask

	andi	temp,	temp,	0xffff
	or	modewd,	mask,	temp
	sw	modewd,	RSP_STATE_OTHER_H(rsp_state)

	sw	modewd,	0(outp)	# output rdp command

	lw	temp,	RSP_STATE_OTHER_L(rsp_state)
	sw	temp,	4(outp) # rest of otherword command

    	jal	OutputClose
	# delay slot
	addi	outp,	outp,	8	# increment output pointer

	# retrieve return address
	lh	return,		(10+RSP_SCRATCH_OFFSET)(zero)

	jr	return
	nop	# delay slot

.unname	modewd
.unname	mask

.name	UNUSED11,	$11
.name	UNUSED19,	$19

 ############################################################################

.unname	UNUSED16

.name	TLUTCount,	$16

LoadSpriteTLUT:

#if !(defined(OUTPUT_DRAM)||defined(OUTPUT_FIFO))
	jal	OutputOpen
	addi	$18, zero, 48 	# Total size of all commands written out
#endif /* !(OUTPUT_DRAM || OUTPUT_FIFO) */

	lb	temp,	RSP_SP_SOURCEBITSIZE(in_bufp)
	addi	temp,	temp,	-1
	bne	temp,	zero,	Load4BitPalette
	nop	# delay slot

Load8BitPalette:

	j	WriteOutTLUTCommands

	# TLUTCount = 255, delay slot
	addi	TLUTCount,	zero,	255


Load4BitPalette:

	# Note that we are using a hard coded palette number of 0

	# TLUTCount = 15
	addi	TLUTCount,	zero,	15

WriteOutTLUTCommands:

	# Write out the gDPSetTextureImage command
	addi	temp,	zero,	G_SETTIMG
	sb	temp,	0(outp)	# output rdp command

	# Now write out format and size values

	# Load in SourceImageType
	addi	temp,	zero,	G_IM_FMT_RGBA << 2
	# sll	temp,	temp,	2

	ori	temp,	temp,	G_IM_SIZ_16b
	sll	temp,	temp,	3

	sb	temp,	1(outp)	# output format & size

	# for some bizarre reason, the gsSetImage() gbi macro
	# subtracts 1 from the image width. Why?
	addi	temp,	zero,	0
	sh	temp,	2(outp)	# output width

.unname	UNUSED19
.name	addr,		$19

	# Load in SourceImagePointer
	lw	addr,	RSP_SP_TLUTP(in_bufp)

	# Subroutine uses registers $11,$12,$13,$19
	jal	AddrFixup
	nop	# delay slot

	sw	addr,	4(outp)	# output DRAM address

	# pop back return address stored earlier
	lh	return,		(10+RSP_SCRATCH_OFFSET)(zero)

.unname	addr
.name	UNUSED19,	$19

	addi	outp,	outp,	8	# increment output pointer

	# Write out the gDPTileSync command
	addi	temp,	zero,	G_RDPTILESYNC
	sll	temp,	temp,	24 # shift over command
	sw	temp,	0(outp)	# output rdp command
	sw	zero,	4(outp)
	addi	outp,	outp,	8	# increment output pointer

	# Write out the gDPSetTile command
	addi	temp,	zero,	G_SETTILE
	sb	temp,	0(outp) # output rdp command

	# Now write out 4 bytes, these will overlap
	# with the next 4 bytes written out, so we
	# will set up the tile parameter below

	# Everything but the Tmem adrs = 0
	addi	temp,	zero,	256
	sll	temp,	temp,	8
	sw	temp,	1(outp)

	# Put in Tile number = G_TX_LOADTILE
	addi	temp,	zero,	G_TX_LOADTILE
	sll	temp,	temp,	24

	# Now write out second set of overlapping 4 bytes
	# This write has the correct Tile information

	sw	temp,	4(outp)

	addi	outp,	outp,	8	# increment output pointer

	# Write out the gDPLoadSync command
	addi	temp,	zero,	G_RDPLOADSYNC
	sll	temp,	temp,	24 # shift over command
	sw	temp,	0(outp)	# output rdp command
	sw	zero,	4(outp)
	addi	outp,	outp,	8	# increment output pointer

	# Write out the gDPLoadTLUTCmd command
	addi	temp,	zero,	G_LOADTLUT
	sll	temp,	temp,	24 # shift over command
	sw	temp,	0(outp)	# output rdp command

	addi	temp,	zero,	G_TX_LOADTILE
	sll	temp,	temp,	10
	or	temp,	temp,	TLUTCount
	sll	temp,	temp,	14
	sw	temp,	4(outp)

	addi	outp,	outp,	8	# increment output pointer

	# Write out the gDPipeSync command
	addi	temp,	zero,	G_RDPPIPESYNC
	sll	temp,	temp,	24 # shift over command
	sw	temp,	0(outp)	# output rdp command
	sw	zero,	4(outp)

	addi	outp,	outp,	8	# increment output pointer

    	jal	OutputClose
	# delay slot, set up for WriteOutSetTextureLUT
	addi	temp,	zero,	G_TT_RGBA16

.unname	TLUTCount
.name	UNUSED16,	$16

.unname	UNUSED11
.unname	UNUSED19

.name	modewd,	$11
.name	mask,	$19

	j	Resume
 	nop	# delay slot

.unname	modewd
.unname	mask

.name	UNUSED11,	$11
.name	UNUSED19,	$19

 ############################################################################

ComputeLoadingParameters:

.unname	UNUSED16
.name	NumWordsPerLine,	$16

	# Load ScaleY now since we need it for the hacks
	lh	ScaleY,			RSP_SP_SCALEY(in_bufp)

	# Compute FractionalOffset first part, delay slot
	lh	PScreenY,		RSP_SP_PSCREENY(in_bufp)
	andi	FractionalOffset,	PScreenY,	0x03

	# Now we start the disgusting hacks section
	# The above hacks were just to wet your appetite

	lh	NumWordsPerLine,	RSP_SP_SUBWIDTH(in_bufp)

	# Only need to do hacks if we're rendering a 32 bit image
	lb	temp,	RSP_SP_SOURCEBITSIZE(in_bufp)
	addi	temp,	temp,	-3
	bne	temp,	zero,	DisgustingHackDone

	# delay slot
	# Check if we have > 512 width
	addi	temp,	NumWordsPerLine,	-512
	blez	temp,	DisgustingHackDone

Hack1:
	# If you're trying to render a 32 bit texture, *and* you wish to
	# scale in the
	# Y axis, then you're limited to a maximum width of 512
	# pixels. Have to track down and figure out why this is causing a
	# problem

	# lb	temp,	RSP_SP_FLIPY(in_bufp)
	# bne	temp,	zero,	DoDisgustingHack1
	# nop	# delay slot

	addi	temp,	ScaleY,	-1024
	beq	temp,	zero,	Hack2
	nop	# delay slot

DoDisgustingHack1:
	addi	NumWordsPerLine,	zero,	512
	j	DisgustingHackDone
	# delay slot
	sh	NumWordsPerLine,	RSP_SP_SUBWIDTH(in_bufp)

Hack2:
	# If you're trying to render a 32 bit texture, *and* you wish the screen
	# rectangle to be aligned on a fractional pixel in the vertical direction
	# *and* you want to display more than 512 pixels per row, then you would
	# need to load more than 4K of texture into TMEM, which won't fit. So you
	# either have to special case that situation and tile the image horizontally
	# as well as vertically, or you take the weenie way out and either clamp
	# the width to being 512, or clamp the vertical alignment to being on a
	# pixel boundary. Otherwise you will get a divide by 0 error when computing
	# the NumLoads parameter in ComputeLoadingParameters(). We take the visually
	# better approach of clamping the vertical placement to an integer boundary

	beq	FractionalOffset,	zero,	DisgustingHackDone
	nop	# delay slot

DoDisgustingHack2:
	addi	FractionalOffset,	zero,	0
	andi	PScreenY,	PScreenY,	0xfffc

DisgustingHackDone:

.unname	UNUSED11

.name	iScaleXYi,		$v10
.name	iScaleXYf,		$v11
.name	ScaleXYi,	$v3
.name	ScaleXYf,	$v4

	# Compute 1/ScaleY, and 1/ScaleX

	# We load the ScaleXY values into a fractional register so that we
	# get maximal precision later out of the Newton iteration (loading
	# into a fractional register causes the inverse to end up mainly in
	# the integer result register, which gives you many low order bits
	# to play with when doing the newton correction

	# move ScaleY into fractional register
	mtc2	ScaleY,		ScaleXYf[0]

	# move ScaleX into fractional register
	lh	temp,		RSP_SP_SCALEX(in_bufp)
	mtc2	temp,		ScaleXYf[2]

	# zero out ScaleXYi
	vxor	ScaleXYi,	ScaleXYi,	ScaleXYi

	# Now compute inverse of ScaleXY
	vrcph	iScaleXYi[0],	ScaleXYi[0]
	vrcpl	iScaleXYf[0],	ScaleXYf[0]
	vrcph	iScaleXYi[0],	ScaleXYi[1]
	vrcpl	iScaleXYf[1],	ScaleXYf[1]
	vrcph	iScaleXYi[1],	vzero[0]

	# Now scale up the inverse by 2
	vmudn	iScaleXYf,	iScaleXYf,	vconst[2]
	vmadh	iScaleXYi,	iScaleXYi,	vconst[2]
	vmadn	iScaleXYf,	vzero,		vzero[0]

.unname	vtmp1
.unname	vtmp2

.name	vTempi,			$v1
.name	vTempf,			$v2
.name	r2i,			$v12
.name	r2f,			$v13

	# Now do a Newton iteration on this inverse
	lqv	vTempi[0],	VNEWT_OFFSET(zero)
	vxor	vTempf,	vTempf,	vTempf

	vmudl	r2f,		iScaleXYf,	ScaleXYf		#  R*X
	vmadm	r2f,		iScaleXYi,	ScaleXYf
	vmadn	r2f,		iScaleXYf,	ScaleXYi
	vmadh	r2i,		iScaleXYi,	ScaleXYi

	vsubc	r2f,		vTempf,		r2f		#  2 - (R*X)
	vsub	r2i,		vTempi,		r2i

	vmudl	vTempf,		iScaleXYf,	r2f		#  R * (2-R*X)
	vmadm	vTempi,		iScaleXYi,	r2f
	vmadn	iScaleXYf,	iScaleXYf,	r2i
	vmadh	iScaleXYi,	iScaleXYi,	r2i

.unname	ScaleXYi
.unname	ScaleXYf
.unname	vTempi
.unname	vTempf
.unname	r2i
.unname	r2f

.name	vtmp1,			$v1
.name	vtmp2,			$v2

.name	TargetRectangleWidthi,	$v3
.name	TargetRectangleWidthf,	$v4

	# Compute TargetRectangleWidth = (int) ((float) (SubImageWidth << 12) / (float) ScaleX) ;

	vadd	TargetRectangleWidthi,	vzero,	iScaleXYi
	vadd	TargetRectangleWidthf,	vzero,	iScaleXYf

	# Now shift the computed iScaleX to the right by 4, IFxF = IF

	vmudl	TargetRectangleWidthf,	TargetRectangleWidthf,	vconst1[1]
	vmadm	TargetRectangleWidthi,	TargetRectangleWidthi,	vconst1[1]
	vmadn	TargetRectangleWidthf,	vzero,			vzero[0]

	# Load SubImageWidth into vector register
	lh	temp,		RSP_SP_SUBWIDTH(in_bufp)
	mtc2	temp,		vtmp1[0]

	# Now do IFxI multiplication of 1/ScaleX by SubImageWidth
	vmudn	TargetRectangleWidthf,	TargetRectangleWidthf,	vtmp1[0]
	vmadh	TargetRectangleWidthi,	TargetRectangleWidthi,	vtmp1[0]
	vmadn	TargetRectangleWidthf,	vzero,			vzero[0]

	# Now store the TargetRectangleWidth into scratch buffer
	mfc2	temp,	TargetRectangleWidthi[2]
	sh	temp,	(0+RSP_SCRATCH_OFFSET)(zero)

.unname	TargetRectangleWidthi
.unname	TargetRectangleWidthf

.name	TextureLoadSize,	$11

	# Compute TextureLoadSize

	# THIS GETS LOADED IN A LOT, TRY TO MAKE IT A PERMANENT VALUE

	# Load in SourceImageType
	lb	temp,	RSP_SP_SOURCETYPE(in_bufp)
	addi	temp,	temp,	-G_IM_FMT_CI
	beq	temp,	zero,	TextureLoadSize256

	# Load in SourceImageBitSize
	lb	temp,	RSP_SP_SOURCEBITSIZE(in_bufp)
	addi	temp,	temp,	-3
	beq	temp,	zero,	TextureLoadSize256
	nop	# delay slot

TextureLoadSize512:
	j	TextureLoadSizeDone
	# delay slot
	addi	TextureLoadSize,	zero,	512

TextureLoadSize256:

	addi	TextureLoadSize,	zero,	256

TextureLoadSizeDone:

	# Compute NumWordsPerLine

	# Load in SourceImageBitSize
	lb	temp,	RSP_SP_SOURCEBITSIZE(in_bufp)
	bne	temp,	zero,	TextureisNot4Bit
	nop	# delay slot

Textureis4Bit:

.unname	UNUSED12
.name	temp2,		$12

	# NumWordsPerLine = (SubImageWidth * 4  +
	#	 ((SubImageWidth & 0x01) ? 4 : 0) +
	#	 (((SubImageWidth + SourceImageOffsetS) & 0x01) ? 4 : 0) + 63) >> 6;

	# NumWordsPerLine already contains SubImageWidth from above

	addi	temp2,	NumWordsPerLine,	0
	sll	NumWordsPerLine,	NumWordsPerLine,	2

	andi	temp,	temp2,	0x01
	sll	temp,	temp,	2
	add	NumWordsPerLine,	NumWordsPerLine,	temp

	lh	temp,	RSP_SP_SOURCEOFFSETS(in_bufp)
	add	temp,	temp,	temp2
	andi	temp,	temp,	0x01
	sll	temp,	temp,	2

	j	SourceBitSizeDone
	add	NumWordsPerLine,	NumWordsPerLine,	temp

TextureisNot4Bit:

	# NumWordsPerLine = (SubImageWidth  * (TileBitSize[SourceImageBitSize] == 32 ?
	# 16 : TileBitSize[SourceImageBitSize])  + 63) >> 6;

	# NumWordsPerLine already contains SubImageWidth from above

	addi	temp,	temp,	-1
	beq	temp,	zero,	SourceBitSize1
	nop	# delay slot

	j	SourceBitSizeDone
	# delay slot
	sll	NumWordsPerLine,	NumWordsPerLine,	4


SourceBitSize1:
	sll	NumWordsPerLine,	NumWordsPerLine,	3

SourceBitSizeDone:
	addi	NumWordsPerLine,	NumWordsPerLine,	63
	srl	NumWordsPerLine,	NumWordsPerLine,	6

	# Compute NumLinesPerLoad

.unname	vtmp1
.unname vtmp2

.name	NumWordsPerLinei,	$v14
.name	NumWordsPerLinef,	$v15
.name	iNumWordsPerLinei,	$v3
.name	iNumWordsPerLinef,	$v4

	# Compute 1/NumWordsPerLine

	# mtc2	NumWordsPerLine,	vtmp1[0]
	# vrcp	iNumWordsPerLinef[0],	vtmp1[0]
	# vrcph	iNumWordsPerLinei[0],	vconst[0]


	# We load the NumWordsPerLine into a fractional register so that we
	# get maximal precision later out of the Newton iteration (loading
	# into a fractional register causes the inverse to end up mainly in
	# the integer result register, which gives you many low order bits
	# to play with when doing the newton correction

	# move NumWordsPerLine into fractional register
	mtc2	NumWordsPerLine,	NumWordsPerLinef[0]

	# zero out NumWordsPerLinei
	vxor	NumWordsPerLinei,	NumWordsPerLinei,	NumWordsPerLinei

	# Now compute inverse of NumWordsPerLine
	vrcph	iNumWordsPerLinei[0],	NumWordsPerLinei[0]
	vrcpl	iNumWordsPerLinef[0],	NumWordsPerLinef[0]
	vrcph	iNumWordsPerLinei[0],	vzero[0]

	# Now scale up the inverse by 2
	vmudn	iNumWordsPerLinef,	iNumWordsPerLinef,	vconst[2]
	vmadh	iNumWordsPerLinei,	iNumWordsPerLinei,	vconst[2]
	vmadn	iNumWordsPerLinef,	vzero,		vzero[0]

.name	vTempi,			$v1
.name	vTempf,			$v2
.name	r2i,			$v12
.name	r2f,			$v13

	# Now do a Newton iteration on this inverse
	lqv	vTempi[0],	VNEWT_OFFSET(zero)
	vxor	vTempf,	vTempf,	vTempf

	vmudl	r2f,	iNumWordsPerLinef,	NumWordsPerLinef		#  R*X
	vmadm	r2f,	iNumWordsPerLinei,	NumWordsPerLinef
	vmadn	r2f,	iNumWordsPerLinef,	NumWordsPerLinei
	vmadh	r2i,	iNumWordsPerLinei,	NumWordsPerLinei

	vsubc	r2f,	vTempf,		r2f		#  2 - (R*X)
	vsub	r2i,	vTempi,		r2i

	vmudl	vTempf,	iNumWordsPerLinef,	r2f		#  R * (2-R*X)
	vmadm	vTempi,	iNumWordsPerLinei,	r2f
	vmadn	iNumWordsPerLinef,	iNumWordsPerLinef,	r2i
	vmadh	iNumWordsPerLinei,	iNumWordsPerLinei,	r2i

.unname	NumWordsPerLinei
.unname	NumWordsPerLinef
.unname	vTempi
.unname	vTempf
.unname	r2i
.unname	r2f

.name	vtmp1,			$v1
.name	vtmp2,			$v2
.name	vtmp3,			$v12

 # no longer need NumWordsPerLine
.unname	NumWordsPerLine
	# move TextureLoadSize into temp vector register
	mtc2	TextureLoadSize,	vtmp2[0]

	# Multiply 1/NumWordsPerLine by TextureLoadSize, do IFxF
	# just care about integer part

	vmudl	vtmp1,			iNumWordsPerLinef,	vtmp2[0]
	vmadm	NumLinesPerLoadi,	iNumWordsPerLinei,	vtmp2[0]

 # no longer need TextureLoadSize
.unname	TextureLoadSize
.name	UNUSED11,	$11

 # no longer need iNumWordsPerLine
.unname	iNumWordsPerLinef
.unname iNumWordsPerLinei

	# NumLinesPerRectangle = NumLinesPerLoad - (FractionalOffset ? 1 : 0)
	# Need to move NumLinesPerLoadi into Scaler register
	mfc2	NumLinesPerLoad,	NumLinesPerLoadi
	beq	NumLinesPerLoad,	zero,		NumLinesPerRectangleAdjusted

	# delay slot
	addi	temp,	NumLinesPerLoad,	0

	addi	temp,	NumLinesPerLoad,	0xffff

NumLinesPerRectangleAdjusted:

	# Now compute NumLoads = SubImageHeight / NumLinesPerRectangle
	lh	SubImageHeight,		RSP_SP_SUBHEIGHT(in_bufp)

	# Compute 1/NumLinesPerRectangle
	mtc2	temp,				vtmp1[0]
	vrcp	iNumLinesPerRectanglef[0],	vtmp1[0]
	vrcph	iNumLinesPerRectanglei[0],	vconst[0]

	# Result from reciprocal needs to be multiplied by 2
	# this is equivalent to multiplying SubImageHeight by 2

	sll	temp,	SubImageHeight,		1

	# move SubImageHeight*2 into temp vector register
	mtc2	temp,	vtmp2[0]

	# Multiply 1/NumLinesPerRectangle by SubImageHeight, do IFxF
	# just care about integer part

	vmudl	vtmp1,		iNumLinesPerRectanglef,	vtmp2[0]
	vmadm	NumLoadsi,	iNumLinesPerRectanglei,	vtmp2[0]

 # no longer need iNumLinesPerRectangle
.unname iNumLinesPerRectanglef
.unname	iNumLinesPerRectanglei

	# Compute SStartR

	lb	temp2,	RSP_SP_SOURCEBITSIZE(in_bufp)
	bne	temp2,	zero,	NoSStartROffset

	# delay slot. Set up temp in case we have jumped to no offset
	addi	temp2,	zero,	0

	lh	temp2,	RSP_SP_SOURCEOFFSETS(in_bufp)
	andi	temp2,	temp2,	0x01

	# At this point temp2 contains either 0 or 1, which is what we want

NoSStartROffset:

	# Check to see if we have FlipTextureX == 1
	lb	temp,	RSP_SP_FLIPX(in_bufp)
	beq	temp,	zero,	WriteSStartR

	# delay slot, copy over possible offset into temp
	addi	temp,	temp2,	0

FlipX:
	lh	temp2,	RSP_SP_SUBWIDTH(in_bufp)
	addi	temp2,	temp2,	-1
	add	temp,	temp,	temp2	# delay slot

WriteSStartR:

	# Shift partial result left by 5
	sll	temp,	temp,	5

	# We don't need SStartR again until we are actually
	# writing out the gSPTextureRectangle command, so we will store
	# this result away in the scratch area and retrieve it later
	sh	temp,	(2+RSP_SCRATCH_OFFSET)(zero)

	# Here we do the hack to prevent negative starting X values for
	# the displayed rectangle. We take the quick and dirty route of
	# clamping PScreenX to 0, adjusting SStartR and shortening
	# TargetRectangleWidth

	#    if (PScreenX < 0)
	#      {
	#	TargetRectangleWidth += PScreenX;
	#	SStartR -= ((FlipTextureX ? -1 : 1) * PScreenX * TextureScaleX) >> 7;
	#	PScreenX = 0;
	#      }

	lh	temp2,	RSP_SP_PSCREENX(in_bufp)
	bgtz	temp2,	PScreenXNonNegative
	nop	# delay slot

	# Load in TargetRectangleWidth
	lh	temp,	(0+RSP_SCRATCH_OFFSET)(zero)
	add	temp,	temp,	temp2
	sh	temp,	(0+RSP_SCRATCH_OFFSET)(zero)

	# Check if we have FlipTextureX
	lb	temp,	RSP_SP_FLIPX(in_bufp)
	beq	temp,	zero,	NoFlipX3
	nop	# delay slot

	# Negate PScreenX
	sub	temp2,	zero,	temp2

NoFlipX3:

	# Load PScreenX into vector register
	mtc2	temp2,	vtmp1[0]

	# Load TextureScaleX into vector register
	lh	temp2,		RSP_SP_SCALEX(in_bufp)
	mtc2	temp2,		vtmp2[0]

	# Now do IxI multiplication
	# vmudh	vtmp1,	vtmp1,		vtmp2[0]

	# Now we do a IxI = 32 bit I multiply here
	vmudh	vtmp2,	vtmp1,	vtmp2[0]
	vsar	vtmp2,	vtmp1,	vtmp2[0]
	vsar	vtmp3,	vtmp1,	vtmp2[1]

	# Now shift answer down by 7 bits
	vmudl	vtmp3,	vtmp3,	vconst[7]
	vmadm	vtmp2,	vtmp2,	vconst[7]
	vmadn	vtmp3,	vzero,	vzero[0]

	# Move result into scaler register
	mfc2	temp2,	vtmp3
	# srl	temp2,	temp2,	7

	# Now load in stored SStartR, update and restore
	lh	temp,	(2+RSP_SCRATCH_OFFSET)(zero)
	sub	temp,	temp,	temp2
	sh	temp,	(2+RSP_SCRATCH_OFFSET)(zero)

	# Now set PScreenX to 0
	sh	zero,	RSP_SP_PSCREENX(in_bufp)

PScreenXNonNegative:

	# Now we do the hack to prevent the rectangle end points from
	# wrapping past the end of the coordinate range

	#    if (PScreenX + TargetRectangleWidth > 4095)
	#      {
	#	TargetRectangleWidth = 4095 - PScreenX;
	#      }

	addi	temp,	zero,	4095
	lh	temp2,	RSP_SP_PSCREENX(in_bufp)
	sub	temp2,	temp,	temp2

	# Load in stored away TargetRectangleWidth
	lh	temp,	(0+RSP_SCRATCH_OFFSET)(zero)
	sub	temp,	temp,	temp2

	blez	temp,	NoEndPointOverflow
	nop	# delay slot

	# Store 4095 - PScreenX into TargetRectangleWidth
	sh	temp2,	(0+RSP_SCRATCH_OFFSET)(zero)

NoEndPointOverflow:

	# Now we do the hack to prevent the rightmost endpoint from
	# having a negative value. If that occurs then we short
	# circuit the rest of the sprite calculations and return

	#    /* Do Clipping hack for negative ending points */
	#    if (PScreenX + TargetRectangleWidth < 0)
	#      {
	#	/* Short circuit and return */
	#	return;
	#      }

	lh	temp,	RSP_SP_PSCREENX(in_bufp)
	lh	temp2,	(0+RSP_SCRATCH_OFFSET)(zero)
	add	temp,	temp,	temp2
	bgtz	temp,	NoShortCircuit
	nop	# delay slot

	# Otherwise bail out of rest of sprite ucode
	j	ExitSprite
	nop	# delay slot

NoShortCircuit:

	jr	return	# End of ComputeLoadingParameters
  	nop	# delay slot

 ############################################################################

InitialSetup:

	lh	SourceImageOffsetT,	RSP_SP_SOURCEOFFSETT(in_bufp)

	# We setup TexLoadStart since both paths below add in SourceImageOffsetT
	add	TexLoadStart,		zero,	SourceImageOffsetT

 	# Check if we have FlipTextureY
	lb	temp,	RSP_SP_FLIPY(in_bufp)
	beq	temp,	zero,	NoFlipY3

	addi	YEndR,	PScreenY,	0	# delay slot

FlipY3:
	# Compute  TEndR = - ((FractionalOffset * ScaleY) >> 6), # IxI = I
	# and also TEndP  = (-(FractionalOffset * ScaleY)) >> 7;
	mtc2	FractionalOffset,	vtmp1[0]
	mtc2	ScaleY,			vtmp2[0]

	vmudh	vtmp1,	vtmp1,		vtmp2[0]

	# Now move partial result into scaler registers
	mfc2	TEndR,	vtmp1
	mfc2	TEndP,	vtmp1

	# Now shift TEndR right by 6 places
	sra	TEndR,	TEndR,	6
	# Now negate answer
	sub	TEndR,	zero,	TEndR

	# Now we compute the YFlipMirror hack
	#      if (FractionalOffset == 1)
	#	TEndR -= 16 ;
	#      else if (FractionalOffset == 3)
	#	TEndR += 16 ;

	beq	FractionalOffset,	zero,	NoYFlipMirrorHack

	# delay slot
	addi	temp,	FractionalOffset,	-2
	sll	temp,	temp,	4
	#	sub	temp,	zero,	temp
	add	TEndR,	TEndR,	temp

NoYFlipMirrorHack:

	# Now compute second half of TEndR
	# TEndR -= (TempOldTEndR == 0) * 32 ;
	bne	TEndR,	zero,	TEndRDone

	# Now we copy TEndR over to temp to be used as TempOldTEndR later
	# delay slot
	addi	temp,	TEndR,	0

	addi	TEndR,	TEndR,	-32

TEndRDone:

	# Finish computation of TEndP
	sub	TEndP,	zero,	TEndP
	sra	TEndP,	TEndP,	7

	# Now compute TexLoadStart = SourceImageOffsetT + SubImageHeight - 1 +
	#			(TempOldTEndR == 0) + (FractionalOffset != 0)
	# Make use of partial result already sitting in TexLoadStart and temp

	bne	temp,	zero,	TempOldTEndRNEZero
	addi	TexLoadStart,	TexLoadStart,	-1	# delay slot

	addi	TexLoadStart,	TexLoadStart,	1

TempOldTEndRNEZero:

	beq	FractionalOffset,	zero,	FractionalOffsetZero
	add	TexLoadStart,	TexLoadStart,	SubImageHeight	# delay slot

	addi	TexLoadStart,	TexLoadStart,	1

FractionalOffsetZero:

	bne	temp,	zero,	NoTexLoadStartAdjust
	nop	# delay slot

	addi	temp,	ScaleY,	-1024
	beq	temp,	zero,	NoTexLoadStartAdjust
	nop	# delay slot

	addi	TexLoadStart,	TexLoadStart,	1

NoTexLoadStartAdjust:

	j	InitialSetupDone
	nop	# delay slot

NoFlipY3:
	# Hard coded for non Y flipped texture
	addi	TEndR,		zero,		0


InitialSetupDone:

	# PUSH TEXLOADSTART
	sh	TexLoadStart,		(18+RSP_SCRATCH_OFFSET)(zero)


 	jr	return
 	nop	# delay slot

  ############################################################################

ComputeRectangles:

	# POP TEXLOADSTART
	lh	TexLoadStart,		(18+RSP_SCRATCH_OFFSET)(zero)

	# Check to see if we're done
	lh	temp,	(6+RSP_SCRATCH_OFFSET)(zero)
	bgtz	temp,	ComputeRectanglesDone

	addi	YStartR,		YEndR,		0

	#      /* Hack to prevent wraparound past end of coordinate range */
	#      if (YStartR >= 4094) return;

	addi	temp,	YStartR,	-4094
	bgez	temp,	ExitSprite

	andi	FractionalOffset,	YStartR,	0x03
	sub	YStartP,		YStartR,	FractionalOffset

	# Now we compute TStartR

	# Check if we have FlipTextureY
	lb	temp,	RSP_SP_FLIPY(in_bufp)
	beq	temp,	zero,	NoFlipY4
	nop	# delay slot

	# Compute FlipTextureY TStartR here

	andi	temp,	TEndP,	0x1f
	bne	temp,	zero,	NoTStartROffset
	addi	TStartR,	zero,		-1
	addi	TStartR,	TStartR,	1

NoTStartROffset:

	add	TStartR,	TStartR,	NumLinesPerLoad
	sll	TStartR,	TStartR,	5
	add	TStartR,	TStartR,	TEndR

	add	temp,	zero,	NumLinesPerLoad
	addi	temp,	temp,	-1
	sll	temp,	temp,	5
	sub	temp,	temp,	TStartR
	bgez	temp,	NoTStartROffset2
	nop	# delay slot

	addi	TStartR,	TStartR,	-32

NoTStartROffset2:

	j	TStartRDone
	nop	# delay slot

NoFlipY4:
	j	TStartRDone
	# delay slot
	andi	TStartR,		TEndR,		0x1f

TStartRDone:

	# compute TStartP = TStartR +/- ((FractionalOffset * ScaleY) >> 7);
	mtc2	FractionalOffset,	vtmp1[0]

	mtc2	ScaleY,			vtmp2[0]

	# Now do an IXI Multiplication
	vmudh	vtmp1,	vtmp1,		vtmp2[0]

	# Now shift partial result down by 7 bits, equivalent to doing a
	# IXF by 1/128. Only care about integer result

	vmudm	vtmp1,	vtmp1,		vconst[7]

	# Now move result into scaler register
	mfc2	temp2,	vtmp1

	# Compute approximate first pass height
	# We break this up into two sections. The first section computes an
	# intermediate result depending on whether we have YFlipping or not
	# then we compute 1/ScaleY, then later on we finish computing the
	# approximate height by multiplying the intermediate value by 1/ScaleY
	# and then adding 4.

	# Check if we have FlipTextureY
	lb	temp,	RSP_SP_FLIPY(in_bufp)
	beq	temp,	zero,	NoFlipY5

	# Now add partial product from TStartR, delay slot
	add	TStartP,	TStartR,	temp2

	# SubRectangleHeight = ((TStartP + 1) << 7) / ScaleY + 4	;
	j	ApproximateHeightDone

	# delay slot

	# addi	temp,	TStartP,	0
	addi	temp,	TStartP,	1


NoFlipY5:

	# Now subtract partial product from TStartR
	sub	TStartP,	TStartR,	temp2

	# SubRectangleHeight = ((((NumLinesPerLoad - 1) << 5) - TStartR) << 7) / ScaleY + 4;

	addi	temp,	NumLinesPerLoad,	0xffff
	sll	temp,	temp,	5

	j	ApproximateHeightDone
	sub	temp,	temp,	TStartR		# delay slot


ApproximateHeightDone:

	# Now store this result in the scratch area since we'll be using the
	# temp register below
	sh	temp,	(4+RSP_SCRATCH_OFFSET)(zero)

.name	SubRectangleHeighti,	$v7
.name	SubRectangleHeightf,	$v8

	# Now, because we've started out with ScaleY sitting in the fractional
	# register, which means the inverse result ended up largely in an integer
	# register, we need to scale it down to get the real answer. However, we
	# are also supposed to multiply the partial product above by 128. So we
	# can combine these two effects by moving the 1/ScaleY results down 9 bits

	# Move 1/ScaleY into SubRectangle registers so that we can preserve the
	# full precision of the 1/ScaleY calculation for later

	vadd	SubRectangleHeighti,	iScaleXYi,	vzero
	vadd	SubRectangleHeightf,	iScaleXYf,	vzero

	# Scale 1/ScaleY down by 512, move into fractional bits, IFXF
	vmudl	SubRectangleHeightf,	SubRectangleHeightf,	vconst1[6]
	vmadm	SubRectangleHeighti,	SubRectangleHeighti,	vconst1[6]
	vmadn	SubRectangleHeightf,	vzero,			vzero[0]

	# retrieve out stored partial product out of scratch
	lh	temp,	(4+RSP_SCRATCH_OFFSET)(zero)

	# Now multiply partial product with 1/ScaleY, IxIF
	mtc2	temp,	vtmp1[0]
	#	vmudm	SubRectangleHeightf,	vtmp1,	iScaleXYf
	#	vmadh	SubRectangleHeighti,	vtmp1,	iScaleXYi
	#	vmadn	SubRectangleHeightf,	vzero,	vzero[0]
	vmudm	SubRectangleHeightf,	vtmp1,	SubRectangleHeightf
	vmadh	SubRectangleHeighti,	vtmp1,	SubRectangleHeighti
	vmadn	SubRectangleHeightf,	vzero,	vzero[0]

	# Now add 4 to SubRectangleHeighti
	vadd	SubRectangleHeighti,	SubRectangleHeighti,	vconst[5]

	# Compute YEndR = ((YStartR + SubRectangleHeight) >> 2) << 2 ;

	mfc2	YEndR,	SubRectangleHeighti
	add	YEndR,	YStartR,	YEndR
	srl	YEndR,	YEndR,		2
	sll	YEndR,	YEndR,		2

	#      /* Hack to prevent wraparound past end of coordinate range */
	#      if (YEndR >= 4095) YEndR = 4095;
	addi	temp,	YEndR,	-4095
	bgez	temp,	ExitSprite

	# Compute SubRectangleHeight = YEndR - YStartR
	# stick back into vector register since we're going to use it immediately

	sub	temp,	YEndR,	YStartR
	mtc2	temp,	SubRectangleHeighti[0]

	# Check if we have FlipTextureY
	lb	temp,	RSP_SP_FLIPY(in_bufp)
	beq	temp,	zero,	NoFlipY6
	nop	# delay slot

	# Compute Y flip TEndR, TEndP, TexLoadStart, TexLoadEnd, TStartP

	# TEndP = TStartP + (((YStartP - YEndR) * ScaleY) >> 7)	;
	sub	TEndP,	YStartP,	YEndR

	# reload ScaleY value into vector register
	mtc2	ScaleY,	vtmp1[0]

	# Move partial product into vector register
	mtc2	TEndP,	vtmp2[0]

	# Now we do a IxI = 32 bit I multiply here
	vmudh	vtmp2,	vtmp1,	vtmp2[0]
	vsar	vtmp2,	vtmp1,	vtmp2[0]
	vsar	vtmp3,	vtmp1,	vtmp2[1]

	# Now shift answer down by 7 bits
	vmudl	vtmp3,	vtmp3,	vconst[7]
	vmadm	vtmp2,	vtmp2,	vconst[7]
	vmadn	vtmp3,	vzero,	vzero[0]

	# Now move partial product into scaler register, scaled down answer
	# should be in the fractional result register

	mfc2	TEndP,	vtmp3
	add	TEndP,	TEndP,	TStartP


	#  TexLoadEnd   = TexLoadStart;
	#  if ((((TEndR & 0x1f) == 0) && (ScaleY != 1024)) ||
	#      ((((TEndR | FractionalOffset ) & 0x1f) == 0)))
	#    TexLoadEnd--;

	or	temp,	TEndR,	FractionalOffset
	andi	temp,	temp,	0x1f
	beq	temp,	zero,	DoTexLoadEndOffset
	addi	TexLoadEnd,	TexLoadStart,	0

	andi	temp,	TEndR,	0x1f
	bne	temp,	zero,	NoTexLoadEndOffset
	nop	# delay slot

	addi	temp,	ScaleY,	-1024
	beq	temp,	zero,	NoTexLoadEndOffset
	nop	# delay slot

DoTexLoadEndOffset:

	addi	TexLoadEnd,	TexLoadEnd,	-1

NoTexLoadEndOffset:

	# TexLoadStart = TexLoadEnd - NumLinesPerLoad + 1 ;
	addi	TexLoadStart,	TexLoadEnd,	1
	sub	TexLoadStart,	TexLoadStart,	NumLinesPerLoad

	# TEndR = TEndP
	add	TEndR,	zero,	TEndP

	# Now check to see if we are finished
	sub	temp,	TexLoadStart,	SourceImageOffsetT
	bgtz	temp,	DoATextureRectangle
	nop	# delay slot

	# TStartP -= (SourceImageOffsetT - TexLoadStart) << 5 ;
	sub	temp,	SourceImageOffsetT,	TexLoadStart
	sll	temp,	temp,	5
	sub	TStartP,	TStartP,	temp

 # HERE
.name	vtmp4,	$v14

	# YEndR = ((ScaleY == 1024) ? 4 : 0) + YStartR + (TStartP << 7) / ScaleY;

	# Load TStartP into register
	mtc2	TStartP,	vtmp4

	# The 1/ScaleY calculated previously has already
	# been shifted by the equivalent of 16, so we
	# Now multiply TStartP by 128, IxI = 32 bit I

	# load 128 into vtmp1
	vadd	vtmp1,	vzero,	vconst1[6]

	vmudh	vtmp3,	vtmp1,	vtmp4[0]
	vsar	vtmp3,	vtmp1,	vtmp4[0]
	vsar	vtmp4,	vtmp1,	vtmp4[1]

	# Now multiply scaled height by previous calculated
	# 1/ScaleY, IFxIF = IF
	vmudl	vtmp2,	vtmp4,	iScaleXYf[0]
	vmadm	vtmp2,	vtmp3,	iScaleXYf[0]
	vmadn	vtmp2,	vtmp4,	iScaleXYi[0]
	vmadh	vtmp1,	vtmp3,	iScaleXYi[0]
	vmadn	vtmp2,	vzero,	vzero[0]

	# Now move the integer portion of the result out to
	# a scaler register and then add in YStartR

	mfc2	YEndR,	vtmp1

	# now add YStartR
	add	YEndR,	YEndR,	YStartR

	# now optionally add 4
	addi	temp,	ScaleY,	-1024
	bne	temp,	zero,	NoOffsetYEndR
	nop	# delay slot

	addi	YEndR,	YEndR,	4

NoOffsetYEndR:

	# Now store done=1 into memory location
	addi	temp,	zero,	1
	sh	temp,	(6+RSP_SCRATCH_OFFSET)(zero)

	j	DoATextureRectangle
	# delay slot
	# TexLoadStart = SourceImageOffsetT
	addi	TexLoadStart,	SourceImageOffsetT,	0

NoFlipY6:

	# Compute TexLoadStart and TexLoadEnd, before we compute TEndR

	# TexLoadStart
	addi	temp,		TEndR,			0
	srl	temp,		temp,			5
	add	TexLoadStart,	TexLoadStart,		temp

	# TexLoadEnd
	addi	TexLoadEnd,	NumLinesPerLoad,	-1
	add	TexLoadEnd,	TexLoadEnd,		TexLoadStart

	# Now compute TEndR = TStartR + ((ScaleY * SubRectangleHeight) >> 7) ;

	# Hard coded no Y Flip
	# reload ScaleY value into vector register
	mtc2	ScaleY,	vtmp1[0]

	# Now we do a IxI = 32 bit I multiply here
	vmudh	SubRectangleHeighti,	vtmp1,	SubRectangleHeighti[0]
	vsar	SubRectangleHeighti,	vtmp1,	SubRectangleHeighti[0]
	vsar	SubRectangleHeightf,	vtmp1,	SubRectangleHeighti[1]

	# Now shift answer down by 7 bits
	vmudl	SubRectangleHeightf,	SubRectangleHeightf,	vconst[7]
	vmadm	SubRectangleHeighti,	SubRectangleHeighti,	vconst[7]
	vmadn	SubRectangleHeightf,	vzero,			vzero[0]

	# Now move partial product into scaler register, scaled down answer
	# should be in the fractional result register

	mfc2	temp,	SubRectangleHeightf
	add	TEndR,	temp,	TStartR

.unname SubRectangleHeighti
.unname	SubRectangleHeightf

	# Now compute equivalent of CleanUpYEndR
	# compute TexLoadStart + (TEndR >> 5) - SourceImageOffsetT - SubImageHeight >= 0

	addi	temp,	TEndR,	0
	srl	temp,	temp,	5
	add	temp,	temp,	TexLoadStart
	sub	temp,	temp,	SourceImageOffsetT
	sub	temp,	temp,	SubImageHeight

	bltz	temp,	DoATextureRectangle
	nop	# delay slot

	# Compute YEndR = PScreenY + (SubImageHeight << 12) / ScaleY

	# Shorten amount of texture loaded in, optimization for small sprites
	# TexLoadEnd = SourceImageOffsetT + SubImageHeight - ((FractionalOffset == 0) && (ScaleY == 1024);

	lh	TexLoadEnd,	RSP_SP_SUBHEIGHT(in_bufp)
	lh	temp,		RSP_SP_SOURCEOFFSETT(in_bufp)

	bne	FractionalOffset,	zero,	NoTexLoadEndOffset2

	# delay slot
	add	TexLoadEnd,	TexLoadEnd,	temp

	addi	temp,	ScaleY,	-1024
	bltz	temp,	NoTexLoadEndOffset2
	nop	# delay slot

	addi	TexLoadEnd,	TexLoadEnd,	-1

NoTexLoadEndOffset2:

	# Load SubRectangleHeight into register
	mtc2	SubImageHeight,		vtmp4

	# The 1/ScaleY calculated previously has already
	# been shifted by the equivalent of 16, so we
	# Now multiply SubImageHeight by 4096, IxI = 32 bit I

	# load 4096 into vtmp1

	vadd	vtmp1,	vzero,	vconst1[1]

	vmudh	vtmp3,	vtmp1,	vtmp4[0]
	vsar	vtmp3,	vtmp1,	vtmp4[0]
	vsar	vtmp4,	vtmp1,	vtmp4[1]

	# Now multiply scaled height by previous calculated
	# 1/ScaleY, IFxIF = IF
	vmudl	vtmp2,	vtmp4,	iScaleXYf[0]
	vmadm	vtmp2,	vtmp3,	iScaleXYf[0]
	vmadn	vtmp2,	vtmp4,	iScaleXYi[0]
	vmadh	vtmp1,	vtmp3,	iScaleXYi[0]
	vmadn	vtmp2,	vzero,	vzero[0]

	# Now move the integer portion of the result out to
	# a scaler register and then add in PScreenY

	mfc2	YEndR,	vtmp1

	# now add PScreenY
	add	YEndR,	YEndR,	PScreenY

	# Now store done=1 into memory location
	addi	temp,	zero,	1

	j	DoATextureRectangle
	sh	temp,	(6+RSP_SCRATCH_OFFSET)(zero)	# delay slot


.unname	vtmp3
.unname	vtmp4

ComputeRectanglesDone:

	jr	return
 	nop	# delay slot

 # now clean up uneeded registers
.unname	FractionalOffset
.unname SubImageHeight
.unname	SourceImageOffsetT

 ############################################################################

DoATextureRectangle:

.name	LRSourceImageBitSize,	$9
.name	temp3,			$16

.unname	UNUSED11
.unname	UNUSED13
.unname	UNUSED19

.name	Stride,			$13
.name	TileSize,		$19
.name	TextureShift,		$11

.name	vtmp3,			$v13

	# PUSH TEXLOADSTART
	sh	TexLoadStart,		(18+RSP_SCRATCH_OFFSET)(zero)

	# Do YClip hacks here

	# First check if YEndR < 0, if so then return
	bltz	YEndR,	DoATextureRectangleDone

	# store return address since jumps below will clobber it, delay slot
	sh	return,		(10+RSP_SCRATCH_OFFSET)(zero)

	# Second check if YStartR < 0, if so then clip
	# TStartP += ((-YStartR * ScaleY) >> 7) * (FlipTextureY ? -1 : 1);
	# YStartR = 0;

	bgez	YStartR,	NoYClipHack

	# negate YStartR (make > 0)
	sub	temp,	zero,	YStartR

	mtc2	temp,		vtmp1[0]
	mtc2	ScaleY,		vtmp2[0]

	# Now we do a IxI = 32 bit I multiply here
	vmudh	vtmp2,	vtmp1,	vtmp2[0]
	vsar	vtmp2,	vtmp1,	vtmp2[0]
	vsar	vtmp3,	vtmp1,	vtmp2[1]

	# Now shift answer down by 7 bits
	vmudl	vtmp3,	vtmp3,	vconst[7]
	vmadm	vtmp2,	vtmp2,	vconst[7]
	vmadn	vtmp3,	vzero,	vzero[0]

	# Move result into scaler register
	mfc2	temp2,	vtmp3

	# Check if we have FlipTextureY
	lb	temp,	RSP_SP_FLIPY(in_bufp)
	beq	temp,	zero,	NoFlipY7
	nop	# delay slot

	sub	temp2,	zero,	temp2

.unname	vtmp1
.unname	vtmp2

NoFlipY7:

	add	TStartP,	TStartP,	temp2
	addi	YStartR,	zero,		0

NoYClipHack:

	# open for output, need to save and restore register $18
	# since it holds the TexLoadEnd value need later
	sh	TexLoadEnd,	(8+RSP_SCRATCH_OFFSET)(zero)

#if !(defined(OUTPUT_DRAM)||defined(OUTPUT_FIFO))
	jal	OutputOpen
	addi	$18, zero, 72 	# Total size of all commands written out
#endif /* !(OUTPUT_DRAM || OUTPUT_FIFO) */

	# We delay popping back the return address until after we've
	# done the SETTIMG address fixup routine below

	# pop back TexLoadEnd
	lh	TexLoadEnd,	(8+RSP_SCRATCH_OFFSET)(zero)

	# Now we start computing and writing out the necessary texture commands

	# Do common setup of TextureShift
	addi	TextureShift,	zero,	G_TEXTURE_IMAGE_FRAC

	# Load in SourceImageBitSize, use it to determine if we have a 4 bit texture
	lb	LRSourceImageBitSize,	RSP_SP_SOURCEBITSIZE(in_bufp)
	beq	LRSourceImageBitSize,	zero,	FourBitTexture

	# delay slot, Common setup of Stride
	lh	Stride,		RSP_SP_STRIDE(in_bufp)


Non4BitTexture:

	# LRSourceImageBitSize = ((((((SourceImageOffsetS + SubImageWidth - 1)) -
	# (SourceImageOffsetS)+1) * TileBytes[SourceImageBitSize])+7)>>3);
	# Why this awful mess is defined this way in the gbi macros, I have no clue
	# the simplification of this is:
	# LRSourceImageBitSize = (SubImageWidth * TileBytes[SourceImageBitSize] + 7 ) >> 3;

	# Note that since TileBytes[SourceImageBitSize] = LineBytes[SourceImageBitSize]
	# and that TileBytes[0,1,2,3] = {0,1,2,2}, we are just going to special case
	# this rather than doing a true array lookup. Also note that after the array
	# lookup, we are going to be multiplying by a power of two. But note that since
	# we are assuming we don't have a 4 bit texture then we can change to doing
	# a logical shift. But we only need to do a shift if the index = 2 or 3. So we
	# special case index == 1 below and avoid the array lookup and multiplication.

	# If we loaded a 1, then jump past left shift
	addi	temp,	zero,	1
	beq	LRSourceImageBitSize,	temp,	LRAdjustDone

	# delay slot
	# load SubImageWidth into register
	lh	LRSourceImageBitSize,	RSP_SP_SUBWIDTH(in_bufp)

	# We have a 2 or 3, so we left shift LRSourceImageBitSize by 1
	sll	LRSourceImageBitSize,	LRSourceImageBitSize,	1

LRAdjustDone:

	# Now we add 7
	addi	LRSourceImageBitSize,	LRSourceImageBitSize,	7

	# now right shift 3
	srl	LRSourceImageBitSize,	LRSourceImageBitSize,	3

	j	SizeSpecificSetupDone

	# delay slot
	lb	TileSize,	RSP_SP_SOURCEBITSIZE(in_bufp)

FourBitTexture:

	# LRSourceImageBitSize = NumWordsPerLine = (SubImageWidth * 4 + 63) >> 6 ;
	# load SubImageWidth into register
	lh	LRSourceImageBitSize,	RSP_SP_SUBWIDTH(in_bufp)
	sll	LRSourceImageBitSize,	LRSourceImageBitSize,	2
	addi	LRSourceImageBitSize,	LRSourceImageBitSize,	63
	srl	LRSourceImageBitSize,	LRSourceImageBitSize,	6

	# Now setup TileSize and finish TextureShift and finish Stride
	addi	TileSize,	zero,	G_IM_SIZ_8b
	addi	TextureShift,	TextureShift,	-1

	addi	Stride,		Stride,		1
	srl	Stride,		Stride,		1

SizeSpecificSetupDone:

	# Now write out the gDPSetTextureImage command
	addi	temp,	zero,	G_SETTIMG
	sb	temp,	0(outp)	# output rdp command

	# Now write out format and size values

	# Load in SourceImageType
	lb	temp,	RSP_SP_SOURCETYPE(in_bufp)
	sll	temp,	temp,	2

	or	temp,	temp,	TileSize
	sll	temp,	temp,	3

	sb	temp,	1(outp)	# output format & size

	# for some bizarre reason, the gsSetImage() gbi macro
	# subtracts 1 from the image width. Why?
	addi	temp,	Stride,	-1
	sh	temp,	2(outp)	# output width

	# Now we temporarily store away TileSize, Stride and TextureShift
	# since address fixup procedure stomps on those
	sh	TileSize,	(12+RSP_SCRATCH_OFFSET)(zero)
	sh	Stride,		(14+RSP_SCRATCH_OFFSET)(zero)
	sh	TextureShift,	(16+RSP_SCRATCH_OFFSET)(zero)

.unname	TileSize
.name	addr,		$19

	# Load in SourceImagePointer
	lw	addr,	RSP_SP_SOURCEP(in_bufp)

	# Subroutine uses registers $11,$12,$13,$19
	jal	AddrFixup
	nop	# delay slot

	sw	addr,	4(outp)	# output DRAM address

	# pop back return address stored earlier
	lh	return,		(10+RSP_SCRATCH_OFFSET)(zero)

.unname	addr
.name	TileSize,	$19

	# retrieve stored TileSize, stride and TextureShift
	lh	TileSize,	(12+RSP_SCRATCH_OFFSET)(zero)
	lh	Stride,		(14+RSP_SCRATCH_OFFSET)(zero)
	lh	TextureShift,	(16+RSP_SCRATCH_OFFSET)(zero)

	addi	outp,	outp,	8	# increment output pointer

	# HOW EXPENSIVE IS IT TO RELOAD VALUES FROM MEMORY?
	# SHOULD I KEEP THESE IN REGISTERS FOR REUSE?

	# SOME OF THESE CONSTANTS ARE DEFINED AS ZERO, DON'T
	# NEED TO WRITE THOSE OUT, LOOK FOR THOSE LATER

	# Now write out gDPSetTile command

	addi	temp,	zero,	G_SETTILE
	sb	temp,	0(outp) # output rdp command

	# Load in SourceImageType
	lb	temp,	RSP_SP_SOURCETYPE(in_bufp)
	sll	temp,	temp,	2

	or	temp,	temp,	TileSize
	sll	temp,	temp,	27

	add	temp2,	zero,	LRSourceImageBitSize
	sll	temp2,	temp2,	17
	or	temp,	temp,	temp2

	# Tmem adrs == 0, so no need to do anything

	# Now write out 4 bytes, these will overlap
	# with the next 4 bytes written out, so we
	# will set up the tile parameter below

	sw	temp,	1(outp)

	# Put in Tile number = G_TX_LOADTILE
	addi	temp,	zero,	G_TX_LOADTILE
	sll	temp,	temp,	24

	# Palette == 0, so no need to do anything

	# Now set up ct&mt = G_TX_CLAMP | G_TX_NOMIRROR
	addi	temp2,	zero,	G_TX_CLAMP | G_TX_NOMIRROR
	sll	temp2,	temp2,	18
	or	temp,	temp,	temp2

	# Mask  T == 0, so no need to do anything
	# Shift T == 0, so no need to do anything

	# Now set up cs&ms = G_TX_CLAMP | G_TX_NOMIRROR
	addi	temp2,	zero,	G_TX_CLAMP | G_TX_NOMIRROR
	sll	temp2,	temp2,	8
	or	temp,	temp,	temp2

	# Mask  S == 0, so no need to do anything
	# Shift T == 0, so no need to do anything

	# Now write out second set of overlapping 4 bytes
	# This write has the correct Tile information

	sw	temp,	4(outp)

	addi	outp,	outp,	8	# increment output pointer

	# Now write out gDPLoadSync command

	addi	temp,	zero,	G_RDPLOADSYNC
	sll	temp,	temp,	24 # shift over command
	sw	temp,	0(outp)	# output rdp command
	sw	zero,	4(outp)

	addi	outp,	outp,	8	# increment output pointer

	# Now write out gDPLoadTile command

	addi	temp,	zero,	G_LOADTILE
	sb	temp,	0(outp)	# output rdp command

	# Load in SL, save it for use later
	lh	temp3,	RSP_SP_SOURCEOFFSETS(in_bufp)
	sllv	temp,	temp3,	TextureShift
	sll	temp,	temp,	20

	# Load in TL
	add	temp2,	zero,	TexLoadStart
	sll	temp2,	temp2,	10 # includes shift by G_TEXTURE_IMAGE_FRAC
	or	temp,	temp2,	temp

	# Now write out 4 bytes, these will overlap
	# with the next 4 bytes written out, so we
	# will set up the tile parameter below

	sw	temp,	1(outp)

	addi	temp,	zero,	G_TX_LOADTILE
	sll	temp,	temp,	24

	# Load in SH, make use of stored SourceImageOffsetS from earlier
	lh	temp2,	RSP_SP_SUBWIDTH(in_bufp)
	add	temp2,	temp2,	temp3
	addi	temp2,	temp2,	-1
	sllv	temp2,	temp2,	TextureShift
	sll	temp2,	temp2,	12
	or	temp,	temp2,	temp

	# Load in TH
	add	temp2,	zero,	TexLoadEnd
	sll	temp2,	temp2,	2  # shift by G_TEXTURE_IMAGE_FRAC
	or	temp,	temp2,	temp

	# Now write out second set of overlapping 4 bytes
	# This write has the correct Tile information

	sw	temp,	4(outp)

	addi	outp,	outp,	8	# increment output pointer

.unname	TileSize
.unname	Stride
.unname	TextureShift

	# Now write out gDPPipeSync command

	addi	temp,	zero,	G_RDPPIPESYNC
	sll	temp,	temp,	24 # shift over command
	sw	temp,	0(outp)	# output rdp command
	sw	zero,	4(outp)
	addi	outp,	outp,	8	# increment output pointer

	# Now write out second gDPSetTile command

	addi	temp,	zero,	G_SETTILE
	sb	temp,	0(outp) # output rdp command

	# Load in SourceImageType
	lb	temp,	RSP_SP_SOURCETYPE(in_bufp)
	sll	temp,	temp,	2

	# Load in SourceImageBitSize
	lb	temp2,	RSP_SP_SOURCEBITSIZE(in_bufp)

	or	temp,	temp,	temp2
	sll	temp,	temp,	27

	add	temp2,	zero,	LRSourceImageBitSize
	sll	temp2,	temp2,	17
	or	temp,	temp,	temp2

	# Tmem adrs == 0, so no need to do anything

	# Now write out 4 bytes, these will overlap
	# with the next 4 bytes written out, so we
	# will set up the tile parameter below

	sw	temp,	1(outp)

	# Put in Tile number = G_TX_RENDERTILE
	addi	temp,	zero,	G_TX_RENDERTILE
	sll	temp,	temp,	24

	# Palette == 0, so no need to do anything

	# Now set up ct&mt = G_TX_CLAMP | G_TX_NOMIRROR
	addi	temp2,	zero,	G_TX_CLAMP | G_TX_NOMIRROR
	sll	temp2,	temp2,	18
	or	temp,	temp,	temp2

	# Mask  T == 0, so no need to do anything
	# Shift T == 0, so no need to do anything

	# Now set up cs&ms = G_TX_CLAMP | G_TX_NOMIRROR
	addi	temp2,	zero,	G_TX_CLAMP | G_TX_NOMIRROR
	sll	temp2,	temp2,	8
	or	temp,	temp,	temp2

	# Mask  S == 0, so no need to do anything
	# Shift T == 0, so no need to do anything

	# Now write out second set of overlapping 4 bytes
	# This write has the correct Tile information

	sw	temp,	4(outp)

	addi	outp,	outp,	8	# increment output pointer

	# Now write out gDPSetTileSize command

	addi	temp,	zero,	G_SETTILESIZE
	sb	temp,	0(outp)	# output rdp command

	# SL and TL == 0, so just need to write out zeros
	sw	zero,	1(outp)

	addi	temp,	zero,	G_TX_RENDERTILE
	sll	temp,	temp,	24

	# Load in SH, make use of stored SourceImageOffsetS from earlier
	lh	temp2,	RSP_SP_SUBWIDTH(in_bufp)
	addi	temp2,	temp2,	-1
	sll	temp2,	temp2,	14 # includes shift by 2
	or	temp,	temp2,	temp

	# Load in TH
	add	temp2,	zero,	TexLoadEnd
	sub	temp2,	temp2,	TexLoadStart
	sll	temp2,	temp2,	2  # shift by G_TEXTURE_IMAGE_FRAC
	or	temp,	temp2,	temp

	# Now write out second set of overlapping 4 bytes
	# This write has the correct Tile information

	sw	temp,	4(outp)

	addi	outp,	outp,	8	# increment output pointer

	# Now write out the gSPTextureRectangle command

	addi	temp,	zero,	G_TEXRECT
	sb	temp,	0(outp)	# output rdp command

	# Now compute XH, load TargetRectangleWidth from scratch area
	# store PScreenX into temp3 for use later
	lh	temp,	(0+RSP_SCRATCH_OFFSET)(zero)
	# sll	temp,	temp,	2
	lh	temp3,	RSP_SP_PSCREENX(in_bufp)
	add	temp,	temp,	temp3
	sll	temp,	temp,	20

	# Load in YH
	add	temp2,	zero,	YEndR
	sll	temp2,	temp2,	8
	or	temp,	temp,	temp2

	# Now write out 4 bytes, these will overlap
	# with the next 4 bytes written out, so we
	# will set up the tile parameter below

	sw	temp,	1(outp)

	# Now set up tile

	addi	temp,	zero,	G_TX_RENDERTILE
	sll	temp,	temp,	24

	# Now set up XL
	sll	temp3,	temp3,	12
	or	temp,	temp,	temp3

	# Now set up YL
	or	temp,	temp,	YStartR

	# Now write out second set of overlapping 4 bytes
	# This write has the correct Tile information

	sw	temp,	4(outp)

	# Now write out S
	# Previously we had computed and stored away SSTartR, retrieve from scratch
	lh	temp,	(2+RSP_SCRATCH_OFFSET)(zero)
	sh	temp,	8(outp)

	# Now write out dsdx, (T Written out in delay slot)
	lh	temp,	RSP_SP_SCALEX(in_bufp)
	# Check if we have FlipTextureX
	lb	temp2,	RSP_SP_FLIPX(in_bufp)
	beq	temp2,	zero,	NoFlipX2
	# Now write out T
	sh	TStartP,	10(outp) # delay slot

	# We have FlipTextureX, so negate dsdx
	sub	temp,	zero,	temp

NoFlipX2:
	sh	temp,	12(outp)

	# Now write out dtdy

	# Check if we have FlipTextureY
	lb	temp2,	RSP_SP_FLIPY(in_bufp)
	beq	temp2,	zero,	NoFlipY2

	# delay slot
	addi	temp,	ScaleY,	0

	# We have FlipTextureY, so negate dtdy
	sub	temp,	zero,	ScaleY

NoFlipY2:
	sh	temp,	14(outp)


	# store return address since jump below will clobber it
	sh	return,		(10+RSP_SCRATCH_OFFSET)(zero)

    	jal	OutputClose
	addi	outp,	outp,	16	# delay slot

DoATextureRectangleDone:

	# jump back to start of ComputeRectangles
  	j	ComputeRectangles
	# pop back return address, delay slot
	lh	return,		(10+RSP_SCRATCH_OFFSET)(zero)


 ############################################################################

 # .end case_G_SPRITE

ExitSprite:

	# Fall through to where we came from

.unname ScaleY
.unname	TEndP
.unname	YStartP
.unname YEndR
.unname	TEndR
.unname	PScreenY
.unname	TStartP
.unname	temp
.unname	TexLoadEnd