msp_setup.txt 2.52 KB

triangle setup
divide throughput and latency: 16b 4T/4L, 32b 12T/16L

register usage
	vhp, vmp, vlp;	/* point xy */
	dhp, dmp, dlp;	/* delta xy */
	fhp, fmp, flp;	/* fractional delta xy */
	vhc, vmc, vlc;	/* point rgba */
	dhc, dmc;	/* delta rgba */
	vht, vmt, vlt;	/* point stwz */
	dht, dmt;	/* delta stwz */

	ryh, rym, ryl;		/* reciprocal of y length */
	rq;		/* reciprocal of plane equation */
	qp, qf;		/* coefficients of plane equation */
	dyc, dxc, dyt, dxt;	/* plane equation attributes */
	fyc, fxc, fyt, fxt;	

/* sort in su instructions not shown */
/* note that plane equation dx and dy terms can proceed during vertex sort,
	then x slopes later after sort */
/* load vertex point and subtract delta */
	vload( php, vhp);
	vload( pmp, vmp);
	vload( plp, vlp);
	vsub( vhp, vlp, dhp);
	vsub( vhp, vmp, dmp);
/* load vertex color and subtract delta h, m */
	/* 5 */
	/* 3 noop for dmp to complete */
/* cross hx * my - mx * hy, save hx, hy, mx, my */
	vmuls( dhp, dmp, 1, rq);
	vmacs( dmp, dhp, 1, rq);	/* rq is 30 bit number?????????*/
	vrotate( dhp, dmp, 2, qp);
/* load vertex texture and subtract delta h, m*/
	/* 5 */
	/* 2 noop for rq to complete */
/* launch rq divide */
	vdiv( rq, 1, rq);	/* rq needs at least 20 frac bits */
	/* 16 noop */
	/* reload and subtract x slopes after sort here */
/* compute dq */
	vmuls( qp, rq, 0, qp);
	vmac( 0, 0, qf);
	/* 7 noop */
/* interleave the following 3 divides with color, texture dx, dy */
/* launch hy divide */
	vdiv( dhp, 1, rhy);
/* launch my divide */
	vdiv( dmp, 1, rmy);
/* launch ly divide */
	vdiv( dlp, 1, rly);
/* compute color and texture dx and dy */
	vmuls( dhc, qp, 0, dxc);
	vmuls( dhc, qf, 0, dxc);
	vmacs( dmc, qp, 1, dxc);
	vmuls( dmc, qf, 1, dxc);
	vmac( 0, 0,  fxc);
	/* 20 total for dxc, dyc, dxt, dyt, m/h i/f fout */
/* high edge delta_y x */
	vmuls( shp, rhy, 0, dhp, SH_INT);
	vmac(0, 0, fhp, SH_FRAC);	/* no-op multiply, write fraction */
/* mid edge delta_y x */
	/* 2 */
/* low edge delta_y x */
	/* 2 */

/* adjust point to first scanline */
/* as vh* = vh* + y_frac * (dy* + fy*); */
	/* 9 clocks */
/* adjust point to first pixel center */
/* as vh* = vh* + x_frac * (dx* + fx*); */
	/* 9 clocks */

/* delta_y_0  = dy + int(dyx) * (dx* + fx*) */
	/* 6 clocks */
/* delta_y_1 - delta_y_0 + (dx* + fx*) */
	/* 6 clocks */

/* store: point, delta_x, delta_y0, delta_y1 * 3 == 12 words */


/* totals
	6	ld,sub xy
	8	ld,sub rgba and wait
	4	pleq
	8	ld,sub stwz and wait
	17	1/pleq
	8	times xy
	20	dx,dy rgba stwz
	6	x slopes
	18	point adjust
	12	dxy adjust
	12	store

	125	total