msp_transform.txt 3.03 KB

/* try with radix4 divider, 8 clocks throughput for 16 bit W,
	pipelined for 4 clocks latency */

/* register usage */
	px, py, pz, pw;		/* vectors of x, y, z, w */
	ox, oy, oz;		/* output x, y, z */
	mtxa, mtxb, mtxc, mtxd;	/* integer transform matrix */
	mtfa, mtfb, mtfc, mtfd;	/* fractional transform matrix */
	sca, trl;		/* device scale and translate */
	rw;			/* reciprocal of w */
	nw;			/* -pw for clip test */
	a1;			/* vector of one */

/* some loads  could parallelize with other computation... */
/* ???? store matrix column order for load latency */
	vload( inputx, px);
	vload( inputy, py);
	vload( inputz, pz);
	vload( a1, one);	/* constant '1' vector? */
	vload( matrix, mtxa);
	/* 7 more matrix loads */
	vload( scale, sca);
	vload( translate, trl);
/* compute w */
	vmuls( px, mtxa, 3, 0);		/* mtx integer  0,3 times x */
	vmacs( px, mtfa, 3, 0);		/* mtx fraction 0,3 times x */
	vmuls( py, mtxb, 3, 0);		/* mtx integer  1,3 times y */
	vmacs( py, mtfb, 3, 0);		/* mtx fraction 1,3 times y */
	vmuls( pz, mtxc, 3, 0);		/* mtx integer  2,3 times z */
	vmacs( pz, mtfc, 3, 0);		/* mtx fraction 2,3 times z */
	vmuls( a1, mtxd, 3, 0);		/* mtx integer  3,3 times w */
	vmacs( a1, mtfd, 3, pw);	/* mtx fraction 3,3 times w */
/* compute z, matrix element 2, while W completes */
	/* 8 clocks */
		vdiv( pw, 0, sw);	 /* launch w divide */
/* compute y, matrix element 1 */
	/* 8 clocks */
		vdiv( pw, 1, sw);	 /* launch w divide */
		vdiv( pw, 2, sw);	 /* launch w divide */
/* negate w for clip test */
	vsub( 0, pw, nw);
/* compute x, matrix element 0 */
	/* 8 clocks */
		vdiv( pw, 3, sw);	 /* launch w divide */
		vdiv( pw, 4, sw);	 /* launch w divide */
/* clip test -w < xy < w, 0 < z < w */
/* meanwhile, su does and/or of codes for trivial accept/reject */
	vcmp_le( px, pw);
	vmov( vcc, su_rega);
	vcmp_ge( px, nw);
	vmov( vcc, su_regb);
		vdiv( pw, 5, sw);	 /* launch w divide */
	vcmp_le( py, pw);
	vmov( vcc, su_rega);
	vcmp_ge( py, nw);
	vmov( vcc, su_regb);
		vdiv( pw, 6, sw);	 /* launch w divide */
	vcmp_le( pz, pw);
	vmov( vcc, su_rega);
	vcmp_gt( pz, 0);
	vmov( vcc, su_regb);
		vdiv( pw, 7, sw);	 /* launch w divide */
/* save x,y,z,w and 1/w for for clipping and texture usage */
	vstore( temp0, px);
	vstore( temp1, py);
	vstore( temp2, pz);
	vstore( temp3, pw);
	/* 4 clocks while div 7 completes */
/* 1/w times x, y, z */
	vmul( rw, px, ox);
	vmul( rw, py, oy);
	vmul( rw, pz, ow);
	/* 6 clocks noop while rw*px completes */
/* scale and translate x,y,z */
	vmuls( ox, sca, 0, 0);
	vmacs( a1, trl, 0, ox);
	vmuls( oy, sca, 1, 0);
	vmacs( a1, trl, 1, oy);
	vmuls( oz, sca, 2, 0);
	vmacs( a1, trl, 2, oz);
	vstore( temp4, rw);
	/* 5 clocks noop */
/* store output points */
	vstore( outputx, ox);
	vstore( outputy, oy);
	vstore( outputz, oz);

/* counts
	load	14	could be somewhat parallel
	xyzw	32
	divide	 8	interleaved
	clip	13	1 negate, 6 cmp, 6 mov
	proj	 3
	sca/trl	 6
	store	 8	5 temp, 3 out
	noop	 6	while *rw completes
	noop	 9	while *sc completes

	total	99

if 8 point vector, about 13 clocks, if 4 then 25.
approaches 10 clocks with parallel load.