msp_setup.txt
2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
triangle setup
divide throughput and latency: 16b 4T/4L, 32b 12T/16L
register usage
vhp, vmp, vlp; /* point xy */
dhp, dmp, dlp; /* delta xy */
fhp, fmp, flp; /* fractional delta xy */
vhc, vmc, vlc; /* point rgba */
dhc, dmc; /* delta rgba */
vht, vmt, vlt; /* point stwz */
dht, dmt; /* delta stwz */
ryh, rym, ryl; /* reciprocal of y length */
rq; /* reciprocal of plane equation */
qp, qf; /* coefficients of plane equation */
dyc, dxc, dyt, dxt; /* plane equation attributes */
fyc, fxc, fyt, fxt;
/* sort in su instructions not shown */
/* note that plane equation dx and dy terms can proceed during vertex sort,
then x slopes later after sort */
/* load vertex point and subtract delta */
vload( php, vhp);
vload( pmp, vmp);
vload( plp, vlp);
vsub( vhp, vlp, dhp);
vsub( vhp, vmp, dmp);
/* load vertex color and subtract delta h, m */
/* 5 */
/* 3 noop for dmp to complete */
/* cross hx * my - mx * hy, save hx, hy, mx, my */
vmuls( dhp, dmp, 1, rq);
vmacs( dmp, dhp, 1, rq); /* rq is 30 bit number?????????*/
vrotate( dhp, dmp, 2, qp);
/* load vertex texture and subtract delta h, m*/
/* 5 */
/* 2 noop for rq to complete */
/* launch rq divide */
vdiv( rq, 1, rq); /* rq needs at least 20 frac bits */
/* 16 noop */
/* reload and subtract x slopes after sort here */
/* compute dq */
vmuls( qp, rq, 0, qp);
vmac( 0, 0, qf);
/* 7 noop */
/* interleave the following 3 divides with color, texture dx, dy */
/* launch hy divide */
vdiv( dhp, 1, rhy);
/* launch my divide */
vdiv( dmp, 1, rmy);
/* launch ly divide */
vdiv( dlp, 1, rly);
/* compute color and texture dx and dy */
vmuls( dhc, qp, 0, dxc);
vmuls( dhc, qf, 0, dxc);
vmacs( dmc, qp, 1, dxc);
vmuls( dmc, qf, 1, dxc);
vmac( 0, 0, fxc);
/* 20 total for dxc, dyc, dxt, dyt, m/h i/f fout */
/* high edge delta_y x */
vmuls( shp, rhy, 0, dhp, SH_INT);
vmac(0, 0, fhp, SH_FRAC); /* no-op multiply, write fraction */
/* mid edge delta_y x */
/* 2 */
/* low edge delta_y x */
/* 2 */
/* adjust point to first scanline */
/* as vh* = vh* + y_frac * (dy* + fy*); */
/* 9 clocks */
/* adjust point to first pixel center */
/* as vh* = vh* + x_frac * (dx* + fx*); */
/* 9 clocks */
/* delta_y_0 = dy + int(dyx) * (dx* + fx*) */
/* 6 clocks */
/* delta_y_1 - delta_y_0 + (dx* + fx*) */
/* 6 clocks */
/* store: point, delta_x, delta_y0, delta_y1 * 3 == 12 words */
/* totals
6 ld,sub xy
8 ld,sub rgba and wait
4 pleq
8 ld,sub stwz and wait
17 1/pleq
8 times xy
20 dx,dy rgba stwz
6 x slopes
18 point adjust
12 dxy adjust
12 store
125 total