msp_transform.txt
3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/* try with radix4 divider, 8 clocks throughput for 16 bit W,
pipelined for 4 clocks latency */
/* register usage */
px, py, pz, pw; /* vectors of x, y, z, w */
ox, oy, oz; /* output x, y, z */
mtxa, mtxb, mtxc, mtxd; /* integer transform matrix */
mtfa, mtfb, mtfc, mtfd; /* fractional transform matrix */
sca, trl; /* device scale and translate */
rw; /* reciprocal of w */
nw; /* -pw for clip test */
a1; /* vector of one */
/* some loads could parallelize with other computation... */
/* ???? store matrix column order for load latency */
vload( inputx, px);
vload( inputy, py);
vload( inputz, pz);
vload( a1, one); /* constant '1' vector? */
vload( matrix, mtxa);
/* 7 more matrix loads */
vload( scale, sca);
vload( translate, trl);
/* compute w */
vmuls( px, mtxa, 3, 0); /* mtx integer 0,3 times x */
vmacs( px, mtfa, 3, 0); /* mtx fraction 0,3 times x */
vmuls( py, mtxb, 3, 0); /* mtx integer 1,3 times y */
vmacs( py, mtfb, 3, 0); /* mtx fraction 1,3 times y */
vmuls( pz, mtxc, 3, 0); /* mtx integer 2,3 times z */
vmacs( pz, mtfc, 3, 0); /* mtx fraction 2,3 times z */
vmuls( a1, mtxd, 3, 0); /* mtx integer 3,3 times w */
vmacs( a1, mtfd, 3, pw); /* mtx fraction 3,3 times w */
/* compute z, matrix element 2, while W completes */
/* 8 clocks */
vdiv( pw, 0, sw); /* launch w divide */
/* compute y, matrix element 1 */
/* 8 clocks */
vdiv( pw, 1, sw); /* launch w divide */
vdiv( pw, 2, sw); /* launch w divide */
/* negate w for clip test */
vsub( 0, pw, nw);
/* compute x, matrix element 0 */
/* 8 clocks */
vdiv( pw, 3, sw); /* launch w divide */
vdiv( pw, 4, sw); /* launch w divide */
/* clip test -w < xy < w, 0 < z < w */
/* meanwhile, su does and/or of codes for trivial accept/reject */
vcmp_le( px, pw);
vmov( vcc, su_rega);
vcmp_ge( px, nw);
vmov( vcc, su_regb);
vdiv( pw, 5, sw); /* launch w divide */
vcmp_le( py, pw);
vmov( vcc, su_rega);
vcmp_ge( py, nw);
vmov( vcc, su_regb);
vdiv( pw, 6, sw); /* launch w divide */
vcmp_le( pz, pw);
vmov( vcc, su_rega);
vcmp_gt( pz, 0);
vmov( vcc, su_regb);
vdiv( pw, 7, sw); /* launch w divide */
/* save x,y,z,w and 1/w for for clipping and texture usage */
vstore( temp0, px);
vstore( temp1, py);
vstore( temp2, pz);
vstore( temp3, pw);
/* 4 clocks while div 7 completes */
/* 1/w times x, y, z */
vmul( rw, px, ox);
vmul( rw, py, oy);
vmul( rw, pz, ow);
/* 6 clocks noop while rw*px completes */
/* scale and translate x,y,z */
vmuls( ox, sca, 0, 0);
vmacs( a1, trl, 0, ox);
vmuls( oy, sca, 1, 0);
vmacs( a1, trl, 1, oy);
vmuls( oz, sca, 2, 0);
vmacs( a1, trl, 2, oz);
vstore( temp4, rw);
/* 5 clocks noop */
/* store output points */
vstore( outputx, ox);
vstore( outputy, oy);
vstore( outputz, oz);
/* counts
load 14 could be somewhat parallel
xyzw 32
divide 8 interleaved
clip 13 1 negate, 6 cmp, 6 mov
proj 3
sca/trl 6
store 8 5 temp, 3 out
noop 6 while *rw completes
noop 9 while *sc completes
total 99
if 8 point vector, about 13 clocks, if 4 then 25.
approaches 10 clocks with parallel load.