iquant.s
3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
##############################################################
#
# Inverse quantization
#
# NOTES:
# ------
#
# - The CPU puts the inputs and the required coefficients
# (ie C[], D[] and scalefactors[]) into an order which
# is required by the DCT.
#
# - Code can be interleaved for big improvements in efficiency
# the current arrangement is for debugging only
#
# - Can save stores at the end and make DCT use results from
# registers
.ent iquant
.name out0h, $v1
.name out1h, $v2
.name out0l, $v3
.name out1l, $v4
.name out2h, $v5
.name out3h, $v6
.name out2l, $v7
.name out3l, $v8
.name c0l, $v9
.name c1l, $v10
.name c2l, $v11
.name c3l, $v12
.name c0h, $v13
.name c1h, $v14
.name c2h, $v15
.name c3h, $v16
.name in0, $v17
.name in1, $v18
.name in2, $v19
.name in3, $v20
.name dctIA, $1
.name cAddr, $2
.name dAddr, $3
.name scAddr, $4
.name iAddr, $5
IQuant:
addi iAddr, zero, RSP_IQIN_OFFSET
addi cAddr, zero, RSP_IQC_OFFSET
addi dAddr, zero, RSP_IQD_OFFSET
addi scAddr, zero, RSP_IQSCALE_OFFSET
addi dctIA, zero, RSP_DCTIN_OFFSET
lqv in0[0], 0(iAddr)
lqv in1[0], 16(iAddr)
lqv in2[0], 32(iAddr)
lqv in3[0], 48(iAddr)
# Invert the high bit
vxor in0, in0, vconst[5]
vxor in1, in1, vconst[5]
vxor in2, in2, vconst[5]
vxor in3, in3, vconst[5]
# Add D[]
lqv c0l[0], 0(dAddr)
lqv c1l[0], 16(dAddr)
lqv c2l[0], 32(dAddr)
lqv c3l[0], 48(dAddr)
vadd in0, in0, c0l
vadd in1, in1, c1l
vadd in2, in2, c2l
vadd in3, in3, c3l
# Multiply by C[]
# 16x32 to give 32 multiplies - anyway to do this in less than 3 inst.?
lqv c0l[0], 0(cAddr)
lqv c1l[0], 16(cAddr)
lqv c2l[0], 32(cAddr)
lqv c3l[0], 48(cAddr)
lqv c0h[0], 64(cAddr)
lqv c1h[0], 80(cAddr)
lqv c2h[0], 96(cAddr)
lqv c3h[0], 112(cAddr)
vmudh $v0, in0, c0h
vmadm out0l, in0, c0l
vsaw out0h, $v0, out0h[0]
vmudh $v0, in1, c1h
vmadm out1l, in1, c1l
vsaw out1h, $v0, out1h[0]
vmudh $v0, in2, c2h
vmadm out2l, in2, c2l
vsaw out2h, $v0, out2h[0]
vmudh $v0, in3, c3h
vmadm out3l, in3, c3l
vsaw out3h, $v0, out3h[0]
# Multiply by scalefactors
lqv c0l[0], 0(scAddr)
lqv c1l[0], 16(scAddr)
lqv c2l[0], 32(scAddr)
lqv c3l[0], 48(scAddr)
lqv c0h[0], 64(scAddr)
lqv c1h[0], 80(scAddr)
lqv c2h[0], 96(scAddr)
lqv c3h[0], 112(scAddr)
vmudl $v0, c0l, out0l
vmadm $v0, c0h, out0l
vmadn out0l, c0l, out0h
vmadh out0h, c0h, out0h
vmudl $v0, c1l, out1l
vmadm $v0, c1h, out1l
vmadn out1l, c1l, out1h
vmadh out1h, c1h, out1h
vmudl $v0, c2l, out2l
vmadm $v0, c2h, out2l
vmadn out2l, c2l, out2h
vmadh out2h, c2h, out2h
vmudl $v0, c3l, out3l
vmadm $v0, c3h, out3l
vmadn out3l, c3l, out3h
vmadh out3h, c3h, out3h
# sqv out0h, 0(dctIA)
# sqv out1h, 16(dctIA)
# sqv out2h, 32(dctIA)
# sqv out3h, 48(dctIA)
#
# sqv out0l, 64(dctIA)
# sqv out1l, 80(dctIA)
# sqv out2l, 96(dctIA)
# sqv out3l, 112(dctIA)
j DCT32
nop
.unname c0l
.unname c1l
.unname c2l
.unname c3l
.unname c0h
.unname c1h
.unname c2h
.unname c3h
.unname in0
.unname in1
.unname in2
.unname in3
.unname out0l
.unname out1l
.unname out2l
.unname out3l
.unname out0h
.unname out1h
.unname out2h
.unname out3h
.unname dctIA
.unname cAddr
.unname dAddr
.unname scAddr
.unname iAddr
.end iquant