dct32.s
10.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
##############################################################
#
# 32 point DCT based on Lee's algorithm.
#
# NOTES:
# ------
#
# - This is a lot slower and bigger than it could be if we had
# alternate loads/stores
#
# - Scalar code can generate folded sequence at the same time
#
# - Should put DCT coefficient loads at the beginning to save
# multiple loads
#
# - Also need to look at other algorithms which won't need the
# stream of scalar code at the end
#
.ent dct32
.name dcti0h, $v1
.name dcti1h, $v2
.name dcti0l, $v3
.name dcti1l, $v4
.name dcti2h, $v5
.name dcti3h, $v6
.name dcti2l, $v7
.name dcti3l, $v8
.name dct0h, $v9
.name dct1h, $v10
.name dct0l, $v11
.name dct1l, $v12
.name dct2h, $v13
.name dct2l, $v14
.name dct3h, $v15
.name dct3l, $v16
.name coef0h, $v17
.name coef1h, $v18
.name coef0l, $v19
.name coef1l, $v20
.name dctIA, $1
.name dctOA, $2
.name t0, $3
.name t1, $4
.name t2, $5
.name t3, $6
.name t4, $7
.name t5, $8
.name t6, $9
.name t7, $10
.name t8, $11
.name t9, $12
.name t10, $13
.name t11, $14
.name t12, $15
.name t13, $16
.name t14, $17
.name t15, $18
.name tmp0, $19
.name tmp1, $20
DCT32:
addi dctIA, zero, RSP_DCTIN_OFFSET
addi dctOA, zero, RSP_DCTOUT_OFFSET
# Process top 16
# First butterfly - both halves: X(0) + X(16) and X(0) - X(16)
# Assumes Data previously processsed by IQUANT and in registers
# lqv dcti0h, 0(dctIA)
# lqv dcti0l, 64(dctIA)
# lqv dcti1h, 16(dctIA)
# lqv dcti1l, 80(dctIA)
# lqv dcti2h, 32(dctIA)
# lqv dcti2l, 96(dctIA)
# lqv dcti3h, 48(dctIA)
# lqv dcti3l, 112(dctIA)
vaddc dct0l, dcti0l, dcti2l
vadd dct0h, dcti0h, dcti2h
vaddc dct1l, dcti1l, dcti3l
vadd dct1h, dcti1h, dcti3h
# Second half result - keep it around
vsubc dcti0l, dcti0l, dcti2l
vsub dcti0h, dcti0h, dcti2h
vsubc dcti1l, dcti1l, dcti3l
vsub dcti1h, dcti1h, dcti3h
jal LASTBut
nop
# First output stage - uses scalar code
lw t0, 24(dctIA) # p6
lw t1, 56(dctIA) # p7
add tmp0, t0, t1 # p6+p7
sw t1, 112(dctOA) # outsam[28]
lw t0, 40(dctIA) # p5
add t3, t0, tmp0
sw t3, 48(dctOA) # outsam[12]
add t3, t0, t1 # p5+p7
sw t3, 80(dctOA) # outsam[20]
lw t0, 8(dctIA) # p4
add t3, t0, tmp0
sw t3, 16(dctOA) # outsam[4]
lw t0, 52(dctIA) # p11
lw t1, 60(dctIA) # p15
add tmp0, t1, t0 # p15+p11
lw t2, 44(dctIA) # p13
add t3, t2, tmp0
sw t3, 88(dctOA) # outsam[22]
sw tmp0, 104(dctOA) # outsam[26]
sw t1, 120(dctOA) # outsam[30]
lw tmp0, 28(dctIA) # p14
add tmp0, tmp0, t1 # p14+p15
lw t3, 36(dctIA) # p9
add t3, t3, t2 # p9+p13
add t4, t3, tmp0
sw t4, 56(dctOA) # outsam[14]
add t4, t3, t1 # p9+p13+p15
sw t4, 72(dctOA) # outsam[18]
lw t1, 12(dctIA) # p12
lw t3, 4(dctIA) # p8
add t3, t3, t1 # p8+p12
add t3, t3, tmp0
sw t3, 8(dctOA) # outsam[2]
lw t3, 20(dctIA) # p10
add t3, t3, t0 # p10+p11
add tmp0, tmp0, t3 # tmp += p10+p11
add t3, t2, tmp0
sw t3, 40(dctOA) # outsam[10]
add t3, t1, tmp0
sw t3, 24(dctOA) # outsam[6]
lw t0, 0(dctIA) # p0
sw t0, 0(dctOA) # outsam[0]
lw t0, 32(dctIA) # p1
sw t0, 64(dctOA) # outsam[16]
lw t0, 48(dctIA) # p3
sw t0, 96(dctOA) # outsam[24]
lw t1, 16(dctIA) # p2
add t0, t0, t1 # p2+p3
sw t0, 32(dctOA) # outsam[8]
# Process bottom 16
lqv coef0l[0], DCT64_LOW_OFFSET(zero)
lqv coef0h[0], DCT64_HIGH_OFFSET(zero)
lqv coef1l[0], DCT64_LOWU_OFFSET(zero)
lqv coef1h[0], DCT64_HIGHU_OFFSET(zero)
vmudl $v0, coef0l, dcti0l
vmadm $v0, coef0h, dcti0l
vmadn dct0l, coef0l, dcti0h
vmadh dct0h, coef0h, dcti0h
vmudl $v0, coef1l, dcti1l
vmadm $v0, coef1h, dcti1l
vmadn dct1l, coef1l, dcti1h
vmadh dct1h, coef1h, dcti1h
jal LASTBut
nop
# Second output stage - uses scalar code
lw t0, 0(dctIA)
lw t1, 32(dctIA)
lw t2, 16(dctIA)
lw t3, 48(dctIA)
lw t4, 8(dctIA)
lw t5, 40(dctIA)
lw t6, 24(dctIA)
lw t7, 56(dctIA)
lw t8, 4(dctIA)
lw t9, 36(dctIA)
lw t10, 20(dctIA)
lw t11, 52(dctIA)
lw t12, 12(dctIA)
lw t13, 44(dctIA)
lw t14, 28(dctIA)
lw t15, 60(dctIA)
add tmp0, t13, t15
add tmp1, t1, t9
add tmp1, tmp1, tmp0
sw tmp1, 68(dctOA) # outsam[17]
add tmp1, t5, t7
add tmp1, tmp1, t11
add tmp1, tmp1, tmp0
sw tmp1, 84(dctOA) # outsam[21]
add tmp0, tmp0, t9
add tmp1, t1, t14
add tmp1, tmp1, tmp0
sw tmp1, 60(dctOA) # outsam[15]
add tmp0, tmp0, t5
add tmp0, tmp0, t7
sw tmp0, 76(dctOA) # outsam[19]
add tmp1, t6, t14
add tmp1, tmp1, tmp0
sw tmp1, 52(dctOA) # outsam[13]
add tmp0, t10, t11
add tmp0, tmp0, t12
add tmp0, tmp0, t13
add tmp0, tmp0, t14
add tmp0, tmp0, t15
add tmp1, t2, t3
add tmp1, tmp1, tmp0
sub tmp1, tmp1, t12
sw tmp1, 36(dctOA) # outsam[9]
# Re-use t1
add t1, t6, t7
add tmp1, t4, t1
add tmp1, tmp1, tmp0
sub tmp1, tmp1, t13
sw tmp1, 20(dctOA) # outsam[5]
add tmp1, t5, t1
add tmp1, tmp1, tmp0
sub tmp1, tmp1, t12
sw tmp1, 44(dctOA) # outsam[11]
add tmp1, t2, t3
add tmp1, tmp1, tmp0
sub tmp1, tmp1, t13
sw tmp1, 28(dctOA) # outsam[7]
add tmp0, t8, t12
add tmp0, tmp0, t14
add tmp0, tmp0, t15
add tmp1, tmp0, t0
sw tmp1, 4(dctOA) # outsam[1]
add tmp1, tmp0, t4
add tmp1, tmp1, t1
sw tmp1, 12(dctOA) # outsam[3]
add tmp0, t11, t15
add tmp1, tmp0, t7
sw tmp1, 108(dctOA) # outsam[27]
add tmp0, tmp0, t3
sw tmp0, 100(dctOA) # outsam[25]
add tmp1, tmp0, t13
sw tmp1, 92(dctOA) # outsam[23]
add tmp1, t7, t15
sw tmp1, 116(dctOA) # outsam[29]
sw t15, 124(dctOA) # outsam[31]
j Remap
nop
#
# Common code for the middle butterfly stages
#
LASTBut:
# Second butterfly: X(0) + X(8) and cos32*(X(0) - X(8))
vaddc dct2l, dct0l, dct1l
vadd dct2h, dct0h, dct1h
vsubc dct3l, dct0l, dct1l
vsub dct3h, dct0h, dct1h
sqv dct2h[0], 0(dctIA)
sqv dct2l[0], 64(dctIA)
lqv coef0l[0], DCT32_LOW_OFFSET(zero)
lqv coef0h[0], DCT32_HIGH_OFFSET(zero)
vmudl $v0, coef0l, dct3l
vmadm $v0, coef0h, dct3l
vmadn dct1l, coef0l, dct3h
vmadh dct1h, coef0h, dct3h
sqv dct1h[0], 16(dctIA)
sqv dct1l[0], 80(dctIA)
# Third butterfly: X(0) + X(8) and cos16*(X(0) - X(8))
# X(0) and X(8) formed from two halves of previous 8 element vectors
ldv dct0h[0], 0(dctIA)
ldv dct0h[8], 16(dctIA)
ldv dct1h[0], 8(dctIA)
ldv dct1h[8], 24(dctIA)
ldv dct0l[0], 64(dctIA)
ldv dct0l[8], 80(dctIA)
ldv dct1l[0], 72(dctIA)
ldv dct1l[8], 88(dctIA)
vaddc dct2l, dct0l, dct1l
vadd dct2h, dct0h, dct1h
vsubc dct3l, dct0l, dct1l
vsub dct3h, dct0h, dct1h
ldv coef0l[0], DCT16_LOW_OFFSET(zero)
ldv coef0h[0], DCT16_HIGH_OFFSET(zero)
ldv coef0l[8], DCT16_LOW_OFFSET(zero)
ldv coef0h[8], DCT16_HIGH_OFFSET(zero)
sqv dct2h[0], 0(dctIA)
sqv dct2l[0], 64(dctIA)
vmudl $v0, coef0l, dct3l
vmadm $v0, coef0h, dct3l
vmadn dct1l, coef0l, dct3h
vmadh dct1h, coef0h, dct3h
sqv dct1h[0], 16(dctIA)
sqv dct1l[0], 80(dctIA)
# Fourth butterfly: X(0) + X(8) and cos8*(X(0) - X(8))
# X(0) and X(8) formed from four quarters of previous 8 element vectors
llv dct0h[0], 0(dctIA)
llv dct0h[4], 8(dctIA)
llv dct0h[8], 16(dctIA)
llv dct0h[12], 24(dctIA)
llv dct0l[0], 64(dctIA)
llv dct0l[4], 72(dctIA)
llv dct0l[8], 80(dctIA)
llv dct0l[12], 88(dctIA)
llv dct1h[0], 4(dctIA)
llv dct1h[4], 12(dctIA)
llv dct1h[8], 20(dctIA)
llv dct1h[12], 28(dctIA)
llv dct1l[0], 68(dctIA)
llv dct1l[4], 76(dctIA)
llv dct1l[8], 84(dctIA)
llv dct1l[12], 92(dctIA)
vaddc dct2l, dct0l, dct1l
vadd dct2h, dct0h, dct1h
vsubc dct3l, dct0l, dct1l
vsub dct3h, dct0h, dct1h
# This can go in the beginning - can use more memory to reduce loads
llv coef0l[0], DCT8_LOW_OFFSET(zero)
llv coef0h[0], DCT8_HIGH_OFFSET(zero)
llv coef0l[4], DCT8_LOW_OFFSET(zero)
llv coef0h[4], DCT8_HIGH_OFFSET(zero)
llv coef0l[8], DCT8_LOW_OFFSET(zero)
llv coef0h[8], DCT8_HIGH_OFFSET(zero)
llv coef0l[12], DCT8_LOW_OFFSET(zero)
llv coef0h[12], DCT8_HIGH_OFFSET(zero)
sqv dct2h[0], 0(dctIA)
sqv dct2l[0], 64(dctIA)
vmudl $v0, coef0l, dct3l
vmadm $v0, coef0h, dct3l
vmadn dct1l, coef0l, dct3h
vmadh dct1h, coef0h, dct3h
sqv dct1h[0], 16(dctIA)
sqv dct1l[0], 80(dctIA)
# Fifth butterfly - load alternate would be really useful here
lsv dct0h[0], 0(dctIA)
lsv dct0h[2], 4(dctIA)
lsv dct0h[4], 8(dctIA)
lsv dct0h[6], 12(dctIA)
lsv dct0h[8], 16(dctIA)
lsv dct0h[10], 20(dctIA)
lsv dct0h[12], 24(dctIA)
lsv dct0h[14], 28(dctIA)
lsv dct0l[0], 64(dctIA)
lsv dct0l[2], 68(dctIA)
lsv dct0l[4], 72(dctIA)
lsv dct0l[6], 76(dctIA)
lsv dct0l[8], 80(dctIA)
lsv dct0l[10], 84(dctIA)
lsv dct0l[12], 88(dctIA)
lsv dct0l[14], 92(dctIA)
lsv dct1h[0], 2(dctIA)
lsv dct1h[2], 6(dctIA)
lsv dct1h[4], 10(dctIA)
lsv dct1h[6], 14(dctIA)
lsv dct1h[8], 18(dctIA)
lsv dct1h[10], 22(dctIA)
lsv dct1h[12], 26(dctIA)
lsv dct1h[14], 30(dctIA)
lsv dct1l[0], 66(dctIA)
lsv dct1l[2], 70(dctIA)
lsv dct1l[4], 74(dctIA)
lsv dct1l[6], 78(dctIA)
lsv dct1l[8], 82(dctIA)
lsv dct1l[10], 86(dctIA)
lsv dct1l[12], 90(dctIA)
lsv dct1l[14], 94(dctIA)
vaddc dct2l, dct0l, dct1l
vadd dct2h, dct0h, dct1h
vsubc dct3l, dct0l, dct1l
vsub dct3h, dct0h, dct1h
# Store alternate would be really useful here!
ssv dct2h[0], 0(dctIA)
ssv dct2h[2], 4(dctIA)
ssv dct2h[4], 8(dctIA)
ssv dct2h[6], 12(dctIA)
ssv dct2h[8], 16(dctIA)
ssv dct2h[10], 20(dctIA)
ssv dct2h[12], 24(dctIA)
ssv dct2h[14], 28(dctIA)
ssv dct2l[0], 2(dctIA)
ssv dct2l[2], 6(dctIA)
ssv dct2l[4], 10(dctIA)
ssv dct2l[6], 14(dctIA)
ssv dct2l[8], 18(dctIA)
ssv dct2l[10], 22(dctIA)
ssv dct2l[12], 26(dctIA)
ssv dct2l[14], 30(dctIA)
vmudl $v0, dct3l, vconst[4] # 0.7071...
vmadm $v0, dct3h, vconst[4] # This is zero
vmadn dct1l, dct3l, vconst[0]
vmadh dct1h, dct3h, vconst[0]
ssv dct1h[0], 32(dctIA)
ssv dct1h[2], 36(dctIA)
ssv dct1h[4], 40(dctIA)
ssv dct1h[6], 44(dctIA)
ssv dct1h[8], 48(dctIA)
ssv dct1h[10], 52(dctIA)
ssv dct1h[12], 56(dctIA)
ssv dct1h[14], 60(dctIA)
ssv dct1l[0], 34(dctIA)
ssv dct1l[2], 38(dctIA)
ssv dct1l[4], 42(dctIA)
ssv dct1l[6], 46(dctIA)
ssv dct1l[8], 50(dctIA)
ssv dct1l[10], 54(dctIA)
ssv dct1l[12], 58(dctIA)
ssv dct1l[14], 62(dctIA)
jr return
nop
.unname dcti0h
.unname dcti1h
.unname dcti0l
.unname dcti1l
.unname dcti2h
.unname dcti3h
.unname dcti2l
.unname dcti3l
.unname dct0h
.unname dct1h
.unname dct0l
.unname dct1l
.unname dct2h
.unname dct2l
.unname dct3h
.unname dct3l
.unname coef0h
.unname coef1h
.unname coef0l
.unname coef1l
.unname dctIA
.unname dctOA
.unname t0
.unname t1
.unname t2
.unname t3
.unname t4
.unname t5
.unname t6
.unname t7
.unname t8
.unname t9
.unname t10
.unname t11
.unname t12
.unname t13
.unname t14
.unname t15
.unname tmp0
.unname tmp1
.end dct32