ola.s
10 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
#############################################################
#
# Window and overlap-add
#
# NOTES:
# ------
.ent ola
.name bAddrl, $1
.name bAddrh, $2
.name iAddrl, $3
.name iAddrh, $4
.name wAddr, $5
.name pBase, $6
.name bEnd, $7
.name tmp0, $8
.name oAddr, $9
OLA:
addi pBase, zero, RSP_PAR_OFFSET # point to parameters
lhu bAddrh, 0(pBase) # make these 1 step behind
lhu bAddrl, 2(pBase)
addi bEnd, zero, eval(RSP_OLABUFF_LOW_OFFSET+RSP_OLABUFF_SIZE8-1)
addi wAddr, zero, eval(WIN0_LOW_OFFSET)
# First 6 are single precision coefficients
addi iAddrh, zero, RSP_OLAIN_OFFSET
jal SingleMul
addi iAddrl, iAddrh, 128
addi iAddrh, iAddrh, 64
jal SingleMul
addi iAddrl, iAddrl, 64
addi iAddrh, zero, RSP_OLAIN_OFFSET
jal SingleMul
addi iAddrl, iAddrh, 128
addi iAddrh, iAddrh, 64
jal SingleMul
addi iAddrl, iAddrl, 64
addi iAddrh, zero, RSP_OLAIN_OFFSET
jal SingleMul
addi iAddrl, iAddrh, 128
addi iAddrh, iAddrh, 64
jal SingleMul
addi iAddrl, iAddrl, 64
# Next 4 are double precision coefficients
addi iAddrh, zero, RSP_OLAIN_OFFSET
jal DoubleMul
addi iAddrl, iAddrh, 128
addi iAddrh, iAddrh, 64
jal DoubleMul
addi iAddrl, iAddrl, 64
addi iAddrh, zero, RSP_OLAIN_OFFSET
jal DoubleMul
addi iAddrl, iAddrh, 128
addi iAddrh, iAddrh, 64
jal DoubleMul
addi iAddrl, iAddrl, 64
# Last 6 are single precision coefficients
addi iAddrh, zero, RSP_OLAIN_OFFSET
jal SingleMul
addi iAddrl, iAddrh, 128
addi iAddrh, iAddrh, 64
jal SingleMul
addi iAddrl, iAddrl, 64
addi iAddrh, zero, RSP_OLAIN_OFFSET
jal SingleMul
addi iAddrl, iAddrh, 128
addi iAddrh, iAddrh, 64
jal SingleMul
addi iAddrl, iAddrl, 64
addi iAddrh, zero, RSP_OLAIN_OFFSET
jal SingleMul
addi iAddrl, iAddrh, 128
# Last one does multiply and store instead of add
addi iAddrh, iAddrh, 64
jal SingleMulEnd
addi iAddrl, iAddrl, 64
# Store the OLA buffer address, but first update to point to the
# output area.
addi bAddrl, bAddrl, 64
sub tmp0, bEnd, bAddrl
bgtz tmp0, OLAcont4
addi bAddrh, bAddrh, 64
addi bAddrl, zero, RSP_OLABUFF_LOW_OFFSET
addi bAddrh, zero, RSP_OLABUFF_HIGH_OFFSET
########################################################################
#
# Rescale the output from the OLA process. The OLA uses 32 bit
# resolution for the buffer, and results are stored shifted up
# by 8 bits (ie 24 bits). Rescale requires making the output
# into a 16 bit quantity which can be used for the DACs
#
# NOTES:
# ------
# - There are a lot of stalls here which could be fixed by using
# more vector registers and interleaving the different pieces
.name iBufl, $v1
.name iBufh, $v2
.name oBuf, $v3
OLAcont4:
addi oAddr, zero, RSP_DCTIN_OFFSET
lqv iBufl[0], 0(bAddrl)
vmudn oBuf, iBufl, vconst[6]
lqv iBufh[0], 0(bAddrh)
vmadh oBuf, iBufh, vconst[6]
sqv oBuf[0], 0(oAddr)
lqv iBufl[0], 16(bAddrl)
vmudn oBuf, iBufl, vconst[6]
lqv iBufh[0], 16(bAddrh)
vmadh oBuf, iBufh, vconst[6]
sqv oBuf[0], 16(oAddr)
lqv iBufl[0], 32(bAddrl)
vmudn oBuf, iBufl, vconst[6]
lqv iBufh[0], 32(bAddrh)
vmadh oBuf, iBufh, vconst[6]
sqv oBuf[0], 32(oAddr)
lqv iBufl[0], 48(bAddrl)
vmudn oBuf, iBufl, vconst[6]
lqv iBufh[0], 48(bAddrh)
vmadh oBuf, iBufh, vconst[6]
sqv oBuf[0], 48(oAddr)
# OLA is done store the buffer pointer and return
sh bAddrh, 0(pBase)
j decodeDone
sh bAddrl, 2(pBase) # delay slot
.unname iBufl
.unname iBufh
.unname oBuf
#
# Implements group (ie 32 values) multiply and add assuming the window
# coefficients are signed 16 bit
#
# Inputs:
# wAddr - start of group window coefficients. High
# values are assumed to be stored 64 bytes above
# low values.
# iAddrl - start of input coefficients.
# iAddrh
#
# bAddrl - start of overlap-add buffer
# bAddrh
# bEnd - the end of the overlap-add buffer (low)
# NOTES:
# - Check if there is a way to save some instructions on the
# 16x32 multiply!
#
.name w0l, $v1
.name w1l, $v2
.name w2l, $v3
.name w3l, $v4
.name i0l, $v5
.name i1l, $v6
.name i2l, $v7
.name i3l, $v8
.name b0l, $v9
.name b1l, $v10
.name b2l, $v11
.name b3l, $v12
.name w0h, $v13
.name w1h, $v14
.name w2h, $v15
.name w3h, $v16
.name i0h, $v17
.name i1h, $v18
.name i2h, $v19
.name i3h, $v20
.name b0h, $v21
.name b1h, $v22
.name b2h, $v23
.name b3h, $v24
SingleMul:
# Update the OLA buffer pointer
addi bAddrl, bAddrl, 64
sub tmp0, bEnd, bAddrl
bgtz tmp0, OLAcont1
addi bAddrh, bAddrh, 64
addi bAddrl, zero, RSP_OLABUFF_LOW_OFFSET
addi bAddrh, zero, RSP_OLABUFF_HIGH_OFFSET
OLAcont1:
lqv i0l[0], 0(iAddrl)
lqv i1l[0], 16(iAddrl)
lqv i2l[0], 32(iAddrl)
lqv i3l[0], 48(iAddrl)
lqv i0h[0], 0(iAddrh)
lqv i1h[0], 16(iAddrh)
lqv i2h[0], 32(iAddrh)
lqv i3h[0], 48(iAddrh)
lqv b0l[0], 0(bAddrl)
lqv b1l[0], 16(bAddrl)
lqv b2l[0], 32(bAddrl)
lqv b3l[0], 48(bAddrl)
lqv b0h[0], 0(bAddrh)
lqv b1h[0], 16(bAddrh)
lqv b2h[0], 32(bAddrh)
lqv b3h[0], 48(bAddrh)
lqv w0l[0], 0(wAddr)
lqv w1l[0], 16(wAddr)
lqv w2l[0], 32(wAddr)
lqv w3l[0], 48(wAddr)
# Multiply input by window
vmudm $v0, w0l, i0l
vmadh i0l, w0l, i0h
vsaw i0h, $v0, i0h[0]
vsaw i0l, $v0, i0l[1]
vmudm $v0, w1l, i1l
vmadh i1l, w1l, i1h
vsaw i1h, $v0, i1h[0]
vsaw i1l, $v0, i1l[1]
vmudm $v0, w2l, i2l
vmadh i2l, w2l, i2h
vsaw i2h, $v0, i2h[0]
vsaw i2l, $v0, i2l[1]
vmudm $v0, w3l, i3l
vmadh i3l, w3l, i3h
vsaw i3h, $v0, i3h[0]
vsaw i3l, $v0, i3l[1]
# Add to buffer
vaddc b0l, b0l, i0l
vadd b0h, b0h, i0h
vaddc b1l, b1l, i1l
vadd b1h, b1h, i1h
vaddc b2l, b2l, i2l
vadd b2h, b2h, i2h
vaddc b3l, b3l, i3l
vadd b3h, b3h, i3h
# Replace the buffer
sqv b0l[0], 0(bAddrl)
sqv b1l[0], 16(bAddrl)
sqv b2l[0], 32(bAddrl)
sqv b3l[0], 48(bAddrl)
sqv b0h[0], 0(bAddrh)
sqv b1h[0], 16(bAddrh)
sqv b2h[0], 32(bAddrh)
sqv b3h[0], 48(bAddrh)
jr return
addi wAddr, wAddr, 64
#
# Implements group (ie 32 values) multiply with *NO* add assuming the window
# coefficients are signed 16 bit.
#
#
# Inputs:
# wAddr - start of group window coefficients. High
# values are assumed to be stored 64 bytes above
# low values.
# iAddrl - start of input coefficients.
# iAddrh
#
# bAddrl - start of overlap-add buffer
# bAddrh
# bEnd - the end of the overlap-add buffer (low)
#
SingleMulEnd:
# Update the OLA buffer pointer
addi bAddrl, bAddrl, 64
sub tmp0, bEnd, bAddrl
bgtz tmp0, OLAcont2
addi bAddrh, bAddrh, 64
addi bAddrl, zero, RSP_OLABUFF_LOW_OFFSET
addi bAddrh, zero, RSP_OLABUFF_HIGH_OFFSET
OLAcont2:
lqv i0l[0], 0(iAddrl)
lqv i1l[0], 16(iAddrl)
lqv i2l[0], 32(iAddrl)
lqv i3l[0], 48(iAddrl)
lqv i0h[0], 0(iAddrh)
lqv i1h[0], 16(iAddrh)
lqv i2h[0], 32(iAddrh)
lqv i3h[0], 48(iAddrh)
lqv w0l[0], 0(wAddr)
lqv w1l[0], 16(wAddr)
lqv w2l[0], 32(wAddr)
lqv w3l[0], 48(wAddr)
# Multiply input by window
vmudm $v0, w0l, i0l
vmadh i0l, w0l, i0h
vsaw i0h, $v0, i0h[0]
vsaw i0l, $v0, i0l[1]
vmudm $v0, w1l, i1l
vmadh i1l, w1l, i1h
vsaw i1h, $v0, i1h[0]
vsaw i1l, $v0, i1l[1]
vmudm $v0, w2l, i2l
vmadh i2l, w2l, i2h
vsaw i2h, $v0, i2h[0]
vsaw i2l, $v0, i2l[1]
vmudm $v0, w3l, i3l
vmadh i3l, w3l, i3h
vsaw i3h, $v0, i3h[0]
vsaw i3l, $v0, i3l[1]
# Replace the buffer
sqv i0l[0], 0(bAddrl)
sqv i1l[0], 16(bAddrl)
sqv i2l[0], 32(bAddrl)
sqv i3l[0], 48(bAddrl)
sqv i0h[0], 0(bAddrh)
sqv i1h[0], 16(bAddrh)
sqv i2h[0], 32(bAddrh)
sqv i3h[0], 48(bAddrh)
jr return
nop
#
# Implements group multiply and add assuming the window
# coefficients are signed 32 bit
#
# Inputs:
# wAddr - start of group window coefficients. High
# values are assumed to be stored 64 bytes above
# low values.
# iAddrl - start of input coefficients.
# iAddrh
#
# bAddrl - start of overlap-add buffer
# bAddrh
# bEnd - the end of the overlap-add buffer (low)
DoubleMul:
# Update the OLA buffer pointer
addi bAddrl, bAddrl, 64
sub tmp0, bEnd, bAddrl
bgtz tmp0, OLAcont3
addi bAddrh, bAddrh, 64
addi bAddrl, zero, RSP_OLABUFF_LOW_OFFSET
addi bAddrh, zero, RSP_OLABUFF_HIGH_OFFSET
OLAcont3:
lqv i0l[0], 0(iAddrl)
lqv i1l[0], 16(iAddrl)
lqv i2l[0], 32(iAddrl)
lqv i3l[0], 48(iAddrl)
lqv i0h[0], 0(iAddrh)
lqv i1h[0], 16(iAddrh)
lqv i2h[0], 32(iAddrh)
lqv i3h[0], 48(iAddrh)
lqv b0l[0], 0(bAddrl)
lqv b1l[0], 16(bAddrl)
lqv b2l[0], 32(bAddrl)
lqv b3l[0], 48(bAddrl)
lqv b0h[0], 0(bAddrh)
lqv b1h[0], 16(bAddrh)
lqv b2h[0], 32(bAddrh)
lqv b3h[0], 48(bAddrh)
lqv w0l[0], 0(wAddr)
lqv w1l[0], 16(wAddr)
lqv w2l[0], 32(wAddr)
lqv w3l[0], 48(wAddr)
lqv w0h[0], 64(wAddr)
lqv w1h[0], 80(wAddr)
lqv w2h[0], 96(wAddr)
lqv w3h[0], 112(wAddr)
# Multiply input by window
vmudl $v0, w0l, i0l
vmadm $v0, w0h, i0l
vmadn i0l, w0l, i0h
vmadh i0h, w0h, i0h
vmudl $v0, w1l, i1l
vmadm $v0, w1h, i1l
vmadn i1l, w1l, i1h
vmadh i1h, w1h, i1h
vmudl $v0, w2l, i2l
vmadm $v0, w2h, i2l
vmadn i2l, w2l, i2h
vmadh i2h, w2h, i2h
vmudl $v0, w3l, i3l
vmadm $v0, w3h, i3l
vmadn i3l, w3l, i3h
vmadh i3h, w3h, i3h
# Add to buffer
vaddc b0l, b0l, i0l
vadd b0h, b0h, i0h
vaddc b1l, b1l, i1l
vadd b1h, b1h, i1h
vaddc b2l, b2l, i2l
vadd b2h, b2h, i2h
vaddc b3l, b3l, i3l
vadd b3h, b3h, i3h
# Replace the buffer
sqv b0l[0], 0(bAddrl)
sqv b1l[0], 16(bAddrl)
sqv b2l[0], 32(bAddrl)
sqv b3l[0], 48(bAddrl)
sqv b0h[0], 0(bAddrh)
sqv b1h[0], 16(bAddrh)
sqv b2h[0], 32(bAddrh)
sqv b3h[0], 48(bAddrh)
jr return
addi wAddr, wAddr, 128
.unname w0l
.unname w1l
.unname w2l
.unname w3l
.unname i0l
.unname i1l
.unname i2l
.unname i3l
.unname b0l
.unname b1l
.unname b2l
.unname b3l
.unname w0h
.unname w1h
.unname w2h
.unname w3h
.unname i0h
.unname i1h
.unname i2h
.unname i3h
.unname b0h
.unname b1h
.unname b2h
.unname b3h
.unname bAddrl
.unname bAddrh
.unname iAddrl
.unname iAddrh
.unname wAddr
.unname pBase
.unname bEnd
.unname tmp0
.end ola