adpcm.s
8.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
/*
* adpcm.s
*
* Simple adpcm using a switched set of predictors and
* forward adaptive gain. Implemented using 8x10 matrix/vector
* multiply to create 8 samples at a time.
*
*/
/* 32 bytes of state */
#define ADPCM_STATE_SIZE 31 /* Actually one less for DMA */
/* Semi-permanent scalar registers */
.name header_base, $21 # DMEM Input - points to header
.name dm_in, $20 # DMEM Input - points to samples
.name dm_out, $19 # DMEM output
.name count, $18 # Number to process
.name state_addr, $17
.name static_base, $16
.name coef_base, $15
.name scalei, $14
.name coef_index, $13
.name ncoef_index, $12
.name tindex, $11
/* Semi-permanent vector registers */
.name vconst, $v31 # Contains some usefule constants
.name idata0, $v30 # Two vectors of 8 inputs
.name idata1, $v29
.name odata0, $v28 # Two vectors of 8 outputs
.name odata1, $v27
.name odataf, $v26 # intermediate fractional value for output
.name imask1, $v25 # Used to mask the bitstream, lower 4x4 bits,
.name imask2, $v24 # .... upper 4x4 bits
.name iscale1, $v23 # Used to scale the masked bits to correct
.name vscale, $v22
.name col0, $v21 # Matrix columns
.name col1, $v20
.name col2, $v19
.name col3, $v18
.name col4, $v17
.name col5, $v16
.name col6, $v15
.name col7, $v14
.name col8, $v13
# Get address of input and output
case_A_ADPCM:
# important constants:
#
lqv vconst[0], VCONST_OFFSET(zero)
vxor odata1, odata1, odata1 # Clear last output
lhu header_base, RSP_PARAMETER_DMEMIN(parbase)
vxor imask1, imask1, imask1 # Clear some columns
vxor imask2, imask2, imask2
addi dm_in, header_base, 1
lhu dm_out, RSP_PARAMETER_DMEMOUT(parbase)
vxor col8, col8, col8
vxor col7, col7, col7
lhu count, RSP_PARAMETER_COUNT(parbase)
vxor col6, col6, col6
# Resolve DRAM address for state
.name mask, $1
.name seg_id, $2
.name seg_addr, $3
lui mask, 0x00ff # Load up the state address
vxor col5, col5, col5
ori mask, mask, 0xffff
vxor col4, col4, col4
and state_addr, aud1, mask
vxor col3, col3, col3
srl seg_id, aud1, 24 # must clear bits
vxor col2, col2, col2
sll seg_id, seg_id, 2 # leave mult. by 4 for offset
lw seg_addr, RSP_SEG_OFFSET(seg_id)
add state_addr, state_addr, seg_addr
.unname mask
.unname seg_id
.unname seg_addr
# Load state if required
.name tmp0, $1
.name tmp1, $2
.name ssize, $3
sqv odata1[0], 0(dm_out) # Store cleared last output
sqv odata1[0], 16(dm_out)
srl tmp0, aud0, 16 # Check A_INIT flag
andi tmp0, tmp0, A_INIT
bgtz tmp0, ADPCMno_load
srl tmp0, aud0, 16 # Check A_LOOP flag
andi tmp0, tmp0, A_LOOP
beq zero, tmp0, ADPCMload
addi tmp1, state_addr, 0
# If its a loop then load up state from loop state
lw tmp1, RSP_PARAMETER_LSTATE(parbase)
# Load up the state
ADPCMload:
addi tmp0, dm_out, 0
jal DMAread
addi ssize, zero, ADPCM_STATE_SIZE
# Wait here for state to load up
ADPCMwait1:
mfc0 $5, DMA_BUSY
bne $5, zero, ADPCMwait1
nop
# release the semaphore
mtc0 $0, SP_RESERVED
.unname tmp0
.unname tmp1
.unname ssize
ADPCMno_load:
# Initialize registers
# Addresses for static data and coefficients
addi static_base, zero, DEC_MASK_OFFSET
addi coef_base, zero, RSP_ADPCMTABLE_OFFSET
# Set-up the static data (masks, iscale) in registers
ldv imask1[0], 0(static_base)
ldv imask2[8], 0(static_base)
ldv iscale1[0], 8(static_base)
ldv iscale1[8], 8(static_base)
lqv odata1[0], 16(dm_out)
# Increment dm_out to reflect the first location after the
# stored state
addi dm_out, dm_out, 32
# Now if count is zero I don't have to do anything
beq count, zero, ADPCMdone
# delay slot
.name bitdata, $v1
.name vtmp0, $v2
.name vtmp1, $v3
.name vtmp2, $v4
.name vtmp3, $v5
.name vtmp4, $v6
.name vtmp5, $v7
.name headr, $1
.name tmp1, $2
.name tmp2, $3
.name scale, $4
# Parse the first header to give predictor and scale - this gets
# us one frame ahead so this stuff can be done in parallel with
# vector instructions in the loop.
ldv bitdata[0], 0(dm_in)
lbu headr, 0(header_base)
andi tindex, headr, 0x0f
sll tindex, tindex, 5 # each table entry is 32 bytes
vand vtmp1, imask1, bitdata[0]
add coef_index, tindex, coef_base
vand vtmp2, imask2, bitdata[1]
srl scalei, headr, 0x04
vand vtmp3, imask1, bitdata[2]
addi tmp1, zero, 12
vand vtmp4, imask2, bitdata[3]
sub scalei, tmp1, scalei
addi tmp1, scalei, -1
addi tmp2, zero, 1
sll tmp2, tmp2, 15
srlv scale, tmp2, tmp1
mtc2 scale, vscale # vscale is meaningless if scalei is zero
# Load up the matrix coefficients for the first frame into registers.
# This is overlayed in the loop.
lqv col0[0], 0(coef_index)
lqv col1[0], 16(coef_index)
addi coef_index, coef_index,-2
lrv col2[0], 32(coef_index)
addi coef_index, coef_index,-2
lrv col3[0], 32(coef_index)
addi coef_index, coef_index,-2
lrv col4[0], 32(coef_index)
addi coef_index, coef_index,-2
lrv col5[0], 32(coef_index)
addi coef_index, coef_index,-2
lrv col6[0], 32(coef_index)
addi coef_index, coef_index,-2
lrv col7[0], 32(coef_index)
addi coef_index, coef_index,-2
lrv col8[0], 32(coef_index)
ADPCMfloop:
# Parse the next header to give predictor and scale
# and parse the current data from the vtmp[1-4] registers.
# Use the current scale to scale the input data.
addi dm_in, dm_in, 9 # 9 bytes/frame of input
vmudn idata0, vtmp1, iscale1 # Parse the data
addi header_base, header_base, 9
vmadn idata0, vtmp2, iscale1
ldv bitdata[0], 0(dm_in)
vmudn idata1, vtmp3, iscale1
lbu headr, 0(header_base)
vmadn idata1, vtmp4, iscale1
# Scale the data
blez scalei, ADPCMnoscale
andi tindex, headr, 0x0f
vmudm idata0, idata0, vscale[0]
vmudm idata1, idata1, vscale[0]
ADPCMnoscale:
# Multiply out to give the 16 output values
# vtmp0 and vtmp5 are used to avoid stalling
sll tindex, tindex, 5
vand vtmp1, imask1, bitdata[0]
add coef_index, tindex, coef_base
vand vtmp2, imask2, bitdata[1]
vand vtmp3, imask1, bitdata[2]
vand vtmp4, imask2, bitdata[3]
srl scalei, headr, 0x04
vmudh vtmp0, col0, odata1[6]
addi tmp1, zero, 12
vmadh vtmp0, col1, odata1[7]
sub scalei, tmp1, scalei
vmadh vtmp0, col2, idata0[0]
addi tmp1, scalei, -1
vmadh vtmp0, col3, idata0[1]
addi tmp2, zero, 1
vmadh vtmp0, col4, idata0[2]
sll tmp2, tmp2, 15
vmadh vtmp0, col5, idata0[3]
srlv scale, tmp2, tmp1
vmadh odata0, col6, idata0[4]
mtc2 scale, vscale # vscale is meaningless if scalei is zero
vmadh vtmp0, col7, idata0[5]
vmadh vtmp0, col8, idata0[6]
vmadh vtmp0, idata0, vconst[5]
vsaw odataf, vtmp5, odata0[1]
vsaw odata0, vtmp5, odata0[0]
# Rescale first output
vmudn vtmp0, odataf, vconst[4]
vmadh odata0, odata0, vconst[4]
# Second set of 8 outputs
vmudh vtmp0, col2, idata1[0]
addi ncoef_index, coef_index,-2
vmadh vtmp0, col3, idata1[1]
lrv col2[0], 32(ncoef_index)
vmadh vtmp0, col4, idata1[2]
addi ncoef_index, ncoef_index,-2
vmadh vtmp0, col5, idata1[3]
lrv col3[0], 32(ncoef_index)
vmadh vtmp0, col6, idata1[4]
addi ncoef_index, ncoef_index,-2
vmadh vtmp0, col7, idata1[5]
lrv col4[0], 32(ncoef_index)
vmadh vtmp0, col8, idata1[6]
addi ncoef_index, ncoef_index,-2
vmadh vtmp0, idata1, vconst[5]
lrv col5[0], 32(ncoef_index)
vmadh vtmp0, col0, odata0[6]
addi ncoef_index, ncoef_index,-2
vmadh vtmp0, col1, odata0[7]
lrv col6[0], 32(ncoef_index)
vsaw odataf, vtmp5, odata1[1]
addi ncoef_index, ncoef_index,-2
vsaw odata1, vtmp5, odata1[0]
lrv col7[0], 32(ncoef_index)
addi ncoef_index, ncoef_index,-2
lrv col8[0], 32(ncoef_index)
lqv col0[0], 0(coef_index)
# Rescale output
vmudn vtmp0, odataf, vconst[4]
lqv col1[0], 16(coef_index)
vmadh odata1, odata1, vconst[4]
addi count, count, -32 # Sample count
# Save the tindex for next time round
sdv odata0[0], 0(dm_out)
sdv odata0[8], 8(dm_out)
sdv odata1[0], 16(dm_out)
sdv odata1[8], 24(dm_out)
# Increment output and input and goto floop
bgtz count, ADPCMfloop
addi dm_out, dm_out, 32 # 32 bytes/frame of output
.unname bitdata
.unname vtmp0
.unname vtmp1
.unname vtmp2
.unname vtmp3
.unname vtmp4
.unname vtmp5
.unname headr
.unname tmp1
.unname tmp2
.unname scale
.name tmp0, $1
.name tmp1, $2
.name ssize, $3
ADPCMdone:
addi tmp0, dm_out, -32 # Back to before the last frame
addi tmp1, state_addr, 0
jal DMAwrite
addi ssize, zero, ADPCM_STATE_SIZE
ADPCMwait2:
mfc0 $5, DMA_BUSY
bne $5, zero, ADPCMwait2
nop
j AudDone
mtc0 $0, SP_RESERVED
.unname tmp0
.unname tmp1
.unname ssize
.unname header_base
.unname dm_in
.unname dm_out
.unname count
.unname state_addr
.unname static_base
.unname coef_base
.unname scalei
.unname coef_index
.unname ncoef_index
.unname tindex
.unname vconst
.unname idata0
.unname idata1
.unname odata0
.unname odata1
.unname odataf
.unname imask1
.unname imask2
.unname iscale1
.unname vscale
.unname col0
.unname col1
.unname col2
.unname col3
.unname col4
.unname col5
.unname col6
.unname col7
.unname col8