adpcm.s 8.93 KB

Raw Blame History Permalink


/*
 * adpcm.s
 *
 * Simple adpcm using a switched set of predictors and
 * forward adaptive gain. Implemented using 8x10 matrix/vector
 * multiply to create 8 samples at a time.
 *
 */

/* 32 bytes of state */

#define	ADPCM_STATE_SIZE 31	/* Actually one less for DMA */

/* Semi-permanent scalar registers */

.name	header_base,	$21	# DMEM Input - points to header
.name	dm_in,		$20	# DMEM Input - points to samples
.name   dm_out,		$19	# DMEM output
.name	count,		$18	# Number to process
.name	state_addr,	$17
.name	static_base,	$16
.name	coef_base,	$15
.name	scalei,		$14
.name	coef_index,	$13
.name	ncoef_index,	$12
.name	tindex,		$11

/* Semi-permanent vector registers */

.name	vconst,		$v31	# Contains some usefule constants
.name	idata0,	 	$v30	# Two vectors of 8 inputs
.name 	idata1, 	$v29
.name 	odata0,		$v28  	# Two vectors of 8 outputs
.name 	odata1,		$v27
.name	odataf,		$v26	# intermediate fractional value for output
.name	imask1,		$v25	# Used to mask the bitstream, lower 4x4 bits,
.name	imask2,		$v24	# .... upper 4x4 bits
.name	iscale1,	$v23	# Used to scale the masked bits to correct
.name	vscale,		$v22
.name	col0,		$v21	# Matrix columns
.name	col1,		$v20
.name	col2,		$v19
.name	col3,		$v18
.name	col4,		$v17
.name	col5,		$v16
.name	col6,		$v15
.name	col7,		$v14
.name	col8,		$v13

  # Get address of input and output

case_A_ADPCM:
  # important constants:
  #
	lqv	vconst[0], VCONST_OFFSET(zero)
				vxor	odata1, odata1, odata1	# Clear last output

	lhu	header_base, RSP_PARAMETER_DMEMIN(parbase)
				vxor	imask1, imask1, imask1	# Clear some columns
				vxor	imask2, imask2, imask2
	addi	dm_in, header_base, 1
	lhu	dm_out, RSP_PARAMETER_DMEMOUT(parbase)
				vxor	col8, col8, col8
				vxor	col7, col7, col7
	lhu	count, RSP_PARAMETER_COUNT(parbase)
				vxor	col6, col6, col6

  # Resolve DRAM address for state

.name	mask,		$1
.name	seg_id,		$2
.name	seg_addr,	$3

	lui	mask, 0x00ff			# Load up the state address
				vxor	col5, col5, col5
	ori	mask, mask, 0xffff
				vxor	col4, col4, col4
	and	state_addr, aud1, mask
				vxor	col3, col3, col3
	srl	seg_id, aud1, 24		# must clear bits
				vxor	col2, col2, col2
	sll	seg_id, seg_id, 2		# leave mult. by 4 for offset
	lw	seg_addr, RSP_SEG_OFFSET(seg_id)
	add	state_addr, state_addr, seg_addr

.unname mask
.unname seg_id
.unname seg_addr

  # Load state if required

.name	tmp0,	$1
.name	tmp1,	$2
.name	ssize,	$3

	sqv	odata1[0], 0(dm_out)		# Store cleared last output
	sqv	odata1[0], 16(dm_out)
	srl	tmp0, aud0, 16			# Check A_INIT flag
	andi	tmp0, tmp0, A_INIT
	bgtz	tmp0, ADPCMno_load
	srl	tmp0, aud0, 16			# Check A_LOOP flag
	andi	tmp0, tmp0, A_LOOP
	beq	zero, tmp0, ADPCMload
	addi	tmp1, state_addr, 0

  # If its a loop then load up state from loop state

	lw	tmp1, RSP_PARAMETER_LSTATE(parbase)

  # Load up the state

ADPCMload:
	addi	tmp0, dm_out, 0
	jal	DMAread
	addi	ssize, zero, ADPCM_STATE_SIZE

 # Wait here for state to load up

ADPCMwait1:
	mfc0	$5, DMA_BUSY
	bne	$5, zero, ADPCMwait1
	nop
 # release the semaphore
	mtc0	$0, SP_RESERVED

.unname	tmp0
.unname	tmp1
.unname	ssize

ADPCMno_load:

  # Initialize registers
  # Addresses for static data and coefficients

	addi	static_base, zero, DEC_MASK_OFFSET
	addi	coef_base, zero, RSP_ADPCMTABLE_OFFSET

  # Set-up the static data (masks, iscale) in registers

	ldv	imask1[0], 0(static_base)
	ldv	imask2[8], 0(static_base)
	ldv	iscale1[0], 8(static_base)
	ldv	iscale1[8], 8(static_base)

	lqv	odata1[0], 16(dm_out)

  # Increment dm_out to reflect the first location after the
  # stored state
	addi	dm_out, dm_out, 32
  # Now if count is zero I don't have to do anything
	beq	count, zero, ADPCMdone
	# delay slot

.name	bitdata,	$v1
.name	vtmp0, 		$v2
.name	vtmp1,		$v3
.name	vtmp2, 		$v4
.name	vtmp3,		$v5
.name	vtmp4,		$v6
.name   vtmp5,		$v7

.name	headr,		$1
.name	tmp1,		$2
.name	tmp2,		$3
.name	scale,		$4


  # Parse the first header to give predictor and scale - this gets
  # us one frame ahead so this stuff can be done in parallel with
  # vector instructions in the loop.
	ldv	bitdata[0], 0(dm_in)
	lbu	headr, 0(header_base)
	andi	tindex, headr, 0x0f
	sll	tindex, tindex, 5	# each table entry is 32 bytes
					vand	vtmp1, imask1, bitdata[0]
	add	coef_index, tindex, coef_base
					vand	vtmp2, imask2, bitdata[1]
	srl	scalei, headr, 0x04
					vand	vtmp3, imask1, bitdata[2]
	addi	tmp1, zero, 12
					vand	vtmp4, imask2, bitdata[3]
	sub	scalei, tmp1, scalei
	addi	tmp1, scalei, -1
	addi	tmp2, zero, 1
  	sll	tmp2, tmp2, 15
	srlv	scale, tmp2, tmp1
	mtc2	scale, vscale			# vscale is meaningless if scalei is zero


  # Load up the matrix coefficients for the first frame into registers.
  # This is overlayed in the loop.

	lqv	col0[0], 0(coef_index)
	lqv	col1[0], 16(coef_index)
	addi	coef_index, coef_index,-2
	lrv	col2[0], 32(coef_index)
	addi	coef_index, coef_index,-2
	lrv	col3[0], 32(coef_index)

	addi	coef_index, coef_index,-2
	lrv	col4[0], 32(coef_index)
	addi	coef_index, coef_index,-2
	lrv	col5[0], 32(coef_index)
	addi	coef_index, coef_index,-2
	lrv	col6[0], 32(coef_index)
	addi	coef_index, coef_index,-2
	lrv	col7[0], 32(coef_index)
	addi	coef_index, coef_index,-2
	lrv	col8[0], 32(coef_index)

ADPCMfloop:

  # Parse the next header to give predictor and scale
  # and parse the current data from the vtmp[1-4] registers.
  # Use the current scale to scale the input data.
	addi	dm_in, dm_in, 9			# 9 bytes/frame of input
					vmudn	idata0, vtmp1, iscale1		# Parse the data
	addi	header_base, header_base, 9
					vmadn	idata0, vtmp2, iscale1
	ldv	bitdata[0], 0(dm_in)
					vmudn	idata1, vtmp3, iscale1
	lbu	headr, 0(header_base)
					vmadn	idata1, vtmp4, iscale1

  # Scale the data
	blez	scalei, ADPCMnoscale
	andi	tindex, headr, 0x0f

					vmudm	idata0, idata0, vscale[0]
					vmudm	idata1, idata1, vscale[0]

ADPCMnoscale:

  # Multiply out to give the 16 output values
  # vtmp0 and vtmp5 are used to avoid stalling

	sll	tindex, tindex, 5
					vand	vtmp1, imask1, bitdata[0]
	add	coef_index, tindex, coef_base
					vand	vtmp2, imask2, bitdata[1]
					vand	vtmp3, imask1, bitdata[2]
					vand	vtmp4, imask2, bitdata[3]
	srl	scalei, headr, 0x04
					vmudh	vtmp0, col0, odata1[6]
	addi	tmp1, zero, 12
					vmadh	vtmp0, col1, odata1[7]
	sub	scalei, tmp1, scalei
					vmadh	vtmp0, col2, idata0[0]
	addi	tmp1, scalei, -1
					vmadh	vtmp0, col3, idata0[1]
	addi	tmp2, zero, 1
					vmadh	vtmp0, col4, idata0[2]
  	sll	tmp2, tmp2, 15
					vmadh	vtmp0, col5, idata0[3]
	srlv	scale, tmp2, tmp1
					vmadh	odata0, col6, idata0[4]
	mtc2	scale, vscale			# vscale is meaningless if scalei is zero
					vmadh	vtmp0, col7, idata0[5]
					vmadh	vtmp0, col8, idata0[6]
					vmadh	vtmp0, idata0, vconst[5]

					vsaw	odataf, vtmp5, odata0[1]
					vsaw	odata0, vtmp5, odata0[0]
 # Rescale first output
					vmudn	vtmp0, odataf, vconst[4]
					vmadh	odata0, odata0, vconst[4]

 # Second set of 8 outputs
					vmudh	vtmp0, col2, idata1[0]
	addi	ncoef_index, coef_index,-2
					vmadh	vtmp0, col3, idata1[1]
	lrv	col2[0], 32(ncoef_index)
					vmadh	vtmp0, col4, idata1[2]
	addi	ncoef_index, ncoef_index,-2
					vmadh	vtmp0, col5, idata1[3]
	lrv	col3[0], 32(ncoef_index)
					vmadh	vtmp0, col6, idata1[4]
	addi	ncoef_index, ncoef_index,-2
					vmadh	vtmp0, col7, idata1[5]
	lrv	col4[0], 32(ncoef_index)
					vmadh	vtmp0, col8, idata1[6]
	addi	ncoef_index, ncoef_index,-2
					vmadh	vtmp0, idata1, vconst[5]
	lrv	col5[0], 32(ncoef_index)
					vmadh	vtmp0, col0, odata0[6]
	addi	ncoef_index, ncoef_index,-2
					vmadh	vtmp0, col1, odata0[7]
	lrv	col6[0], 32(ncoef_index)
					vsaw	odataf, vtmp5, odata1[1]
	addi	ncoef_index, ncoef_index,-2
					vsaw	odata1, vtmp5, odata1[0]
	lrv	col7[0], 32(ncoef_index)
	addi	ncoef_index, ncoef_index,-2
	lrv	col8[0], 32(ncoef_index)
	lqv	col0[0], 0(coef_index)

 # Rescale output
					vmudn	vtmp0, odataf, vconst[4]
	lqv	col1[0], 16(coef_index)
					vmadh	odata1, odata1, vconst[4]
	addi	count, count, -32		# Sample count

  # Save the tindex for next time round
	sdv	odata0[0], 0(dm_out)
	sdv	odata0[8], 8(dm_out)
	sdv	odata1[0], 16(dm_out)
	sdv	odata1[8], 24(dm_out)

  # Increment output and input and goto floop

	bgtz	count, ADPCMfloop
	addi	dm_out, dm_out, 32		# 32 bytes/frame of output

.unname	bitdata
.unname	vtmp0
.unname	vtmp1
.unname	vtmp2
.unname	vtmp3
.unname	vtmp4
.unname	vtmp5

.unname	headr
.unname	tmp1
.unname	tmp2
.unname scale

.name	tmp0,	$1
.name	tmp1,	$2
.name	ssize,	$3

ADPCMdone:
	addi	tmp0, dm_out, -32	# Back to before the last frame
	addi	tmp1, state_addr, 0
	jal	DMAwrite
	addi	ssize, zero, ADPCM_STATE_SIZE

ADPCMwait2:
	mfc0	$5, DMA_BUSY
	bne	$5, zero, ADPCMwait2
	nop

	j	AudDone
	mtc0	$0, SP_RESERVED

.unname tmp0
.unname tmp1
.unname	ssize

.unname	header_base
.unname	dm_in
.unname dm_out
.unname	count
.unname	state_addr
.unname	static_base
.unname	coef_base
.unname	scalei
.unname	coef_index
.unname	ncoef_index
.unname	tindex

.unname vconst
.unname	idata0
.unname idata1
.unname odata0
.unname odata1
.unname	odataf
.unname	imask1
.unname	imask2
.unname	iscale1
.unname	vscale
.unname	col0
.unname	col1
.unname	col2
.unname	col3
.unname	col4
.unname	col5
.unname	col6
.unname	col7
.unname	col8