idct.s 7.97 KB
#include "idct.h"

/*
    Inputs:

	idct_cbp	- Coded Block Pattern (bit N is a flag for IDCTing that block
	idct_in_dat	- Input Data address
	idct_out_dat	- Output Data Address (can be the same as the in_dat)
    
    Outputs:

	data at idct_out_dat - IDCT'ed Input Data
*/

#define _FIDCT
	
idct:	
	addi caddr, rzero, IDCT_CONST_BASE
	lqv a00[0], AMAT0L(caddr)
	lqv a08[0], AMAT0H(caddr)
	lqv a10[0], AMAT1L(caddr)
	lqv a18[0], AMAT1H(caddr)

	lqv consts[0], IDCT_CONSTS(caddr)

#ifndef _FIDCT
	vadd vone, vzero, consts[ONE]
#endif
	
	lqv	p0[0], ROW0(idct_in_dat)
	lqv	p1[0], ROW4(idct_in_dat)
	lqv	p2[0], ROW2(idct_in_dat)
	lqv	p3[0], ROW6(idct_in_dat)

idct_loop:
	andi	dum, idct_in_cbp, 1
	bgtz	dum, idct_do
	srl	idct_in_cbp, idct_in_cbp, 1
	blez	idct_in_cbp, idct_done
	addi	idct_in_dat, idct_in_dat, 128
	j	idct_loop
	addi	idct_out_dat, idct_out_dat, 128

idct_done:
	jr	return
	nop




idct_do:
			lqv	q0[0], ROW1(idct_in_dat)	
	vmulf vdum, p0, a00[0]
			lqv	q1[0], ROW5(idct_in_dat)
	vmacf vdum, p1, a00[1]
			lqv	q2[0], ROW3(idct_in_dat)
	vmacf vdum, p2, a00[2]
			lqv	q3[0], ROW7(idct_in_dat)
	vmacf x00, p3, a00[3]
			addi	idct_in_dat, idct_in_dat, 128
	vmulf vdum, p0, a00[4]
	vmacf vdum, p1, a00[5]
	vmacf vdum, p2, a00[6]
	vmacf x01, p3, a00[7]

	vmulf vdum, p0, a08[0]
	vmacf vdum, p1, a08[1]
	vmacf vdum, p2, a08[2]
	vmacf x02, p3, a08[3]
	
	vmulf vdum, p0, a08[4]
	vmacf vdum, p1, a08[5]
	vmacf vdum, p2, a08[6]
	vmacf x03, p3, a08[7]

	
	/* 2nd Matrix */
	vmulf vdum, q0, a10[0]
	vmacf vdum, q1, a10[1]
	vmacf vdum, q2, a10[2]
	vmacf x10, q3, a10[3]

	vmulf vdum, q0, a10[4]
	vmacf vdum, q1, a10[5]
	vmacf vdum, q2, a10[6]
	vmacf x11, q3, a10[7]

	vmulf vdum, q0, a18[0]
	vmacf vdum, q1, a18[1]
	vmacf vdum, q2, a18[2]
	vmacf x12, q3, a18[3]

	vmulf vdum, q0, a18[4]
	vmacf vdum, q1, a18[5]
	vmacf vdum, q2, a18[6]
	vmacf x13, q3, a18[7]

	
	vadd x23, x03, x10		/* Butterflies with no rounding */
	vadd x22, x02, x11
	vadd x21, x01, x12
	vadd x20, x00, x13
	vsub x24, x03, x10
	vsub x25, x02, x11
	vsub x26, x01, x12
	vsub x27, x00, x13

#ifndef _FIDCT
	vmudm	vdum, x20, consts[HALF]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x20, $v0, consts[ONE]
	
	vmudm	vdum, x21, consts[HALF]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x21, $v0, consts[ONE]
	
	vmudm	vdum, x22, consts[HALF]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x22, $v0, consts[ONE]
	
	vmudm	vdum, x23, consts[HALF]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x23, $v0, consts[ONE]
	
	vmudm	vdum, x24, consts[HALF]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x24, $v0, consts[ONE]
	
	vmudm	vdum, x25, consts[HALF]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x25, $v0, consts[ONE]
	
	vmudm	vdum, x26, consts[HALF]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x26, $v0, consts[ONE]
		
	vmudm	vdum, x27, consts[HALF]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x27, $v0, consts[ONE]
#endif
			
	/* Transpose */

	addi	caddr, rzero, TEMP_BASE

	stv	x20[0], ROW0(caddr)
	stv	x20[2], ROW1(caddr)
	stv	x20[4], ROW2(caddr)
	stv	x20[6], ROW3(caddr)
	stv	x20[8], ROW4(caddr)
	stv	x20[10], ROW5(caddr)
	stv	x20[12], ROW6(caddr)
	stv	x20[14], ROW7(caddr)
	
	nop
	nop

	ltv	p0[0], ROW0(caddr)
	ltv	p0[14], ROW1(caddr)
	ltv	p0[12], ROW2(caddr)
	ltv	p0[10], ROW3(caddr)
	ltv	p0[8], ROW4(caddr)
	ltv	p0[6], ROW5(caddr)
	ltv	p0[4], ROW6(caddr)
	ltv	p0[2], ROW7(caddr)

		/* 2nd Pass */

	vmulf vdum, p0, a00[0]
	vmacf vdum, p1, a00[1]
	vmacf vdum, p2, a00[2]
	vmacf x00, p3, a00[3]

	vmulf vdum, p0, a00[4]
	vmacf vdum, p1, a00[5]
	vmacf vdum, p2, a00[6]
	vmacf x01, p3, a00[7]

	vmulf vdum, p0, a08[0]
	vmacf vdum, p1, a08[1]
	vmacf vdum, p2, a08[2]
	vmacf x02, p3, a08[3]

	vmulf vdum, p0, a08[4]
	vmacf vdum, p1, a08[5]
	vmacf vdum, p2, a08[6]
	vmacf x03, p3, a08[7]

	vmulf vdum, q0, a10[0]		/* 2nd Matrix */
	vmacf vdum, q1, a10[1]
	vmacf vdum, q2, a10[2]
	vmacf x10, q3, a10[3]

	vmulf vdum, q0, a10[4]
	vmacf vdum, q1, a10[5]
	vmacf vdum, q2, a10[6]
	vmacf x11, q3, a10[7]

	vmulf vdum, q0, a18[0]
	vmacf vdum, q1, a18[1]
	vmacf vdum, q2, a18[2]
	vmacf x12, q3, a18[3]

			lqv	p0[0],	ROW0(idct_in_dat)
	vmulf vdum, q0, a18[4]
			lqv	p1[0],	ROW4(idct_in_dat)
	vmacf vdum, q1, a18[5]
			lqv	p2[0],	ROW2(idct_in_dat)
	vmacf vdum, q2, a18[6]
			lqv	p3[0],	ROW6(idct_in_dat)
	vmacf x13, q3, a18[7]

	
#ifdef _FIDCT
	vmulf x23, x03, consts[SCALEDN]
	vmacf x23, x10, consts[SCALEDN]
	vmacf x24, x10, consts[NSCALEDN]

	vmulf x22, x02, consts[SCALEDN]
	vmacf x22, x11, consts[SCALEDN]
	vmacf x25, x11, consts[NSCALEDN]
			sqv	x23[0],	ROW3(idct_out_dat)

	vmulf x21, x01, consts[SCALEDN]
			sqv	x24[0], ROW4(idct_out_dat)
	vmacf x21, x12, consts[SCALEDN]
	vmacf x26, x12, consts[NSCALEDN]
			sqv	x22[0], ROW2(idct_out_dat)
	
	vmulf x20, x00, consts[SCALEDN]
			sqv	x25[0], ROW5(idct_out_dat)
	vmacf x20, x13, consts[SCALEDN]
	vmacf x27, x13, consts[NSCALEDN]
			sqv	x21[0], ROW1(idct_out_dat)

	sqv	x26[0], ROW6(idct_out_dat)
	sqv	x20[0], ROW0(idct_out_dat)
	sqv	x27[0], ROW7(idct_out_dat)
	
#else
	vadd x23, x03, x10		/* Butterflies with no rounding */
	vadd x22, x02, x11
	vadd x21, x01, x12
	vadd x20, x00, x13
	vsub x24, x03, x10
	vsub x25, x02, x11
	vsub x26, x01, x12
	vsub x27, x00, x13
#endif
/*

	Final Rounding and Scaling
*/

#define bit0	x00
#define bit1	x01
#define bit2	x02
#define bit3	x03

#define bit4	x10
#define bit5	x11
#define bit6	x12
#define bit7	x13

	/*  The x2's now have 6 bits of fraction that needs to be
	    properley rounded.  Symmetric rounding is very important
	    here since a bias will affect about 1% (1/128) of the
	    values.

	    The following gives symmetric rounding:

	    if( x2 < 0 )
		result = (x2 + 0x1f) >> 6;
	    else
		result = (x2 + 0x20) >> 6;
	    
	    I have implemented this via:

	    accum = 0
	    accum += x2*(-1)
	    bit   = accum>>15;	(-1 if x2 > 0, else 0 )

	    accum = (x2 + 0x20) * (1<<(15-6));
	    accum += bit
	    result = accum >> 15;
	*/

	/* Sets bit0 to -1 if x20>0 */
/*	
	vmudh bit0, x20, consts[ZERO]
	vmacf bit0, x20, consts[NEG1]	
	vmudh bit1, x21, consts[ZERO]
	vmacf bit1, x21, consts[NEG1]
	vmudh bit2, x22, consts[ZERO]
	vmacf bit2, x22, consts[NEG1]
	vmudh bit3, x23, consts[ZERO]
	vmacf bit3, x23, consts[NEG1]
	vmudh bit4, x24, consts[ZERO]
	vmacf bit4, x24, consts[NEG1]
	vmudh bit5, x25, consts[ZERO]
	vmacf bit5, x25, consts[NEG1]
	vmudh bit6, x26, consts[ZERO]
	vmacf bit6, x26, consts[NEG1]
	vmudh bit7, x27, consts[ZERO]
	vmacf bit7, x27, consts[NEG1]
*/
	/* Decriments x20>>6 if x20>0 */
/*	
	vmulf vdum,  x20, consts[SCALEDN]
	vmacf x20, bit0, consts[ONE]	
	vmulf vdum,  x21, consts[SCALEDN]
	vmacf x21, bit1, consts[ONE]
	vmulf vdum,  x22, consts[SCALEDN]
	vmacf x22, bit2, consts[ONE]
	vmulf vdum,  x23, consts[SCALEDN]
	vmacf x23, bit3, consts[ONE]
	vmulf vdum,  x24, consts[SCALEDN]
	vmacf x24, bit4, consts[ONE]
	vmulf vdum,  x25, consts[SCALEDN]
	vmacf x25, bit5, consts[ONE]
	vmulf vdum,  x26, consts[SCALEDN]
	vmacf x26, bit6, consts[ONE]
	vmulf vdum,  x27, consts[SCALEDN]
	vmacf x27, bit7, consts[ONE]
*/

#ifndef _FIDCT
	vmudm	vdum, x20, consts[SHIFT5]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x20, $v0, consts[ONE]
	
	vmudm	vdum, x21, consts[SHIFT5]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x21, $v0, consts[ONE]
	
	vmudm	vdum, x22, consts[SHIFT5]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x22, $v0, consts[ONE]
	
	vmudm	vdum, x23, consts[SHIFT5]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x23, $v0, consts[ONE]
	
	vmudm	vdum, x24, consts[SHIFT5]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x24, $v0, consts[ONE]
	
	vmudm	vdum, x25, consts[SHIFT5]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x25, $v0, consts[ONE]
	
	vmudm	vdum, x26, consts[SHIFT5]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x26, $v0, consts[ONE]
		
	vmudm	vdum, x27, consts[SHIFT5]
	vmadm	vdum, vone, consts[SUBHALF]
	vrndn	x27, $v0, consts[ONE]
	
	sqv	x20[0], ROW0(idct_out_dat)
	sqv	x21[0], ROW1(idct_out_dat)
	sqv	x22[0], ROW2(idct_out_dat)
	sqv	x23[0], ROW3(idct_out_dat)
	sqv	x24[0], ROW4(idct_out_dat)
	sqv	x25[0], ROW5(idct_out_dat)
	sqv	x26[0], ROW6(idct_out_dat)
	sqv	x27[0], ROW7(idct_out_dat)
#endif

	blez	idct_in_cbp, idct_done
	addi	idct_out_dat, idct_out_dat, 128
	j	idct_loop
	nop
	
#include "idct_un.h"