mc.s 4.74 KB
#include "rsp.h"

#include "mc.h"

.base           MSPBOOTBASE

mc_start:

	nop
	addi	store_flag , r0, 1		/* Always store results */

	addi	dum2,	r0, SWITCH_TABLE

	addi	dum,	r0, LOWER( X0Y0 )
	lui	dum,	    UPPER( X0Y0 )
	sw	dum,	0(dum2)

	addi	dum,	r0, LOWER( X1Y0 )
	lui	dum,	    UPPER( X1Y0 )
	sw	dum,	4(dum2)

	addi	dum,	r0, LOWER( X0Y1 )
	lui	dum,	    UPPER( X0Y1 )
	sw	dum,	8(dum2)

	addi	dum,	r0, LOWER( X1Y1 )
	lui	dum,	    UPPER( X1Y1 )
	sw	dum,	12(dum2)

next_block:

	nop
	nop					/* Break here to get input */
	nop

	lw	mvx,	MVX(r0)
	lw	mvy,	MVY(r0)

#define	xyh	dum2

	andi	xh,	mvx, 1
	andi	yh,	mvy, 1
	sll	xyh,	yh,  1
	or	xyh,	xyh, xh			/* XYH = (2*yh)+xh */

	sll	dum,	xyh,  2
	lw	mc4, SWITCH_TABLE(dum)

	sra	dum,	mvx, 1			/* toss the fraction */
	andi	dum,	dum, 0xf		/* remaining offset within MB */
	addi	xLbase,	dum, LBASE
	addi	xRbase,	dum, RBASE
	addi	xObase,	r0, OBASE
	jal	mcFixYAddrs
	nop
	jalr	mc4
	nop
	jalr	mc4
	nop
	jalr	mc4
	nop
	jalr	mc4
	nop

	/* Chroma */

	sra	dum2,	mvx, 31
	and	dum2,	dum2, xh
	add	mvx,	dum2, mvx
	sra	mvx,	mvx, 1

	sra	dum,	mvy, 31
	and	dum,	dum, yh
	add	mvy,	dum, mvy
	sra	mvy,	mvy, 1

	lw	dum,	MVCX(r0)
	lw	dum2,	MVCY(r0)

	bne	dum,  mvx, error
	nop
	bne	dum2, mvy, error
	nop

	andi	xh,	mvx, 1
	andi	yh,	mvy, 1
	sll	xyh,	yh,  1
	or	xyh,	xyh, xh			/* XYH = (2*yh)+xh */

	sll	dum,	xyh,  2
	lw	mc4, SWITCH_TABLE(dum)

	sra	dum,	mvx, 1			/* toss the fraction */
	andi	dum,	dum, 0x7		/* remaining offset within MB */
	sll	dum,	dum, 1			/* Fix for UV interleave */
	addi	xLbase,	dum, LCrBASE
	addi	xRbase,	dum, RCrBASE
	addi	xObase,	r0, OCrBASE

	jal	mcFixCrAddrs
	nop
	jalr	mc4
	nop
	jalr	mc4
	nop

	nop
	nop					/* Break here to write output */
	nop

	j	next_block
	nop

mcFixYAddrs:
	andi	dum,	mvx,	1
	blez	dum,	no_fix
	    addi	Lbasep1, xLbase, 1
	    addi	Rbasep1, xRbase, 1

	    andi	dum,	mvx, 0x1f	/* What about 15.5 case? */
	    addi	dum,	dum, -30
	    blez 	dum,	no_fix
	    nop					/* Could be useful */
		addi	Lbasep1, Rbasep1, -16
no_fix:	ret
	nop

mcFixCrAddrs:
	andi	dum,	mvx,	1
	blez	dum,	no_fix2
	    addi	Lbasep1, xLbase, 2
	    addi	Rbasep1, xRbase, 2

	    andi	dum,	mvx, 0x1f	/* What about 15.5 case? */
	    addi	dum,	dum, -30
	    blez 	dum,	no_fix2
	    nop					/* Could be useful */
		addi	Lbasep1, Rbasep1, -16

					/* We don't need to fix Rbasep1 since
					   it is page QW aligned already, so
					   LRQ has no effect! */
no_fix2: ret
	nop

error:
	nop
	nop					/* OOPS! */
	nop

x0y0:
	lqv	v0[0],	 0(xLbase)
	lrv	v0[0],	 0(xRbase)
	lqv	v1[0],	16(xLbase)
	lrv	v1[0],	16(xRbase)
	lqv	v2[0],	32(xLbase)
	lrv	v2[0],	32(xRbase)
	lqv	v3[0],	48(xLbase)
	lrv	v3[0],	48(xRbase)
finish:
	addi	xLbase,	xLbase, 64
	addi	xRbase,	xRbase, 64
	blez	store_flag,	skip_stores
	nop

	sqv	v0[0],	 0(xObase)
	sqv	v1[0],	16(xObase)
	sqv	v2[0],	32(xObase)
	sqv	v3[0],	48(xObase)
	addi	xObase,	xObase, 64

skip_stores:
	ret
	nop

x1y0:
	lqv	v0[0],	 0(xLbase)
	lrv	v0[0],	 0(xRbase)
	lqv	t1[0],	 0(Lbasep1)
	lrv	t1[0],	 0(Rbasep1)
	nop
	vaddb	v0,	v0, t1[1]		/* Avg */

	lqv	v1[0],	 16(xLbase)
	lrv	v1[0],	 16(xRbase)
	lqv	t1[0],	 16(Lbasep1)
	lrv	t1[0],	 16(Rbasep1)
	nop
	vaddb	v1,	v1, t1[1]		/* Avg */

	lqv	v2[0],	 32(xLbase)
	lrv	v2[0],	 32(xRbase)
	lqv	t1[0],	 32(Lbasep1)
	lrv	t1[0],	 32(Rbasep1)
	nop
	vaddb	v2,	v2, t1[1]		/* Avg */

	lqv	v3[0],	 48(xLbase)
	lrv	v3[0],	 48(xRbase)
	lqv	t1[0],	 48(Lbasep1)
	lrv	t1[0],	 48(Rbasep1)
	nop
	vaddb	v3,	v3, t1[1]		/* Avg */

	addi	Lbasep1, Lbasep1, 64
	addi	Rbasep1, Rbasep1, 64

	j	finish
	nop

#define t3 t1p1

x0y1:
	lqv	t1[0],	 0(xLbase)
	lrv	t1[0],	 0(xRbase)
	lqv	t2[0],	 16(xLbase)
	lrv	t2[0],	 16(xRbase)
	nop
	vaddb	v0,	t1, t2[1]		/* Avg */

	lqv	t3[0],	 32(xLbase)
	lrv	t3[0],	 32(xRbase)
	nop
	vaddb	v1,	t2, t3[1]		/* Avg */

	lqv	t1[0],	 48(xLbase)
	lrv	t1[0],	 48(xRbase)
	nop
	vaddb	v2,	t3, t1[1]		/* Avg */

	lqv	t2[0],	 64(xLbase)
	lrv	t2[0],	 64(xRbase)
	nop
	vaddb	v3,	t1, t2[1]		/* Avg */

	j	finish
	nop

x1y1:
	lqv	t1[0],	 0(xLbase)		/* Prefetch */
	lrv	t1[0],	 0(xRbase)
	lqv	t1p1[0], 0(Lbasep1)
	lrv	t1p1[0], 0(Rbasep1)

	lqv	t2[0],	 16(xLbase)
	lrv	t2[0],	 16(xRbase)
	lqv	t2p1[0], 16(Lbasep1)
	lrv	t2p1[0], 16(Rbasep1)
	vaddb	v0,	t1, t1p1[2]		/* Avg of 4 */
	vaccb	v0,	t2, t2p1[2]		/* Avg of 4 */

	lqv	t1[0],	 32(xLbase)
	lrv	t1[0],	 32(xRbase)
	lqv	t1p1[0], 32(Lbasep1)
	lrv	t1p1[0], 32(Rbasep1)
	nop
	vaddb	v1,	t1, t1p1[2]		/* Avg of 4 */
	vaccb	v1,	t2, t2p1[2]		/* Avg of 4 */

	lqv	t2[0],	 48(xLbase)
	lrv	t2[0],	 48(xRbase)
	lqv	t2p1[0], 48(Lbasep1)
	lrv	t2p1[0], 48(Rbasep1)
	nop
	vaddb	v2,	t1, t1p1[2]		/* Avg of 4 */
	vaccb	v2,	t2, t2p1[2]		/* Avg of 4 */

	lqv	t1[0],	 64(xLbase)
	lrv	t1[0],	 64(xRbase)
	lqv	t1p1[0], 64(Lbasep1)
	lrv	t1p1[0], 64(Rbasep1)
	nop
	vaddb	v3,	t1, t1p1[2]		/* Avg of 4 */
	vaccb	v3,	t2, t2p1[2]		/* Avg of 4 */

	addi	Lbasep1, Lbasep1, 64
	addi	Rbasep1, Rbasep1, 64

	j	finish
	nop