vmult.c 9.63 KB

/**************************************************************************
 *                                                                        *
 *               Copyright (C) 1994, Silicon Graphics, Inc.               *
 *                                                                        *
 *  These coded instructions, statements, and computer programs  contain  *
 *  unpublished  proprietary  information of Silicon Graphics, Inc., and  *
 *  are protected by Federal copyright  law.  They  may not be disclosed  *
 *  to  third  parties  or copied or duplicated in any form, in whole or  *
 *  in part, without the prior written consent of Silicon Graphics, Inc.  *
 *                                                                        *
 *************************************************************************/

/*
 * File:	vmult.c
 * Creator:	hsa@sgi.com
 * Create Date:	Tue Mar  8 10:58:45 PST 1994
 *
 * This file holds the multiply instructions for the VU.
 *
 */

#include <stdio.h>
#include "rsp.h"
#include "i128.h"
#include "rspctl.h"
#include "opcode.h"
#include "vu.h"
#include "trace_print.h"


#define ExtractBits64(dword, high, low) 	\
		((dword >> low) & (0xffffffffffffffff>>(63-(high-low))))

rsp_vuPipe_t	vu_MultPipe[rsp_VUPIPEDEPTH+1];


/*
 * decode and execute a mult instruction
 * This code is ugly. I've tried to reuse code by parameterizing the
 * different multiplies, which makes this ugly. The alternative is
 * lots more code, 90% of it repeated identically for each case...
 *
 */
static void
rsp_VUMultExec(rsp_vuPipe_t *mp)
{
    boolean	doaccum;
    int		fmt, element, i, round, pshift, ashift, clamplo, type_u,doaccum_rnd;
    u16		du, su, tu, mulqMask;
    i16		di, si, ti;
    u64		clampMin, clampMax;
    i64		taccum, clampMask;
    i32		macq_bits;
    int         acc_bit21;
    int         acc_bit47;

    /* decode instruction */
    doaccum = ExtractBits(mp->inst, 3, 3);
    fmt = ExtractBits(mp->inst, 2, 0);
    mp->format = ExtractBits(mp->inst, 24, 21);
    mp->rt = ExtractBits(mp->inst, 20, 16);
    mp->rs = ExtractBits(mp->inst, 15, 11);
    mp->rd = ExtractBits(mp->inst, 10,  6);

    /* initialize parameters to most common values */
    type_u = 0;
    pshift = 0;
    round = 0;
    doaccum_rnd = 0;
    mulqMask = 0xffff;
    ashift = 0;
    clamplo = 31;
    clampMin = 0xffffffffffff8000;	/* most negative signed 16-bit number */
    clampMax = 0x0000000000007fff;	/* most positive signed 16-bit number */

    /* tweak parameters based on what kind of mult */
    switch (fmt) {
      case 0x00:	/* vmulf vmacf */
	strcpy(mp->opString, (doaccum ? "vmacf" : "vmulf"));
	round = (doaccum ? 0 : 32768);
	ashift = 16;
	pshift = 1;
	break;

      case 0x01:	/* vmulu vmacu */
	strcpy(mp->opString, (doaccum ? "vmacu" : "vmulu"));
	round = (doaccum ? 0 : 32768);
	ashift = 16;
	type_u = 1;
	pshift = 1;
	clampMin = 0x0000000000000000;	/* most negative unsigned 16-bit number */
	clampMax = 0xffffffffffffffff;	/* most positive unsigned 16-bit number */
	break;

      case 0x02:	/* vrndp, vrndn */
	strcpy(mp->opString, (doaccum ? "vrndn" : "vrndp"));
        doaccum_rnd = 1; 	/* used for ACCU always when vrndn or vrndp */
	ashift = 16;
	if( mp->rs & 1 )	/* Shift Round amount up by 16 sometimes */
	    pshift = 16;
	else
	    pshift = 0;
	break;

      case 0x03:	/* vmulq, vmacq */
	strcpy(mp->opString, (doaccum ? "vmacq" : "vmulq"));
	round = (doaccum ? 0 : (31 << 16));
	ashift = 17;
	pshift = 16;
	clamplo = 32;
        mulqMask = 0xfff0;
	break;

      case 0x04:	/* vmudl vmadl */
	strcpy(mp->opString, (doaccum ? "vmadl" : "vmudl"));
	pshift = -16;
	clampMin = 0x0000;
	clampMax = 0xffff;
	break;

      case 0x05:	/* vmudm vmadm */
	strcpy(mp->opString, (doaccum ? "vmadm" : "vmudm"));
	ashift = 16;
	break;

      case 0x06:	/* vmudn vmadn */
	strcpy(mp->opString, (doaccum ? "vmadn" : "vmudn"));
	clampMin = 0x0000;
	clampMax = 0xffff;
	break;

      case 0x07:	/* vmudh vmadh */
	strcpy(mp->opString, (doaccum ? "vmadh" : "vmudh"));
	pshift = 16;
	ashift = 16;
	break;
    }

    clampMask = ~(((u64)0x0000000000000001 << clamplo) - 1);
    rsp_VURegLock(mp->rd, mp->pc);

    /* check for stall. If okay, execute and decrement delay field */
    if (rsp_VURegIsLocked(mp->rt, mp->pc) || rsp_VURegIsLocked(mp->rs, mp->pc)) {
	/* can't do anything right now... */
	mp->stalled = TRUE;
	rsp_VUStalled = TRUE;
	rsp_Verbose(stderr,"VU mult stalled... (%08x)\n",mp->pc);
    } else {
	/* execute */
	for (i=0; i<8; i++) {

	    /* handle vt[e] as a scaler: */
	    element = ((i & cmask_tab[(mp->format & 0x0e) >> 1]) + 
		       (mp->format & emask_tab[(mp->format & 0x0e) >> 1]));

	    switch (fmt) {
	      case 0x00:	/* vmulf vmacf */
		si = (i16) Get128By16(&(rsp_VUR[mp->rs]), i);
		ti = (i16) Get128By16(&(rsp_VUR[mp->rt]), element);
		taccum = (i64) ((i64)si * (i64)ti);
		break;

	      case 0x01:	/* vmulu vmacu */
		si = (i16) Get128By16(&(rsp_VUR[mp->rs]), i);
		ti = (i16) Get128By16(&(rsp_VUR[mp->rt]), element);
		taccum = (i64) ((i64)si * (i64)ti);
		break;

	      case 0x02:	/* vrndn, vrndp */
		ti = (i16) Get128By16(&(rsp_VUR[mp->rt]), element);
		if (doaccum) {			/* vrndn */
		    if (rsp_ACC[i] < 0)
			taccum = ti;
		    else
			taccum = 0;
		} else {			/* vrndp */
		    if (rsp_ACC[i] >= 0)
			taccum = ti;
		    else
			taccum = 0;
		}
		break;

	      case 0x03:	/* vmulq, vmacq */
		if (doaccum) {
		    /* vmacq just does oddification... */
		    acc_bit21 = ExtractBits64(rsp_ACC[i], 21, 21);
		    macq_bits = rsp_ACC[i] >> 22;
		    if (macq_bits < 0 && acc_bit21 == 0) {
			taccum = 32;
		    } else if (macq_bits > 0 && acc_bit21 == 0) {
			taccum = -32;
		    } else {
			taccum = 0;
		    }
		} else {
		    si = (i16) Get128By16(&(rsp_VUR[mp->rs]), i);
		    ti = (i16) Get128By16(&(rsp_VUR[mp->rt]), element);
		    taccum = (i64) ((i64)si * (i64)ti);
		    if (taccum >= 0)
			round = 0;
		    else
			round = 31<<16;
		}
		break;

	      case 0x04:	/* vmudl vmadl */
		su = (u16) Get128By16(&(rsp_VUR[mp->rs]), i);
		tu = (u16) Get128By16(&(rsp_VUR[mp->rt]), element);
		taccum = (i64) ((u64)su * (u64)tu);
		break;

	      case 0x05:	/* vmudm vmadm */
		si = (i16) Get128By16(&(rsp_VUR[mp->rs]), i);
		tu = (u16) Get128By16(&(rsp_VUR[mp->rt]), element);
		taccum = (i64) ((i64)si * (u64)tu);
		break;

	      case 0x06:	/* vmudn vmadn */
		su = (u16) Get128By16(&(rsp_VUR[mp->rs]), i);
		ti = (i16) Get128By16(&(rsp_VUR[mp->rt]), element);
		taccum = (i64) ((u64)su * (i64)ti);
		break;

	      case 0x07:	/* vmudh vmadh */
		si = (i16) Get128By16(&(rsp_VUR[mp->rs]), i);
		ti = (i16) Get128By16(&(rsp_VUR[mp->rt]), element);
		taccum = (i64) ((i64)si * (i64)ti);
		break;
	    }

	    if (pshift > 0) {	/* shift partial products */
		taccum <<= pshift;
	    } else if (pshift < 0) {
		taccum = (u64)taccum >> (-pshift);
	    } /* else no shift */

	    taccum += round;	/* do rounding... */

	    if (doaccum | doaccum_rnd)
		taccum += rsp_ACC[i];

	    /* The following change adjusts the taccum value to reflect the hardware.
               Hardware has 48 bit acc. with bit47 as sign, while C-sim has 64 bit
               acc. To match the behaviour, bit47 is sign extended in this model.
	     */

	    if ((taccum>>47) & 0x1) 
		taccum = 0xffff000000000000 | (taccum & 0xffffffffffff);
	    else
		taccum = taccum & 0xffffffffffff;

	    rsp_ACC[i] = taccum;

	    /* clamp output */
	    if (taccum < 0) {	/* negative */
                if (type_u) {
                   taccum = 0;
                    }
                else
		if (~taccum & clampMask) {
		    /* there are some 0's in the overflow area */
		    taccum = ((i64)clampMin) << ashift;
		}
	    } else {		/* positive */
		if (taccum & clampMask) {
		    /* there are some 1's in the overflow area */
		    taccum = ((i64)clampMax) << ashift;
		}
	    }

	    du = ExtractBits64(taccum, (ashift+16-1), ashift);
            du &= mulqMask;
	    Set128By16(&(mp->result), du, i);
	}

	mp->stalled = FALSE;
	mp->delay--;
    }
}


/* PUBLIC FUNCTIONS */

/*
 * check multiply pipeline for stalls
 * returns TRUE is stalled.
 */
boolean
rsp_VUMultCheckStall(void)
{
    int		i;

    for (i=0; i<rsp_VUPIPEDEPTH; i++) {
	if (vu_MultPipe[i].delay > 0 && vu_MultPipe[i].stalled)
	    return TRUE;
    }

    return FALSE;
}

/*
 * install a VU instruction into the pipeline
 */
void
rsp_VUMultInstall(u32 inst, u32 pc)
{
    int		i;

    for (i=0; i<rsp_VUPIPEDEPTH; i++) {
	if (vu_MultPipe[i].delay == 0 || VUZeroPipe) {
	    vu_MultPipe[i].inst = inst;
	    vu_MultPipe[i].pc = pc;
	    vu_MultPipe[i].delay = (VUZeroPipe) ? 1 : rsp_VUPIPEDEPTH;
	    vu_MultPipe[i].stalled = FALSE;
	    strcpy(vu_MultPipe[i].opString, "?");
	    if (VUZeroPipe) rsp_VUMultPipeStep();
	    break;
	}
    }
}

/*
 * this function is called once per clock, advances the multiply
 * pipeline one step.
 */
boolean
rsp_VUMultPipeStep(void)
{
    int		i;

    /* advance all the things in the pipe */
    for (i=0; i<rsp_VUPIPEDEPTH; i++) {
	if (vu_MultPipe[i].delay > 0) {

	    if (vu_MultPipe[i].delay == rsp_VUPIPE_STAGE_EX || VUZeroPipe) {
		/* do decode and exec, if possible */
		rsp_VUMultExec(&(vu_MultPipe[i]));
	    } else {
		/*
		 * fakes pipelining by waiting to write-back answer
		 */
               if (rsp_VUStalled==FALSE ||
                         vu_MultPipe[i].delay<rsp_VUPIPE_STAGE_EX)
		vu_MultPipe[i].delay--;
	    }

	    if (vu_MultPipe[i].delay == rsp_VUPIPE_STAGE_WB) {
		/* do write-back */
		rsp_Verbose(stderr,"VU MULT (%s) did write-back. (v%d)\n",
			    vu_MultPipe[i].opString, vu_MultPipe[i].rd);
		rsp_VUR[vu_MultPipe[i].rd] = vu_MultPipe[i].result;
	
		traceVUbyVU(vu_MultPipe[i].rd,0xff,&(vu_MultPipe[i].result),
					vu_MultPipe[i].pc);

		/* mark registers not in use */
		rsp_VURegUnLock(vu_MultPipe[i].rd, vu_MultPipe[i].pc);
	    }
	}
    }
}