vdiv.c 9.13 KB

/**************************************************************************
 *                                                                        *
 *               Copyright (C) 1994, Silicon Graphics, Inc.               *
 *                                                                        *
 *  These coded instructions, statements, and computer programs  contain  *
 *  unpublished  proprietary  information of Silicon Graphics, Inc., and  *
 *  are protected by Federal copyright  law.  They  may not be disclosed  *
 *  to  third  parties  or copied or duplicated in any form, in whole or  *
 *  in part, without the prior written consent of Silicon Graphics, Inc.  *
 *                                                                        *
 *************************************************************************/

/*
 * File:	vdiv.c
 * Creator:	hsa@sgi.com
 * Create Date:	Tue Mar  8 11:00:31 PST 1994
 *
 * This file holds the divide instructions for the VU.
 *
 */


#include <stdio.h>
#include <math.h>
#include "rsp.h"
#include "i128.h"
#include "rspctl.h"
#include "opcode.h"
#include "vu.h"
#include "trace_print.h"

#define ACC_LOW(indx,val)       ((rsp_ACC[indx] & ~0xffffLL) | (val & 0xffff))


#define ROM_ADRS 9
#define ROM_DATA 16
#define ROM_DEPTH (1<<ROM_ADRS)
#define ROM_WIDTH (1<<ROM_DATA)
static i32 DivIn=0, MovIn=0, DivOut=0;
static boolean RomValid = FALSE;
static u16 *rom;
static int dph_flag=0;

extern boolean use_alt_divrom;
static u16 rom_o[ROM_DEPTH*2] ;
static u16 rom_a[] =
{
#include "divrominit.h"
};

rsp_vuPipe_t	vu_DivPipe[rsp_VUPIPEDEPTH+1];

int div_input_table[16] = {
    0, 1, 2, 3,
    4, 5, 6, 7,
    0, 1, 2, 3,
    4, 5, 6, 7 };

/*
 * decode and execute a Div instruction
 */
static void
rsp_VUDivExec(rsp_vuPipe_t *mp)
{
    int		i, type, element;
    i16		ti;
    int sqrt,sp;
    i128 dest;
    u16 vd[8];

    mp->format = ExtractBits(mp->inst, 24, 21);
    type = ExtractBits(mp->inst, 2, 0);
    mp->rt = ExtractBits(mp->inst, 20, 16);
    mp->rs = ExtractBits(mp->inst, 15, 11);
    mp->rd = ExtractBits(mp->inst, 10,  6);
    rsp_VURegLock(mp->rd, mp->pc);
    
    /* check for stall. If okay, execute and decrement delay field */
    if (rsp_VURegIsLocked(mp->rt, mp->pc)) {
	/* can't do anything right now... */
	mp->stalled = TRUE;
	rsp_VUStalled = TRUE;
	rsp_Verbose(stderr,"VU divide stalled... (%08x)\n",mp->pc);
    } else {

	if (!RomValid){
	    RomValid = TRUE;
	    InitROM(); 	
	   }

	 mp->inhibit_wb = FALSE;

	/* get scaler value */
        element = mp->format; /*always a whole scaler*/
	element = div_input_table[ element ];

	ti = (i16) Get128By16(&(rsp_VUR[mp->rt]), element);	

	switch(type) {

	  case 0x0: 	/*vrcp*/
	  case 0x4: 	/*vrsq*/
		strcpy(mp->opString,(type==0x4) ? "vrsq" : "vrcp");
		DivIn = (i32) ti;
		sqrt = (type==0x4); 
	        sp = 1;
		DivOut = GetDivResult(DivIn,sqrt,sp);

		mp->res16 = (u16) DivOut&0xffff;
		mp->res16_element = (u16) mp->rs&0x7;

		dph_flag = 0;
	        break;

	  case 0x1: 	/*vrcpl*/
	  case 0x5: 	/*vrsql*/
		strcpy(mp->opString,(type==0x5) ? "vrsql" : "vrcpl");
		/*If the previous devide is not high, then this is forced
                  to single precision. For HW compatibility
                 */
		if (dph_flag) {
		   DivIn = DivIn | ti&0xffff;
		   sp = 0;
		}
		else {
		   DivIn =  (i32) ti;
		   sp = 1;
		}

		sqrt = (type==0x5); 
		DivOut = GetDivResult(DivIn,sqrt,sp);

		mp->res16 = (u16) DivOut&0xffff;
		mp->res16_element = (u16) mp->rs&0x7;

		dph_flag = 0;
	        break;

	  case 0x2: 	/*vrcph*/
	  case 0x6: 	/*vrsqh*/
		strcpy(mp->opString,(type==0x6) ? "vrsqh" : "vrcph");
		DivIn = ti<<16;
		sqrt = (type==0x6); 
	        sp = 0;

		mp->res16 = (u16) (DivOut>>16)&0xffff;
		mp->res16_element = (u16) mp->rs&0x7;

		dph_flag = 1;
	        break;

	  case 0x3: 	/*vmove*/
		strcpy(mp->opString, "vmove");
		MovIn = (i32) ti;

		mp->res16 = (u16) MovIn&0xffff;
		mp->res16_element = (u16) mp->rs&0x7;

	        break;

	  case 0x7: 	/*vnop*/
		strcpy(mp->opString, "vnop");
		mp->inhibit_wb = TRUE;
	        break;

	   default: break;

	} /*switch*/

	if( type != 7 )		/* vnop has no effect */
	  for (i=0; i<8; i++) {

	    element = ((i & cmask_tab[(mp->format & 0x0e) >> 1]) +
		       (mp->format & emask_tab[(mp->format & 0x0e) >> 1]));

	    ti = (i16) Get128By16(&(rsp_VUR[mp->rt]), element);

            rsp_ACC[i] = ACC_LOW(i,ti); /* HW compatibility */

	    if( (type == 3) && (i == ((u16) mp->rs&0x7) ))
		mp->res16 = (u16) (( (i32) ti ) &0xffff );
	};

	mp->stalled = FALSE;
	mp->delay--;
    }
}

GetDivResult(divin,sqrt,sp)
i32 divin;
int sqrt, sp;
  {
     i32 in_data,result;
     int j, found_1,lshift,romdata,rshift,addr;
     float F1,F2;

#if 0
     /* do a "real" divide, for debugging precision problems */
     result = (int) ((65536.0/(float)divin)*65536.0);
     result >>= 1;
     return (result);
#endif

     in_data = divin;
     if (sp && divin<0) in_data = -in_data;
     if (!sp && divin<0)  
	 if (divin>=-32768)
		in_data = -in_data;
	 else
		in_data = ~in_data;

	found_1 = 0;
	lshift= 0;
	for (j=0; j<32; j++)
	  if ((in_data & 1<<(31-j)) && !found_1) 
	    {
	      lshift=  j;
	      found_1 =1;
	    }

	/* This is what hardware does on zero input */
        if (in_data==0x0)
           if (sp)
                lshift = 0x10;
           else
                lshift = 0x00;

	addr = ((in_data<<lshift)&0x7fc00000) >> 22;
	if (sqrt) addr = (addr|0x200)&0x3fe | (lshift%2);

        romdata = rom[addr];

        rshift = (sqrt) ? (~lshift & 0x1f)/2 : ~lshift&0x1f;
        result = (0x40000000 | (romdata<<14)) >> rshift;
	if (divin<0) result = ~result;

        /* Final fix to handle corner cases*/
        if (divin==0)
           result = 0x7fffffff;
        else
        if (divin==0xffff8000)
           result = 0xffff0000;

	return ((i32) result);
 }

InitROM()
 {
  int i, j;
  float F1,F2;

  if (use_alt_divrom) {
      rsp_eprintf(stdout,"rsp : init'ing alt div rom.\n");
      rom = &(rom_a[0]);
      return;
  }

  rom = &(rom_o[0]);

  /*********************************************
   * reciprocal  Input range 1:(2-e)
   **********************************************/
    rom[0] = (2*ROM_WIDTH)-1;   /* maximum value */
    for (i=1; i<ROM_DEPTH; i++) {
           F1 = (float) ROM_DEPTH/(ROM_DEPTH+i) ;
           F2 = (float) 2.0 * ROM_WIDTH * F1;
           rom[i] = (int)(F2);
          }
  /*********************************************
   * 1/sqrt Input range 1:(2-e)
   **********************************************/
    rom[513] = (2*ROM_WIDTH)-1;   /* maximum value */
    for (i=1; i<ROM_DEPTH/2; i++) {
           j = i*2 ;
           F1 = (float) ROM_DEPTH/(ROM_DEPTH+j) ;
           F2 = (float) 2.0 * ROM_WIDTH*sqrtf(F1);
           rom[513+j] = (int)(F2);
           }
  /*********************************************
   *1/sqrt  Input range 2:(4-e)
   **********************************************/
    F1 = 0.5;
    F2 = (float)  2.0 * ROM_WIDTH*sqrtf(F1);
    rom[512] = (int)(F2);
    for (i=1; i<ROM_DEPTH/2; i++) {
           j = i*2 ;
           F1 = (float) ROM_DEPTH/((float) 2.0*(ROM_DEPTH+j)) ;
           F2 = (float) 2.0 * ROM_WIDTH*sqrtf(F1);
           rom[512+j] = (int)(F2);
           }

	/*
	printf("\n\n");
	for (i=0; i<1024; i++) printf("rom[%03x]=%04x \n",i,rom[i]);
	printf("\n\n");
	*/
 }

/* PUBLIC FUNCTIONS */

/*
 * check Divide pipeline for stalls
 * returns TRUE is stalled.
 */
boolean
rsp_VUDivCheckStall(void)
{
    int		i;

    for (i=0; i<rsp_VUPIPEDEPTH; i++) {
	if (vu_DivPipe[i].delay > 0 && vu_DivPipe[i].stalled)
	    return TRUE;
    }

    return FALSE;
}

/*
 * install a VU instruction into the pipeline
 */
void
rsp_VUDivInstall(u32 inst, u32 pc)
{
    int		i;

    for (i=0; i<rsp_VUPIPEDEPTH; i++) {
	if (vu_DivPipe[i].delay == 0 || VUZeroPipe) {
	    vu_DivPipe[i].inst = inst;
	    vu_DivPipe[i].pc = pc;
	    vu_DivPipe[i].delay = (VUZeroPipe) ? 1 : rsp_VUPIPEDEPTH;
	    vu_DivPipe[i].stalled = FALSE;
	    strcpy(vu_DivPipe[i].opString, "?");
	    if (VUZeroPipe) rsp_VUDivPipeStep();
	    break;
	}
    }
}

/*
 * this function is called once per clock, advances the Divide
 * pipeline one step.
 */
boolean
rsp_VUDivPipeStep(void)
{
    int		i;

    /* advance all the things in the pipe */
    for (i=0; i<rsp_VUPIPEDEPTH; i++) {
	if (vu_DivPipe[i].delay > 0) {


	    if (vu_DivPipe[i].delay == rsp_VUPIPE_STAGE_EX || VUZeroPipe) {
		/* do decode and exec, if possible */
		rsp_VUDivExec(&(vu_DivPipe[i]));
	    } else {
		/*
		 * fakes pipelining by waiting to write-back answer
		 */
               if (rsp_VUStalled==FALSE || 
			 vu_DivPipe[i].delay<rsp_VUPIPE_STAGE_EX)
		vu_DivPipe[i].delay--;
	    }

	    if (vu_DivPipe[i].delay == rsp_VUPIPE_STAGE_WB) {
		/* do write-back */
		rsp_Verbose(stderr,"VU DIV (%s) did write-back. (v%d)\n",	
			    vu_DivPipe[i].opString, vu_DivPipe[i].rd);
	
		if (vu_DivPipe[i].inhibit_wb==FALSE) {
		    Set128By16 (&rsp_VUR[vu_DivPipe[i].rd], vu_DivPipe[i].res16,
			        			vu_DivPipe[i].res16_element);

		  
		traceVUbyVU(vu_DivPipe[i].rd, 1<<vu_DivPipe[i].res16_element,
				&rsp_VUR[vu_DivPipe[i].rd], vu_DivPipe[i].pc);
		}
		/*
		rsp_VUR[vu_DivPipe[i].rd] = vu_DivPipe[i].result;
		*/

		/* mark registers not in use */
		rsp_VURegUnLock(vu_DivPipe[i].rd, vu_DivPipe[i].pc);
	    }
	}
    }
}