translator.c 133 KB

Raw Blame History Permalink

/*
 * Copyright (C) 1996-1998 by the Board of Trustees
 *    of Leland Stanford Junior University.
 *
 * This file is part of the SimOS distribution.
 * See LICENSE file for terms of the license.
 *
 */

/**************************************************************************
*      File:           translator.c
* This is a major file for Embra
*
*  This file takes gets called from the main simulator loop
* (continue_run) with a virtual PC from which we read a basic block
* of instructions, write them into the translation cache, and then
* return the address of the start of the block in the translation
* cache.

* The main structure of the module is a switch statement that takes
* each instruction and writes its translation.

* Some optimization, like not emitting a load nop when it is not
* needed, has been implemented, but there is much more room for
* optimization, especially avoiding uncessesary loads (and stores)
* when two adjacent instructions depend on each other

* We implement chaining in this module.  The idea of chaining is,
* instead of jumping back to the main simulator loop at the close of a
* basic block, jump directly to the next block.

* D_Cache_Check and I_Cache_Check contain the very delicate quick
* check code that sees if we need if we need to call out to mem_ref.
* These routines also ensure that the line in question is on a TLB
* mapped page

* When a jump or branch is the last instruction on a page, we want to avoid
* using physical addresses since it is possible that the delay slot
* instruction is on different physical pages for different processes.
* Thus we don't insert blocks that have jumps as their last instruction
* into the lookup tables under their physical address, so each ASID has
* their own translated version of these blocks.

* I assume that all basic blocks are assumed to be less than (1<<15)-1 bytes

* $author mencer $
* $date 5/21/96  $
*
 ***************************************************************************/

#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <bstring.h>
#include <sys/cachectl.h>
#include <sys/signal.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <stdlib.h>

#include "simmisc.h"
#include "embra.h"
#include "annotations.h"

#include "translator.h"
#include "decoder.h"
#include "mem_control.h"
#include "qc.h"
#include "cp0.h"
#include "driver.h"
#include "callout.h"
#include "main_run.h"
#include "tc_coherence.h"
#include "stats.h"
#include "cpu_interface.h"
#include "clock.h"
#include "userflush.h"

#include "fpu.h"

#include "vcode.h"

/* note: following included just for the assertion at the beginning
 * of translate().
 */
#include "machine_defs.h"


#define CHECKREGS 1

#if CHECKREGS
void CheckRegs(int current_target, int current_pc, int physaddr);
int check_regs=0;
int check_every=1;  /* check every basic block? */
int nblocks=0;
int print_blocks=1;
int die_reg=1;
uint64 min_cycles=0;
#endif

/* **************GLOBALS***********/


#define memptr v_ip

/* Offsets into translated block */

int SPECULATIVE_ENTRY = 0;	/* Is PC correct? */
int CHECK_MAPPING_ENTRY = 0;	/* Is physical addr correct? */
int SAME_PAGE_ENTRY = 0;	/* No checks required */

/* XXX These are REAL IMPORTANT */
/* This is wrong
   #define I_QC_LEN (embra.MPinUP?12:10)
   #define I_PA_LEN (embra.MPinUP?18:16)
   #define IREF_LEN (embra.useVQC?I_QC_LEN:I_PA_LEN)
*/

int I_QC_LEN = 0;		/* length of I-cache quick check */
int I_PA_LEN = 0;		/* length of I-cache cache check */
#define IREF_LEN (embra.useVQC?I_QC_LEN:I_PA_LEN)	/* length of I-cache check */

/* What chaining mode are we in? */
/* NO_CHAINING - all BB go to dispatch */
/* BB_CHAINING - BB link to each other */
enum {NO_C=0, BB_C=1} chainMode;

/* These function as labels for backward branches */

/* Management of labels */

static struct {
   v_label_type cont_run_adj_clock;
   v_label_type cont_run;
   v_label_type do_periodic;
   v_label_type rewind_dqc;
   v_label_type rewind_iqc;
   uint* cont_run_adj_clock_addr;
   uint* cont_run_addr;
   uint* do_periodic_addr;
   uint* rewind_dqc_addr;
   uint* rewind_iqc_addr;
} labels;

typedef struct {
  int real;
  int used;
}reg_t;
static reg_t prev_store;

/* These are function pointers used in the translation process */
/* They are setup in Translator_Init */
void* dispatchChain;
void* dispatchNoChain;
void* periodicCallout;

/* Turn this to 0 to stop register allocation. Curent max == 8*/

/* XXX This may blow us away, but I'm trying it right now for
 *  exact compatibility.....
   */
#define NUM_ALLOCATABLE_REGS 4

/* Actual space for register allocation structures */
alloc_reg_t reg2alloc[32];
/* Pointers to register allocation structures, so it can be sorted by */
/* register use */

/* Array field must be greater than 0 */
#if NUM_ALLOCATABLE_REGS > 0
  alloc_reg_t* src2alloc[NUM_ALLOCATABLE_REGS];
#else
/* Not used, just here for the compiler */
  alloc_reg_t* src2alloc[1];
#endif

#if defined(SIM_MIPS32)
#define JMP_PC_MASK 0xf0000000
#else
#define JMP_PC_MASK 0xfffffffff0000000LL
#endif

/* Always allocate registers in this order.  */
/* Any regsiters in this list should be saved and restored on a
   callout.  See callout.s */
unsigned reg_alloc[] = REGALLOC_LIST;

typedef enum {SEQ_FLOW, JMP_FLOW, BRANCH_FLOW, REGINDIRECT_FLOW,
              BRANCH_TAKEN, BRANCH_UNTAKEN} flow_t;


typedef struct TransState {
   InstrGrp *instrGrp;
   flow_t flow;
   VA curPC;
   int cycle_correction;
   VA composePC;
   unsigned branch_instr;
   int fp_tested;
} TransState;

/* **************END GLOBALS***********/

/****************************************************************/
/* Local Functions */
/****************************************************************/

/* Register Allocation functions */


/* *************************************************************
 * Some type checking for the register allocation
 * (the hacked up syle)
 * *************************************************************/

static int Load( int sim, int real );
static void Load_Move( int sim, int real );
static void Store( int used, int real);
/* Use this to move a known register to a possibly allocated register */
static void Store_Move( int contents, int store_or_alloc );
/* Load value from simulated regs into real regs */
static int Preload_Regs(void);
static int Set_Destination( int suggestion, int reg );
#if defined(SIM_MIPS64)
static void Load_64_Bit_Immed( int reg, Reg64 immed );
#define Load_Reg_Immed(_reg,_imm) Load_64_Bit_Immed(_reg,_imm);
#define LOAD_REG_SIZE  2
#endif
static void Load_Op_Immed( int loadOpCode, int reg, uint addr );

/* Allows all forms of chaining */
static void Page_Prelude_Chain_Check (InstrGrp *instrGrp, int cycles );
static void Cache_Prelude_Chain_Check(InstrGrp *instrGrp, int cycles );

/* Emits the chaining  jump */
static void Transfer_To( TransState *trans,VA newPC );

/* Function for manipulating the PC */
static void Update_PC( TransState *trans,
                       flow_t flow, VA next_PC);

/* Callout functions */
static void Do_Callout          (TransState *trans, int callout_code );
static void Do_Exception_Callout(TransState *trans, int exception_code);

/* Quick Check Functions */
static void Page_D_Cache_Check  (TransState *trans,char new_state,int init_reg);
static void Cache_D_Cache_Check (TransState *trans,char new_state,int init_reg);
static int  D_Memory_Check      (TransState *trans,unsigned instr, char new_state);
static void I_Memory_Check      (TransState *trans,int cpuNum, VA imm);

/* Clock maintenence */
static void Check_Timer( int num_cycles, uint pc, int bd );

/* Pipeline timing model */
static uint Pipe_Time(InstrGrp* thisGrp, int is_delay_slot_instr );

/****************************************************************/
/* END local functions */
/****************************************************************/

/* This disables speculative (jr) chaining */
/* #define DISABLE_SPECULATIVE_CHAINING */

/* extract PC from TransState structure */
#define COMPOSE_PC(_tr) ((_tr)->curPC | ((_tr)->flow!=SEQ_FLOW))

/* actual mem address of the pointer passed in */
/* #define mem_size(x)  sizeof(int)*(x)  */

#define mem_size(x)  ((x)<<2)

/*
 * Offset into CPUState of a (32 or 64-bit) register.
 */
#define REG_OFFSET(_x) (((_x)*sizeof(Reg)) + GP_OFF)

#if defined(SIM_MIPS64)
#define MAX_LONG_CONST 16
#define MAX_SIZE_LONG_CONST (MAX_LONG_CONST*8)
static struct LongConstTable {
    Reg64   value;      /* Value to be loaded */
    TCA     memptr;    /* Instruction seq do to load */
    int     regno;     /* Where to put value */
} longConstTable[MAX_LONG_CONST];

static int nextLongConstIndex;

#define InitLongConst()  {nextLongConstIndex = 0;}
#define AddLongConst(_val, _memptr, _regno) { \
                 ASSERT(nextLongConstIndex < MAX_LONG_CONST); \
                 longConstTable[nextLongConstIndex].value = (_val); \
                 longConstTable[nextLongConstIndex].memptr = (_memptr); \
                 longConstTable[nextLongConstIndex].regno = (_regno); \
                 nextLongConstIndex++; }
static void FillInLongConst(void);
#else
/* Not used or need in 32bit mode */
#define MAX_SIZE_LONG_CONST 0
#define InitLongConst()
#define AddLongConst(_val, _memptr, _regno) ASSERT(0);
#define FillInLongConst()
#endif


/* *******************************************************************
 * Longest translation (expressed in instructions)
 *     longest instruction : ldc1_op :                   1
 *                                     D_Memory_Check = 23
 *                                    Check_C1_Usable = 15 (=9+callout)
 *                                Annotation(callout) =  6
 *                                                    ------------
 *                                                      45
 * This is the max. possible value, and is checked anyway post-facto
 * ******************************************************************/

#define CHECK_TIMER  3
#define INCREMENT_MEM_ACCESS_COUNT 2
#define LONGEST_TRANS(grp) (IREF_LEN+SPECULATIVE_ENTRY+CHECK_MAPPING_ENTRY+SAME_PAGE_ENTRY + CHECK_TIMER + INCREMENT_MEM_ACCESS_COUNT + grp * (45))

#define _nop 0

/* I am no longer R3000 compatible, so this is near useless */
#ifdef R3000_COMPAT
/* No load NOPs needed on the R4000 */
#define TRAILING_LOAD( u ) prev_load = u;
#else
#define TRAILING_LOAD( u )
#endif


/*-----------------------------------------------------------------------------
 *
 *  This is the register allocation section
 *
 *----------------------------------------------------------------------------*/

/* Allocate registers for vcode */


v_reg_type VREGS[32];		/* integer registers */
v_reg_type FVREGS[32];		/* floating point registers */

v_label_type label;		/* universal label to branch to */

void VC_Allocate_Regs(void)
{
  int i;
  /* allocate all of the registers which the emulator uses; we only
  * have 16 registers available for vcode; (less with x86!!!)
  Tragically, this doesn't work; instead we have the
  following repulsive hack.
    v_getreg(&VREGS[VSS_BASE], V_I, V_TEMP);
    v_getreg(&VREGS[QC_REG],   V_I, V_TEMP);
    v_getreg(&VREGS[MMU_REG], V_I, V_TEMP);
    v_getreg(&VREGS[PC_REG], V_I, V_TEMP);
    v_getreg(&VREGS[CLOCK_REG], V_I, V_TEMP);
    v_getreg(&VREGS[MMU_REG], V_I, V_TEMP);
    v_getreg(&VREGS[SIM_T1], V_I, V_TEMP);
    v_getreg(&VREGS[SIM_T2], V_I, V_TEMP);
    v_getreg(&VREGS[SIM_T4], V_I, V_TEMP);
    v_getreg(&VREGS[BRANCHREG], V_I, V_TEMP);
    v_getreg(&VREGS[SHADOW0], V_I, V_TEMP);
    v_getreg(&VREGS[SHADOW1], V_I, V_TEMP);
    v_getreg(&VREGS[SHADOW3], V_I, V_TEMP);
    v_getreg(&VREGS[SHADOW3], V_I, V_TEMP);
    */

  for (i=0; i<32; i++)
    VREGS[i].reg=i;

    /* allocate all floating point registers */
    for (i=0; i<32; i++)
      FVREGS[i].reg=i;
      /* v_getreg(&FVREGS[i], V_F, V_TEMP); */

}


/* This definition of Load allows us to optimize out uneeded loads as */
/* we generate code. */
static int
Load( int sim, int real )
{
  /* If you want register 0, use the real thing */
  if( !real )
	return 0;
  if( reg2alloc[real].alloc_reg )
	return reg2alloc[real].alloc_reg;
  if( prev_store.real == real )
    {
      if( prev_store.used != sim )
		 {
			ECs( or_op_, sim, prev_store.used, G0 );
		 }
      return sim;
    }

  ECi( REG_LD_OP, sim, VSS_BASE, REG_OFFSET(real) );
  return sim;
}

/* This ensures that the current value of simREGS[real] is in register sim */
static void
Load_Move( int sim, int real )
{
   if( !sim )
	  return;
   if( reg2alloc[real].alloc_reg ) {
	  ECs( or_op_, sim, reg2alloc[real].alloc_reg, G0 );
	  return;
   }
  ECi( REG_LD_OP, sim, VSS_BASE, REG_OFFSET(real) );
}

static int Preload_Regs(void)
{
  int i;
  int alloc = 0;
  for(i = 0; i < NUM_ALLOCATABLE_REGS ; i++)
	if( src2alloc[i]->alloc_reg )
	   {
		  ECi( REG_LD_OP, src2alloc[i]->alloc_reg, VSS_BASE,
			   REG_OFFSET(src2alloc[i]->reg_num) );
          alloc++;
	   }
  return alloc;
}

static void
Store( int used, int real)
{
  /* No need to store to register 0 */
  if( !real )
    return;
  ECi( REG_ST_OP, used, VSS_BASE, REG_OFFSET(real) );
  /* Only use this trick with a non-allocated register */
  /* XXX- Make this trick real (i.e. use register), or possibly discard */
  if( used == SIM_T2 ) {
	 prev_store.real = real;
	 prev_store.used = used;
  } else {
	 prev_store.real = 0;
	 prev_store.used = 0;
  }
}

/* No matter what, store to simulated registers.  If this is an allocated register */
/* then move value into the allocated register also */
static void Store_Move( int contents, int store_or_alloc )
{
   if( reg2alloc[store_or_alloc].alloc_reg )
	  ECs(or_op_, reg2alloc[store_or_alloc].alloc_reg, G0, contents);

   ECi( REG_ST_OP, contents, VSS_BASE, REG_OFFSET(store_or_alloc) );
}

static int
Set_Destination( int suggestion, int reg )
{
  if( !reg )
    return 0;
  if( reg2alloc[reg].alloc_reg )
	 return reg2alloc[reg].alloc_reg;
  return suggestion;
}


#if defined(SIM_MIPS64)
static void
Load_64_Bit_Immed( int reg, Reg64 immed )
{
  AddLongConst(immed, memptr, reg);

  /* ECi(cop0_op,0,mfc_op,0);  In case we forget */
  /* ECi(cop0_op,0,mfc_op,0);  to fill this in */
  memptr++;
  memptr++;
  ASSERT(LOAD_REG_SIZE == 2);

}

static void FillInLongConst(void)
{
  int i;
  TCA current_memptr;

  current_memptr = memptr; /* We use the EcX() macros so we need to
                            * reset memptr to point at the place to
                            * insert. Save current_memptr before hand.
                            */
  for (i = 0; i < nextLongConstIndex; i++) {
     Reg64 val = longConstTable[i].value;
     memptr = longConstTable[i].memptr;
     if ((int64)val == (signed int)val) {
        /* Is really a 32bit bit consts */
        Load_32_Bit_Immed(longConstTable[i].regno, val);
     } else {
        /* Is 64bits - Allocate space for const at end of translation */
       if (((int)current_memptr) & 0x7) current_memptr++;
       (*(Reg64 *)current_memptr) = val;
       ASSERT((uint)current_memptr == (Reg64)current_memptr);
       Load_Op_Immed(ld_op, longConstTable[i].regno, (uint)current_memptr);
       current_memptr += 2;
     }
  }
  memptr = current_memptr;

}
#endif

/* UGH! We will have to generate this on the fly... */

extern void (*Em_dynPQCdsh)(void);
extern void (*Em_dynPQCdex)(void);
extern void (*Em_HackedIPageMMUQC)(VA *);

#ifdef RECORD_TRANSLATIONS
static int trans;
#endif

int fildes;

void Translator_Init(void)
{
   int i;
   char* chainModeStr;
   static int initialized;

   /* I hope this is OK!! moved from below -BL */
   if( initialized )
     return;
   initialized = 1;

   /* Generate interface code */

   GenInterfaceCode();

   if( embra.MPinUP ) {
      dispatchChain = (void *)continue_run;
      dispatchNoChain = (void *)continue_run_without_chaining;
      periodicCallout = (void *)Embra_CX;
   } else {
      dispatchChain = (void *)continue_run;
      dispatchNoChain = (void *)continue_run_without_chaining;
      if( TOTAL_CPUS == 1 ) {
         periodicCallout = (void *)UPperiodic_callout;
      } else {
         periodicCallout = 0;
#ifdef obsolete
         periodicCallout = (void *)do_periodic_callout;
#endif
      }
   }

#if CHECKREGS
   if (check_regs) fildes = open("regs",O_RDONLY);
#endif

   for (i = 0; i < TOTAL_CPUS; i++) {
     /* XXXX evil! more hacks; init fpcr -BL */
     EMP[i].FCR[0] = 0x900;  /* for R10000???? */
     EMP[i].FCR[30] = 0x900; /* eir -> same as fcr0 on r10k */
   }

   /* XXX Hard-wire register allocation - fix this??? */
   for(i = 0; i < 32; i++ )
	  reg2alloc[i].reg_num = i;

#ifdef RECORD_TRANSLATIONS
   trans = open("./trans", (O_CREAT|O_RDWR|O_TRUNC), 0);
   if( trans == -1 )
	  perror("open trans");
#endif

   /* Make this a 1 to run without ll/sc */
   chainMode = BB_C; /* BB_C or NO_C */

}

static void set_src2alloc( void )
{
   int i = 0;
   int j;
   /* Don't allocate register 0, and use it as a reference */
   /* Registers have to be used at least once to get on the list */
   reg2alloc[0].num_src = 1;
   for(i = 0; i < NUM_ALLOCATABLE_REGS; i++)
	  src2alloc[i] = &reg2alloc[0];
   /* Exclude 0 from the running */
   for(i = 1; i < 32; i++ ) {
	  j = NUM_ALLOCATABLE_REGS-1;
	  if( reg2alloc[i].num_src >= src2alloc[j]->num_src ) {
		 src2alloc[j] = &reg2alloc[i];
		 /* Bubble entry down to index 0 */
		 while( j && src2alloc[j]->num_src >= src2alloc[j-1]->num_src ) {
			register alloc_reg_t* tmp;
			tmp = src2alloc[j];
			src2alloc[j] = src2alloc[j-1];
			src2alloc[j-1] = tmp;
			j--;
		 }
	  }
   }
   /* Remember, don't allocate register zero, we can just use it directly */
   reg2alloc[0].num_src = 0;
}


/* Provides a 2 instruction load from a 32 bit quantity */
static void
Load_Op_Immed( int loadOpCode, int reg, uint addr )
{
  uint low;

  uint *temp = memptr;

  /* Use lui and load offset to load from a 32 bit address.  We have */
  /* to play games because lw sign extends its offset.  If the offset */
  /* has its sign bit set, we take action to correct the value. */

   ASSERT (addr >= 0x10000000);
   ASSERT (addr <  0x80000000);
   ASSERT ((addr < MA_TO_UINT(SIM_MEM_ADDR(0)))
           || (addr >= MA_TO_UINT(SIM_MEM_ADDR(NUM_MACHINES-1) +
                                  MEM_SIZE(NUM_MACHINES-1))));

   if (addr & 0x8000) {
     /* Correct the value computed by lw */
     ECi( lui_op_, reg, 0, (addr>>16)+1 );
     low = (addr & 0xffff) - (1<<16);
   }
   else {
     ECi( lui_op_, reg, 0, (addr>>16));
     low = (addr & 0xffff);
   }

   /* must be 16 bits for vcode !! */
   ASSERT(is16bit(low));

   switch (loadOpCode) {
      case lb_op:
	ECi( lb_op_, reg, reg, low );
	break;
      case lw_op:
	ECi( lw_op_, reg, reg, low );
	break;
      case ld_op:
	ECi( REG_LD_OP, reg, reg, low );
	break;
    }

   /* we're dead if we  use more than 2 instructions */
   ASSERT( memptr - temp == 2);

}

/*-----------------------------------------------------------------------------
 *
 *  This section provides support for chaining
 *
 *---------------------------------------------------------------------------*/

/* This is a deceptively simple  function.  It emits a prelude to */
/* the basic block's translation.  Only embra gods may modify it*/
/* XXX BL comment: Will this qualify me as an embra god?? soon!! ;-) */

static void
Page_Prelude_Chain_Check( InstrGrp *instrGrp,  int cycles )
{
   TCA start = memptr;

   /* This is the bail out code, called if we should not have chained */
   /* to here */
   /* It can be chained, because the old ra is still in the register */
   /* (and old & new pc are) correct -- except when we are coming from */
   /* our exception return path, continue_run_without_chaining -- see */
   /* main_run.s for details */

   SET_LABEL(cont_run );
   ECj(j_op_, dispatchChain);
   ECnop;

   /* do_periodic: we end up here when our clock goes to zero or below */
   SET_LABEL(do_periodic );
   ECi(addiu_op_, CLOCK_REG, CLOCK_REG, cycles);
   ECj( jal_op_, periodicCallout );
   ECnop;
   ECi(ori_op_, RA, G0, 8); /* Smash RA from callout to prevent bogus chain ;
			    * Note that chain will be to $ra-8 = 0, so no chain
			    */

   if (!SPECULATIVE_ENTRY)
     SPECULATIVE_ENTRY = memptr - start;

   ASSERT (start+SPECULATIVE_ENTRY == memptr);

   start = memptr;

   /* SPECULATIVE_ENTRY: */
   /* Entry point number 1 is for jr (conditional chaining).  It checks */
   /* virtual address match */
   Load_Reg_Immed( SIM_T1, instrGrp->virt_pc );

   /* If the virtual address of the last BB's target is not our VA, */
   /* bail out to j continue_run;nop  */
   ECilab( bne_op_, PC_REG, SIM_T1, USE_LABEL( cont_run ) );
   ECnop;

   if (!CHECK_MAPPING_ENTRY)
     CHECK_MAPPING_ENTRY = memptr - start;

   ASSERT (start+CHECK_MAPPING_ENTRY == memptr);

   start = memptr;

   /* CHECK_MAPPING_ENTRY: */
   /* Entry point number 2 is for regular chaining.  It checks physical */
   /* address match */

   /* For kernel (unmapped) code, there is no mapping, so no check */
   if( !IS_UNMAPPED_ADDR( instrGrp->virt_pc ) ) {

#ifdef EMBRA_USE_QC64
     ECi(ori_op_,SHADOW0,RA, 0); /* Save RA for chaining purpose */
     ECi(ADDR_ADDI_OP, A0, PC_REG, 0); /* argument = PC */
     ECj(jal_op_, Em_HackedIPageMMUQC); /* Returns in SIM_T2 */
     ECi(ori_op_,RA, SHADOW0, 0); /* Restore RA for chaining purpose */
#else
     if( embra.MPinUP ) {
       /* This code uses the actual virtual address of the current */
       /* process because we can't do the faster version for MPinMP. */
       /* There is such strong incentive for all dynamically linked */
       /* libraries to be mapped at the same VA, that we can simply */
       /* say that each VA, PA pair has its own translated block.  This */
       /* allows the faster check done below. */
       ECsh( srl_op_, SIM_T2, PC_REG, NUM_OFFSET_BITS );
       /* Word Align */
       ECsh( sll_op_, SIM_T2, SIM_T2, 2 );
       ECs(addu_op_, SIM_T2, SIM_T2, MMU_REG );
       ECi(lw_op_, SIM_T2, SIM_T2, 0);
     } else {
       Load_Op_Immed( lw_op, SIM_T2,
		      (uint)&EMP[instrGrp->cpuNum].mmu[PAGE_NUMBER(instrGrp->virt_pc)] );
     }
#endif

     Load_32_Bit_Immed( SIM_T1,  MA_TO_UINT(instrGrp->maPC)&0x7FFFF000 );

     /* If the physical page corresponding to the present virtual page is */
     /* not what we think know ours to be, then bail out.  */
     /* Don't allow matches to pages which are set exclusive -- */
     /* this implies we are executing of a page which was just */
     /* written to */
     ECilab( bne_op_, SIM_T1, SIM_T2, USE_LABEL( cont_run ) );
     ECnop;

     if (!SAME_PAGE_ENTRY)
       SAME_PAGE_ENTRY = memptr - start;

     /* SAME_PAGE_ENTRY: no need to check anything */
     ASSERT (start + SAME_PAGE_ENTRY ==memptr);

   } /* if !IS_UNMAPPED_ADDR */

} /* Page_Prelude_Chain_Check */

/* Comments are in Page_Prelude_Chain_Check */
static void
Cache_Prelude_Chain_Check( InstrGrp *instrGrp, int cycles)
{
   uint laddr;
   TCA start =memptr;

   /*
    * Prelude to basic block
    */

   SET_LABEL(cont_run );
   ECj( j_op_, continue_run );
   ECnop;

   /* do_periodic: we end up here when our clock goes to zero or below */
   SET_LABEL(do_periodic );
   ECi( addiu_op_, CLOCK_REG, CLOCK_REG, cycles );
   ECj( jal_op_, periodicCallout );
   ECnop;
   ECi(ori_op_, RA, G0, 8);  /* Smash RA from callout to prevent bogus chain */

   /*ECi( REG_ST_OP, PC_REG, VSS_BASE, PC_OFF );*/

   /*
    * Main (first) entry point of translated BB
    */
   if( instrGrp->is_delay_slot_instr ) {
      /* Align and adjust PC */
#if defined(SIM_MIPS32)
      ECsh( srl_op_, PC_REG, PC_REG, 1 );
      ECsh( sll_op_, PC_REG, PC_REG, 1 );
#else
      ECsh( dsrl_op_, PC_REG, PC_REG, 1 );
      ECsh( dsll_op_, PC_REG, PC_REG, 1 );
#endif
      ECi(ADDR_ADDI_OP, PC_REG, PC_REG, -4);
      return;
   }

   if (!SPECULATIVE_ENTRY)
     SPECULATIVE_ENTRY = memptr - start;

   ASSERT (start+SPECULATIVE_ENTRY == memptr);

   start = memptr;

  /* Entry point number 1 is for jr (conditional chaining).  It checks */
  /* virtual address match */
  Load_Reg_Immed( SIM_T1, instrGrp->virt_pc );

  /* If the virtual address of the last BB's target is not our VA, */
  /* bail out to j continue_run;nop  */
  ECilab( bne_op_, PC_REG, SIM_T1, USE_LABEL(cont_run) );
  ECnop;

  if (!CHECK_MAPPING_ENTRY)
    CHECK_MAPPING_ENTRY = memptr - start;

   ASSERT (start+CHECK_MAPPING_ENTRY == memptr);

   start = memptr;

  /* For kernel code, virtual/physical mapping is the identity map */
  if( !IS_UNMAPPED_ADDR( instrGrp->virt_pc ) ) {

#ifdef EMBRA_USE_QC64
     ECi(ori_op_,SHADOW0,RA, 0); /* Save RA for chaining purpose */
     ECi(ADDR_ADDI_OP, A0, PC_REG, 0);
     ECj(jal_op_, Em_HackedIPageMMUQC); /* Returns in SIM_T2 */
     ECi(ori_op_,RA, SHADOW0, 0); /* Restore RA for chaining purpose */
#else

     if( embra.MPinUP ) {
        /* This code uses the actual virtual address of the current */
        /* process because we can't do the faster version for MPinMP. */
        /* There is such strong incentive for all dynamically linked */
        /* libraries to be mapped at the same VA, that we can simply */
        /* say that each VA, PA pair has its own translated block.  This */
        /* allows the faster check done below. */
        ECsh( srl_op_, SIM_T2, PC_REG, NUM_OFFSET_BITS );
        /* Word Align */
        ECsh( sll_op_, SIM_T2, SIM_T2, 2 );
        ECs(addu_op_, SIM_T2, SIM_T2, MMU_REG );
        ECi(lw_op_, SIM_T2, SIM_T2, 0);
     } else {
        /* Entry point number 2 is for regular chaining.  It checks */
        /* physical  address match */
        Load_Op_Immed( lw_op, SIM_T2,
                       (uint)&EMP[instrGrp->cpuNum].mmu[PAGE_NUMBER(instrGrp->virt_pc)] );
     }
#endif
     Load_32_Bit_Immed( SIM_T1,  MA_TO_UINT(instrGrp->maPC)&0x7FFFF000 );
     /* If the physical page corresponding to the present virtual page is */
     /* not what we think know ours to be, then bail out.  */
     /* Don't allow matches to pages which are set exclusive -- */
     /* this implies we are executing of a page which was just */
     /* written to */
     ECilab( bne_op_, SIM_T1, SIM_T2, USE_LABEL(cont_run));
     /* Branch delay instruction in user mode is Icache check for */
     /* cache mode */

     if (!SAME_PAGE_ENTRY)
       SAME_PAGE_ENTRY = memptr - start;

     /* SAME_PAGE_ENTRY: no need to check anything */
     ASSERT (start + SAME_PAGE_ENTRY ==memptr);
  } /* if (!IS_UNMAPPED_ADDR) */
} /* Cache_Prelude_Chain_Check */


/* Execution is equivalent to reading */
static void
I_Memory_Check( TransState *trans, int cpuNum, VA imm)
{

   /* Note: A0 is set in instr_read_wrapper by clearing the lower bit */
   /* of A2, and copying the result into A0 */
   /* Note: A1 is set in instr_read_wrapper to MEM_I_SHARED */
   /* Don't need an icache check in page mode because of prelude chain check*/

   TCA temp=memptr;
   trans->fp_tested = 0;

   switch( embra.emode ) {
   case EMBRA_CACHE:
#if defined(SIM_MIPS64)
      CPUError("Cache mode doesn't work with 64bit stuff\n");
#endif

      if (embra.useVQC){
         SET_LABEL(rewind_iqc );

         if( embra.MPinUP ) {
            /* In this case need to use the qc_v register */
            Load_32_Bit_Immed(SIM_T4, (uint)(ADDR2SLINE(imm)));
            ECs(addu_op_, V0, SIM_T4, QC_REG);
            ECi(lb_op_, SIM_T4, V0, 0);
         } else {
            /* Load Status byte into SIM_T4 */
            Load_Op_Immed( lb_op, SIM_T4,
                           (unsigned)&EMP[cpuNum].qc_v[ADDR2SLINE(imm)] );
         }

         /* ENSURE that this line is read-only.  That way we catch it if */
         /* someone writes code, jumps to it, and eventually changes the */
         /* code */
         ECi(bgtz_op_, G0, SIM_T4, 7);

         /* Put PC into A0 */
         ECi(ADDR_ADDI_OP, A0, PC_REG,
             COMPOSE_PC(trans) - trans->instrGrp->virt_pc);
         ECi(REG_ST_OP, A0, VSS_BASE, PC_OFF);
         ECi(addiu_op_, A3, G0, trans->cycle_correction);
         ECi(addiu_op_, A1, G0, MEM_I_SHARED);
         /* Because we are in the same segment as TC, and we are jumping to */
         /* an assembly routine, we can just use the bits directly */
         if( embra.sequential ) {
            /* Don't rewind */
            ECi(addiu_op_, SIM_T2, G0, 0);
         } else {
            ECilab(addiu_op_, SIM_T2, G0, USE_LABEL_VALUE(rewind_iqc));
         }
         ECj(jal_op_, phys_mem_ref_wrapper);
	 ECnop;

	 VCTARGET;

	 if (!I_QC_LEN)
	   I_QC_LEN = memptr - temp;

	 ASSERT(temp ==memptr-I_QC_LEN);

      } else { /* !embra.useVQC */
	static v_label_t mmu_or_cache_miss, cache_hit;

         /* get offset into page at Translate Time */
         unsigned offset = imm & (DEFAULT_PAGESZ-1);

	 /* initialize labels */
	 mmu_or_cache_miss = v_genlabel();
	 cache_hit = v_genlabel();

         if( embra.MPinUP ) {
            /* In this case need to use the MMU_RELOC register */
            Load_32_Bit_Immed(SIM_T4, (uint)(PAGE_NUMBER(imm)*
                                             sizeof(EMP[cpuNum].mmu[0] )) );
            ECs(addu_op_, SIM_T4, SIM_T4, MMU_REG);
            ECi(lw_op_, SIM_T4, SIM_T4, 0);
         } else {
            /* Load Relocated page into ST4 */

            Load_Op_Immed( lw_op, SIM_T4,
                           ((unsigned)&EMP[cpuNum].mmu[PAGE_NUMBER(imm)]));
         }
         /* ENSURE that this page is read-only.  That way we catch it if */
         /* someone writes */
         /* code, jumps to it, and eventually changes the code */

	 /* MMU hit -> check physarray */

         ECs(and_op_, SIM_T4, MMUMASK_REG, SIM_T4); /* clear prot bit of MMU entry */

	 v_bleii(VREGS[SIM_T4], 0, mmu_or_cache_miss);
	 ECnop;

         /* add offset to MA page # */
         ECi(ori_op_,SIM_T4,SIM_T4,offset);

         ECsh(srl_op_,SIM_T4,SIM_T4,log2SCACHE_LINE_SIZE);
         /* physical line number : ST1 , ST2 = PA_REG + ST2 */
         ECs( addu_op_, SIM_T4, SIM_T4, PA_REG );

         ECi( lb_op_, SIM_T4, SIM_T4, 0 );/* Load PA entry byte into ST2 */

	 v_bneii(VREGS[SIM_T4], 0, cache_hit);		/* branch on PA hit */

	 v_label(mmu_or_cache_miss);

         ECi(ADDR_ADDI_OP, A0, PC_REG,
             COMPOSE_PC(trans) - trans->instrGrp->virt_pc);
         ECi(REG_ST_OP, A0, VSS_BASE, PC_OFF);

         /* Because we are in the same segment as TC, and we are jumping to */
         /* an assembly routine, we can just use the bits directly */
         /* Correct the cycle count */
         ECi(addiu_op_, A3, G0, trans->cycle_correction);
         ECi(addiu_op_, A1, G0, MEM_I_SHARED);
         /* This instruction must be in the delay slot so the USE_LABEL will */
         /* work correctly */
         if(embra.sequential ) {
            /* In embra.MPinUP no need to rewind quick check */
            ECi(addiu_op_, SIM_T2, G0, 0);
         } else {
            ECilab(addiu_op_, SIM_T2, G0, USE_LABEL_VALUE(rewind_dqc));
         }
         ECj(jal_op_, pa_mem_ref_wrapper);
	 ECnop;

	 v_label(cache_hit);

	 if (!I_PA_LEN)
	   I_PA_LEN = memptr - temp;

         ASSERT(temp==(memptr-I_PA_LEN));

      }

      break;
   case EMBRA_PAGE:
#ifndef EMBRA_USE_QC64
     if( embra.MPinUP ) {
         /* In this case need to use the MMU_RELOC register */
         Load_32_Bit_Immed(SIM_T4, (uint)(PAGE_NUMBER(imm)*
                                          sizeof(EMP[cpuNum].mmu[0] )) );
         ECs(addu_op_, SIM_T4, SIM_T4, MMU_REG);
         ECi(lw_op_, SIM_T4, SIM_T4, 0);
      } else {
         /* Load Relocated page into SIM_T4 */
         Load_Op_Immed( lw_op, SIM_T4,
                        (unsigned)&EMP[cpuNum].mmu[PAGE_NUMBER(imm)]);
      }
      /* ENSURE that this page is read-only.  That way we catch it if */
      /* someone writes */
      /* code, jumps to it, and eventually changes the code */
      ECi(bgtz_op_, G0, SIM_T4, 6);
#else
     ECnop;
     ECj(jal_op_, Em_HackedIPageMMUQC); /* Returns in SIM_T2 */
     ECi(ADDR_ADDI_OP, A0, PC_REG, imm - trans->instrGrp->virt_pc);

     ECi(bgtz_op_, G0, SIM_T2, 5 );
#endif

      ECi(ADDR_ADDI_OP, A2, PC_REG,
          COMPOSE_PC(trans) - trans->instrGrp->virt_pc);
      ASSERT(COMPOSE_PC(trans)!=0);
      ECi(REG_ST_OP, A2, VSS_BASE, PC_OFF);
      ECi(addiu_op_, A1, G0, MEM_I_SHARED);
      ECi(addiu_op_, A3, G0, trans->cycle_correction);
      ECj(jal_op_, mem_ref_wrapper);
      ECnop;

      VCTARGET;
      break;
   }
}

/* Select which cache to use for a given vAddr based on its location and
 * the SR mode
 */

#define TC_CACHE(vAddr) (IS_USER(vAddr) && \
			 ((EMP[cpuNum].CP0[C0_SR] & SR_KSU_MSK) == \
			  SR_KSU_USR) \
			 ?  TC_USER : TC_KERN)

/* **********************************************************************
 * FindTCA
 *
 * Called by ChainBB. Might have side-effects and might not return.
 * **********************************************************************/
TCA FindTCA(VA vAddr, MA mAddr, int cpuNum)
{
   int tcCache = TC_CACHE(vAddr);

   if( !mAddr ) {
      uint tvRes;
      PA pa;
      /* Must be user code */

      tvRes = Em_TranslateVirtual( cpuNum, vAddr, &pa, ACT_IREAD);
      if( tvRes == BACKDOOR_CODE ) {
         /* Let Translate emulate this call */
         STAT_INC( pc_tc_bdoor_misses );
         STAT_INC( pc_tc_lookup_misses );
         /* Do this here so we don't chain backdoor addresses */
         mem_translate( cpuNum, vAddr );
         /* NOT REACHED */
         ASSERT(0);
         return 0;
      } else if (tvRes == NORMAL_CODE) {
         /* this might not be appropriate for cache mode */
         CPUError("EMBRA: pc_tc_lookup error: PC=0x%x \n",vAddr);
      } else {
         if( tvRes == EXCEPTION_CODE ) {
            ReenterTC(&EMP[cpuNum]);
            /* NOT REACHED */
         }
      }
      mAddr = PHYS_TO_MEMADDR(M_FROM_CPU(cpuNum), pa);
      ASSERT( tvRes == NORMAL_CODE );
   } else {
      ASSERT( EMBRA_IS_MEMADDR(M_FROM_CPU(cpuNum), mAddr));
   }

   ASSERT( IS_KSEG0(vAddr) || IS_KUSEG(vAddr) || IS_KSEG2(vAddr));

   return TC_PCLookup(tcCache,vAddr,mAddr);
}

TCA FindTCA_NoSE(int cpuNum, VA vAddr)
{
   MA mAddr;
   int tcCache;
   K0A k0A = non_excepting_tv(cpuNum, vAddr);
   if (!k0A) return 0;
   mAddr = K0_TO_MEMADDR(M_FROM_CPU(cpuNum),k0A);
   if (!mAddr) return 0;
   tcCache = TC_CACHE(vAddr);
   return TC_PCLookup(tcCache,vAddr,mAddr);
}


/* ***********************************************************************
 * FindEntryPoint
 *
 * This is called on non-speculative chainings only, either at translation
 * time (direct-chaining) or at execution time (delayed chaining)
 * **********************************************************************/

TCA FindEntryPoint(VA startBB, VA endBB, VA target, TCA targetTrans )
{
   /*
    * the offsets are defined in bytes, not instructions
    */

  TCA to = targetTrans +  CHECK_MAPPING_ENTRY;
   int success = 1;

      /* SAME_PAGE_ENTRY only exists for user code (because its mapped) */
   if( !IS_UNMAPPED_ADDR( target ) ) {
      STAT_INC(chain_user_chains);
      if( PAGE_NUMBER(startBB) == PAGE_NUMBER(target) &&
          PAGE_NUMBER(endBB)==PAGE_NUMBER(target) ) {
         STAT_INC(chain_samepg_chains);
         to += SAME_PAGE_ENTRY;
      } else {
         /*
          * just being anal, but we cannot afford to have this
          * chaining bypass fail and the next one (cache mode stuff)
          * succeed
          */
         success = 0;
         }
   }
   if( success &&
       embra.emode == EMBRA_CACHE &&
       ADDR2SLINE(startBB)   == ADDR2SLINE(target) &&
       ADDR2SLINE(endBB) == ADDR2SLINE(target)) {
      STAT_INC(chain_sameln_chains);
      to += IREF_LEN;
   }

#ifdef BROKENNOW_KEEP_FOR_LATER
   /*
    * this just checks that the entry point has
    * an "expected" opcode.  Unfortunately, this is now broken
    */
      ASSERT((*(unsigned*)to & 0xffff0000) == 0x26940000 ||
          (*(unsigned*)to & 0xfc000000) == 0x3c000000 ||
          (*(unsigned*)to & 0xfc0007ff) == 0x00000302);
#endif

   return  to;
}


/* *******************************************************************
 * ChainBasicBlock
 *
 * This function is called from the TC through the continue_run and
 * continue_run_without_chaining wrappers.
 *
 * The purpose of this function is to determine the translation of the
 * target BB and to patch the last instruction of the chain-from BB
 * to bypass ChainBasicBlock
 * (one time self-modifying code)
 *
 * On entry, jump_addr is the address of the jump at the end of the
 * previous basic block, and new_pc is the (simulated) PC of the
 * target basic block that we are jumping to.
 *
 * *******************************************************************/

/* Chaining note: if the cost of cacheflushing is too high, the direct */
/* jal instruction can be replaced with a load (from an area which can */
/* be overwritten) and jump.  */
/* This function is called from continue_run, and that procedure does */
/* some work for it */

/* these macros are going to be machine-dependent */

#define MIPS_CHANGE_JUMP_TARGET(jump_addr, new_target) \
   *(unsigned*)jump_addr = ComposeJump( jal_op, (uint)new_target>>2 )

#define MIPS_IS_JUMP(jump_addr) \
     ((*(unsigned*)jump_addr & 0xfc000000) == 0x0c000000)

/* XXX-XXX XXX */
/* Note this check is totally dependent on what is done in */
/* Update_PC. Here we check to see if the upper 11 bits of the */
/* previous instruction are non-0. If they are, then we know */
/* that this can't be a register indirect jump because they*/
/* put an OR with rs==0 in the delay slot */

#define MIPS_ISNT_REGIND(jump_addr) (*(jump_addr-1) & 0xffe00000 )

TCA ChainBasicBlock( TCA jump_addr, VA new_pc)
/*
 *        register TCA to,
 *         register VA old_pc,
 *         register VA new_pc )
 */
{
#ifdef EMBRA_USE_QC64
   MA newMA = (MA) Em_QC64Reload(new_pc,QC64_READ);
   TCA targetTCA = FindTCA(curEmp->PC,newMA, curEmp->myNum);
   TCA chainTCA;
#else
   MA mapping    = curEmp->mmu[PAGE_NUMBER(curEmp->PC)];
   MA newMA      = (mapping? mapping  + PAGE_OFFSET(curEmp->PC):0);
   TCA targetTCA = FindTCA(curEmp->PC,MMU_PROT_READ(newMA),curEmp->myNum);
   TCA chainTCA;
#endif

   ASSERT( new_pc == curEmp->PC);

   ASSERT( new_pc > 0x100000); /* XXX for debugging */

   if (!targetTCA) {
      int has_flushed;
      targetTCA = Translate(curEmp->myNum,curEmp->PC, &has_flushed);
      SyncInstr();

      /* Translate flushed the TC, so the jump we're supposed to patch up is
       * not a valid instruction anymore.
       */
      if (has_flushed) return targetTCA;
   }
   /*
    * make sure that the qc data structures are ok
    */
   if (embra.emode==EMBRA_CACHE && embra.useVQC) {
      ASSERT( !VQC_EXCL(curEmp->qc_v[ADDR2SLINE(curEmp->PC)]));
   } else {
#ifndef EMBRA_USE_QC64
         ASSERT( !IS_MMU_PROT_WRITE(QC_MMU_LOOKUP(curEmp,curEmp->PC)));
#endif
   }
#if defined(SIM_MIPS64)
   ASSERT( IS_KSEG0(curEmp->PC) || IS_KUSEG(curEmp->PC) || IS_KSEG2(curEmp->PC));
#endif

#if CHECKREGS
   if (check_regs) {
     CheckRegs((unsigned) targetTCA, (unsigned) new_pc, 0);
     /* don't chain */
     return targetTCA;
   }
#endif

   /* no chaining for now!! */
   if (!jump_addr) {
      /*
       * no chaining. simply return the targetTCA
       */
      return targetTCA;
   }

   if( ((uint)jump_addr ) == (uint)Embra_CX_nochain ) {
      STAT_INC(chain_bounced);
      return targetTCA;
   }
   if (!TC_InTC(TC_USER,jump_addr)  && !TC_InTC(TC_KERN,jump_addr)) {
      CPUWarning("EmbraChaining: %10lld cpu=%d PC=0x%llx chaining from 0x%x\n",
                   (uint64)EmbraCpuCycleCount(curEmp->myNum),
                 curEmp->myNum,(Reg64)curEmp->PC,
                 jump_addr);
      return targetTCA;
   }


#if CHECK_INTER_TC_CHAIN
   if ((TC_InTC(TC_USER,jump_addr)  && TC_InTC(TC_KERN,targetTCA)) ||
       (TC_InTC(TC_KERN,jump_addr) && TC_InTC(TC_USER,targetTCA))) {
      CPUError("EmbraChaining: inter-TC chain at %10lld cpu=%d PC=0x%llx chaining from 0x%x to 0x%x\n",
                   (uint64)EmbraCpuCycleCount(curEmp->myNum),
                 curEmp->myNum,(Reg64)curEmp->PC,
                 jump_addr, targetTCA);
      return targetTCA;
   }
#endif


   /* For regind we need to chain to speculative entry point; otherwise,
    * we can do better
    */
   if( MIPS_ISNT_REGIND(jump_addr) ) {
      /*
       * get the translation for the page of the previous pc.
       * we need to decode it once more
       */
#ifdef EMBRA_USE_QC64
      MA memAddr =  (MA) ((uint)Em_QC64Reload(curEmp->oldPC,QC64_READ) &
                          ~(DEFAULT_PAGESZ-1));
#else
      MA memAddr = MMU2ADDR(QC_MMU_LOOKUP(curEmp,curEmp->oldPC));
#endif
      VA endOldBB = (memAddr?EndOfBB(curEmp->oldPC,memAddr+(curEmp->oldPC & (DEFAULT_PAGESZ-1))):0);

      chainTCA = FindEntryPoint(curEmp->oldPC,endOldBB,new_pc,targetTCA);
   } else {
     /* chain to speculative entry point */
      chainTCA = targetTCA;
   }

   /* Actually overwrite old jump instr */
   ASSERT( MIPS_IS_JUMP(jump_addr) );
   /* ASSERT(!TC_In_TC( (void*)(((*(unsigned*)jump_addr) & 0x3ffffff) << 2) )); */

   MIPS_CHANGE_JUMP_TARGET(jump_addr, chainTCA);

   FlushOneLine(jump_addr);

#if 0
   CPUWarning("Chain: (0x%x,0x%08x)  -> (0x%x,0x%08x) : +%d val=0x%08x\n",
	      old_pc, jump_addr, curEmp->PC,chainTCA,
	      (uint)chainTCA-(uint)targetTCA,
	      *(unsigned*)jump_addr);
#endif

   return chainTCA;
/* }*/
}


void
Transfer_To(TransState *trans, VA vPC )
{
  TCA trans_PC;

  ASSERT( !IN_BD(vPC));
  switch(chainMode) {
  case NO_C:
     /* This jumps directly to dispatch.  Since NO_C is set, chaining */
     /* is not in effect */
     ECj(jal_op_, dispatchNoChain);
     return;
  case BB_C:
     trans_PC =  FindTCA_NoSE(trans->instrGrp->cpuNum, vPC );
#if CHECKREGS
     if (trans_PC && !check_regs) {
#else
     if (trans_PC) {
#endif
        VA oldPCstart = trans->instrGrp->virt_pc;
        VA oldPCend   = trans->instrGrp->virt_pc +
           mem_size(trans->instrGrp->GrpLen);

        trans_PC = FindEntryPoint(oldPCstart,oldPCend,vPC,trans_PC);
        ECj(jal_op_, trans_PC);
     } else {
        ECj(jal_op_, dispatchChain);
     }
  }
}


/* ********************************************************
 * EmitBranch
 *
 * Code immediately following the branch should
 * be for the branch NOT taken case, while
 * code for the branch taken case should
 * start after num_instr_to_skip
 * ********************************************************/

static void Emit_Branch( TransState *trans, int num_instr_to_skip )
{
   int tmp1, tmp2;
   /* If we can load, registers and use the branch we are translating */
   if( trans->instrGrp->delay_slot_reg_conflict ) {
	  ECi(bgtz_op_, G0, BRANCHREG, num_instr_to_skip);
   }else{
	  switch( MAJOR_OPCODE(trans->branch_instr ) ) {
	  case beq_op:
	  case bne_op:
		 if( prev_store.real == rt(trans->branch_instr) ) {
			tmp2 = Load( SIM_T2, rt(trans->branch_instr) );
			tmp1 = Load( SIM_T1, rs(trans->branch_instr) );
		 } else {
			tmp1 = Load( SIM_T1, rs(trans->branch_instr) );
			tmp2 = Load( SIM_T2, rt(trans->branch_instr) );
		 }
		 switch( MAJOR_OPCODE(trans->branch_instr ) ) {
		 case beq_op:
		   ECi(beq_op_, tmp2, tmp1, num_instr_to_skip);
		   break;
		 case bne_op:
		   ECi(bne_op_, tmp2, tmp1, num_instr_to_skip);
		   break;
		 }
		 break;
	  case cop1_op:
		 /* Kinda hacky way of changing the offset of an fp branch */
	    /*** XXX THIS WILL HAVE TO BE FIXED -BL ******/
		 *memptr++ = (trans->branch_instr & 0xffff0000) | num_instr_to_skip;
		 break;
	  default:
		 tmp1 = Load(SIM_T1, rs(trans->branch_instr));
		 if (MAJOR_OPCODE(trans->branch_instr) == bcond_op) {
		   switch(rt(trans->branch_instr)) {
		   case bltz_op:
		   case bltzl_op:
		     ECb(bltz_op_, tmp1, num_instr_to_skip);
		     break;
		   case bgez_op:
		   case bgezl_op:
		     ECb(bgez_op_, tmp1, num_instr_to_skip);
		     break;
		   default:
		     fprintf(stderr,
			     "Emit_Branch- unimplemented branch code = %x\n",
			     rt(trans->branch_instr));
		     ASSERT(0);
		     break;
		   }
		 }
		 else {
		   switch (MAJOR_OPCODE(trans->branch_instr)) {
		   case blez_op:
		   case blezl_op:
		     ECi(blez_op_, G0, tmp1, num_instr_to_skip);
		     break;
		   case bgtz_op:
		   case bgtzl_op:
		     ECi(bgtz_op_, G0, tmp1, num_instr_to_skip);
		     break;
		   default:
		     fprintf(stderr,
			   "Emit_Branch - unimplemented branch opcode = %x\n",
			   MAJOR_OPCODE(trans->branch_instr));
		     ASSERT(0);
		     break;
		   }
		 }


	  }
   }
}

/* ***********************************************************************
 * Update_PC
 * ***********************************************************************/
static void Update_PC( TransState *trans,
          flow_t flow,VA next_PC )

/*
flow_t flow, VA current_pc, int grp_len, VA next_PC,
unsigned branch_instr,,
int no_chain, int delay_slot_reg_conflict )
*/

{
   int no_chain =  trans->instrGrp->is_rfe_block; /* || is_delay_slot_instr */
   VA current_pc = trans->instrGrp->virt_pc;
   switch( flow ){
      case REGINDIRECT_FLOW:
         /* This is a register indirect jump, so we have to return to the */
         /* main simulator loop */
         ASSERT( trans->branch_instr );

         /* Put previous PC in OLD_PC */
         /*ECs(or_op, OLD_PC, G0, PC_REG);*/
         ECi(REG_ST_OP, PC_REG, VSS_BASE, OLDPC_OFF);

	 /* this seems to be needed.... ??? */
         ECs( or_op_, PC_REG, G0, BRANCHREG);

         /* If we are returning from an exception, don't chain.  Can't */
         /* chain from kernel to user space */
         if( no_chain ) {
            ECj(jal_op_, dispatchNoChain);
         } else {
            ECj(jal_op_, dispatchChain);
         }
	 ECnop;
         break;

      case SEQ_FLOW:
         /* Put previous PC in OLD_PC */
         /*ECs(or_op_, OLD_PC, G0, PC_REG);*/
	ECi(REG_ST_OP, PC_REG, VSS_BASE, OLDPC_OFF);
	ECi(ADDR_ADDI_OP, PC_REG, PC_REG, mem_size(trans->instrGrp->GrpLen) );
	Transfer_To( trans, current_pc + mem_size(trans->instrGrp->GrpLen) );
	ECnop;
	break;

      case JMP_FLOW:
         /* Put previous PC in OLD_PC */
         /*ECs(or_op_, OLD_PC, G0, PC_REG);*/
         ECi(REG_ST_OP, PC_REG, VSS_BASE, OLDPC_OFF);
#if defined(SIM_MIPS32)
         ECi( lui_op_, PC_REG, G0, next_PC>>16 );
	 ECi( ori_op_, PC_REG, PC_REG, next_PC );
         Transfer_To( trans, next_PC );
	 ECnop;
#else
	 {
             int64 dist =  next_PC - trans->instrGrp->virt_pc;
             ASSERT(dist == (int)dist);
             if ((((int)dist & 0xffff8000) == 0) ||
                 (((int)dist & 0xffff8000) == 0xffff8000)) {
                /* Fits in immediate field */
                ECi( ADDR_ADDI_OP, PC_REG, PC_REG, (int)dist);
                Transfer_To( trans, next_PC );
	     } else {
                Load_32_Bit_Immed(SIM_T1, dist);
                ECs( ADDR_ADD_OP, PC_REG, PC_REG, SIM_T1);
                Transfer_To( trans, next_PC );
             }
         }
#endif
	      break;

      case BRANCH_FLOW:
         /* Unconditional branches are implemented as beq $0 $0 */
         if( MAJOR_OPCODE( trans->branch_instr == beq_op ) &&
             rs( trans->branch_instr ) == G0 &&
             rt( trans->branch_instr ) == G0 ) {
            /* Put previous PC in OLD_PC */
            /*ECs(or_op_, OLD_PC, G0, PC_REG);*/
            ECi(REG_ST_OP, PC_REG, VSS_BASE, OLDPC_OFF);
            if( ((int)(next_PC - current_pc) < (1<<15)) &&
                ((int)(next_PC - current_pc) > (-(1<<15))) ) {
	      ECi(ADDR_ADDI_OP, PC_REG, PC_REG, next_PC - current_pc);
	      Transfer_To( trans, next_PC);
	      ECnop;
            }else{
               Load_32_Bit_Immed(SIM_T1, next_PC - current_pc);
               ECs(ADDR_ADD_OP, PC_REG, PC_REG, SIM_T1);
               Transfer_To( trans, next_PC);
	       ECnop;
            }
            break;
         }
         ASSERT( trans->branch_instr );
         /* Both branches need the pc moved to the old PC */
         /*ECs(or_op_, OLD_PC, G0, PC_REG);*/
         ECi(REG_ST_OP, PC_REG, VSS_BASE, OLDPC_OFF);
         Emit_Branch(trans, 3);
	 ECnop;

         /* Fall Through Case */
	 ECi(ADDR_ADDI_OP, PC_REG, PC_REG, mem_size(trans->instrGrp->GrpLen) );
         Transfer_To( trans, current_pc + mem_size(trans->instrGrp->GrpLen));

	 ECnop;
	 VCTARGET; /* branch target for vcode */


         /* Branch Taken */
         if( ((int)(next_PC - current_pc) < (1<<15)) &&
             ((int)(next_PC - current_pc) > (-(1<<15))) ) {
            ECi(ADDR_ADDI_OP, PC_REG, PC_REG, next_PC - current_pc);
            Transfer_To( trans, next_PC);
	    ECnop;
         }else{
            Load_32_Bit_Immed(SIM_T1, next_PC - current_pc);
	    ECs(ADDR_ADD_OP, PC_REG, PC_REG, SIM_T1);
            Transfer_To( trans, next_PC );
	    ECnop;
         }
         break;

      case BRANCH_UNTAKEN:
         ASSERT( trans->branch_instr );
         /* Skip the chain if condition says, "Taken"*/
         /* Put previous PC in OLD_PC */
         /*ECs(or_op_, OLD_PC, G0, PC_REG);*/
         ECi(REG_ST_OP, PC_REG, VSS_BASE, OLDPC_OFF);
         Emit_Branch(trans, 4); /* was 3 -BL */
         /* Fall Through (annulling) Case */
         ECi(ADDR_ADDI_OP, PC_REG, PC_REG, mem_size(trans->instrGrp->GrpLen));
         Transfer_To( trans, current_pc + mem_size(trans->instrGrp->GrpLen) );
	 ECnop;
	 VCTARGET; /* branch target for vcode */
         break;

      case BRANCH_TAKEN:
         /* Previous PC in OLD_PC in the BRANCH_UNTAKEN case */
         /* Branch Taken */
         if( ((int)(next_PC - current_pc) < (1<<15)) &&
             ((int)(next_PC - current_pc) > -(1<<15)) ) {
            ECi(ADDR_ADDI_OP, PC_REG, PC_REG, next_PC - current_pc);
            Transfer_To( trans, next_PC );
	    ECnop;
         }else{
            Load_32_Bit_Immed( SIM_T1, next_PC - current_pc );
            ECs(ADDR_ADD_OP, PC_REG, PC_REG, SIM_T1);
            Transfer_To( trans, next_PC );
	    ECnop;
         }
         break;
      }
}


/*-----------------------------------------------------------------------------
 *
 *  This section provides support for callout facilities
 *
 *---------------------------------------------------------------------------*/

/* XXX THIS IS WRONG!!! beware, was 7 */

#define DO_EXCEPTION_CALLOUT_LEN (bogus)

static void
Do_Exception_Callout( TransState *trans, int exception_code)
{
  TCA start = memptr;
  PC_BD pc = COMPOSE_PC(trans);

  /* Put exception code into A1 */
  Load_32_Bit_Immed( A1, exception_code );
  /* Put current pc into S */
  ECi(ADDR_ADDI_OP, SIM_T4, PC_REG,
             COMPOSE_PC(trans) - trans->instrGrp->virt_pc);
  ECi(REG_ST_OP, SIM_T4, VSS_BASE, PC_OFF);
  ECi(addiu_op_, A3, G0, trans->cycle_correction);

  /* Put procedure number in SIM_T2 */
  ECi( addiu_op_, SIM_T2, G0, CALLOUT_EXCEPTION );

  /* XXX - Note we do not set A2 because it is only checked on an */
  /* EXC_CPU, coprocessor unusable exception, and we do not raise */
  /* those from emitted code */
  /* Do the Callout */
  ECj( jal_op_, callout );
  ECnop;

}


void Do_Callout( TransState *trans,  int callout_code )
{
  PC_BD pc = COMPOSE_PC(trans);
  TCA start = memptr;
  /* Put State pointer into A0 */
  /* Put current pc into A1 */
  ECi(ADDR_ADDI_OP, SIM_T4, PC_REG,
             COMPOSE_PC(trans) - trans->instrGrp->virt_pc);
  ECi(REG_ST_OP, SIM_T4, VSS_BASE, PC_OFF);
  ECi(addiu_op_, A3, G0, trans->cycle_correction);

  /* Put procedure number in SIM_T2 */
  ECi(addiu_op_, SIM_T2, G0, callout_code );
  /* Do the Callout */
  ECj( jal_op_, callout );
  ECnop;

  ASSERT( start + DO_CALLOUT_LEN == memptr);

  trans->fp_tested = 0;
}


/*-----------------------------------------------------------------------------
 *
 *  This section provides support for memory system simulation
 *
 *---------------------------------------------------------------------------*/


/* Input: Addr in A0
   Output: Translated addr in SIM_T1
*/
static void
Page_D_Cache_Check( TransState *trans, char new_state, int init_reg )
{
   if (embra.inlineQC) {
      /* Compute vpn (word aligned) into SIM_T1 */
      ECsh(srl_op_, SIM_T1, init_reg, 12);
      ECsh(sll_op_, SIM_T1, SIM_T1, 2 );

      /* Use vpn to index into TLB array */
      ECs(addu_op_, SIM_T1, MMU_REG, SIM_T1);

      /* Load the (shifted, K0 offset) physical page */
      ECi(lw_op_, SIM_T1, SIM_T1, 0);

      /* Get the offset from the VA */
      ECi(andi_op_, SIM_T4, init_reg, DEFAULT_PAGESZ-1 );

      /* Correct the cycle count */
      ECi(addiu_op_, A3, G0, trans->cycle_correction);

      if( new_state == MEM_D_EXCLUSIVE ) {
         ECb(bltz_op_, SIM_T1, 9 ); /* was 7 */
      } else {
         /* MEM_SHARED */
         ECi(bne_op_, G0, SIM_T1, 9);
      }
      ECnop;

   /* Insert PC into state structure */
      ECi(ADDR_ADDI_OP, V0, PC_REG,
             COMPOSE_PC(trans) - trans->instrGrp->virt_pc);
      ECi(REG_ST_OP, V0, VSS_BASE, PC_OFF);
      ECi(addiu_op_, A1, G0, new_state);

      /* Because we are in the same segment as TC, and we are jumping to */
      /* an assembly routine, we can just use the bits directly */
      ECj(jal_op_, mem_ref_wrapper);
      ECnop;

      /* ugh - zero t4 for or (below) -BL */
      ECi(ori_op_, SIM_T4, 0, 0);

      VCTARGET; /* branch target for vcode */

      /* Note this delay slot instruction is needed if we DONT call out */
      /* Or the offset and physical page together */
      ECs(or_op_, SIM_T1, SIM_T1, SIM_T4);

   /* Clear upper bit  */
      ECs(and_op_, SIM_T1, MMUMASK_REG, SIM_T1);

   } else {
     ECi(addiu_op_, A3, G0, trans->cycle_correction);

      /* Correct the cycle count */
     ECi(ADDR_ADDI_OP, V0, PC_REG,
          COMPOSE_PC(trans) - trans->instrGrp->virt_pc);

      /* ECi( addiu_op_, A3, G0, cycle_correction );*/

      switch( new_state ){
      default:
      case MEM_I_EXCLUSIVE:
      case MEM_I_SHARED:
         CPUError("MemIExclusive\n");
	 ASSERT(0);
         break;

	 /*         ECj(jal_op_, Em_dynPQCish);
	  *	    break;
	 */

      case MEM_D_SHARED:
         ECj(jal_op_, Em_dynPQCdsh);
         break;
      case MEM_D_EXCLUSIVE:
         ECj(jal_op_, Em_dynPQCdex);
         break;
      }
      ECnop;

   }
}


/*
 * Cache_D_Cache_Check:
 *
 * This is the check for cache mode. We have two methods of simulating the
 * cache, the Virtual Quick Check (VQC) and the regular cache check.
 *
 * If we are using the VQC, we index (by cache line) into the VQC array
 * (pointed to by QC_REG), and get a status byte.
 *
 * We also do the virtual address translation here, indexing into the TLB
 * array (by VPN) and getting the physical page address.
 *
 * If we miss in the TLB or the cache, we call out to phys_mem_ref_wrapper,
 * which raises a TLB miss exception or handles the cache miss. Then
 * once everything has been handled, we rewind the quick check (we passed
 * in a rewind value telling it the size of this code) by adding an offset
 * (plus a delta for the jr/delay slot) to the ra and doing a jr ra.
 */

static void
Cache_D_Cache_Check( TransState *trans,char new_state, int init_reg)
{
   static v_label_type vqc_hit, mmu_or_cache_miss,
   cache_hit;							/* labels for branches */

#if defined(SIM_MIPS64)
  CPUError("Cache mode doesn't work on 64bit\n");
#endif

 vqc_hit = v_genlabel();					/* allocate labels */
 mmu_or_cache_miss = v_genlabel();

  if (embra.useVQC){

    SET_LABEL(rewind_dqc );					/* rewind_dqc => offset of first instruction */

    ECsh( srl_op_, SIM_T1, init_reg, log2SCACHE_LINE_SIZE );	/* Virtual Cache line number ->SIM_T1 */
    ECs( addu_op_, SIM_T2, SIM_T1, QC_REG );			/* (mem_state + (addr>>LOG2SCACHE_LINE_SIZE)) -> SIM_T2 */
    ECi( lb_op_, SIM_T2, SIM_T2, 0 );				/* Load Status Byte into SIM_T2 */
    ECsh( srl_op_, SIM_T1, init_reg, 12);			/* Compute VPN (word aligned) into SIM_T1 */
    ECsh( sll_op_, SIM_T1, SIM_T1, 2);
    ECs( addu_op_, SIM_T1, MMU_REG, SIM_T1 );			/* Use VPN to index into TLB array */
    ECi( lw_op_, SIM_T4, SIM_T1, 0 );				/* Load the (shifted, K0 offset) physical page */
    ECi( andi_op_, SIM_T1, init_reg, DEFAULT_PAGESZ-1);		/* Get the offset from the VA */

    ECs(or_op_, SIM_T1, SIM_T1, SIM_T4);			/* Or the offset and physical page together -
								 * needed if we DON'T call out */

    if( new_state == MEM_D_EXCLUSIVE ) {
      v_bltii( VREGS[SIM_T2], 0, vqc_hit);			/* MEM_D_EXCLUSIVE: write, branch on negative  */
    } else {
      v_bneii( VREGS[SIM_T2], 0, vqc_hit);			/* MEM_SHARED: read, go if non-zero */
    }
    ECnop;

    ECi(ADDR_ADDI_OP, V0, PC_REG,				/* Insert PC into A2, and jump to mem_ref_wrapper */
	COMPOSE_PC(trans) - trans->instrGrp->virt_pc);
    ECi(REG_ST_OP, V0, VSS_BASE, PC_OFF);
    ECi(addiu_op_, A3, G0, trans->cycle_correction);		/* Correct the cycle count */
    ECi(addiu_op_, A1, G0, new_state);

    if( embra.sequential ) {
      ECi(addiu_op_, SIM_T2, G0, 0);				/* In MPinUP no need to rewind quick check */
    } else {
      ECilab(addiu_op_, SIM_T2, G0, USE_LABEL_VALUE(rewind_dqc));
    }

    ECj(jal_op_, phys_mem_ref_wrapper);				/* routine is within jump range */
    ECnop;

    v_label(vqc_hit);						/* branch target for vcode */

  } else { /* !embra.useVQC */

    ECsh(srl_op_, SIM_T1, init_reg, 12);			/* Compute vpn (word aligned) into SIM_T1 */
    ECsh(sll_op_, SIM_T1, SIM_T1, 2 );
    ECs(addu_op_, SIM_T1, MMU_REG, SIM_T1);			/* Use vpn to index into TLB array */
    ECi(lw_op_, SIM_T1, SIM_T1, 0);				/* Load the (shifted, K0 offset) physical page */
    ECi(andi_op_, SIM_T4, init_reg, DEFAULT_PAGESZ-1 );		/* Get the offset from the VA */
    ECs(or_op_, SIM_T1, SIM_T1, SIM_T4);			/* Or the offset and physical page together
								 * - needed if we hit in TLB */

    if( new_state == MEM_D_EXCLUSIVE ) {			/* Data_Write */
      v_bgeii( VREGS[SIM_T1], 0, mmu_or_cache_miss);		/* write protect -> neg mmu-entry */
    } else {							/* MEM_SHARED: Data_Read */
      v_beqii( VREGS[SIM_T1], 0, mmu_or_cache_miss);		/* branch on MMU miss */
    }
    ECnop;

    /* MMU hit -> check physarray ( phys addr. is in SIM_T1 )*/

    ECs( and_op_, SIM_T1, MMUMASK_REG, SIM_T1);			/* delete protection bit of MMU entry */
    ECsh( srl_op_, SIM_T2, SIM_T1, log2SCACHE_LINE_SIZE);	/* SIM_T2 = cache line index */
    ECs( addu_op_, SIM_T2, SIM_T2, PA_REG );			/* physical line number : ST1 , ST2 = PA_REG + ST2 */
    ECi( lb_op_, SIM_T2, SIM_T2, 0 );				/* Load PA entry byte into ST2 */
    ECi( sll_op_, SIM_T2 , SIM_T2, 24);				/* exclusive entry = 0x80 */

    ECi( ADDR_ADDI_OP, SIM_T4, PC_REG,				/* update PC  - IS THIS NEEDED???? */
	COMPOSE_PC(trans) - trans->instrGrp->virt_pc);		/* Maybe we need the PC on cache hits? */
    ECi( REG_ST_OP, V0, VSS_BASE, PC_OFF);

    if( new_state == MEM_D_EXCLUSIVE ) {
      v_bltii( VREGS[SIM_T2], 0, cache_hit );			/* MEM_D_EXCLUSIVE: branch on PA WRITE Hit */
    } else {
      v_bneii( VREGS[SIM_T2], 0, cache_hit);			/* MEM_SHARED: branch on PA READ Hit */
    }
    ECnop;

    v_label(mmu_or_cache_miss);					/* jump here on mmu/cache miss */

    ECi(ADDR_ADDI_OP, SIM_T4, PC_REG,				/* Update PC ???? */
	COMPOSE_PC(trans) - trans->instrGrp->virt_pc);
    ECi(REG_ST_OP, V0, VSS_BASE, PC_OFF);
    ECi(addiu_op_, A3, G0, trans->cycle_correction);		/* Correct the cycle count */
    ECi(addiu_op_, A1, G0, new_state);

    if( embra.sequential ) {
      ECi(addiu_op_, SIM_T2, G0, 0);				/* In MPinUP no need to rewind quick check */
    } else {
      ECilab(addiu_op_, SIM_T2, G0, USE_LABEL_VALUE(rewind_dqc));
    }

    ECj(jal_op_, pa_mem_ref_wrapper);				/* In same segment */
    ECnop;

    v_label(cache_hit);						/* jump here on cache hit */
  }
}


/* This assumes rs has the base address (without extra field)
   It returns the translated address in SIM_T1
*/
static int
D_Memory_Check( TransState *trans,
                unsigned instr,
                char new_state)
{
  int tmp1;
  int dst;

  trans->fp_tested = 0;

  /* Put addr into A_REG0 */
  /* GPR[rs] + offset -> A_REG0 */
  tmp1 = Load( A0, rs(instr) );
  dst = tmp1;

  /* So the deal is -- We need the Base+Offset value in A0 for the callout */
  /* if we don't do it here, then our QC rewind gets hassled */
  /* we could only do it in the callout part of the cache check, but then */
  /* different QC's would be different lengths. */
  /* The solution to that would be for the QC to rewind itself either */
  /* with a jump or passing the rewind offset to the wrapper function */
  if( IMMED(instr) )
	 {
		ECi(ADDR_ADDI_OP, A0, tmp1, IMMED(instr) );
		dst = A0;
	 }
  else
	 {
	  /* Ensure that address gets loaded into A0 for the call to mem_ref */
		ECs(or_op_, A0, G0, tmp1);
		dst = A0;
	 }

  switch( embra.emode ) {
  case EMBRA_PAGE:
     Page_D_Cache_Check( trans,new_state, dst);
     break;
  case EMBRA_CACHE:
     Cache_D_Cache_Check( trans,new_state, dst);
     break;
  default:
     CPUError("Embra mode incorrect\n");
  }
  return SIM_T1;
}


static void
Check_Timer( int num_cycles, uint pc_val, int bd_slot )
{
  extern int clock_val; /* In clock.c */
  PC_BD pc = pc_val | bd_slot;  /* bd_slot must be 1 or 0 */

  /* Decrement by number of instructions, allowing processor speed to */
  /* be controlled by timer value */
  /* can fit this in 16 bits.  As processor speeds up be careful */
  ECi(addiu_op_, CLOCK_REG, CLOCK_REG, -num_cycles);
  /* Store cycle count for interventions */
  if( embra.emode == EMBRA_CACHE &&embra.parallel ) {
     ECi(sw_op_, CLOCK_REG, VSS_BASE, CCD_OFF);
  } else {
     /* Don't allow b in bdelay slot */
     if( num_cycles == 2 ) {
        ECnop;
     }
  }
  ECilab(blez_op_, G0, CLOCK_REG, USE_LABEL( do_periodic ) );
  ECnop;

  /*XXXXX - DANGER - NO translation may begin with a store to the */
  /*simulated state.  If it did, it could be in this branch delay */
  /*slot, and that would cause incorrect behavior */

#ifdef CHAIN_BAR_SYNC
     {
        extern void embra_sync_barrier(void);
        int OK = 1;
        int i;
        for( i = 0; i < TOTAL_CPUS; i++ )
           OK = OK && EMP[i].outOfSlaveLoop;
        if( OK ) {
           ECi(ori_op_, SIM_T4, G0, 1);
           /* This thing is broken */
           ECj(jal_op_, embra_sync_barrier);
           ECi(sw_op_, SIM_T4, VSS_BASE, OUTTC_OFF);
           ECi(sw_op_, G0, VSS_BASE, OUTTC_OFF);
        }
     }
#endif
}

static void
Increment_Memory_Access_Count( int num_D_accesses, int num_I_accesses )
{
   if( embra.emode == EMBRA_CACHE ) {
      if( num_D_accesses ) {
         ECi(addiu_op_, DHIT_REG, DHIT_REG, num_D_accesses );
      }
      if( num_I_accesses ) {
         ECi(addiu_op_, IHIT_REG, IHIT_REG, num_I_accesses );
      }
   }
}

/* The kernel doesn't want to spill fp registers on context switches, */
/* so it sets cp1 unusable, and relies on a coprocessor unusable */
/* exception to tell it that the process is using floating point */
static void
Check_C1_Usable(TransState *trans)
{

  if (trans->fp_tested) {
    /*
     * Simple optimization. Do not check on consecutive
     * FP ops. fp_tested gets cleared on all callouts.
     */
    return;
  }
  /*
   * Determine whether the FP register file needs to be
   * loaded or not. This is the complementary operation
   * to SPILL_FP_ENABLED callout macro.
   */

  ECi(lw_op_,SIM_T1, VSS_BASE,FPLOADED_OFF);
  ECi(bne_op_,G0,SIM_T1,3);
  ECi(ori_op_,SIM_T1,G0,1);
  ECi(sw_op_,SIM_T1,VSS_BASE,FPLOADED_OFF);
  ECj(jal_op_, RestoreFP);
  ECnop;

  VCTARGET;

   /*
    * Check if the OS has enabled the COP1. Generate
    * an exception callout otherwise. Note that the exception callout
    * never returns.
    */
   ECi(REG_LD_OP, SIM_T4, VSS_BASE, CP0_OFF + C0_SR*REG_SIZE);
   /* XXX - this assumes that SR_CU1 has its bit set in the upper 16 */
   ECi(lui_op_, SIM_T1, G0, SR_CU1>>16);
   ECs(and_op_, SIM_T4, SIM_T4, SIM_T1);
   /* XXX - this offset depends on implementation of Do_Callout */
   ECi(bne_op_, G0, SIM_T4, DO_CALLOUT_LEN);
   Do_Callout( trans,CALLOUT_RAISE_C1_UNUSABLE);

   VCTARGET;

   /*
    * Need to do this after the callout since the callout
    * clears fp_tested. That's fine since the callout never
    * returns if it is taken.
    */
  trans->fp_tested = 1;
}

/* A (very simple) pipeline timing model:
 * Assume 1 CPI for straight-line code;
 * branch and other stalls are accounted for
 * elsewhere (???)
 */
uint
Pipe_Time(InstrGrp* thisGrp, int is_delay_slot_instr )
{
   if( is_delay_slot_instr ) {
      ASSERT( thisGrp->GrpLen == 2 );
      return (thisGrp->GrpLen - 1);
   }
   return (thisGrp->GrpLen);
}


/*----------------------------------------------------------------------------
 *
 *      Routine:    Translate
 *
 *  This is the main translation routine.  It calls DecodeInstr which
 * decodes a basic block of instructions, and then for each instruction
 * it switches off the opcode and write the appropirate translation.

 * Other code emitted:
 * We emit I-cache checks at the start of a basic block and at
 * I-cache line boundaries.  We check the clock at the start to see if
 * we need to take a clock interrupt.  We decrement the clock by the
 * number of instructions in the block, and correct it if we call out
 * (like for an excpetion)
 *--------------------------------------------------------------------------*/
TCA Translate(int cpuNum, VA current_pc, int* has_flushed)
{

   InstrGrp instrGrp;
   TransState trans;
   TCA tc_start;
   int tc_offset;
   int i;
   VA next_PC = -1;
   register Instruction instr;
   VA ret_addr;
   int tmp1, tmp2, tmp3;
   int pipe_time = 0;
   int est_translen = 0;
   int is_delay_slot_instr= 0;

   int tcCache = TC_CACHE(current_pc);

   ASSERT( EMP[cpuNum].PC != 0xc0009f20);
#ifdef notdef
   CPUWarning("Translating CPU %d PC 0x%08x\n", cpuNum, current_pc);
#endif

#ifdef SOLO
   ASSERT(EMP[cpuNum].PC < 0x80000000);
#else
   ASSERT(PAGE_NUMBER(current_pc) != PAGE_NUMBER(__MAGIC_OSPC_BASE));
#endif
   ASSERT( embra.emode != EMBRA_PAGE || (current_pc & 0x3) == 0 );
   /* NOTE: TIMER_STOP is also called in EXCEPTION, mem_translate, */
   /* because we can take an */
   /* exception (and hence non-local return) during translation */
   STAT_TIMER_START( trans_timer );

   ASSERT( !EMP[cpuNum].stalled );
   ASSERT(current_pc);
   /* This is just debugging.  If the text segment gets larger than */
   /* 16MB, then this is no longer valid */
   /* In Hive there is text all over kseg0, so need to allow that. */
   VASSERT(current_pc < 0x68000000 || current_pc > 0x78000000,
           ("current_pc 0x%x\n", current_pc) );

#ifndef DEBUG_TRANSLATOR
   ASSERT(current_pc == EMP[cpuNum].PC);
#endif

   for( i = 0; i < 32; i++ ) {
      reg2alloc[i].num_src = 0;
      reg2alloc[i].alloc_reg = 0;
   }

   InitLongConst();

   /* The only bad thing about this is that I am more committed to */
   /* having the low bit set in the PC as a legitmate construct.  If I */
   /* want to detect unaligned accesses, I will have to junk this */
   /* scheme */
   if( IN_BD( current_pc ) ) {
      CPUError("Embra: got a IN_BD pc at 0x%x (Tell Ed, he wants to see it)\n",
                 current_pc);
      current_pc = CLEAR_BD(current_pc);
      current_pc -= INST_SIZE;
      /* Can't chain these because the oldPC will not be correct */
      /* Also the offsets for these translations are different from
         standard translations */
      is_delay_slot_instr = 1;
   } else {
      is_delay_slot_instr = 0;
   }
   ASSERT ((current_pc &0x3)==0);
   STAT_INC( translations );


   /*
    * Get an instruction group and decode them
    * into intermediate format.
    *
    * NOTE THAT THIS FUNCTION HAS SIDE-EFFECTS, NAMELY
    * CAN GENERATE EXCEPTIONS IN 2 OCCASIONS!
    */


   instrGrp.reg2alloc = reg2alloc;
   DecodeInstrs( &instrGrp, cpuNum, current_pc,is_delay_slot_instr);
   STAT_DEC_BB_SIZE( instrGrp.GrpLen );
   trans.instrGrp = &instrGrp;

   ASSERT( IS_KUSEG(current_pc) || IS_KSEG0(current_pc) ||
           IS_KSEG2(current_pc));


   /*
    * Check that this translation can fit in the TC
    * est_translen is in instructions
    */

   est_translen =  LONGEST_TRANS(instrGrp.GrpLen);

   est_translen += 10000; /* vcode adds 2 extra instns at end ;-/ */

   if ( !TC_Is_Room(  est_translen, tcCache ) ) {
      STAT_INC(tc_filled);
      /* was: tcCache; but if we flush one cache, don't we have
       * to flush both, because we allow chaining between user
       * and kernel addresses?? XXX -BL
       */
         Clear_Translation_State( tcCache );
      *has_flushed = 1;
   } else *has_flushed = 0;


   /*
    * get the pointer to store instructions in TC
    */
   tc_start = TC_GetTCPtr(  tcCache );

   /* for vcode, create a new leaf procedure */
   v_lambda("foo", "", 0, V_LEAF, tc_start, est_translen);

   /* we really need to know the address of the first instruction1! */
   tc_offset = memptr - tc_start;

   VC_Allocate_Regs(); /* allocate registers for vcode */

   pipe_time = Pipe_Time( &instrGrp, is_delay_slot_instr );
   ASSERT( pipe_time >= instrGrp.GrpLen);

   trans.flow = SEQ_FLOW;
   trans.curPC = current_pc;
   trans.cycle_correction = 0;
   trans.branch_instr = 0;
   trans.fp_tested = 0;

   /* XXX BL: hack to make code line up */

   /* Note: Prelude functions emit code that should not simply be */
   /* sequentially executed. */
   /* They emit code that allows other basic blocks to chain to this one */
   /* And they decrement the counter */
   switch( embra.emode ) {
   case EMBRA_CACHE:
      Cache_Prelude_Chain_Check( &instrGrp, pipe_time);

      /* If this is a delay slot instruction, don't check PC of branch */
      if( !is_delay_slot_instr ) {
         I_Memory_Check( &trans,cpuNum, current_pc);
      }
      break;
   case EMBRA_PAGE:
      Page_Prelude_Chain_Check( &instrGrp, pipe_time );
      /* In page mode, Prelude_Chain_Check ensures that the page we are */
      /* about to execute is mapped with the proper permissions */
      break;
   }

   /*
    * Increment time.
    * If timer has expired, call out
    */
   Check_Timer( pipe_time, current_pc, 0 /* not delay slot*/ );
   trans.cycle_correction = pipe_time;

   /*
    * Deal with register allocation
    * Rfe callout assigns to register set directly.  Can't allocate registers
    * Tns writes to register set  directly.  Can't allocate registers
    */
   {
      int alloc = 0;
      if( !instrGrp.is_rfe_block && !instrGrp.no_reg_allocate) {
         set_src2alloc();
         for(i = 0; i < NUM_ALLOCATABLE_REGS; i++)
            /* If there are fewer than NUM_ALLOCATABLE_REGS used, don't */
            /* load to unused registers */
            /* We may want to allocate BRANCHREG for $ra, that would be nice */
            if( src2alloc[i]->num_src)
               src2alloc[i]->alloc_reg = reg_alloc[i];
         alloc = Preload_Regs();
      }
      /*
       * Can't put incr memory access in delay slot or it will mess up
       * instruction counting and cause cache mode to hang
       *
       * (most probably a MPinMP cache mode comment, but it's a good
       * thing to remember! (bugnion)
       */
      if( !alloc ){
         ECnop;
      }
   }

   /* Add memory accesses, soak a delay slot */
   if( is_delay_slot_instr ) {
      Increment_Memory_Access_Count( instrGrp.numDMemoryAcesses,
                                     instrGrp.numIMemoryAcesses - 1 );
   } else {
      Increment_Memory_Access_Count( instrGrp.numDMemoryAcesses,
                                     instrGrp.numIMemoryAcesses );
   }


   /*
    * MAIN LOOP
    *
    * Emit translations for each instruction
    */

   for (i = 0; (i < instrGrp.GrpLen); i++)  {
      instr = instrGrp.instrs[i];
      trans.curPC = current_pc + INST_SIZE*i;
      trans.cycle_correction = pipe_time - i;
      /*
       * First, check text residency and presence in the cache
       */

      ASSERT (!is_delay_slot_instr);
      switch( embra.emode ) {
      case EMBRA_PAGE:
         /* Insert ICache check if this is a new page */
         if( i && ((trans.curPC & (DEFAULT_PAGESZ-1)) == 0 ) ) {
            ASSERT (instrGrp.isKseg0 || i == instrGrp.GrpLen-1);
#if 0
            /*
             * no longer need to check here. either kseg0 -> no check
             * or the branch delay slot has been checked earlier
             */
            I_Memory_Check( &trans,cpuNum,trans.curPC);
#endif
         }

         break;
      case EMBRA_CACHE:
         /* Insert ICache check if this is a new line */
         if( i && ((trans.curPC) & (SCACHE_LINE_SIZE-1)) == 0 ) {
            I_Memory_Check( &trans,cpuNum,trans.curPC);
         }
         break;
      }
      if (!instrGrp.isKseg0 &&
          i !=  instrGrp.GrpLen-1 &&
          PAGE_NUMBER(trans.curPC) != PAGE_NUMBER(trans.curPC+INST_SIZE)) {
         /*
          * straddling a page boundary. only if there is a branch. We force
          * an i-cache check before the branch. But first assert that this
          * is a branch.
          *
          */
         ASSERT (i== instrGrp.GrpLen-2);
         ASSERT (instrGrp.GrpLen ==2);
         ASSERT( IsCtlInstr(instr));
         I_Memory_Check( &trans,cpuNum, trans.curPC+INST_SIZE);
      }


      /*
       * Insert annotation callouts in translated code
       */

      if (instrGrp.pcAnn[i] & ANNFM_PRE_PC_TYPE) {
         uint *startCallout = memptr;
         Do_Callout(&trans, CALLOUT_PREPC_ANN);
         TC_IncrementSize(tcCache,(char*)memptr-(char*)startCallout);
      }

      /* if nop, do nothing */
      if (instr != _nop) {
      switch ( MAJOR_OPCODE(instr ) )
         {

/****************************************************************/
         case spec_op:
/****************************************************************/
            switch( FUNC( instr ) )
               {

                  /*************************************/
                  /* Shifts -- */
                  /*************************************/
               case sll_op:    /* [rt] shift by shamt -> [_rd] */
               case sra_op:
               case srl_op:
#if defined(SIM_MIPS64)
               case dsll_op:
               case dsll32_op:
               case dsrl_op:
               case dsrl32_op:
               case dsra_op:
               case dsra32_op:
#endif

                  /* Uses RT RD */

                  tmp1 = Load( SIM_T1, rt(instr) );

                  tmp3 = Set_Destination( SIM_T2, rd(instr) );

		  switch( FUNC( instr ) ) {
		  case sll_op:
		    ECsh(sll_op_, tmp3, tmp1, SHAMT(instr));
		    break;
		  case sra_op:
		    ECsh(sra_op_, tmp3, tmp1, SHAMT(instr));
		    break;
		  case srl_op:
		    ECsh(srl_op_, tmp3, tmp1, SHAMT(instr));
		    break;
#if defined(SIM_MIPS64)
               case dsll_op:
		    ECsh(dsll_op_, tmp3, tmp1, SHAMT(instr));
		    break;
               case dsll32_op:
		    ECsh(dsll32_op_, tmp3, tmp1, SHAMT(instr));
		    break;
               case dsrl_op:
		    ECsh(dsrl_op_, tmp3, tmp1, SHAMT(instr));
		    break;
               case dsrl32_op:
		    ECsh(dsrl32_op_, tmp3, tmp1, SHAMT(instr));
		    break;
               case dsra_op:
		    ECsh(dsra_op_, tmp3, tmp1, SHAMT(instr));
		    break;
               case dsra32_op:
		    ECsh(dsra32_op_, tmp3, tmp1, SHAMT(instr));
		    break;
#endif
		  default:
		    goto unknown_opcode;
		  }

                  Store( tmp3, rd(instr) );
                  break;


               case jalr_op:

                  /* Uses RS RD */

                  trans.flow = REGINDIRECT_FLOW;

                  /* This load MUST occur because the RS register value is */
                  /*modified by the  delay slot instruction */
                  /* Also, Update_PC needs to write an addu in the delay */
                  /* slot of a jal to signal the chaining function */
                  Load_Move(BRANCHREG, rs(instr));

                  /*MIPS manual specifies that the REG_RA gets written */
                  /* before the delay slot instruction */
                  tmp3 = Set_Destination( SIM_T4, rd(instr) );

                  /* Compute RA */
                  ECi(ADDR_ADDI_OP, tmp3, PC_REG, (i+2)*4);

                  Store( tmp3, rd(instr) );

                  trans.branch_instr = instr;
                  break;

               case jr_op:
                  /* Uses RS */
                  trans.flow = REGINDIRECT_FLOW;
                  /* This load MUST occur because the RS register value is */
                  /*modified by the  delay slot instruction */
                  /* Also, Update_PC needs to write an addu in the delay */
                  /* slot of a jal to signal the chaining function */
                  Load_Move(BRANCHREG, rs(instr));

                  trans.branch_instr = instr;
                  break;
                  /*************************************/
                  /* Syscall and Break.  -- */
                  /* These are handed by calling out to our C EXCEPTION */
                  /* function */
                  /*************************************/
               case syscall_op:
#ifndef TORNADO
                  if( instrGrp.isKseg0) {
                     CPUWarning("Embra encountered syscall at PC 0x%x.\n", trans.curPC );
                  }
#endif
                  Do_Exception_Callout( &trans, EXC_SYSCALL);

                  break;

               case break_op:
                  /*
                   * By convention, breakpints in the kernels
                   * are passed on to SimOS, while the breakpoints in
                   * user mode generate exceptions.
                   * MOOSE support. if this is a break 0x99, pass it on
                   * and generate exception
                   */
                  if(IS_KSEG0(current_pc) || (BREAK_CODE(instr) < 0x99)) {
                     if( BREAK_CODE( instr ) == BRK_KERNELBP )
                        Do_Callout( &trans, CALLOUT_KERN_DEBUGER_BREAK);
                     else
                        Do_Callout( &trans, CALLOUT_DEBUGER_BREAK);
                  } else {
                     Do_Exception_Callout( &trans, EXC_BREAK);
                  }
                  break;

                  /******************************************/
                  /* Sync
                   *   This doesn't do anything since no write buffer
                   */
               case sync_op:
                  break;

               /*************************************/
               /* DIV,  MULT, Hi & Lo Moves -- */
               /* DIV & MULT are like ALUops, but we need to move to */
               /* and from Hi and Lo */
               /* Note that readin from hi right after issuing the */
               /* instruction will cause an interlock, so this translation */
               /* has space for other instructions, but its too complicated */
               /* (and not worth the effort) to try to squeeze other */
               /* instructions into those slots */
               /*************************************/

               case div_op:  /* [rs] op [rt] -> HI & LO */
               case divu_op:
               case mult_op:
               case multu_op:
#if defined(SIM_MIPS64)
               case ddiv_op:
               case ddivu_op:
               case dmult_op:
               case dmultu_op:
#endif

               {
                  /* Uses RS RT */
                  /* load rs, load rt, store HI, store LO  */
                  if( prev_store.real == rt(instr) )
                     {
                        tmp2 = Load( SIM_T2, rt(instr) );
                        tmp1 = Load( SIM_T1, rs(instr) );
                     }
                  else
                     {
                        tmp1 = Load( SIM_T1, rs(instr) );
                        tmp2 = Load( SIM_T2, rt(instr) );
                     }

		  /* perform the operation using T1 & T2 */

		  switch(FUNC(instr)) {
		  case div_op:  /* [rs] op [rt] -> HI & LO */
		    ECs(div_op_, SIM_T4, tmp1, tmp2);
		    break;
		  case divu_op:
		    ECs(divu_op_, SIM_T4, tmp1, tmp2);
		    break;
		  case mult_op:
		    ECs(mult_op_, SIM_T4, tmp1, tmp2);
		    break;
		  case multu_op:
		    ECs(multu_op_, SIM_T4, tmp1, tmp2);
		    break;
#if defined(SIM_MIPS64)
               case ddiv_op:
		    ECs(ddiv_op_, SIM_T4, tmp1, tmp2);
		    break;
               case ddivu_op:
		    ECs(ddivu_op_, SIM_T4, tmp1, tmp2);
		    break;
               case dmult_op:
		    ECs(dmult_op_, SIM_T4, tmp1, tmp2);
		    break;
               case dmultu_op:
		    ECs(dmultu_op_, SIM_T4, tmp1, tmp2);
		    break;
#endif
		  default:
		    goto unknown_opcode;
		  }

                  break;

               }

               case mfhi_op: /* move HI to rd */
                  /* Uses RD */

                  tmp3 = Set_Destination( SIM_T2, rd(instr) );

                  ECi(REG_LD_OP, tmp3, VSS_BASE, HI_OFF);
                  TRAILING_LOAD( tmp3 );
                  Store( tmp3, rd(instr) );
                  break;


               case mflo_op: /* move LO to rd */

                  tmp3 = Set_Destination( SIM_T2, rd(instr) );

                  ECi(REG_LD_OP, tmp3, VSS_BASE, LO_OFF);
                  TRAILING_LOAD( tmp3 );
                  Store( tmp3, rd(instr) );

                  break;

               case mthi_op: /* move rs to HI */
                  /* Uses RS */
                  /* load rs to T1 */
                  tmp1 = Load( SIM_T1, rs(instr) );

                  /* store T1 to HI */
                  ECi( REG_ST_OP, tmp1, VSS_BASE, HI_OFF );

                  break;


               case mtlo_op: /* move rs to LO */
                  /* Uses RS */
                  /* load rs to T1 */
                  tmp1 = Load( SIM_T1, rs(instr) );

                  /* store T1 to LO */
                  ECi(REG_ST_OP, tmp1, VSS_BASE, LO_OFF);

                  break;

                  /*************************************/
                  /* Arithmetic specials */
                  /* rs == SIM_T1, rt == SIM_T2, rd == SIM_T2 */
                  /*************************************/

                  /* [rs] op [rt] -> [_rd] */
               case add_op:
               case addu_op:
               case and_op:
               case nor_op:
               case or_op:
               case sllv_op:
               case slt_op:
               case srav_op:
               case sltu_op:
               case srlv_op:
               case sub_op:
               case subu_op:
               case xor_op:
#if defined(SIM_MIPS64)
               case dadd_op:
               case daddu_op:
               case dsllv_op:
               case dsrlv_op:
               case dsrav_op:
               case dsub_op:
               case dsubu_op:
#endif /* 64bit support */

                  if (prev_store.real == rt(instr)) {
                     tmp2 = Load( SIM_T2, rt(instr) );
                     tmp1 = Load( SIM_T1, rs(instr) );
                  } else {
                     tmp1 = Load( SIM_T1, rs(instr) );
                     tmp2 = Load( SIM_T2, rt(instr) );
                  }


                  tmp3 = Set_Destination( SIM_T2, rd(instr) );
		  switch(FUNC(instr)) {
		  case add_op:
		    ECs(add_op_,tmp3, tmp1, tmp2 );
		    break;
		  case addu_op:
		    ECs(addu_op_,tmp3, tmp1, tmp2 );
		    break;
		  case and_op:
		    ECs(and_op_,tmp3, tmp1, tmp2 );
		    break;
		  case nor_op:
		    ECs(nor_op_,tmp3, tmp1, tmp2 );
		    break;
		  case or_op:
		    ECs(or_op_,tmp3, tmp1, tmp2 );
		    break;
		  case sllv_op:
		    ECs(sllv_op_,tmp3, tmp1, tmp2 );
		    break;
		  case slt_op:
		    ECs(slt_op_,tmp3, tmp1, tmp2 );
		    break;
		  case srav_op:
		    ECs(srav_op_,tmp3, tmp1, tmp2 );
		    break;
		  case sltu_op:
		    ECs(sltu_op_,tmp3, tmp1, tmp2 );
		    break;
		  case srlv_op:
		    ECs(srlv_op_,tmp3, tmp1, tmp2 );
		    break;
		  case sub_op:
		    ECs(sub_op_,tmp3, tmp1, tmp2 );
		    break;
		  case subu_op:
		    ECs(subu_op_,tmp3, tmp1, tmp2 );
		    break;
		  case xor_op:
		    ECs(xor_op_,tmp3, tmp1, tmp2 );
		    break;
#if defined(SIM_MIPS64)
               case dadd_op:
		    ECs(dadd_op_,tmp3, tmp1, tmp2 );
		    break;
               case daddu_op:
		    ECs(daddu_op_,tmp3, tmp1, tmp2 );
		    break;
               case dsllv_op:
		    ECs(dsllv_op_,tmp3, tmp2, tmp1 );
		    break;
               case dsrlv_op:
		    ECs(dsrlv_op_,tmp3, tmp2, tmp1 );
		    break;
               case dsrav_op:
		    ECs(dsrav_op_,tmp3, tmp2, tmp1 );
		    break;
               case dsub_op:
		    ECs(dsub_op_,tmp3, tmp1, tmp2 );
		    break;
               case dsubu_op:
		    ECs(dsub_op_,tmp3, tmp1, tmp2 );
		    break;
#endif /* 64bit support */
		  default:
		    goto unknown_opcode;
		  }

                  Store( tmp3, rd(instr) );

                  break;
#if defined(SIM_MIPS64)
	       case movc_op:
		 /* XXX This are MIPS4 ops: Should check to see if MIPS4 enabled */
		 /* get floating point condition */
		ECi( lw_op_, SIM_T1, VSS_BASE, FCR_OFF+mem_size(31) );
		Load_32_Bit_Immed(SIM_T2, FC_BIT);
		ECs( and_op_, SIM_T1, SIM_T1, SIM_T2);

		/* XXX ugly */
		if (instr & (1 << 16)) {
		  ECi( beq_op_, SIM_T1, G0, 1); /* only move if FPC is non-zero */
		}
		else {
		  ECi( bne_op_, SIM_T1, G0, 1); /* only move if FPC is zero */
		}

		tmp1 = Load( SIM_T1, rs(instr) );
		tmp3 = Set_Destination( SIM_T4, rd(instr) );
		ECi(ori_op_, tmp3, tmp1, 0);
		Store( tmp3, rd(instr) );

		VCTARGET;

		break;

               case movn_op:
               case movz_op:

                  /* XXX This are MIPS4 ops: Should check to see if MIPS4 enabled */

                  if (prev_store.real == rt(instr)) {
                     tmp2 = Load( SIM_T2, rt(instr) );
                     tmp1 = Load( SIM_T1, rs(instr) );
                  } else {
                     tmp1 = Load( SIM_T1, rs(instr) );
                     tmp2 = Load( SIM_T2, rt(instr) );
                  }


                  tmp3 = Set_Destination( SIM_T4, rd(instr) );

#if (_MIPS_ISA == _MIPS_ISA_MIPS4)
                  /* If mips4 supported, use it. */
                  ECi( REG_LD_OP, tmp3, VSS_BASE, REG_OFFSET(rd(instr)) );
                  ECs(FUNC(instr),tmp3, tmp1, tmp2 );
#else
                 ECi( REG_LD_OP, tmp3, VSS_BASE, REG_OFFSET(rd(instr)) );
		 if (FUNC(instr) == movn_op) {
                     ECi( beq_op_, tmp2, G0, 1); /* move if RT is NOT zero */
                     ECi(ori_op_, tmp3, tmp1, 0);
		     VCTARGET;
		 } else if (FUNC(instr) == movz_op) {
                     ECi( bne_op_, tmp2, G0, 1); /* move if RT is zero */
                     ECi(ori_op_, tmp3, tmp1, 0);
		     VCTARGET;
                  } else { ASSERT(0); }
#endif

                  Store( tmp3, rd(instr) );

                  break;
#endif

               case tge_op:
                  if( prev_store.real == rt(instr) ) {
                     tmp2 = Load( SIM_T2, rt(instr) );
                     tmp1 = Load( SIM_T1, rs(instr) );
                  } else {
                     tmp1 = Load( SIM_T1, rs(instr) );
                     tmp2 = Load( SIM_T2, rt(instr) );
                  }
                  ECs(slt_op_, A0, tmp1, tmp2 );
                  ECi(bne_op_, A0, G0, DO_EXCEPTION_CALLOUT_LEN);
                  Do_Exception_Callout( &trans, EXC_TRAP);
		  VCTARGET;
                  break;

               case tgeu_op:

                  if( prev_store.real == rt(instr) ) {
                     tmp2 = Load( SIM_T2, rt(instr) );
                     tmp1 = Load( SIM_T1, rs(instr) );
                  } else {
                     tmp1 = Load( SIM_T1, rs(instr) );
                     tmp2 = Load( SIM_T2, rt(instr) );
                  }
                  ECs(sltu_op_, A0, tmp1, tmp2 );
                  ECi(bne_op_, A0, G0, DO_EXCEPTION_CALLOUT_LEN);
                  Do_Exception_Callout( &trans, EXC_TRAP);
		  VCTARGET;
                  break;

               case tlt_op:
                  if( prev_store.real == rt(instr) ) {
                     tmp2 = Load( SIM_T2, rt(instr) );
                     tmp1 = Load( SIM_T1, rs(instr) );
                  } else {
                     tmp1 = Load( SIM_T1, rs(instr) );
                     tmp2 = Load( SIM_T2, rt(instr) );
                  }
                  ECs(slt_op_, A0, tmp1, tmp2 );
                  ECi(beq_op_, A0, G0, DO_EXCEPTION_CALLOUT_LEN);
                  Do_Exception_Callout( &trans, EXC_TRAP);
		  VCTARGET;
                  break;

               case tltu_op:

                  if( prev_store.real == rt(instr) ) {
                     tmp2 = Load( SIM_T2, rt(instr) );
                     tmp1 = Load( SIM_T1, rs(instr) );
                  } else {
                     tmp1 = Load( SIM_T1, rs(instr) );
                     tmp2 = Load( SIM_T2, rt(instr) );
                  }
                  ECs(sltu_op_, A0, tmp1, tmp2 );
                  ECi(beq_op_, A0, G0, DO_EXCEPTION_CALLOUT_LEN);
                  Do_Exception_Callout( &trans, EXC_TRAP);
		  VCTARGET;
                  break;


               case teq_op:

                  if( prev_store.real == rt(instr) ) {
                     tmp2 = Load( SIM_T2, rt(instr) );
                     tmp1 = Load( SIM_T1, rs(instr) );
                  } else {
                     tmp1 = Load( SIM_T1, rs(instr) );
                     tmp2 = Load( SIM_T2, rt(instr) );
                  }
                  ECi(bne_op_, tmp1, tmp2, DO_EXCEPTION_CALLOUT_LEN);
                  Do_Exception_Callout( &trans, EXC_TRAP);
		  VCTARGET;
                  break;


               case tne_op:

                  if( prev_store.real == rt(instr) ) {
                     tmp2 = Load( SIM_T2, rt(instr) );
                     tmp1 = Load( SIM_T1, rs(instr) );
                  } else {
                     tmp1 = Load( SIM_T1, rs(instr) );
                     tmp2 = Load( SIM_T2, rt(instr) );
                  }
                  ECi(beq_op_, tmp1, tmp2, DO_EXCEPTION_CALLOUT_LEN);
                  Do_Exception_Callout( &trans, EXC_TRAP);
		  VCTARGET;
                  break;

               default:
                  goto unknown_opcode;
               } /* FUNC field of SPECIAL instructions */

            break;
/****************************************************************/
         case bcond_op:
/****************************************************************/
            switch( rt(instr) )
               {
               case bgezal_op:
               case bgezall_op:

                  /* Link Code */
                  /*MIPS manual specifies that the REG_RA gets written */
                  /* before the delay slot instruction */
                  tmp3 = Set_Destination( SIM_T4, REG_RA );

                  /* Compute RA */
                  ECi(ADDR_ADDI_OP, tmp3, PC_REG, (i+2)*4);

                  Store( tmp3, REG_RA );

                  /* FALL THROUGH */

               case bgez_op:
               case bgezl_op:


                  if( instrGrp.delay_slot_reg_conflict ) {
                     tmp1 = Load( SIM_T1, rs(instr) );
                     ECs(slt_op_, BRANCHREG, tmp1, G0);
                     ECi(xori_op_, BRANCHREG, BRANCHREG, 1);
                     /*Leave branch condition in BRANCHREG */
                  }
                  next_PC =   trans.curPC + INST_SIZE  + (IMMED(instr)<<2);
		  trans.branch_instr = VC_ComposeBCOND( rs(instr), bgez_op, 0 );
                  if( rt(instr) == bgezall_op ||
                      rt(instr) == bgezl_op ) {
                     Update_PC( &trans, BRANCH_UNTAKEN, 0);
                     trans.flow = BRANCH_TAKEN;
                  } else {
                     trans.flow = BRANCH_FLOW;
                  }

                  break;

               case bltzal_op:
               case bltzall_op:

                  /* Link Code */

                  /*MIPS manual specifies that the REG_RA gets written */
                  /* before the delay slot instruction */
                  tmp3 = Set_Destination( SIM_T4, REG_RA );

                  /* Compute RA */
                  ECi(ADDR_ADDI_OP, tmp3, PC_REG, (i+2)*4);

                  Store( tmp3, REG_RA );

                  /* FALL THROUGH */
               case bltz_op:
               case bltzl_op:

                  /* USES RS */
                  if( instrGrp.delay_slot_reg_conflict ) {
                     tmp1 = Load( SIM_T1, rs(instr) );
                     ECs(slt_op_, BRANCHREG, tmp1, G0);
                  }

                  next_PC = trans.curPC + INST_SIZE + (IMMED(instr)<<2);
		  /**** THIS IS CHEESOID; YET, WE DEPEND ON IT -BL ****/
		  trans.branch_instr = VC_ComposeBCOND(rs(instr),bltz_op, 0);

                  if( rt(instr) == bltzall_op ||
                      rt(instr) == bltzl_op ) {
                     Update_PC( &trans, BRANCH_UNTAKEN, 0);
                     trans.flow = BRANCH_TAKEN;
                  } else {
                     trans.flow = BRANCH_FLOW;
                  }

                  break;

               default:
                  goto unknown_opcode;
               }

            break; /* BCOND instructions */
/****************************************************************/
         case cop0_op:
/****************************************************************/
            /*************************************/
            /* CP0 Instructions.  -- */
            /* These are handed by calling out to our C functions */
            /*************************************/

            if( IS_CP0_FUNC( instr ) )
               {
                  switch( FUNC( instr ) )
                     {
                     case tlbr_op:
                        Do_Callout(&trans,CALLOUT_TLBR);
                        break;

                     case tlbwi_op:
                        Do_Callout( &trans, CALLOUT_TLBWI);
                        break;

                     case tlbwr_op:
                        Do_Callout( &trans, CALLOUT_TLBWR);
                        break;

                     case tlbp_op:
                        Do_Callout(&trans, CALLOUT_TLBP);
                        break;

                     case rfe_op:
                     case eret_op:
                        /*
                         * see comment in decoder
                         */
                        ASSERT( instrGrp.is_rfe_block == 1);
                        Do_Callout(&trans, CALLOUT_ERET);
                        break;

                     default:
                        goto unknown_opcode;
                     }
               }
            else
               {
                  /* Operation contained in rs */
                  switch( rs( instr ) )
                     {
                     case dmfc_op:
                     case mfc_op:
                        /* Destination of mfc, dmtc is RT, not RD (kind of silly)*/
      			/* Handle r4k_timing by intercepting this read! */
                        Load_32_Bit_Immed(A1,instr);
                        Do_Callout(&trans, CALLOUT_MFC0 );

                        break;
                     case dmtc_op:
                     case mtc_op:

                        ASSERT( instrGrp.delay_slot_reg_conflict);
                        Load_32_Bit_Immed(A1,instr);

                        /*
                         * load the branch target if we are in a
                         * non sequential flow
                         */
                        if (trans.flow == SEQ_FLOW) {
                           ECi(ori_op_, A2, G0, G0);
                        } else {
                           /*
                            * pass the target of the branch
                            * as A2
                            */
                           if(trans.flow == REGINDIRECT_FLOW) {
                              ECi(ori_op_,A2,BRANCHREG,0);
                           } else {
                              /*branch taken case */
                              Load_Reg_Immed(A2,next_PC);
                              Emit_Branch(&trans,3);
                              /* branch not taken case */
                              ECnop;
                              Load_Reg_Immed(A2, current_pc + mem_size(instrGrp.GrpLen));
			      VCTARGET; /* branch target for vcode */
                           }
                        }

                        Do_Callout(&trans, CALLOUT_MTC0 );

                        break;

                     default:
                        /* cfc_op_, and ctc_op are not valid in cp0 */
                        goto unknown_opcode;
                     }

                  break; /* COP0 moves */
               }

            break;/* COP0  Instructions */
/****************************************************************/
         case cop1_op:
/****************************************************************/
            /*************************************/
            /* Floating Point Operations -- */
            /* The arithmetic  operations need to be decoded */
            /*************************************/

	    Check_C1_Usable( &trans );

	    switch( rs(instr) ) {

	    case ctc_op:
	      /* XXX hack: try only allowing writes to FCR[31] */
	      if (fs(instr) == 31) {
		/* No need to read control reg to drain pipeline as
		   that had to have been done in the original code, and
		   we only add instructions */
		tmp1 = Load( SIM_T1, rt(instr) );

		/* Then execute the move instruction to  move the */
		/* contents from T1 to FPU's fs  */
		/* This works for control registers and regular registers */
		ECCop1Move( ctc_op_, tmp1, fs(instr) );
		/* XXX a repulsive hack - this whole thing is bogus;
		* we need a platform-independent way of setting the rounding
		* mode; if we have the wrong rounding mode on IRIX 6.4 then
		* printf can't print floating-point numbers. -BL
		*/
		ctc1(tmp1, fs(instr));
		/* This instruction takes 2 cycles */
		ECnop;
	      }
	      break;

	    case mtc_op:

	      tmp1 = Load( SIM_T1, rt(instr) );

	      /* Then execute the move instruction to move the */
	      /* contents from T1 to FPU's fs  */

	      ECCop1Move( mtc_op_, tmp1, fs(instr) );

	      break;

	    case dmtc_op:

	      tmp1 = Load( SIM_T1, rt(instr) );

	      /* Then execute the move instruction to move the */
	      /* contents from T1 to FPU's fs  */

	      ECCop1Move( dmtc_op_, tmp1, fs(instr) );

	      break;

	    case mfc_op:
	    case dmfc_op:
	    case cfc_op:

	      tmp1 = Set_Destination( SIM_T2, rt(instr) );

	      switch ( rs(instr) ) {
	      case mfc_op:
		ECCop1Move( mfc_op_, tmp1, fs(instr) );
		break;
	      case dmfc_op:
		ECCop1Move( dmfc_op_, tmp1, fs(instr) );
		break;
	      case cfc_op:
		ECCop1Move( cfc_op_, tmp1, fs(instr) );
		break;
	      default:
		goto unknown_opcode;
	      }

	      /* This instruction takes 2 cycles */
	      ECnop;

	      Store( tmp1, rt(instr) );

	      break;

	    case bc_op:
	      /*
	       * Floating Point Branches
	       * The decoder is too lazy to detect FP register conflicts,
	       * therefore it errs on the conservative side.
	       */
	      ASSERT (instrGrp.delay_slot_reg_conflict);

	      if( instrGrp.delay_slot_reg_conflict ) {
		/* BRANCHREG=1 -> going to take branch */
		ECi(ori_op_, BRANCHREG, 0, 1);
		/* If the Branch is taken, skip the next */
		/* instruction so BRANCHREG will be 1. */

		/* get floating point condition */
		ECi( lw_op_, SIM_T1, VSS_BASE, FCR_OFF+mem_size(31) );
		Load_32_Bit_Immed(SIM_T2, FC_BIT);
		ECs( and_op_, SIM_T1, SIM_T1, SIM_T2);

		/* decode sense of FP branch */
		if ( instr & FBC_BIT ) {
		  /* branch on condition true */
		  ECi(bne_op_, 0, SIM_T1, 2);
		}
		else {
		  /* branch on condition false */
		  ECi(beq_op_, 0, SIM_T1, 2);
		}
		/* otherwise, zero branch register */
		ECi(ori_op_, BRANCHREG, 0, 0);
		VCTARGET; /* target for vcode branch */
	      }
	      /* Get the PC address for the delay slot (i+1) and add */
	      /* it to the offset. */
	      next_PC = trans.curPC + INST_SIZE + (IMMED(instr)<<2);

	      /* This clears the likely bit, so this is a non-annulled branch */
	      trans.branch_instr = instr & 0xfffdffff;

	      if (( rt(instr) == bcfl_op) || (rt(instr) == bctl_op)){
		Update_PC(  &trans, BRANCH_UNTAKEN, 0);
		trans.flow = BRANCH_TAKEN;
	      } else {
		trans.flow = BRANCH_FLOW;
	      }

	      break;

	    case cop_op:
	    case copd_op:
	    case copw_op:
	    case copl_op:

		switch( FUNC(instr) + (FORMAT(instr)<<8)) {

		case fabs_op+F_SSINGLE:
		  v_absf(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case fabs_op+F_SDOUBLE:
		  v_absd(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;

		case fadd_op+F_SSINGLE:
		  v_addf(FVREGS[fd(instr)],FVREGS[fs(instr)],
			 FVREGS[ft(instr)]);
		  break;
		case fadd_op+F_SDOUBLE:
		  v_addd(FVREGS[fd(instr)],FVREGS[fs(instr)],
			 FVREGS[ft(instr)]);
		  break;

		case fcvtd_op+F_SSINGLE:
		  v_cvf2d(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case fcvtd_op+F_W:
		  /* XXX this is a bogus inefficient hack -BL
		   * should really have a nicer way of doing this
		   * need to enhance vcode;
		   * right now, vcode doesn't do well putting integer
		   * values in floating registers, etc.;
		   * check out the way these macros are implemented.
		   */
		  v_stfi(FVREGS[fs(instr)],VREGS[VSS_BASE],
			 FP_OFF+mem_size(fs(instr)));
		  v_ldui(VREGS[SIM_T1], VREGS[VSS_BASE],
			 FP_OFF+mem_size(fs(instr)));
		  v_cvl2f(FVREGS[fd(instr)],VREGS[SIM_T1]);
		  v_cvf2d(FVREGS[fd(instr)],FVREGS[fd(instr)]);
		  break;
		case fcvtd_op+F_L:
		  v_stfi(FVREGS[fs(instr)],VREGS[VSS_BASE],
			 FP_OFF+mem_size(fs(instr)));
		  v_ldui(VREGS[SIM_T1], VREGS[VSS_BASE],
			 FP_OFF+mem_size(fs(instr)));
		  v_cvl2d(FVREGS[fd(instr)],VREGS[SIM_T1]);
		  CPUWarning("Translate: cvt.d.l unimplemented\n");
                  ASSERT(0);
		  break;

		case fcvts_op+F_SDOUBLE:
		  v_cvd2f(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case fcvts_op+F_W:
		  v_stfi(FVREGS[fs(instr)],VREGS[VSS_BASE],
			 FP_OFF+mem_size(fs(instr)));
		  v_ldui(VREGS[SIM_T1], VREGS[VSS_BASE],
			 FP_OFF+mem_size(fs(instr)));
		  v_cvl2f(FVREGS[fd(instr)],VREGS[SIM_T1]);
		  break;
		case fcvts_op+F_L:
		  v_stdi(FVREGS[fs(instr)],VREGS[VSS_BASE],
			 FP_OFF + mem_size(fs(instr)));
		  v_lddi(VREGS[SIM_T1], VREGS[VSS_BASE],
			 FP_OFF + mem_size(fs(instr)));
		  v_cvl2f(FVREGS[fd(instr)],VREGS[SIM_T1]);
		  break;

		case fcvtl_op+F_SSINGLE:
		  v_cvf2l(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case fcvtl_op+F_SDOUBLE:
		  v_cvd2l(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;

		case fcvtw_op+F_SSINGLE:
		  /* XXX this is bogus also - needs to be fixed! -BL
		   * probably modify vcode
		   */
		  v_cvf2i(VREGS[SIM_T1], FVREGS[fs(instr)]);
		  v_stui(VREGS[SIM_T1], VREGS[VSS_BASE],
			 FP_OFF + mem_size(fd(instr)));
		  v_ldfi(FVREGS[fd(instr)], VREGS[VSS_BASE],
			 FP_OFF + mem_size(fd(instr)));
		  break;
		case fcvtw_op+F_SDOUBLE:
		  v_cvd2i(VREGS[SIM_T1], FVREGS[fs(instr)]);
		  v_stui(VREGS[SIM_T1], VREGS[VSS_BASE],
			 FP_OFF + mem_size(fd(instr)));
                  /* SAH - I CHANGED THIS */
		  v_ldfi(FVREGS[fd(instr)], VREGS[VSS_BASE],
			 FP_OFF + mem_size(fd(instr)));
		  break;

		case fdiv_op+F_SSINGLE:
		  v_divf(FVREGS[fd(instr)],FVREGS[fs(instr)],
			 FVREGS[ft(instr)]);
		  break;
		case fdiv_op+F_SDOUBLE:
		  v_divd(FVREGS[fd(instr)],FVREGS[fs(instr)],
			 FVREGS[ft(instr)]);
		  break;

		case fmov_op+F_SSINGLE:
		  v_movf(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case fmov_op+F_SDOUBLE:
		  v_movd(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;

		case fmovc_op+F_SSINGLE:
		case fmovc_op+F_SDOUBLE:
		case fmovn_op+F_SSINGLE:
		case fmovn_op+F_SDOUBLE:
		case fmovz_op+F_SSINGLE:
		case fmovz_op+F_SDOUBLE:
		  /* Move conditional on FP condition */
		  CPUWarning("Translate: floating conditional moves unimplemented\n");
                  ASSERT(0);
		case fmul_op+F_SSINGLE:
		  v_mulf(FVREGS[fd(instr)],FVREGS[fs(instr)],
			 FVREGS[ft(instr)]);
		  break;
		case fmul_op+F_SDOUBLE:
		  v_muld(FVREGS[fd(instr)],FVREGS[fs(instr)],
			 FVREGS[ft(instr)]);
		  break;

		case fneg_op+F_SSINGLE:
		  v_negf(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case fneg_op+F_SDOUBLE:
		  v_negd(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;

		case fsub_op+F_SSINGLE:
		  v_subf(FVREGS[fd(instr)],FVREGS[fs(instr)],
			 FVREGS[ft(instr)]);
		  break;
		case fsub_op+F_SDOUBLE:
		  v_subd(FVREGS[fd(instr)],FVREGS[fs(instr)],
			 FVREGS[ft(instr)]);
		  break;

		case fsqrt_op+F_SSINGLE:
		  v_sqrtf(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case fsqrt_op+F_SDOUBLE:
		  v_sqrtd(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;

		case ftrunc_op+F_SSINGLE:
		  v_floorf(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case ftrunc_op+F_SDOUBLE:
		  truncwd(FVREGS[fd(instr)].reg,FVREGS[fs(instr)].reg);
		  /* was: v_floord(FVREGS[fd(instr)],FVREGS[fs(instr)]); */
		  break;

		case ftruncl_op+F_SSINGLE:
		  v_floorf(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case ftruncl_op+F_SDOUBLE:
		  v_floord(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;

		case ffloorl_op+F_SSINGLE:
		  v_floorf(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case ffloorl_op+F_SDOUBLE:
		  v_floord(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;

		case frecip_op+F_SSINGLE:
CPUWarning("Unimplemented FP at %#x\n", current_pc);
		  break;
		case frecip_op+F_SDOUBLE:
CPUWarning("Unimplemented FP at %#x\n", current_pc);
		  break;

		case frsqrt_op+F_SSINGLE:
CPUWarning("Unimplemented FP at %#x\n", current_pc);
		  break;
		case frsqrt_op+F_SDOUBLE:
CPUWarning("Unimplemented FP at %#x\n", current_pc);
		  break;

		case fround_op+F_SSINGLE:
CPUWarning("Unimplemented FP at %#x\n", current_pc);
		  break;
		case fround_op+F_SDOUBLE:
CPUWarning("Unimplemented FP at %#x\n", current_pc);
		  break;

		case froundl_op+F_SSINGLE:
CPUWarning("Unimplemented FP at %#x\n", current_pc);
		  break;

		case fceil_op+F_SSINGLE:
		  v_ceilf(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;
		case fceill_op+F_SSINGLE:
		  v_ceilf(FVREGS[fd(instr)],FVREGS[fs(instr)]);
		  break;

		default:

		  /* Decode FP Comparison Operations:  C.cond.fmt */
		  if ( (instr & FCMP_BITS) != 0) {

		    /* get FCR 31 and reset condition bit */
		    v_ldui(VREGS[SIM_T1],VREGS[VSS_BASE],FCR_OFF+mem_size(31));
		    Load_32_Bit_Immed(SIM_T2, 0xffffffff ^ FC_BIT);
		    v_andu(VREGS[SIM_T1],VREGS[SIM_T1],VREGS[SIM_T2]);
		    /* get condition bit for setting if desired */
		    Load_32_Bit_Immed(SIM_T2, FC_BIT);
		    if (F_FMT(instr) == s_fmt) {

		    /* check for less than */
		    if (instr & FL_BIT) {
		      label = v_genlabel();
		      v_bgef(FVREGS[fs(instr)],FVREGS[ft(instr)],label);
		      v_ori(VREGS[SIM_T1], VREGS[SIM_T1], VREGS[SIM_T2]);
		      v_label(label);
		    }

		    /* check for equal */
		    if (instr & FEQ_BIT) {
		      label = v_genlabel();
		      v_bnef(FVREGS[fs(instr)],FVREGS[ft(instr)],label);
		      v_ori(VREGS[SIM_T1], VREGS[SIM_T1], VREGS[SIM_T2]);
		      v_label(label);
		    }

		    /* check for unordered XXX unimplemented!! ugh, -BL*/
		    if (instr & FUN_BIT) {
		      /* we should probably check for exponent = 127
		       * and fraction != 0; but.... it's a pain to do
		       * this in the current vcode implementation,
		       * and mipsy doesn't do it, so for now we
		       * just reset the condition bit.
		       */
		      /* CPUWarning( "translate: c.un.f unimplimented\n"); */
		    }

		    }
		    else if (F_FMT(instr) == d_fmt) {
		      /* check for less than */
		      if (instr & FL_BIT) {
			label = v_genlabel();
			v_bged(FVREGS[fs(instr)],FVREGS[ft(instr)],label);
			v_ori(VREGS[SIM_T1], VREGS[SIM_T1], VREGS[SIM_T2]);
			v_label(label);
		      }

		      /* check for equal */
		      if (instr & FEQ_BIT) {
			label = v_genlabel();
			v_bned(FVREGS[fs(instr)],FVREGS[ft(instr)],label);
			v_ori(VREGS[SIM_T1], VREGS[SIM_T1], VREGS[SIM_T2]);
			v_label(label);
		      }

		      /* check for unordered XXX unimplemented!! ugh, -BL*/
		      if (instr & FUN_BIT) {
			/* we should probably check for exponent = 127
			 * and fraction != 0; but.... it's a pain to do
			 * this in the current vcode implementation, so
			 * for now we just do what mipsy does, which is
			 * just reset the condition bit.
			 */
			/* CPUWarning( "translate: c.un.d unimplimented\n"); */
		      }
		    }
		    else {
		      CPUWarning("translate: c.x.x unimplemented for format %d\n",
				 F_FMT(instr));
                      ASSERT(0);
		    }

		    /* update FCR 31 (condition bit) */
		    v_stui(VREGS[SIM_T1],VREGS[VSS_BASE],FCR_OFF+mem_size(31));

		 } /* if: floating point compares */

		  else {
		  CPUError("ERROR!!! FPU instruction 0x%x unimplemented at PC 0x%x\n",
			     instr, current_pc);
		  }

		} /* switch (FUNC(instr) + FORMAT(instr)<<8) */

		break; /* case cop_op, copd_op */

		default:
		  CPUError("ERROR!!! FPU instruction 0x%x unimplemented at PC 0x%x\n",
			     instr, current_pc);

	      } /* switch ( rs(instr) ) */

	    break; /* floating point (cop1) ops */


/****************************************************************/
/* All other opcodes						*/
/****************************************************************/
	    case addi_op:		/* rs op imm -> rt */
	    case addiu_op:
	    case andi_op:
	    case ori_op:
	    case slti_op:
	    case sltiu_op:
	    case xori_op:
#if defined(SIM_MIPS64)
         case daddi_op:		/* rs op imm -> rt */
         case daddiu_op:
#endif

	      tmp1 = Load( SIM_T1, rs(instr) );

	      tmp2 = Set_Destination( SIM_T2, rt(instr) );

	      switch (MAJOR_OPCODE(instr)) {
		/* rs op imm -> rt */
	      case addi_op:  ECi(addi_op_, tmp2, tmp1, IMMED(instr));
		break;
	      case addiu_op: ECi(addiu_op_, tmp2, tmp1, IMMED(instr));
		break;
	      case andi_op:  ECi(andi_op_, tmp2, tmp1, IMMED(instr));
		break;
	      case ori_op:   ECi(ori_op_, tmp2, tmp1, IMMED(instr));
		break;
	      case slti_op:  ECi(slti_op_, tmp2, tmp1, IMMED(instr));
		break;
	      case sltiu_op: ECi(sltiu_op_, tmp2, tmp1, IMMED(instr));
		break;
	      case xori_op:
		ECi(xori_op_, tmp2, tmp1, IMMED(instr));
		break;
#if defined(SIM_MIPS64)
         case daddi_op:		/* rs op imm -> rt */
		ECi(daddi_op_, tmp2, tmp1, IMMED(instr));
		break;
         case daddiu_op:
		ECi(daddiu_op_, tmp2, tmp1, IMMED(instr));
		break;
#endif
	      default:
		goto unknown_opcode;
	      }

	      Store( tmp2, rt(instr) );

	      break;


	    case lui_op:	/* immed<<16 -> rt */

	      tmp2 = Set_Destination( SIM_T2, rt(instr) );

	      ECi(lui_op_, tmp2, G0, IMMED(instr));

	      Store( tmp2, rt(instr) );

	      break;


	      /*************************************/
	      /* Branches -- Condition goes into BRANCHREG: 1 == Taken, 0 == Untaken*/
	      /* Link Address computed in SIM_T4 */
	      /* Annulled branches chain the untaken path here, and chain */
	      /* the taken path after the delay slot instruction */
	      /*************************************/
	    case blez_op:
	    case blezl_op:

	      if( instrGrp.delay_slot_reg_conflict ) {
		tmp1 = Load( SIM_T1, rs(instr) );

		ECs(slt_op_, BRANCHREG, G0, tmp1);
		ECi(xori_op_, BRANCHREG, BRANCHREG, 1);
	      }

	      next_PC = trans.curPC + INST_SIZE  + (IMMED(instr)<<2);

	      trans.branch_instr = VC_ComposeImmed(blez_op, rs(instr), 0, 0);
	      if( MAJOR_OPCODE(instr) == blezl_op ) {
		Update_PC(&trans,BRANCH_UNTAKEN, 0);
		trans.flow = BRANCH_TAKEN;
	      } else {
		trans.flow = BRANCH_FLOW;
	      }

	      break;

	    case bgtz_op:
	    case bgtzl_op:

	      if( instrGrp.delay_slot_reg_conflict ) {
		tmp1 = Load( SIM_T1, rs(instr) );
		ECs(slt_op_, BRANCHREG, G0, tmp1 );
	      }
	      next_PC = trans.curPC + INST_SIZE + (IMMED(instr)<<2);
	      trans.branch_instr = VC_ComposeImmed(bgtz_op, rs(instr), 0, 0);
	      if( MAJOR_OPCODE(instr) == bgtzl_op ) {
		Update_PC( &trans, BRANCH_UNTAKEN,  0);
		trans.flow = BRANCH_TAKEN;
	      } else {
		trans.flow = BRANCH_FLOW;
	      }

	      break;


	    case bne_op:
	    case bnel_op:

	      if( instrGrp.delay_slot_reg_conflict ) {
		if( prev_store.real == rt(instr) ) {
                  tmp2 = Load( SIM_T2, rt(instr) );
                  tmp1 = Load( SIM_T1, rs(instr) );
		} else {
                  tmp1 = Load( SIM_T1, rs(instr) );
                  tmp2 = Load( SIM_T2, rt(instr) );
		}

		ECs(xor_op_, BRANCHREG, tmp1, tmp2);
		ECs(sltu_op_, BRANCHREG, G0, BRANCHREG);
	      }

	      next_PC = trans.curPC + INST_SIZE + (IMMED(instr)<<2);
	      trans.branch_instr = VC_ComposeImmed(bne_op, rs(instr), rt(instr), 0);

	      if( MAJOR_OPCODE(instr) == bnel_op ) {
		Update_PC(&trans, BRANCH_UNTAKEN,  0);
		trans.flow = BRANCH_TAKEN;
	      } else {
		trans.flow = BRANCH_FLOW;
	      }
	      break;


	    case beq_op:
	    case beql_op:

	      /* USES RS RT SIM_TMP */
	      if( instrGrp.delay_slot_reg_conflict ) {
		if( prev_store.real == rt(instr) ) {
                  tmp2 = Load( SIM_T2, rt(instr) );
                  tmp1 = Load( SIM_T1, rs(instr) );
		} else {
                  tmp1 = Load( SIM_T1, rs(instr) );
                  tmp2 = Load( SIM_T2, rt(instr) );
		}

		/* Leave branch condition in BRANCHREG */
		ECs(xor_op_, BRANCHREG, tmp1, tmp2);
		ECi(sltiu_op_, BRANCHREG, BRANCHREG, 1);
	      }

	      next_PC = trans.curPC + INST_SIZE + (IMMED(instr)<<2);
	      trans.branch_instr = VC_ComposeImmed(beq_op, rs(instr), rt(instr), 0);

	      if( MAJOR_OPCODE(instr) == beql_op ) {
		Update_PC(&trans, BRANCH_UNTAKEN, 0);
		trans.flow = BRANCH_TAKEN;
	      } else {
		trans.flow = BRANCH_FLOW;
	      }

	      break;


	      /*************************************/
	      /* Jumps -- If regindirect,  address goes into BRANCHREG */
	      /* Link Address goes computed in SIM_T4 */
	      /*************************************/
	    case j_op:	/* Jump to target */

	      trans.flow = JMP_FLOW;

            /* Use top 4 bits of delay slot & 28 (26shifted) bits of instr */
            next_PC =
               ( JMP_PC_MASK & ( trans.curPC + INST_SIZE ) ) |	(TARGET(instr)<<2);
            break;

	    case jal_op:
	      /* load PC with the target address  load LINK with the */
	      /* address after delay slot */
	      trans.flow = JMP_FLOW;

            ret_addr = trans.curPC + INST_SIZE;
            next_PC = ( JMP_PC_MASK & ret_addr ) | (TARGET(instr)<<2);

	      /*MIPS manual specifies that the REG_RA gets written */
	      /* before the delay slot instruction */
	      tmp3 = Set_Destination( SIM_T4, REG_RA);

	      /* Compute RA */
	      ECi(ADDR_ADDI_OP, tmp3, PC_REG, (i+2)*4);

	      Store(tmp3, REG_RA);

	      break;

#if defined(SIM_MIPS64)
         case lld_op:
#endif

	    case ll_op:

	      tmp1 = D_Memory_Check( &trans,instr, MEM_D_SHARED);

	      /* Store the memory address in the LLaddr register */
	      ECi(sw_op_, tmp1, VSS_BASE, LLADDR_OFF);

	      tmp2 = Set_Destination( SIM_T2, rt(instr) );

	      if (MAJOR_OPCODE(instr) == ll_op) {
		ECi(lw_op_, tmp2, tmp1, 0 );
	      } else {
               ECi(REG_LD_OP, tmp2, tmp1, 0 );
	      }
	      TRAILING_LOAD( tmp2 );

	      /* For page mode ll/sc, store contents of locked location */
	      if( embra.emode == EMBRA_PAGE )
		ECi(REG_ST_OP, tmp2, VSS_BASE, LLCONTENTS_OFF);

	      Store( tmp2, rt(instr) );

	      break;

	      /*************************************/
	      /* Loads -- */
	      /*************************************/
#if defined(SIM_MIPS64)
         case ld_op:
#endif
	    case lb_op:
	    case lbu_op:
	    case lw_op:
	    case lh_op:
	    case lhu_op:
#if defined(SIM_MIPS64)
         case lwu_op:
#endif

	      /* offset(_RS) -> rt */

	      tmp1 = D_Memory_Check( &trans,instr, MEM_D_SHARED);
	      tmp2 = Set_Destination( SIM_T2, rt(instr) );

	      switch(MAJOR_OPCODE(instr)) {
	      case lb_op:             ECi(lb_op_, tmp2, tmp1, 0 );
		break;
	      case lbu_op:            ECi(lbu_op_, tmp2, tmp1, 0 );
		break;
	      case lw_op:            ECi(lw_op_, tmp2, tmp1, 0 );
		break;
	      case ld_op:            ECi(REG_LD_OP, tmp2, tmp1, 0 );
		break;
	      case lh_op:            ECi(lh_op_, tmp2, tmp1, 0 );
		break;
	      case lhu_op:            ECi(lhu_op_, tmp2, tmp1, 0 );
		break;
	      case lwu_op:            ECi(lwu_op_, tmp2, tmp1, 0 );
		break;
	      default:
		goto unknown_opcode;
	      }

	      TRAILING_LOAD( tmp2 );

	      Store( tmp2, rt(instr) );

	      break;

	    case lwl_op:
	    case lwr_op:
#if defined(SIM_MIPS64)
         case ldl_op:
         case ldr_op:
#endif

	      tmp1 = D_Memory_Check( &trans,instr,MEM_D_SHARED);
	      tmp2 = Load( SIM_T2, rt(instr ) );

	      switch(MAJOR_OPCODE(instr)) {
	      case lwl_op:	ECi(lwl_op_, tmp2, tmp1, 0 );
		break;
	      case lwr_op:	ECi(lwr_op_, tmp2, tmp1, 0 );
		break;
	      case ldl_op:	ECi(ldl_op_, tmp2, tmp1, 0 );
		break;
	      case ldr_op:	ECi(ldr_op_, tmp2, tmp1, 0 );
		break;
	      default:
		goto unknown_opcode;
	      }

	      TRAILING_LOAD( tmp2 );

	      Store( tmp2, rt(instr) );

	      break;

	      /*************************************/
	      /* Stores -- */
	      /*************************************/

#if defined(SIM_MIPS64)
         case scd_op:
#endif
	 case sc_op:
	   if( embra.emode == EMBRA_PAGE ) {
	     ASSERT(instrGrp.no_reg_allocate);
	      }
	   /*FALLTHROUGH*/
#if defined(SIM_MIPS64)
         case sd_op:
#endif
	 case sb_op:		/* rt -> offset(base) */
	 case sh_op:
	 case sw_op:
	 case swl_op:
	 case swr_op:
#if defined(SIM_MIPS64)
         case sdl_op:
         case sdr_op:
#endif

	      tmp1 = D_Memory_Check( &trans,instr,MEM_D_EXCLUSIVE);

	      tmp2 = Load( SIM_T2, rt(instr) );
	      if( MAJOR_OPCODE(instr) == sc_op || (MAJOR_OPCODE(instr) == scd_op)) {
		if( embra.emode == EMBRA_CACHE ) {
		  /* XXX this is inefficient - before was only one
		   * branch using delay slot instruction. fix this,
		   * but be careful of register allocation -BL
		   */
		  static v_label_t fail_label, cont_label;
		  fail_label = v_genlabel();
		  cont_label = v_genlabel();
                  /* Check ll reg */
                  ECi( lw_op_, SIM_T4, VSS_BASE, LLADDR_OFF );
                  /* Get physical line number into A3 */
                  /*if the lladdr_reg doesn't match this address, then */
                  /*skip the store and return 0.  NOTE: that we are not */
                  /*being architecturally pure because an ll to a */
                  /*mapped address and an sc to the corresponding K0 */
                  /*address will succeed, while the MIPS ISA says it */
                  /*shouldn't */
		  v_bnei(VREGS[tmp1], VREGS[SIM_T4], fail_label);
                  /* Success  - Do store to memory */
                  ECi( ori_op_, SIM_T4, G0, 1); /* result = success */
                  if (MAJOR_OPCODE(instr) == sc_op) {
                     ECi(sw_op_, tmp2, tmp1, 0);
                  } else {
                     ECi(REG_ST_OP, tmp2, tmp1, 0);
                  }
		  v_jv(cont_label);
		  /* Failure - don't store & zero result */
		  v_label(fail_label);
		  ECi( ori_op_, SIM_T4, G0, 0); /* result = failure */
		  /* Store the return code */
		  v_label(cont_label);
                  Store_Move( SIM_T4, rt(instr) );

		} else {
                  /*
                   * The implementation of LL/SC in page mode has a
                   * compare and swap value based-semantic. It checks
                   * for matching addresses (cleared by exceptions) and
                   * for matching values. This means that if another
                   * processsor increments and then decrements the value,
                   * the SC will still succeed, when a real LL/SC would
                   * have failed.
                   * We are also not being architecturally pure in that we
                   * expect matching virtual addresses for the LL/SC pair and
                   * not matching physical addresses.
                   * Bug report: embra/44
                   */
                  if (embra.sequential) {
		    static v_label_t sc_fails;
		    sc_fails = v_genlabel();
		    /*
		     * do it deterministically !!!
		     * *************************
		     * Check ll reg  to avoid spurious matches of
		     * contents i.e. this allows us to rely on
		     * exceptions making this sc fail because the reg
		     * is cleared, even if the contents were 0 (open lock)
		     */

                     /*
                      * LLADDR_OFF contains the Memory address of the ll
                      * (not kseg0 or physical address)
                      * A0   : actual value
                      * A1   : old value
                      * tmp1 : memory address
                      * tmp2 : new value
                      * V0   : 0(failure) 1(success) -> rt(instr)
                      */
                     ECi(lw_op_,  SIM_T4,VSS_BASE, LLADDR_OFF);
                     ECi(ori_op_, V0, G0, 0);
		     v_bnei(VREGS[tmp1], VREGS[SIM_T4], sc_fails);
                     if (MAJOR_OPCODE(instr) == sc_op) {
                        ECi(lw_op_,  A0, tmp1,0);    /* actual value */
		     }
                     else
                        ECi(REG_LD_OP,  A0, tmp1,0);    /* actual value */
                     ECi(REG_LD_OP,  A1, VSS_BASE, LLCONTENTS_OFF); /*old value*/
		     v_bnei(VREGS[A0], VREGS[A1], sc_fails);
                     ECnop;
                     /* success. store value and set 1 in V0 */
                     if (MAJOR_OPCODE(instr) == sc_op) {
                        ECi(sw_op_,  tmp2, tmp1,0);
		     }
                     else
                        ECi(REG_ST_OP,  tmp2, tmp1,0);
                     ECi(ori_op_, V0, G0,1);
		     /* XXX both jumps jump here; is this correct??? NO
		      * This BREAKS because we don't resolve the
		      * second label!! fix this !!  -BL */
                     /* common path */
		     v_label(sc_fails);
                     Store_Move(V0,rt(instr));
                  } else {

		    /* callout.s */
		    extern void compare_and_swap(unsigned, unsigned, unsigned);
#if defined(SIM_MIPS64)
                     CPUError("Embra 64bit parallel mode doesn't work\n"); /* Mendel */
#endif

		    /* Check ll reg  to avoid spurious matches of */
		    /* contents i.e. this allows us to rely on */
		    /* exceptions making this sc fail because the reg */
		    /* is cleared, even if the contents were 0 (open lock)*/
		    ECi( REG_LD_OP,SIM_T4, VSS_BASE, LLADDR_OFF );
		    /*if the lladdr_reg doesn't match this address, then */
		    /*skip the store and return 0.  NOTE: that we are not */
		    /*being architecturally pure because an ll to a */
		    /*mapped address and an sc to the corresponding K0 */
		    /*address will succeed, while the MIPS ISA says it */
		    /*shouldn't */
		    ECi(ori_op_, V0, G0, 0);
		    ECi(bne_op_, tmp1, SIM_T4, 4);

		    /* A0 - phys addr */
		    ECi(ori_op_, A0, tmp1, 0);
		    /* A1 - old_contents */
		    ECi( REG_LD_OP, A1, VSS_BASE, LLCONTENTS_OFF);
		    /* A2 - new_contents */
		    ECi(ori_op_, A2, tmp2, 0);
		    /*XXX - This should be done in emitted code */
		    ECj( jal_op_, compare_and_swap );
		    ECnop;
		    VCTARGET;
		    Store_Move( V0, rt(instr) );
                  }
		}
	      } else {

		switch (MAJOR_OPCODE(instr)) {
		case sb_op: ECi(sb_op_, tmp2, tmp1, 0);
		  break;
		case sh_op: ECi(sh_op_, tmp2, tmp1, 0);
		  break;
		case sw_op: ECi(sw_op_, tmp2, tmp1, 0);
		  break;
		case swl_op:ECi(swl_op_, tmp2, tmp1, 0);
		  break;
		case swr_op:ECi(swr_op_, tmp2, tmp1, 0);
		  break;
		case sdl_op:ECi(sdl_op_, tmp2, tmp1, 0);
		  break;
		case sdr_op:ECi(sdr_op_, tmp2, tmp1, 0);
		  break;
		case sd_op:ECi(REG_ST_OP, tmp2, tmp1, 0);
		  break;
		default:
		  {
		    void set_inst1(int);
		    int get_inst1(void);

		    fprintf(stderr,
			    "translate: undecoded instruction %x at %x\n",instr,
			    &instrGrp.instrs[i]);
		    set_inst1(instr);

		    /* disassembler(0x10000000, NULL, NULL, NULL,
				 get_inst1, NULL); */
		    ASSERT(0);
		  }
		}

	      }

	      break;

#if defined(SIM_MIPS32)
         case ld_op: /* XXX backward compat */
#endif
	    case ldc2_op:
	      tmp1 = D_Memory_Check( &trans, instr, MEM_D_SHARED);

               /* first word */
            tmp2 = Set_Destination( SIM_T2, rt(instr) );
            ECi( lw_op_, tmp2, tmp1, 0);
            TRAILING_LOAD( tmp2 );
            Store( tmp2, rt(instr) );
            /* second word */
            tmp2 = Set_Destination( SIM_T2, rt(instr)+1 );
            ECi( lw_op_, tmp2, tmp1, 4);
            TRAILING_LOAD( tmp2 );
            Store( tmp2, rt(instr)+1 );
            break;

#if defined(SIM_MIPS32)
         case sd_op: /* XXX backward compat */
#endif

	    case sdc2_op:
	      tmp1 = D_Memory_Check( &trans, instr, MEM_D_EXCLUSIVE);
	      /* first word */
	      tmp2 = Load( SIM_T2, rt(instr) );
	      ECi( sw_op_, tmp2, tmp1, 0);
	      /* second word */
	      tmp2 = Load( SIM_T2, rt(instr)+1 );
	      ECi( sw_op_, tmp2, tmp1, 4);
	      break;

	      /* Prefetch -- treat as a nop for now */
	    case pref_op:
	      break;

	      /*************************************/
	      /* Floating Point Loads -- */
	      /* These are only slightly different */
	      /* from regular loads in that the register values are stored */
	      /* in the registers, not in the virtual state structure */
	      /*************************************/

	    case lwc1_op:
	    case ldc1_op:
	      Check_C1_Usable( &trans );

	      tmp1 = D_Memory_Check( &trans,instr,MEM_D_SHARED);

	      switch(MAJOR_OPCODE(instr)) {
	      case lwc1_op: EC1i(lwc1_op_, rt(instr), tmp1, 0);
		break;
	      case ldc1_op: EC1i(ldc1_op_, rt(instr), tmp1, 0);
		break;
	      default:
		goto unknown_opcode;
	      }

	      TRAILING_LOAD( rt(instr) );

	      break;

	      /*************************************/
	      /* Floating Point Stores -- */
	      /* These, like loads are a little different from regular */
	      /* stores in that we use the floating */
	      /* registers directly */
	      /*************************************/
	    case swc1_op:
	    case sdc1_op:
	      Check_C1_Usable( &trans );
	      tmp1 = D_Memory_Check( &trans,instr, MEM_D_EXCLUSIVE);
	      switch (MAJOR_OPCODE(instr)) {
	      case swc1_op:  EC1i(swc1_op_, rt(instr), tmp1, 0);
		break;
	      case sdc1_op:  EC1i(sdc1_op_, rt(instr), tmp1, 0);
		break;
	      default:
		goto unknown_opcode;
	      }

	      break;

	      /* This is a pseudo opcode to help do MP via a special */
	      /* global test and set*/
	    case mendel_tns:
	      ASSERT(instrGrp.no_reg_allocate);
	      /* Put procedure number in SIM_T2 */
	      ECi(addiu_op_, SIM_T2, G0, CALLOUT_TNS);
	      ECj(jal_op_, callout);
	      ECnop;
	      break;

	    case cache_op:
#ifdef IRIX6_4
            Load_32_Bit_Immed(A1,instr);
            Do_Callout(&trans,CALLOUT_CACHEOP);
#else
	      /* Caveat: this is not entirely correct, since this
	       * should raise an exception in user mode.
	       * To be done.
	       */
	      /*
	       * Right now, this is a nop. It could turn out that
	       * we need to flush the TC on some cache ops, but
	       * the MMU/qc downgrade mechanism combined with the
	       * icache_coherence check should suffice.
	       */
#endif
	      break;

	    default:
	      /* this looks really silly! -BL */
	 unknown_opcode:
	      CPUWarning("Translator: unknown opCode %#x at pc %#x, i = %d\n", instr, current_pc, i );
	      Do_Exception_Callout( &trans, EXC_II);
	      /* The above is the correct thing to do, but if we assume */
	      /* no buggy programs the assert can be more helpful for */
	      /* debugging */
	      ASSERT(0);
	      break;

	    } /* MAJOR_OPCODE switch  */
       /* if (instr != nop ) */
	 }
      /*
       * (post)PC annotations
       */
      if (instrGrp.pcAnn[i] & ANNFM_PC_TYPE) {
         uint *startCallout = memptr;
         Do_Callout(&trans, CALLOUT_PC_ANN);
         TC_IncrementSize(tcCache,(char*)memptr-(char*)startCallout);
      }

      /* Get next instruction */
   } /* for loop */


   /* This maintains a bitmap  so we can detect writes to a */
   /* code page. */
   if( !instrGrp.next_maPC ) {
      TCcoherence_mark_code(PHYS_TO_MEMADDR(M_FROM_CPU(cpuNum), instrGrp.phys_pc),
                            PHYS_TO_MEMADDR(M_FROM_CPU(cpuNum), instrGrp.phys_pc + 4*instrGrp.GrpLen ));
      qc_downgrade_ifpresent(instrGrp.virt_pc);
   } else {
      int firstpart = ((int)NEXT_PAGE(instrGrp.maPC)-(int)instrGrp.maPC);
      int rest = 4 * instrGrp.GrpLen - firstpart;
      PA afterFirst = (PA) (NEXT_PAGE(instrGrp.phys_pc));
      ASSERT (rest>0);
      TCcoherence_mark_code(PHYS_TO_MEMADDR(M_FROM_CPU(cpuNum), instrGrp.phys_pc),
                            PHYS_TO_MEMADDR(M_FROM_CPU(cpuNum), afterFirst));
      TCcoherence_mark_code(PHYS_TO_MEMADDR(M_FROM_CPU(cpuNum), instrGrp.next_phys_pc),
                            PHYS_TO_MEMADDR(M_FROM_CPU(cpuNum), instrGrp.next_phys_pc+rest));

      qc_downgrade_ifpresent(instrGrp.virt_pc);
      qc_downgrade_ifpresent(instrGrp.virt_pc+firstpart);

   }


   if(embra.emode == EMBRA_CACHE){
      /* Update the PC, depending on the flow. */
      Update_PC(&trans, trans.flow,
                 next_PC );
      /* If block branches to self, connect to Icache check for accuracy */
      /* in face of conflicts see cache.c */
      /* Don't chain delay slot translations because their prelude is
         non-standard */
      if( !is_delay_slot_instr ) {
         TC_PCInsert(tcCache, tc_start+tc_offset+SPECULATIVE_ENTRY, instrGrp.virt_pc,instrGrp.maPC);

	 /* was: pc_tc_insert( cpuNum, instrGrp.virt_pc, instrGrp.maPC,
		       tc_start + tc_offset + SPECULATIVE_ENTRY,
		       memptr - tc_start - tc_offset - SPECULATIVE_ENTRY); */
      }
   } else {
      /* Insert this translation into the hash table so if this basic */
      /* block branches to itself, we catch that case */
      /* Don't chain delay slot translations because their prelude is
         non-standard */
      if( !is_delay_slot_instr ) {
         TC_PCInsert(tcCache,tc_start+tc_offset+SPECULATIVE_ENTRY, instrGrp.virt_pc,instrGrp.maPC);
         /* was: pc_tc_insert( cpuNum, instrGrp.virt_pc, instrGrp.maPC,
		       tc_start + tc_offset + SPECULATIVE_ENTRY,
		       memptr - tc_start-SPECULATIVE_ENTRY);  */
         /* Update the PC, depending on the flow. */
         Update_PC(&trans,trans.flow, next_PC);
      }
   }

   FillInLongConst();

   ASSERT (memptr - tc_start <  est_translen);

   /* Now, properly set the TC pointer */
   TC_SetTCNext( tcCache,tc_start, memptr );

#ifdef RECORD_TRANSLATIONS
   /*if( current_pc > 0x42c4bc && current_pc < 0x42c57c )*/
   {
      write( trans, (char*)"\000\000\000\000", 4);
      write( trans, (char*)pPC, instrGrp.GrpLen*sizeof(unsigned));
      write( trans, (char*)tc_start, ((int)memptr - (int)tc_start) );
   }
#endif


   STAT_INC_BY( trans_instrs, instrGrp.GrpLen );
   STAT_BB_SIZE(memptr - tc_start );
   STAT_TIMER_STOP( trans_timer );

#if 0
   CPUWarning("PC=0x%x TC=0x%08x \n", current_pc,tc_start);
   ASSERT( current_pc != 0x600ddb78);
#endif

   /* XXX sad; more hax; add a nop at the end for now */
   v_nop();

   v_end(0); /* important: link basic block */

   /* XXX another hack -nuke vcode header; fix this -BL */
   /* we don't really need this  *(tc_start-1) = 0; */

   /* doesn't work: CheckRegs(tc_start, current_pc, instrGrp.maPC); */

   /* Be conservative (and correct) by jumping to the speculative entry */
   return tc_start + tc_offset + SPECULATIVE_ENTRY;
}


void CountFP(void)
{
   static int numFP;
   numFP++;
   CPUPrint("Num c1 instr %d\n",numFP);
}

/** a hack because I was too lazy to rebuild the library */
void __eprintf(void )
{
  fprintf(stderr,"eprintf called\n");
}


int inst1;
void set_inst1(int a){  inst1 = a;}
int get_inst1(void) {return inst1;}

#if CHECKREGS

static int last_pc=0;
static int last_target=0;

void DumpRegs(Reg *R);

void CheckRegs(int current_target, int current_pc, int physaddr)
{
  int64 cycles = EmbraCpuCycleCount(curEmp-EMP);
   nblocks++;
   if (cycles <= min_cycles) return;

   if (print_blocks) fprintf(stderr,"cycles=%lld nblocks=%x tc=%x pc=%x ma=%x\n\r",
			     cycles, nblocks, current_target, current_pc,
			     physaddr);

   if (check_regs) {
     static int nblocks2, i;
     static EmbraState EMP2;
     static Reg PC;
     static int blockCycleCountdown;

     read(fildes, &nblocks2, sizeof(nblocks2));
     if (nblocks2 != nblocks) {
       fprintf(stderr,"out of block sync at %x (got %x)\n", nblocks, nblocks2);
       ASSERT(0);
     }

     read(fildes, &PC, sizeof(PC));
     if (PC != curEmp->PC) {
       fprintf(stderr,"out of PC sync at %x\n\r",nblocks);
       ASSERT(0);
     }

     read(fildes, &blockCycleCountdown, sizeof(blockCycleCountdown));
     if (blockCycleCountdown != curEmp->blockCycleCountdown) {
       fprintf(stderr,"out of blockcycle sync at %x\n\r",nblocks);
       ASSERT(0);
     }


     read(fildes, &EMP2.R, sizeof(EMP2.R));
     if (bcmp(curEmp->R, EMP2.R, 32*4) != 0) {
       fprintf(stderr,"\rRegisters don't match up before block %x\n\r", nblocks);
       fprintf(stderr,"last_target = %x, current_target= %x\n\r",
	       last_target, current_target);
       fprintf(stderr, "last_pc= %x, current_pc= %x\n\r",
	       last_pc, current_pc);
       fprintf(stderr,"Registers:\n\r"); DumpRegs(curEmp->R);
       fprintf(stderr,"Should be:\n\r"); DumpRegs(EMP2.R);
       for (i=0; i < 32; i++)
	 if (curEmp->R[i] != EMP2.R[i]) fprintf(stderr, "Check R%d\n\r",i);
       if (die_reg) ASSERT(0);
     }
   }
   last_pc = current_pc;
   last_target = current_target;

}

void DumpRegs(Reg *R)
{
  fprintf(stderr,"zero(r0): %08x  t0(r8):  %08x  s0(r16): %08x  t8(r24): %08x\n\r",
	     R[0],       R[8],       R[16],       R[24]);
  fprintf(stderr,"  at(r1): %08x  t1(r9):  %08x  s1(r17): %08x  t9(r25): %08x\n\r",
	     R[1],       R[9],       R[17],       R[25]);
  fprintf(stderr,"  v0(r2): %08x  t2(r10): %08x  s2(r18): %08x  k0(r26): %08x\n\r",
	     R[2],       R[10],       R[18],      R[26]);
  fprintf(stderr,"  v1(r3): %08x  t3(r11): %08x  s3(r19): %08x  k1(r27): %08x\n\r",
	     R[3],       R[11],       R[19],      R[27]);
  fprintf(stderr,"  a0(r4): %08x  t4(r12): %08x  s4(r20): %08x  gp(r28): %08x\n\r",
	     R[4],       R[12],       R[20],       R[28]);
  fprintf(stderr,"  a1(r5): %08x  t5(r13): %08x  s5(r21): %08x  sp(r29): %08x\n\r",
	     R[5],       R[13],       R[21],       R[29]);
  fprintf(stderr,"  a2(r6): %08x  t6(r14): %08x  s6(r22): %08x  fp(r30): %08x\n\r",
	     R[6],      R[14],        R[22],       R[30]);
  fprintf(stderr,"  a3(r7): %08x  t7(r15): %08x  s7(r23): %08x  ra(r31): %08x\n\r",
	     R[7],      R[15],        R[23],       R[31]);

}

#endif


#ifndef DISASSEMBLER_EXISTS
/* no disassembler in n32 or n64 mode ;-(
* This is a patch for now; we should really fix this in vcode
* itself, or get a real disassembler, or a life.
*/
int disassembler(void)
{
  return 0;
}
#endif