recovery.tcl 11.4 KB
#
# Copyright (C) 1996-1998 by the Board of Trustees
#    of Leland Stanford Junior University.
# 
# This file is part of the SimOS distribution. 
# See LICENSE file for terms of the license. 
#
###
### Recovery statistics -- used for measuring recovery perf
###
### Dan Teodosiu, Apr 1996
### Revision history:
###   - brought up to date (Dan Teodosiu, Sep. 1996)
###   - small updates, dieset injection (DT, 02/97)
###
### Before sourcing this script, you should set the following variables:
###   recoveryWatchOnly  -- watch only, don't force recovery
###   recoveryKickFast   -- fast-kick recovery
###   recoveryDebugRc    -- add debugging annotations
###   recoverySwitch     -- switch modes:
###                           0 -> off
###                           1 -> when OS recovery starts
###                           2 -> after OS recovery complete
###   recoverySwitchTo   -- EMBRA_PAGE, ..., EXIT
###   recoveryDieSet     -- dieset (optional, defaults to 0)
###

annotation set simos enter {
    #
    # Make recovery quiet since we don't want to wast time
    # in printf.
    #
    if {$CPU == 0} {
        for {set i 0} {$i < $PARAM(HIVE.NumCells)} {incr i} {
	    set vrp [symbol read "kernel$i:hive_recovery.c:&verbose_recovery"]
	    set MEMORY($vrp) 0
	}
    }
}

set rfn    0
set rtrigd 0
set nsted  0

set cpusPerCell [expr $PARAM(CPU.Count) / $PARAM(HIVE.NumCells)]
for {set i 0} {$i < $PARAM(CPU.Count)} {incr i} {
    set CPUtoCell($i) [expr $i / $cpusPerCell]
}

proc getLS {n} {
    global PARAM

    set r ""
    for {set i 0} {$i < $PARAM(HIVE.NumCells)} {incr i} {
	set l [symbol read "kernel$n:hive.h:CPA->ls<$i>"]
	if {[expr $l & 0x1] != 0} {
	    lappend r $i
	}
    }
    return $r
}

proc printStats {i} {
    global PARAM rk ls lf rs rf rb1 ra1 rb2 ra2 frs fre ps pe recoveryWatchOnly

    console "  RECOVERY stats for cell $i:\n"
    console "\tLatency (kick->lset): [expr $ls($i)-$rk($i)]\n"
    console "\tLSET: [expr $lf($i)-$ls($i)] RECV: [expr $rf($i)-$rs($i)] Total Latency: [expr $rf($i)-$ls($i)]\n"
    
    console "\t\tRound latencies: "
    for {set j 1} {$j <= $PARAM(HIVE.NumCells)} {incr j} {
	console "[expr $fre($i,$j)-$frs($i,$j)] "
    }
    console "\n\t\tFirst ping round latency: [expr $pe($i)-$ps($i)]\n"
    
    console "\tB1: Start: $rb1($i) Latency: [expr $ra1($i)-$rb1($i)]\n"
    console "\tB2: Start: $rb2($i) Latency: [expr $ra2($i)-$rb2($i)]\n"
}

proc die {mess} { global CPU CPUtoCell CYCLES
    console "RECV STATS CPU=$CPU cell=$CPUtoCell($CPU) cyc=$CYCLES: $mess\n"
    ### Note: due to a bug in annotations code, an annotation may fire more
    ###       than once on a given instruction, if the instruction takes an
    ###       exception (s.a. a TLB miss). Thus, we only print a warning
    ###       but do not exit here.
    ### exit
}

### why this kludge? because cpuEnter EMBRA_PAGE is currently broken if
### called from a PC annotation...
set enterSimlPending 0

proc enterSiml {} {
    global CYCLES enterSimlPending recoverySwitchTo PARAM a0 pc ra

    if {$enterSimlPending == 0} {
	console "RECOVERY: preparing to switch back to \$recoverySwitchTo...\n"
	annotation set cycle [expr $CYCLES + 10000] {
	    console "RECOVERY: switching back to $recoverySwitchTo...\n"
	    for {set i 0} {$i < $PARAM(CPU.Count)} {incr i} {
		console "CPU $i: a0=$a0, pc=$pc, ra=$ra\n"
	    }
	    cpuEnter $recoverySwitchTo
	}
	set enterSimlPending 1
    }
}

for {set i 0} {$i < $PARAM(HIVE.NumCells)} {incr i} {
    set rk($i) 0
    set ls($i) 0
    set lf($i) 0
    set rs($i) 0
    set rf($i) 0
    set rb1($i) 0
    set ra1($i) 0
    set rb2($i) 0
    set ra2($i) 0
    # initialize flood-round times
    set ps($i) 0
    set pe($i) 0
    set rnd($i) 0
    for {set j 1} {$j <= $PARAM(HIVE.NumCells)} {incr j} {
	set frs($i,$j) 0
	set fre($i,$j) 0
    }

}

#
# Set the following procs MUSTRUN on CPU 0: PING, RECV, LSET
# This should save a lot of $ misses on Flash
#
annotation set simos enter {
    if {$recoveryWatchOnly == 0 && $CPU == 0} {
	for {set i 0} {$i < $PARAM(HIVE.NumCells)} {incr i} {
	    # PING
	    symbol set "kernel$i:proc.h:proc<6>.p_mustrun" 0
	    # LSET
	    symbol set "kernel$i:proc.h:proc<7>.p_mustrun" 0
	    # RECV
	    symbol set "kernel$i:proc.h:proc<8>.p_mustrun" 0
	}
    }
}

annotation set pc kernel:hive_recovery.c:hive_kick_recovery_init:END {
    log "RECV $CPU: $CYCLES hive_kick_recovery_init:END\n"
    if {$rk($CPUtoCell($CPU)) == 0} {
	set rk($CPUtoCell($CPU)) $CYCLES
    }
    if {[info exists recoverySwitch] != 0 && $recoverySwitch == 1} {
	if {$recoverySwitchTo == "EXIT"} {
	    exit
	} elseif {$PARAM(CPU.Model) != $recoverySwitchTo} {
	    enterSiml
	}
    }
}    

annotation set pc kernel:hive_recovery.c:hive_LSET:lset_started {
    log "RECV $CPU: $CYCLES hive_LSET:lset_started\n"
    if {$ls($CPUtoCell($CPU)) != 0} { die "ls already set" }
    set ls($CPUtoCell($CPU)) $CYCLES
}

annotation set pc kernel:hive_recovery.c:hive_LSET:lset_finished {
    log "RECV $CPU: $CYCLES hive_recovery.c:hive_LSET:lset_finished\n"
    if {$lf($CPUtoCell($CPU)) != 0} { die "lf already set" }
    set lf($CPUtoCell($CPU)) $CYCLES
}

annotation set pc kernel:hive_recovery.c:hive_RECV:recovery_started {
    log "RECV $CPU: $CYCLES hive_RECV:recovery_started\n"
    if {$rs($CPUtoCell($CPU)) != 0} { die "rs already set" }
    set rs($CPUtoCell($CPU)) $CYCLES
}

annotation set pc kernel:hive_recovery.c:hive_RECV:recovery_finished {
    set cell $CPUtoCell($CPU)

    log "RECV $CPU: $CYCLES hive_RECV:recovery_finished\n"
    if {$rf($cell) != 0} { die "rf already set" }
    set rf($cell) $CYCLES
    incr rfn
    printStats $cell

    set lset [getLS $cell]
    console "  RECOVERY LS($cell) is $lset\n"
    if {$rfn == [llength $lset]} {
	# recovery is complete
	console "  RECOVERY: recovery complete on all live cells.\n"
	if {[info exists recoverySwitch] != 0 && $recoverySwitch == 2} {
	    if {$recoverySwitchTo == "EXIT"} {
		exit
	    } elseif {$PARAM(CPU.Model) != $recoverySwitchTo} {
		enterSiml
	    }
	}

	if {$recoveryWatchOnly == 0} {
	    ### comment the following line out if you don't wish to exit
	    exit
	}
    }
}

annotation set pc kernel:hive_recovery.c:hive_RECV:recovery_before_1 {
    log "RECV $CPU: $CYCLES hive_RECV:recovery_before_1\n"
    if {$rb1($CPUtoCell($CPU)) != 0} { die "rb1 already set" }
    set rb1($CPUtoCell($CPU)) $CYCLES
}

annotation set pc kernel:hive_recovery.c:hive_RECV:recovery_after_1 {
    log "RECV $CPU: $CYCLES hive_RECV:recovery_after_1\n"
    if {$ra1($CPUtoCell($CPU)) != 0} { die "ra1 already set" }
    set ra1($CPUtoCell($CPU)) $CYCLES
}

annotation set pc kernel:hive_recovery.c:hive_RECV:recovery_before_2 {
    log "RECV $CPU: $CYCLES hive_RECV:recovery_before_2\n"
    if {$rb2($CPUtoCell($CPU)) != 0} { die "rb2 already set" }
    set rb2($CPUtoCell($CPU)) $CYCLES
}

annotation set pc kernel:hive_recovery.c:hive_RECV:recovery_after_2 {
    log "RECV $CPU: $CYCLES hive_RECV:recovery_after_2\n"
    if {$ra2($CPUtoCell($CPU)) != 0} { die "ra2 already set" }
    set ra2($CPUtoCell($CPU)) $CYCLES
}

annotation set pc kernel:hive_recovery.c:activate_ping:START {
    log "RECV $CPU: $CYCLES activate_ping gen $a0\n"
    if {$ps($CPUtoCell($CPU)) != 0} { die "ps already set" }
    set ps($CPUtoCell($CPU)) $CYCLES
}

annotation set pc kernel:hive_recovery.c:activate_ping:END {
    log "RECV $CPU: $CYCLES activate_ping END\n"
    if {$pe($CPUtoCell($CPU)) != 0} { die "pe already set" }
    set pe($CPUtoCell($CPU)) $CYCLES
}

annotation set pc kernel:hive_recovery.c:flood_send_message:START {
    set cell $CPUtoCell($CPU)
    if {$a0 == 0 && $frs($cell,$rnd($cell)) == 0} {
	log "RECV $CPU: $CYCLES starting round $rnd($cell)\n"
	set frs($cell,$rnd($cell)) $CYCLES
    }
}

annotation set pc kernel:hive_recovery.c:flood_round_complete:END {
    set cell $CPUtoCell($CPU)
    if {$v0 == 1 && $fre($cell,$rnd($cell)) == 0} {
	log "RECV $CPU: $CYCLES done round $rnd($cell)\n"
	set fre($cell,$rnd($cell)) $CYCLES
    }
}

annotation set pc kernel:hive_recovery.c:flip_rounds:START {
    set cell $CPUtoCell($CPU)
    incr rnd($cell)
}

if {$recoveryWatchOnly == 0 && $recoveryKickFast == 0} {
    ### the following triggers recovery on the next clock tick after
    ### all the compiles have started
    
    annotation set pc kernel::exece:END {
	if { $rtrigd == 0 && "$PROCESS($CPU)" == "cfe" } {
	    console "RECV $CPU: cfe exec'd\n"
	    incr nsted
	    if {$nsted >= $PARAM(CPU.Count)} {
		set vrp [symbol read "kernel0:hive_recovery.c:&force_recovery"]
		### force recovery on next clock tick on cell 0
		set MEMORY($vrp) 1
		console "RECV TRIGGERED: $CYCLES\n"
		set rtrigd 1
	    }
	}
    }
}

annotation set pc kernel:SIMMPasm.s:SimosGetDieSet:END {
    if [info exists recoveryDieSet] {
	# Smash v0 with dieset (bitmap of who's alive here).
	# This controls rebooting.
	set v0 $recoveryDieSet
    }
}

if {$recoveryWatchOnly == 0 && $recoveryKickFast != 0} {
    annotation set simos enter {
	if {$CPU == 0} {
	    set vrp [symbol read "kernel0:hive_recovery.c:&force_recovery"]
	    ### force recovery on next clock tick on cell 0
	    set MEMORY($vrp) 1
	    console "RECV TRIGGERED: $CYCLES\n"
	}
    }
}

if {$recoveryDebugRc != 0} {

    annotation set pc kernel:hive_recovery.c:hivePingStub:START {
	set cell $CPUtoCell($CPU)
	log "RECV $CPU: $CYCLES hivePingStub\n"
	if {0} {
	    set pm [symbol read "kernel0:hrpc.h:&((HrpcMessageType*)$a0)->m<0>"]
	    set f  [symbol read "kernel0:hive_recovery.c:((PingMessage*)$pm)->from"]
	    set s  [symbol read "kernel0:hive_recovery.c:((PingMessage*)$pm)->serial"]
	    set r  [symbol read "kernel0:hive_recovery.c:((PingMessage*)$pm)->round"]
	    set g  [symbol read "kernel0:hive_recovery.c:((PingMessage*)$pm)->gen"]
	    set cp [symbol read "kernel$cell:hive_recovery.c:&(p.c<$f>)"]
	    set cs [symbol read "kernel0:hive_recovery.c:((PingCell*)$cp)->p_serial"]
	    set cr [symbol read "kernel0:hive_recovery.c:((PingCell*)$cp)->p_round"]
	    log "RECV $CPU: (f=$f s=$s r=$r g=$g) ($cs $cr)\n"
	}
    }
    
    annotation set pc kernel:hive_recovery.c:send_ping_message:START {
	log "RECV $CPU: $CYCLES send_ping_message to $a0 is_ping $a1\n"
    }
    
    annotation set pc kernel:hive_recovery.c:flood_send_message:START {
	log "RECV $CPU: $CYCLES flood_send_message to $a0\n"
    }
    
    annotation set pc kernel:hive_recovery.c:hiveFloodStub:START {
	log "RECV $CPU: $CYCLES hiveFloodStub\n"
    }
    
    annotation set pc kernel:hive_recovery.c:flood_heartbeat:START {
	log "RECV $CPU: $CYCLES flood_heartbeat\n"
    }
    
    annotation set pc kernel:hive_recovery.c:flip_rounds:START {
	log "RECV $CPU: $CYCLES flip_rounds\n"
    }
    
    annotation set pc kernel:hive_recovery.c:activate_ping:START {
	log "RECV $CPU: $CYCLES activate_ping gen $a0\n"
    }
    
    annotation set pc kernel:hive_recovery.c:initialize_flood:START {
	log "RECV $CPU: $CYCLES initialize_flood\n"
    }
    
    annotation set pc kernel:hive_recovery.c:hive_flood:START {
	log "RECV $CPU: $CYCLES hive_flood\n"
    }
    
    annotation set pc kernel:hive_recovery.c:deactivate_ping:START {
	log "RECV $CPU: $CYCLES deactivate_ping\n"
    }
    
    annotation set pc kernel:hive_recovery.c:ping_set_cell_state:START {
	log "RECV $CPU: $CYCLES ping_set_cell_state [hex $a0] $a1\n"
    }
    
}

console "  RECOVERY installed:\n"
console "    recoveryWatchOnly  = $recoveryWatchOnly\n"
console "    recoveryKickFast   = $recoveryKickFast\n"
console "    recoveryDebugRc    = $recoveryDebugRc\n"
if [info exists recoverySwitch] {
    console "    recoverySwitch     = $recoverySwitch\n"
    console "    recoverySwitchTo   = $recoverySwitchTo\n"
}