seqsuifstat.prl 9.32 KB
#!/usr/local/bin/perl5 -w
#
# Copyright (C) 1996-1998 by the Board of Trustees
#    of Leland Stanford Junior University.
# 
# This file is part of the SimOS distribution. 
# See LICENSE file for terms of the license. 
#

# seqsuifstat.prl
# Author: E.T.
# Date:  8 February 1996
# Purpose: Statistically analyze SUIFSPEC95 (ASPLOS96 paper)
#          data to "prove" validity of only looking at individual
#          steps to characterize a program
# Input:   Input should be the output of the seqsuifspec95.prl -r
#          (-r for raw, since we are interested in just absolute 
#          numbers for statistical analysis)
# Flags:  None = default => separate steps by name.
#         -a => accumulate ALL steps into a single bucket
#         -c => pass arbitrary criteria on the command line as the
#               subsequent argument.  Field names must be delimited
#               by the '#' symbol, e.g.
#          seqsuifstat.prl -c '#_cycles# < 1000' will print summary
#              info for all steps where _cycles was < 1000
#          As a special case, you can pass #stepName# to get the
#          name of the step, and #stepChronos# to get the number of
#          the particular step.  Note that the argument to -c is
#          an arbitrary expression, so you can do some nice things like
#            seqsuifstat.prl -c '#stepName# eq "sweep_END" && #_cycles# < 1000'
#          to get all steps named SWEEP_END whose cycles are < 1000.

# Global variables

@g_data = ();			# Holds a 2-d? array of numbers stored in
				# column major format relative to the
				# table output by seqsuifspec95.prl
@g_fieldNames = ();  # 1-D array of field names
@g_bucketNames = ();
@g_selections = ();    # Descriptions of each selection
$g_numSelections = 0;

&do_main;

sub do_main {

    # At first, let's just handle a raw column of numbers
    # being passed to us, and handle it (i.e. ignore comment in
    # header of file

    &GetTheOptions;

    &ReadInData;  

    if ($g_criteria) {
	&SetupCriteria;
    }
    my $curField = 0;   # The column we're looking at

    # OK, now try analyzing individual buckets, based on the 
    # bucketName
    
    my ($b, $i, $name, $chronos, $mode);
    $i = 0;
    $id=0;
    foreach $b (@g_bucketNames) {
	if ($b =~ /^(\w+)-(\d+)/) {
	    $name = $1;
	    $chronos = $2;
	    if ($g_allModes && $b =~ /.*-(USER|KERNEL|UTLB)/ ) {
		# Did we generate this with -a option to get all buckets?
		$mode = $1;
	    }
	    else { 
		$mode = -1;
	    }
	    for $f (0 .. $#g_fieldNames) {
		$datums[$f] = $g_data[$f][$i];
	    }

	    $id = &SelectionId($name, $chronos, $mode, \@datums);

	    if ($id >=0 ) {
		for $f (0 .. $#g_fieldNames) {
		    # bucket is a 3D array, containing the name etc
		    push(@{ $bucket[$id][$f] }, $g_data[$f][$i]);
		}			 
	    }
	}
	else {
	    print STDERR "do_main: Error in one of the bucketnames = $b\n";
	}
	$i++;
    }

    for $i (0 .. $#g_selections) {
	# Assume all fields have same # of datums, so choose field 0
	$numDatums = scalar(@{$bucket[$i][$curField]});

	&PrintStatHdr($g_selections[$i] . " ($numDatums instances)");
	
	for $curField (0 .. $#g_fieldNames) {
	    @stats = &AnalyzeField($bucket[$i][$curField]);
	    &PrintStat($g_fieldNames[$curField], @stats);
	}

    }
}


sub GetTheOptions {
    require "getopts.pl";

    &Getopts('sc:');

    if (defined($opt_s)) {
	$g_AllBuckets = $opt_s;
    }
    elsif (defined($opt_c)) {
	# Handle some funky criteria!!
	$g_criteria = $g_criteriaString = $opt_c;

	# The following is until I get criteria working!!!
	$g_matchCriteria = 1;
    }
    else {
	$g_individBuckets = 1;
    }
}

sub PrintStatHdr {
    my $table_hdr = shift;
    print "\n\n" . $table_hdr . "\n";
    printf("%-20s ", "Field_Name");
    printf("%12s %12s %12s %12s %12s\n", "Min", "Max", "Avg", "Stddev", "dev/avg(%)");
    print "-" x (20+5*13) . "\n";
}

sub PrintStat {
    # Pass this the name of the field! and the four element array containing
    # min, max, avg, stddev, and it will print it out in a nice format!

    my ($name,@s) = @_;
    
    printf("%-20s ", $name);
    foreach $stat (@s) {
	if ($name ne "_cycles" && $name ne "_instr") {
	    printf("%12.5f ", $stat);
	}			# 
	else {
	    printf("%12.1f ", $stat);
	}
    }
    # Print out the stddev/avg in a percentage format because the numbers
    # are so big, it's hard to read..
    if ($s[2] != 0) {
	printf("%11.2f%% ", 100*$s[3]/$s[2]); 
    }
    print "\n";
}

sub ReadInData {
    my @vals = ();
    my $line = 0;

    while (<>) {
	last if /^TABLE_START/;
    }
    # OK, so now we have TABLE_START in $_
    # Check to see whether it was called with -r
    if (/^TABLE_START(.*)$/) {
	my ($rawdataOK, $fieldLine) = (0,0);

	@opts = split(" ", $1);
	$rawdataOK = 0;
	foreach $o (@opts) {
	    if ($o eq "-r") {
		$rawdataOK = 1;
	    }
	    if ($o eq "-a") {
		$g_allModes = 1;
	    }
	}
	if (! $rawdataOK) {
	    print STDERR "ReadInData: This output file wasn't run with -r.  You'll probably get rubbish!\n";
            exit;
	}
	# Read in next line

	$fieldLine = <>;
	@g_fieldNames = split(" ", $fieldLine);
	$field_pos = 0;
	foreach (@g_fieldNames) {
	    $g_fieldPos{$_} = $field_pos;
	    $field_pos++;
	}
	<>;   # Read in next line, throw it away ... should be -----

	$line = 0;
	while (<>) {  # Read in the data
       
	    last if /^-{20,}/;   # Break out of here at end of table

	    @vals = split(" ", $_);
	    # First val should be the bucket name...
	    push(@g_bucketNames, $vals[0]);
	    if ($vals[1] ne ":") {
		print STDERR "ReadInData: Table line doesn't match template! Probably garbage!\n";
	    }
	    for $i (0..$#g_fieldNames) {
		if ( defined( $vals[$i+2] ) ) {
		    if ($vals[$i+2] < 0) {
			print STDERR "ReadInData: value is negative! at line $line.Replacing with 0\n";
			$vals[$i+2] = 0;
		    }
		    $g_data[$i][$line] = $vals[$i+2];  
		}		# 
		else {
		    $g_data[$i][$line] = 0;
		}
	    }
	    
	    $line++;
	}
    }

}

sub AnalyzeField {
    # Takes as input a ref to an anonymous array of values
    my $vals = shift;
    my ($min,$max,$avg,$stddev) = (0,0,0,0);
    my ($sum, $variance) = (0,0);
    my $i = 0;


    if (! defined($vals)) {
	print STDERR "AnalyzeField: NO DATA PASSED!\n";
    }
    else {
	$min = $max = $vals->[0];
	for $i (0 .. $#$vals) {
	    if ($vals->[$i] > $max) {
		$max = $vals->[$i];
	    }
	    elsif ($vals->[$i] < $min) {
		$min = $vals->[$i];
	    }
	    $sum += $vals->[$i];
	}
	$avg = $sum/($#$vals+1);
	# Now compute variance etc
	for $i (0 .. $#$vals) {
	    $variance += ($vals->[$i] - $avg) * ($vals->[$i] - $avg);
	}
	$variance = $variance/($#$vals+1);   # Do the 1/P(x_i) part
	$stddev = sqrt($variance);
    }
    
    return (($min, $max, $avg, $stddev));
}


sub SelectionId {
    # Pass the bucket name, chronos and the data... then select!
    # data is a ref to an array
    # RETURNS -1 if we don't want to BIN this information...
    # IN which case the caller will ignore it

    my ($lname, $lchronos, $lmode, $data) = @_;

    $g_name = $lname; $g_chronos = $lchronos;
    $g_mode = $lmode;
    # Only necessary because perl5 falls over on trying to access
    # local variables in an eval script...

    if ($g_AllBuckets) {
	$g_selections[0] = "ALL_BUCKETS";
	# ALWAYS return the same "selection ID"
	return 0;
    }
    elsif ($g_individBuckets) {
	my ($bName);
	if ( ! $g_allModes ) {
	    $bName = $lname;
	}
	else {
	    $bName = $lname . "-" . $lmode;
	}
	if (! defined($seenNames{$bName})) {
	    $seenNames{$bName} = $g_numSelections;
	    $g_selections[$g_numSelections] = $bName;
	    $g_numSelections++;
	    return ($g_numSelections-1);
	}
	else {
	    return $seenNames{$bName};
	}
    }
    elsif ($g_matchCriteria) {
	$result = eval($g_criteria);
	if (! defined($result)) {
	    print STDERR "Error in evaluation string: $@\n";
	    return -1;
	}
	else {
	    if ($result) {
		my ($bName);
		if ( ! $g_allModes ) {
		    $bName = $lname;
		}
		else {
		    $bName = $lname . "-" . $lmode;
		}
		if (! defined($seenNames{$bName})) {
		    $seenNames{$bName} = $g_numSelections;
		    $g_selections[$g_numSelections] = $bName . " : $g_criteriaString";
		    $g_numSelections++;
		    return ($g_numSelections-1);
		}
		else {
		    return $seenNames{$bName};
		}
	    }
	    else {
		return -1;
	    }
	}
    }
    else {
	print STDERR "SelectionId:  How did you get here??\n";
	return -1;
    }
}

sub SetupCriteria {
    # Sets up the global $g_predicateString which gets evaled
    # at each SelectionId to determine whether a 
    # data bucket gets included in the "selection" or not

    # $g_criteria stores the string passed in on the command line
    # where variables are delimited by #, so we have to 
    # substitute in the actual field numbers corresponding to 
    # the field names

    my $vname;

    # First special case the stepName and chronos variables

    $g_criteria =~ s/#stepName#/\$g_name/g;
    $g_criteria =~ s/#stepChronos#/\$g_chronos/g;
    $g_criteria =~ s/#stepMode#/\$g_mode/g;

    while ($g_criteria =~ /#(\w+)#/) {
	   # While we have some variable name
	   $vname = $1;
	   # Check to see whether vname is valid by looking it
	   # up in the fieldPos thingy.

	   if (! defined($g_fieldPos{$vname})) {

	       print STDERR "Sorry, $vname is not a valid field name for this input\n";
	       $g_criteria =~ s/#$vname#/1/g;  # Make some dummy substitution
	   }
	   else {
	       $field_pos = $g_fieldPos{$vname};
	       $g_criteria =~ s/#$vname#/\$data->[$field_pos]/g;
	   }
       }
}