Created a copy of sequential code in head-parallel

Signed-off-by: Gaurav Kukreja <mailme.gaurav@gmail.com>

Created a copy of sequential code in head-parallel
Signed-off-by: Gaurav Kukreja <mailme.gaurav@gmail.com>
b7b0f5df · Gaurav Kukreja · a2c0ce6e · b7b0f5df · b7b0f5df · b7b0f5df
Commit b7b0f5df authored May 09, 2012 by Gaurav Kukreja
14 changed files
--- a/heat-parallel/Makefile
+++ b/heat-parallel/Makefile
+# Intel compiler
+CC =  icc
+CFLAGS = -g
+
+MPICC = mpicc
+
+all: heat
+
+heat : heat.o input.o misc.o timing.o relax_gauss.o relax_jacobi.o
+	$(CC) $(CFLAGS) -o $@ $+ -lm 
+
+%.o : %.c %.h
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+clean:
+	rm -f *.o heat *~ *.ppm
+
+remake : clean all
--- a/heat-parallel/cg_annotate
+++ b/heat-parallel/cg_annotate
+#! /usr/bin/perl -w
+##--------------------------------------------------------------------##
+##--- The cache simulation framework: instrumentation, recording   ---##
+##--- and results printing.                                        ---##
+##---                                           callgrind_annotate ---##
+##--------------------------------------------------------------------##
+
+#  This file is part of Callgrind, a cache-simulator and call graph
+#  tracer built on Valgrind.
+#
+#  Copyright (C) 2003 Josef Weidendorfer
+#     Josef.Weidendorfer@gmx.de
+#
+#  This file is based heavily on vg_annotate, part of Valgrind.
+#  Copyright (C) 2002 Nicholas Nethercote
+#     njn25@cam.ac.uk
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU General Public License as
+#  published by the Free Software Foundation; either version 2 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+#  02111-1307, USA.
+#
+#  The GNU General Public License is contained in the file COPYING.
+
+#----------------------------------------------------------------------------
+# Annotator for cachegrind/callgrind. 
+#
+# File format is described in /docs/techdocs.html.
+#
+# Performance improvements record, using cachegrind.out for cacheprof, doing no
+# source annotation (irrelevant ones removed):
+#                                                               user time
+# 1. turned off warnings in add_hash_a_to_b()                   3.81 --> 3.48s
+#    [now add_array_a_to_b()]
+# 6. make line_to_CC() return a ref instead of a hash           3.01 --> 2.77s
+#
+#10. changed file format to avoid file/fn name repetition       2.40s
+#    (not sure why higher;  maybe due to new '.' entries?)
+#11. changed file format to drop unnecessary end-line "."s      2.36s
+#    (shrunk file by about 37%)
+#12. switched from hash CCs to array CCs                        1.61s
+#13. only adding b[i] to a[i] if b[i] defined (was doing it if
+#    either a[i] or b[i] was defined, but if b[i] was undefined
+#    it just added 0)                                           1.48s
+#14. Stopped converting "." entries to undef and then back      1.16s
+#15. Using foreach $i (x..y) instead of for ($i = 0...) in
+#    add_array_a_to_b()                                         1.11s
+#
+# Auto-annotating primes:
+#16. Finding count lengths by int((length-1)/3), not by
+#    commifying (halves the number of commify calls)            1.68s --> 1.47s
+
+use strict;
+
+#----------------------------------------------------------------------------
+# Overview: the running example in the comments is for:
+#   - events = A,B,C,D
+#   - --show=C,A,D
+#   - --sort=D,C
+#----------------------------------------------------------------------------
+
+#----------------------------------------------------------------------------
+# Global variables, main data structures
+#----------------------------------------------------------------------------
+# CCs are arrays, the counts corresponding to @events, with 'undef'
+# representing '.'.  This makes things fast (faster than using hashes for CCs)
+# but we have to use @sort_order and @show_order below to handle the --sort and
+# --show options, which is a bit tricky.
+#----------------------------------------------------------------------------
+
+# Total counts for summary (an array reference).
+my $summary_CC;
+
+# Totals for each function, for overall summary.
+# hash(filename:fn_name => CC array)
+my %fn_totals;
+
+# Individual CCs, organised by filename and line_num for easy annotation.
+# hash(filename => hash(line_num => CC array))
+my %all_ind_CCs;
+
+# Files chosen for annotation on the command line.  
+# key = basename (trimmed of any directory), value = full filename
+my %user_ann_files;
+
+# Generic description string.
+my $desc = "";
+
+# Command line of profiled program.
+my $cmd = "";
+
+# Info on the profiled process.
+my $creator = "";
+my $pid = "";
+my $part = "";
+my $thread = "";
+
+# Positions used for cost lines; default: line numbers
+my $has_line = 1;
+my $has_addr = 0;
+
+# Events in input file, eg. (A,B,C,D)
+my @events;
+my $events;
+
+# Events to show, from command line, eg. (C,A,D)
+my @show_events;
+
+# Map from @show_events indices to @events indices, eg. (2,0,3).  Gives the
+# order in which we must traverse @events in order to show the @show_events, 
+# eg. (@events[$show_order[1]], @events[$show_order[2]]...) = @show_events.
+# (Might help to think of it like a hash (0 => 2, 1 => 0, 2 => 3).)
+my @show_order;
+
+# Print out the function totals sorted by these events, eg. (D,C).
+my @sort_events;
+
+# Map from @sort_events indices to @events indices, eg. (3,2).  Same idea as
+# for @show_order.
+my @sort_order;
+
+# Thresholds, one for each sort event (or default to 1 if no sort events
+# specified).  We print out functions and do auto-annotations until we've
+# handled this proportion of all the events thresholded.
+my @thresholds;
+
+my $default_threshold = 99;
+
+my $single_threshold  = $default_threshold;
+
+# If on, automatically annotates all files that are involved in getting over
+# all the threshold counts.
+my $auto_annotate = 0;
+
+# Number of lines to show around each annotated line.
+my $context = 8;
+
+# Directories in which to look for annotation files.
+my @include_dirs = ("");
+
+# Verbose mode
+my $verbose = "1";
+
+# Inclusive statistics (with subroutine events)
+my $inclusive = 0;
+
+# Inclusive totals for each function, for overall summary.
+# hash(filename:fn_name => CC array)
+my %cfn_totals;
+
+# hash( file:func => [ called file:func ])
+my $called_funcs;
+
+# hash( file:func => [ calling file:func ])
+my $calling_funcs;
+
+# hash( file:func,line => [called file:func ])
+my $called_from_line;
+
+# hash( file:func,line => file:func
+my %func_of_line;
+
+# hash (file:func => object name)
+my %obj_name;
+
+# Print out the callers of a function
+my $tree_caller = 0;
+
+# Print out the called functions
+my $tree_calling = 0;
+
+# hash( file:func,cfile:cfunc => call CC[])
+my %call_CCs;
+
+# hash( file:func,cfile:cfunc => call counter)
+my %call_counter;
+
+# hash(context, index) => realname for compressed traces
+my %compressed;
+
+# Input file name, will be set in process_cmd_line
+my $input_file = "";
+
+# Version number
+my $version = "3.3.0.SVN";
+
+# Usage message.
+my $usage = <<END
+usage: callgrind_annotate [options] [data-file [source-files]]
+
+  options for the user, with defaults in [ ], are:
+    -h --help             show this message
+    -v --version          show version
+    --show=A,B,C          only show figures for events A,B,C [all]
+    --sort=A,B,C          sort columns by events A,B,C [event column order]
+    --threshold=<0--100>  percentage of counts (of primary sort event) we
+                          are interested in [$default_threshold%]
+    --auto=yes|no         annotate all source files containing functions
+                          that helped reach the event count threshold [no]
+    --context=N           print N lines of context before and after
+                          annotated lines [8]
+    --inclusive=yes|no    add subroutine costs to functions calls [no]
+    --tree=none|caller|   print for each function their callers,
+           calling|both   the called functions or both [none]
+    -I --include=<dir>    add <dir> to list of directories to search for 
+                          source files
+
+END
+;
+
+# Used in various places of output.
+my $fancy = '-' x 80 . "\n";
+
+#-----------------------------------------------------------------------------
+# Argument and option handling
+#-----------------------------------------------------------------------------
+sub process_cmd_line() 
+{
+    for my $arg (@ARGV) { 
+
+        # Option handling
+        if ($arg =~ /^-/) {
+
+            # --version
+            if ($arg =~ /^-v$|^--version$/) {
+                die("callgrind_annotate-$version\n");
+
+            # --show=A,B,C
+            } elsif ($arg =~ /^--show=(.*)$/) {
+                @show_events = split(/,/, $1);
+
+            # --sort=A,B,C
+            } elsif ($arg =~ /^--sort=(.*)$/) {
+                @sort_events = split(/,/, $1);
+                foreach my $i (0 .. scalar @sort_events - 1) {
+                    if ($sort_events[$i] =~#/.*:(\d+)$/) {
+                                            /.*:([\d\.]+)%?$/) {
+                        my $th = $1;
+                        ($th >= 0 && $th <= 100) or die($usage);
+                        $sort_events[$i] =~ s/:.*//;
+                        $thresholds[$i] = $th;
+                    } else {
+                        $thresholds[$i] = 0;
+                    }
+                }
+
+            # --threshold=X (tolerates a trailing '%')
+            } elsif ($arg =~ /^--threshold=([\d\.]+)%?$/) {
+                $single_threshold = $1;
+                ($1 >= 0 && $1 <= 100) or die($usage);
+
+            # --auto=yes|no
+            } elsif ($arg =~ /^--auto=(yes|no)$/) {
+                $auto_annotate = 1 if ($1 eq "yes");
+                $auto_annotate = 0 if ($1 eq "no");
+
+            # --context=N
+            } elsif ($arg =~ /^--context=([\d\.]+)$/) {
+                $context = $1;
+                if ($context < 0) {
+                    die($usage);
+                }
+
+            # --inclusive=yes|no
+            } elsif ($arg =~ /^--inclusive=(yes|no)$/) {
+                $inclusive = 1 if ($1 eq "yes");
+                $inclusive = 0 if ($1 eq "no");
+
+            # --tree=none|caller|calling|both
+            } elsif ($arg =~ /^--tree=(none|caller|calling|both)$/) {
+                $tree_caller  = 1 if ($1 eq "caller" || $1 eq "both");
+                $tree_calling = 1 if ($1 eq "calling" || $1 eq "both");
+
+            # --include=A,B,C
+            } elsif ($arg =~ /^(-I|--include)=(.*)$/) {
+                my $inc = $2;
+                $inc =~ s|/$||;         # trim trailing '/'
+                push(@include_dirs, "$inc/");
+
+            } else {            # -h and --help fall under this case
+                die($usage);
+            }
+
+        # Argument handling -- annotation file checking and selection.
+        # Stick filenames into a hash for quick 'n easy lookup throughout
+        } else {
+	  if ($input_file eq "") {
+	    $input_file = $arg;
+	  }
+	  else {
+            my $readable = 0;
+            foreach my $include_dir (@include_dirs) {
+                if (-r $include_dir . $arg) {
+                    $readable = 1;
+                }
+            }
+            $readable or die("File $arg not found in any of: @include_dirs\n");
+            $user_ann_files{$arg} = 1;
+        } 
+    }
+    }
+
+    if ($input_file eq "") {
+      $input_file = (<callgrind.out*>)[0];
+      if (!defined $input_file) {
+	  $input_file = (<cachegrind.out*>)[0];
+      }
+
+      (defined $input_file) or die($usage);
+      print "Reading data from '$input_file'...\n";
+    }
+}
+
+#-----------------------------------------------------------------------------
+# Reading of input file
+#-----------------------------------------------------------------------------
+sub max ($$) 
+{
+    my ($x, $y) = @_;
+    return ($x > $y ? $x : $y);
+}
+
+# Add the two arrays;  any '.' entries are ignored.  Two tricky things:
+# 1. If $a2->[$i] is undefined, it defaults to 0 which is what we want; we turn
+#    off warnings to allow this.  This makes things about 10% faster than
+#    checking for definedness ourselves.
+# 2. We don't add an undefined count or a ".", even though it's value is 0,
+#    because we don't want to make an $a2->[$i] that is undef become 0
+#    unnecessarily.
+sub add_array_a_to_b ($$) 
+{
+    my ($a1, $a2) = @_;
+
+    my $n = max(scalar @$a1, scalar @$a2);
+    $^W = 0;
+    foreach my $i (0 .. $n-1) {
+        $a2->[$i] += $a1->[$i] if (defined $a1->[$i] && "." ne $a1->[$i]);
+    }
+    $^W = 1;
+}
+
+# Add each event count to the CC array.  '.' counts become undef, as do
+# missing entries (implicitly).
+sub line_to_CC ($)
+{
+    my @CC = (split /\s+/, $_[0]);
+    (@CC <= @events) or die("Line $.: too many event counts\n");
+    return \@CC;
+}
+
+sub uncompressed_name($$)
+{
+   my ($context, $name) = @_;
+
+   if ($name =~ /^\((\d+)\)\s*(.*)$/) {
+     my $index = $1;
+     my $realname = $2;
+
+     if ($realname eq "") {
+       $realname = $compressed{$context,$index};
+     }
+     else {
+       $compressed{$context,$index} = $realname;
+     }
+     return $realname;
+   }
+   return $name;
+}
+
+sub read_input_file() 
+{
+    open(INPUTFILE, "< $input_file") || die "File $input_file not opened\n";
+
+    my $line;
+
+    # Read header
+    while(<INPUTFILE>) {
+
+      # remove comments
+      s/#.*$//;
+
+      if (/^$/) { ; }
+
+      elsif (/^version:\s*(\d+)/) {
+	# Can't read format with major version > 1
+	($1<2) or die("Can't read format with major version $1.\n");
+      }
+
+      elsif (/^pid:\s+(.*)$/) { $pid = $1;  }
+      elsif (/^thread:\s+(.*)$/) { $thread = $1;  }
+      elsif (/^part:\s+(.*)$/) { $part = $1;  }
+      elsif (/^desc:\s+(.*)$/) {
+	my $dline = $1;
+	# suppress profile options in description output
+	if ($dline =~ /^Option:/) {;}
+	else { $desc .= "$dline\n"; }
+      }
+      elsif (/^cmd:\s+(.*)$/)  { $cmd = $1; }
+      elsif (/^creator:\s+(.*)$/)  { $creator = $1; }
+      elsif (/^positions:\s+(.*)$/) {
+	my $positions = $1;
+	$has_line = ($positions =~ /line/);
+	$has_addr = ($positions =~ /(addr|instr)/);
+      }
+      elsif (/^events:\s+(.*)$/) {
+	$events = $1;
+	
+	# events line is last in header
+	last;
+      }
+      else {
+	warn("WARNING: header line $. malformed, ignoring\n");
+	if ($verbose) { chomp; warn("    line: '$_'\n"); }
+      }
+    }
+
+    # Check for needed header entries
+    ($cmd ne "") or die("Line $.: missing command line\n");
+
+    # Read "events:" line.  We make a temporary hash in which the Nth event's
+    # value is N, which is useful for handling --show/--sort options below.
+    ($events ne "") or die("Line $.: missing events line\n");
+    @events = split(/\s+/, $events);
+    my %events;
+    my $n = 0;
+    foreach my $event (@events) {
+        $events{$event} = $n;
+        $n++
+    }
+
+    # If no --show arg give, default to showing all events in the file.
+    # If --show option is used, check all specified events appeared in the
+    # "events:" line.  Then initialise @show_order.
+    if (@show_events) {
+        foreach my $show_event (@show_events) {
+            (defined $events{$show_event}) or 
+                die("--show event `$show_event' did not appear in input\n");
+        }
+    } else {
+        @show_events = @events;
+    }
+    foreach my $show_event (@show_events) {
+        push(@show_order, $events{$show_event});
+    }
+
+    # Do as for --show, but if no --sort arg given, default to sorting by
+    # column order (ie. first column event is primary sort key, 2nd column is
+    # 2ndary key, etc).
+    if (@sort_events) {
+        foreach my $sort_event (@sort_events) {
+            (defined $events{$sort_event}) or 
+                die("--sort event `$sort_event' did not appear in input\n");
+        }
+    } else {
+        @sort_events = @events;
+    }
+    foreach my $sort_event (@sort_events) {
+        push(@sort_order, $events{$sort_event});
+    }
+
+    # If multiple threshold args weren't given via --sort, stick in the single
+    # threshold (either from --threshold if used, or the default otherwise) for
+    # the primary sort event, and 0% for the rest.
+    if (not @thresholds) {
+        foreach my $e (@sort_order) {
+            push(@thresholds, 0);
+        }
+        $thresholds[0] = $single_threshold;
+    }
+
+    my $curr_obj = "";
+    my $curr_file;
+    my $curr_fn;
+    my $curr_name;
+    my $curr_line_num = 0;
+    my $prev_line_num = 0;
+
+    my $curr_cobj = "";
+    my $curr_cfile = "";
+    my $curr_cfunc = "";
+    my $curr_cname;
+    my $curr_call_counter = 0;
+    my $curr_cfn_CC = [];
+
+    my $curr_fn_CC = [];
+    my $curr_file_ind_CCs = {};     # hash(line_num => CC)
+
+    # Read body of input file.
+    while (<INPUTFILE>) {
+	$prev_line_num = $curr_line_num;
+
+        s/#.*$//;   # remove comments
+        s/^\+(\d+)/$prev_line_num+$1/e;
+        s/^\-(\d+)/$prev_line_num-$1/e;
+        s/^\*/$prev_line_num/e;
+        if (s/^(-?\d+|0x\w+)\s+//) {
+            $curr_line_num = $1;
+	    if ($has_addr) {
+	      if ($has_line) {
+                s/^\+(\d+)/$prev_line_num+$1/e;
+	        s/^\-(\d+)/$prev_line_num-$1/e;
+                s/^\*/$prev_line_num/e;
+
+	        if (s/^(\d+)\s+//) { $curr_line_num = $1; }
+	      }
+	      else { $curr_line_num = 0; }
+	    }
+            my $CC = line_to_CC($_);
+
+	    if ($curr_call_counter>0) {
+#	      print "Read ($curr_name => $curr_cname) $curr_call_counter\n";
+
+	      if (defined $call_CCs{$curr_name,$curr_cname}) {
+		add_array_a_to_b($CC, $call_CCs{$curr_name,$curr_cname});
+		$call_counter{$curr_name,$curr_cname} += $curr_call_counter;
+	      }
+	      else {
+		$call_CCs{$curr_name,$curr_cname} = $CC;
+		$call_counter{$curr_name,$curr_cname} = $curr_call_counter;
+	      }
+
+	      my $tmp = $called_from_line->{$curr_file,$curr_line_num};
+	      if (!defined $tmp) {
+		$func_of_line{$curr_file,$curr_line_num} = $curr_name;
+	      }
+	      $tmp = {} unless defined $tmp;
+	      $$tmp{$curr_cname} = 1;
+	      $called_from_line->{$curr_file,$curr_line_num} = $tmp;
+	      $call_CCs{$curr_name,$curr_cname,$curr_line_num} = $CC;
+	      $call_counter{$curr_name,$curr_cname,$curr_line_num} = $curr_call_counter;
+
+	      $curr_call_counter = 0;
+
+	      # inclusive costs
+	      $curr_cfn_CC = $cfn_totals{$curr_cname};
+	      $curr_cfn_CC = [] unless (defined $curr_cfn_CC);
+	      add_array_a_to_b($CC, $curr_cfn_CC);
+	      $cfn_totals{$curr_cname} = $curr_cfn_CC;
+
+	      if ($inclusive) {
+		add_array_a_to_b($CC, $curr_fn_CC);
+	      }
+	      next;
+	    }
+
+            add_array_a_to_b($CC, $curr_fn_CC);
+
+            # If curr_file is selected, add CC to curr_file list.  We look for
+            # full filename matches;  or, if auto-annotating, we have to
+            # remember everything -- we won't know until the end what's needed.
+            if ($auto_annotate || defined $user_ann_files{$curr_file}) {
+                my $tmp = $curr_file_ind_CCs->{$curr_line_num};
+                $tmp = [] unless defined $tmp;
+                add_array_a_to_b($CC, $tmp);
+                $curr_file_ind_CCs->{$curr_line_num} = $tmp;
+            }
+
+        } elsif (s/^fn=(.*)$//) {
+            # Commit result from previous function
+            $fn_totals{$curr_name} = $curr_fn_CC if (defined $curr_name);
+
+            # Setup new one
+            $curr_fn = uncompressed_name("fn",$1);
+            $curr_name = "$curr_file:$curr_fn";
+	    $obj_name{$curr_name} = $curr_obj;
+            $curr_fn_CC = $fn_totals{$curr_name};
+            $curr_fn_CC = [] unless (defined $curr_fn_CC);
+
+        } elsif (s/^ob=(.*)$//) {
+            $curr_obj = uncompressed_name("ob",$1);
+
+        } elsif (s/^fl=(.*)$//) {
+            $all_ind_CCs{$curr_file} = $curr_file_ind_CCs 
+                if (defined $curr_file);
+
+            $curr_file = uncompressed_name("fl",$1);
+            $curr_file_ind_CCs = $all_ind_CCs{$curr_file};
+            $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs);
+
+        } elsif (s/^(fi|fe)=(.*)$//) {
+            (defined $curr_name) or die("Line $.: Unexpected fi/fe line\n");
+            $fn_totals{$curr_name} = $curr_fn_CC;
+            $all_ind_CCs{$curr_file} = $curr_file_ind_CCs;
+
+            $curr_file = uncompressed_name("fl",$2);
+            $curr_name = "$curr_file:$curr_fn";
+            $curr_file_ind_CCs = $all_ind_CCs{$curr_file};
+            $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs);
+            $curr_fn_CC = $fn_totals{$curr_name};
+            $curr_fn_CC = [] unless (defined $curr_fn_CC);
+
+        } elsif (s/^\s*$//) {
+            # blank, do nothing
+
+        } elsif (s/^cob=(.*)$//) {
+	  $curr_cobj = uncompressed_name("ob",$1);
+
+	} elsif (s/^cfi=(.*)$//) {
+	  $curr_cfile = uncompressed_name("fl",$1);
+
+	} elsif (s/^cfn=(.*)$//) {
+	  $curr_cfunc = uncompressed_name("fn",$1);
+	  if ($curr_cfile eq "") {
+	    $curr_cname = "$curr_file:$curr_cfunc";
+	  }
+	  else {
+	    $curr_cname = "$curr_cfile:$curr_cfunc";
+	    $curr_cfile = "";
+	  }
+
+	  my $tmp = $calling_funcs->{$curr_cname};
+	  $tmp = {} unless defined $tmp;
+	  $$tmp{$curr_name} = 1;
+	  $calling_funcs->{$curr_cname} = $tmp;
+		
+	  my $tmp2 = $called_funcs->{$curr_name};
+	  $tmp2 = {} unless defined $tmp2;
+	  $$tmp2{$curr_cname} = 1;
+	  $called_funcs->{$curr_name} = $tmp2;
+
+	} elsif (s/^calls=(\d+)//) {
+	  $curr_call_counter = $1;
+
+        } elsif (s/^(jump|jcnd)=//) {
+	  #ignore jump information
+
+        } elsif (s/^jfi=(.*)$//) {
+          # side effect needed: possibly add compression mapping 
+          uncompressed_name("fl",$1);
+          # ignore jump information	
+
+        } elsif (s/^jfn=(.*)$//) {
+          # side effect needed: possibly add compression mapping
+          uncompressed_name("fn",$1);
+          # ignore jump information
+
+        } elsif (s/^totals:\s+//) {
+	  #ignore
+
+        } elsif (s/^summary:\s+//) {
+            $summary_CC = line_to_CC($_);
+
+        } else {
+            warn("WARNING: line $. malformed, ignoring\n");
+	    if ($verbose) { chomp; warn("    line: '$_'\n"); }
+        }
+    }
+
+    # Check if summary line was present
+    if (not defined $summary_CC) {
+        warn("WARNING: missing final summary line, no summary will be printed\n");
+    }
+    else {
+      # Finish up handling final filename/fn_name counts
+      $fn_totals{"$curr_file:$curr_fn"} = $curr_fn_CC 
+	if (defined $curr_file && defined $curr_fn);
+      $all_ind_CCs{$curr_file} = 
+	$curr_file_ind_CCs if (defined $curr_file);
+
+      (scalar(@$summary_CC) == @events) 
+	or die("Line $.: summary event and total event mismatch\n");
+    }
+
+    # Correct inclusive totals
+    if ($inclusive) {
+      foreach my $name (keys %cfn_totals) {
+	$fn_totals{$name} = $cfn_totals{$name};
+      }
+    }
+
+    close(INPUTFILE);
+}
+
+#-----------------------------------------------------------------------------
+# Print options used
+#-----------------------------------------------------------------------------
+sub print_options ()
+{
+    print($fancy);
+    print "Profile data file '$input_file'";
+    if ($creator ne "") { print " (creator: $creator)"; }
+    print "\n";
+
+    print($fancy);
+    print($desc);
+    my $target = $cmd;
+    if ($pid ne "") {
+      $target .= " (PID $pid";
+      if ($part ne "") { $target .= ", part $part"; }
+      if ($thread ne "") { $target .= ", thread $thread"; }
+      $target .= ")";
+    }
+    print("Profiled target:  $target\n");
+    print("Events recorded:  @events\n");
+    print("Events shown:     @show_events\n");
+    print("Event sort order: @sort_events\n");
+    print("Thresholds:       @thresholds\n");
+
+    my @include_dirs2 = @include_dirs;  # copy @include_dirs
+    shift(@include_dirs2);       # remove "" entry, which is always the first
+    unshift(@include_dirs2, "") if (0 == @include_dirs2); 
+    my $include_dir = shift(@include_dirs2);
+    print("Include dirs:     $include_dir\n");
+    foreach my $include_dir (@include_dirs2) {
+        print("                  $include_dir\n");
+    }
+
+    my @user_ann_files = keys %user_ann_files;
+    unshift(@user_ann_files, "") if (0 == @user_ann_files); 
+    my $user_ann_file = shift(@user_ann_files);
+    print("User annotated:   $user_ann_file\n");
+    foreach $user_ann_file (@user_ann_files) {
+        print("                  $user_ann_file\n");
+    }
+
+    my $is_on = ($auto_annotate ? "on" : "off");
+    print("Auto-annotation:  $is_on\n");
+    print("\n");
+}
+
+#-----------------------------------------------------------------------------
+# Print summary and sorted function totals
+#-----------------------------------------------------------------------------
+sub mycmp ($$) 
+{
+    my ($c, $d) = @_;
+
+    # Iterate through sort events (eg. 3,2); return result if two are different
+    foreach my $i (@sort_order) {
+        my ($x, $y);
+        $x = $c->[$i];
+        $y = $d->[$i];
+        $x = -1 unless defined $x;
+        $y = -1 unless defined $y;
+
+        my $cmp = $y <=> $x;        # reverse sort
+        if (0 != $cmp) {
+            return $cmp;
+        }
+    }
+    # Exhausted events, equal
+    return 0;
+}
+
+sub commify ($) {
+    my ($val) = @_;
+    1 while ($val =~ s/^(\d+)(\d{3})/$1,$2/);
+    return $val;
+}
+
+# Because the counts can get very big, and we don't want to waste screen space
+# and make lines too long, we compute exactly how wide each column needs to be
+# by finding the widest entry for each one.
+sub compute_CC_col_widths (@) 
+{
+    my @CCs = @_;
+    my $CC_col_widths = [];
+
+    # Initialise with minimum widths (from event names)
+    foreach my $event (@events) {
+        push(@$CC_col_widths, length($event));
+    }
+    
+    # Find maximum width count for each column.  @CC_col_width positions
+    # correspond to @CC positions.
+    foreach my $CC (@CCs) {
+        foreach my $i (0 .. scalar(@$CC)-1) {
+            if (defined $CC->[$i]) {
+                # Find length, accounting for commas that will be added
+                my $length = length $CC->[$i];
+                my $clength = $length + int(($length - 1) / 3);
+                $CC_col_widths->[$i] = max($CC_col_widths->[$i], $clength); 
+            }
+        }
+    }
+    return $CC_col_widths;
+}
+
+# Print the CC with each column's size dictated by $CC_col_widths.
+sub print_CC ($$) 
+{
+    my ($CC, $CC_col_widths) = @_;
+
+    foreach my $i (@show_order) {
+        my $count = (defined $CC->[$i] ? commify($CC->[$i]) : ".");
+        my $space = ' ' x ($CC_col_widths->[$i] - length($count));
+        print("$space$count ");
+    }
+}
+
+sub print_events ($)
+{
+    my ($CC_col_widths) = @_;
+
+    foreach my $i (@show_order) { 
+        my $event       = $events[$i];
+        my $event_width = length($event);
+        my $col_width   = $CC_col_widths->[$i];
+        my $space       = ' ' x ($col_width - $event_width);
+        print("$space$event ");
+    }
+}
+
+# Prints summary and function totals (with separate column widths, so that
+# function names aren't pushed over unnecessarily by huge summary figures).
+# Also returns a hash containing all the files that are involved in getting the
+# events count above the thresholds (ie. all the interesting ones).
+sub print_summary_and_fn_totals ()
+{
+    my @fn_fullnames = keys   %fn_totals;
+
+    # Work out the size of each column for printing (summary and functions
+    # separately).
+    my $summary_CC_col_widths = compute_CC_col_widths($summary_CC);
+    my      $fn_CC_col_widths = compute_CC_col_widths(values %fn_totals);
+
+    # Header and counts for summary
+    print($fancy);
+    print_events($summary_CC_col_widths);
+    print("\n");
+    print($fancy);
+    print_CC($summary_CC, $summary_CC_col_widths);
+    print(" PROGRAM TOTALS\n");
+    print("\n");
+
+    # Header for functions
+    print($fancy);
+    print_events($fn_CC_col_widths);
+    print(" file:function\n");
+    print($fancy);
+
+    # Sort function names into order dictated by --sort option.
+    @fn_fullnames = sort {
+        mycmp($fn_totals{$a}, $fn_totals{$b})
+    } @fn_fullnames;
+
+
+    # Assertion
+    (scalar @sort_order == scalar @thresholds) or 
+        die("sort_order length != thresholds length:\n",
+            "  @sort_order\n  @thresholds\n");
+
+    my $threshold_files       = {};
+    # @curr_totals has the same shape as @sort_order and @thresholds
+    my @curr_totals = ();
+    foreach my $e (@thresholds) {
+        push(@curr_totals, 0);
+    }
+
+    # Print functions, stopping when the threshold has been reached.
+    foreach my $fn_name (@fn_fullnames) {
+
+        # Stop when we've reached all the thresholds
+        my $reached_all_thresholds = 1;
+        foreach my $i (0 .. scalar @thresholds - 1) {
+            my $prop = $curr_totals[$i] * 100;
+	    if ($summary_CC->[$sort_order[$i]] >0) {
+	      $prop = $prop / $summary_CC->[$sort_order[$i]];
+	    }
+            $reached_all_thresholds &= ($prop >= $thresholds[$i]);
+        }
+        last if $reached_all_thresholds;
+
+	if ($tree_caller || $tree_calling) { print "\n"; }
+
+	if ($tree_caller && ($fn_name ne "???:???")) {
+	  # Print function callers
+	  my $tmp1 = $calling_funcs->{$fn_name};
+	  if (defined $tmp1) {
+	    foreach my $calling (keys %$tmp1) {
+	      if (defined $call_counter{$calling,$fn_name}) {
+		print_CC($call_CCs{$calling,$fn_name}, $fn_CC_col_widths);
+		print" < $calling (";
+		print $call_counter{$calling,$fn_name} . "x)";
+		if (defined $obj_name{$calling}) {
+		  print " [$obj_name{$calling}]";
+		}
+		print "\n";
+	      }
+	    }
+	  }
+	}
+
+        # Print function results
+        my $fn_CC = $fn_totals{$fn_name};
+        print_CC($fn_CC, $fn_CC_col_widths);
+	if ($tree_caller || $tree_calling) { print " * "; }
+        print(" $fn_name");
+	if (defined $obj_name{$fn_name}) {
+	  print " [$obj_name{$fn_name}]";
+	}
+	print "\n";
+
+	if ($tree_calling && ($fn_name ne "???:???")) {
+	  # Print called functions
+	  my $tmp2 = $called_funcs->{$fn_name};
+	  if (defined $tmp2) {
+	    foreach my $called (keys %$tmp2) {
+	      if (defined $call_counter{$fn_name,$called}) {
+		print_CC($call_CCs{$fn_name,$called}, $fn_CC_col_widths);
+		print" >   $called (";
+		print $call_counter{$fn_name,$called} . "x)";
+		if (defined $obj_name{$called}) {
+		  print " [$obj_name{$called}]";
+		}
+		print "\n";
+	      }
+	    }
+	  }
+	}
+
+        # Update the threshold counts
+        my $filename = $fn_name;
+        $filename =~ s/:.+$//;    # remove function name
+        $threshold_files->{$filename} = 1;
+        foreach my $i (0 .. scalar @sort_order - 1) {
+	  if ($inclusive) {
+	    $curr_totals[$i] = $summary_CC->[$sort_order[$i]] -
+                               $fn_CC->[$sort_order[$i]]
+	      if (defined $fn_CC->[$sort_order[$i]]);
+	  } else {
+            $curr_totals[$i] += $fn_CC->[$sort_order[$i]] 
+                if (defined $fn_CC->[$sort_order[$i]]);
+        }
+    }
+    }
+    print("\n");
+
+    return $threshold_files;
+}
+
+#-----------------------------------------------------------------------------
+# Annotate selected files
+#-----------------------------------------------------------------------------
+
+# Issue a warning that the source file is more recent than the input file. 
+sub warning_on_src_more_recent_than_inputfile ($)
+{
+    my $src_file = $_[0];
+
+    my $warning = <<END
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ Source file '$src_file' is more recent than input file '$input_file'.
+@ Annotations may not be correct.
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+END
+;
+    print($warning);
+}
+
+# If there is information about lines not in the file, issue a warning
+# explaining possible causes.
+sub warning_on_nonexistent_lines ($$$)
+{
+    my ($src_more_recent_than_inputfile, $src_file, $excess_line_nums) = @_;
+    my $cause_and_solution;
+
+    if ($src_more_recent_than_inputfile) {
+        $cause_and_solution = <<END
+@@ cause:    '$src_file' has changed since information was gathered.
+@@           If so, a warning will have already been issued about this.
+@@ solution: Recompile program and rerun under "valgrind --cachesim=yes" to 
+@@           gather new information.
+END
+    # We suppress warnings about .h files
+    } elsif ($src_file =~ /\.h$/) {
+        $cause_and_solution = <<END
+@@ cause:    bug in the Valgrind's debug info reader that screws up with .h
+@@           files sometimes
+@@ solution: none, sorry
+END
+    } else {
+        $cause_and_solution = <<END
+@@ cause:    not sure, sorry
+END
+    }
+
+    my $warning = <<END
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@
+@@ Information recorded about lines past the end of '$src_file'.
+@@
+@@ Probable cause and solution:
+$cause_and_solution@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+END
+;
+    print($warning);
+}
+
+sub annotate_ann_files($)
+{
+    my ($threshold_files) = @_; 
+
+    my %all_ann_files;
+    my @unfound_auto_annotate_files;
+    my $printed_totals_CC = [];
+
+    # If auto-annotating, add interesting files (but not "???")
+    if ($auto_annotate) {
+        delete $threshold_files->{"???"};
+        %all_ann_files = (%user_ann_files, %$threshold_files) 
+    } else {
+        %all_ann_files = %user_ann_files;
+    }
+
+    # Track if we did any annotations.
+    my $did_annotations = 0;
+
+    LOOP:
+    foreach my $src_file (keys %all_ann_files) {
+
+        my $opened_file = "";
+        my $full_file_name = "";
+        foreach my $include_dir (@include_dirs) {
+            my $try_name = $include_dir . $src_file;
+            if (open(INPUTFILE, "< $try_name")) {
+                $opened_file    = $try_name;
+                $full_file_name = ($include_dir eq "" 
+                                  ? $src_file 
+                                  : "$include_dir + $src_file"); 
+                last;
+            }
+        }
+        
+        if (not $opened_file) {
+            # Failed to open the file.  If chosen on the command line, die.
+            # If arose from auto-annotation, print a little message.
+            if (defined $user_ann_files{$src_file}) {
+                die("File $src_file not opened in any of: @include_dirs\n");
+
+            } else {
+                push(@unfound_auto_annotate_files, $src_file);
+            }
+
+        } else {
+            # File header (distinguish between user- and auto-selected files).
+            print("$fancy");
+            my $ann_type = 
+                (defined $user_ann_files{$src_file} ? "User" : "Auto");
+            print("-- $ann_type-annotated source: $full_file_name\n");
+            print("$fancy");
+
+            # Get file's CCs
+            my $src_file_CCs = $all_ind_CCs{$src_file};
+            if (!defined $src_file_CCs) {
+                print("  No information has been collected for $src_file\n\n");
+                next LOOP;
+            }
+        
+            $did_annotations = 1;
+            
+            # Numeric, not lexicographic sort!
+            my @line_nums = sort {$a <=> $b} keys %$src_file_CCs;  
+
+            # If $src_file more recent than cachegrind.out, issue warning
+            my $src_more_recent_than_inputfile = 0;
+            if ((stat $opened_file)[9] > (stat $input_file)[9]) {
+                $src_more_recent_than_inputfile = 1;
+                warning_on_src_more_recent_than_inputfile($src_file);
+            }
+
+            # Work out the size of each column for printing
+            my $CC_col_widths = compute_CC_col_widths(values %$src_file_CCs);
+
+            # Events header
+            print_events($CC_col_widths);
+            print("\n\n");
+
+            # Shift out 0 if it's in the line numbers (from unknown entries,
+            # likely due to bugs in Valgrind's stabs debug info reader)
+            shift(@line_nums) if (0 == $line_nums[0]);
+
+            # Finds interesting line ranges -- all lines with a CC, and all
+            # lines within $context lines of a line with a CC.
+            my $n = @line_nums;
+            my @pairs;
+            for (my $i = 0; $i < $n; $i++) {
+                push(@pairs, $line_nums[$i] - $context);   # lower marker
+                while ($i < $n-1 && 
+                       $line_nums[$i] + 2*$context >= $line_nums[$i+1]) {
+                    $i++;
+                }
+                push(@pairs, $line_nums[$i] + $context);   # upper marker
+            }
+
+            # Annotate chosen lines, tracking total counts of lines printed
+            $pairs[0] = 1 if ($pairs[0] < 1);
+            while (@pairs) {
+                my $low  = shift @pairs;
+                my $high = shift @pairs;
+                while ($. < $low-1) {
+                    my $tmp = <INPUTFILE>;
+                    last unless (defined $tmp);     # hack to detect EOF
+                }
+                my $src_line;
+                # Print line number, unless start of file
+                print("-- line $low " . '-' x 40 . "\n") if ($low != 1);
+                while (($. < $high) && ($src_line = <INPUTFILE>)) {
+                    if (defined $line_nums[0] && $. == $line_nums[0]) {
+                        print_CC($src_file_CCs->{$.}, $CC_col_widths);
+                        add_array_a_to_b($src_file_CCs->{$.}, 
+                                         $printed_totals_CC);
+                        shift(@line_nums);
+
+                    } else {
+                        print_CC( [], $CC_col_widths);
+                    }
+
+                    print(" $src_line");
+
+		    my $tmp  = $called_from_line->{$src_file,$.};
+		    my $func = $func_of_line{$src_file,$.};
+		    if (defined $tmp) {
+		      foreach my $called (keys %$tmp) {
+			if (defined $call_CCs{$func,$called,$.}) {
+			  print_CC($call_CCs{$func,$called,$.}, $CC_col_widths);
+			  print " => $called (";
+			  print $call_counter{$func,$called,$.} . "x)\n";
+			}
+		      }
+		    }
+                }
+                # Print line number, unless EOF
+                if ($src_line) {
+                    print("-- line $high " . '-' x 40 . "\n");
+                } else {
+                    last;
+                }
+            }
+
+            # If there was info on lines past the end of the file...
+            if (@line_nums) {
+                foreach my $line_num (@line_nums) {
+                    print_CC($src_file_CCs->{$line_num}, $CC_col_widths);
+                    print(" <bogus line $line_num>\n");
+                }
+                print("\n");
+                warning_on_nonexistent_lines($src_more_recent_than_inputfile,
+                                             $src_file, \@line_nums);
+            }
+            print("\n");
+
+            # Print summary of counts attributed to file but not to any
+            # particular line (due to incomplete debug info).
+            if ($src_file_CCs->{0}) {
+                print_CC($src_file_CCs->{0}, $CC_col_widths);
+                print(" <counts for unidentified lines in $src_file>\n\n");
+            }
+            
+            close(INPUTFILE);
+        }
+    }
+
+    # Print list of unfound auto-annotate selected files.
+    if (@unfound_auto_annotate_files) {
+        print("$fancy");
+        print("The following files chosen for auto-annotation could not be found:\n");
+        print($fancy);
+        foreach my $f (@unfound_auto_annotate_files) {
+            print("  $f\n");
+        }
+        print("\n");
+    }
+
+    # If we did any annotating, print what proportion of events were covered by
+    # annotated lines above.
+    if ($did_annotations) {
+        my $percent_printed_CC;
+        foreach (my $i = 0; $i < @$summary_CC; $i++) {
+            $percent_printed_CC->[$i] = 
+                sprintf("%.0f", 
+                        $printed_totals_CC->[$i] / $summary_CC->[$i] * 100);
+        }
+        my $pp_CC_col_widths = compute_CC_col_widths($percent_printed_CC);
+        print($fancy);
+        print_events($pp_CC_col_widths);
+        print("\n");
+        print($fancy);
+        print_CC($percent_printed_CC, $pp_CC_col_widths);
+        print(" percentage of events annotated\n\n");
+    }
+}
+
+#----------------------------------------------------------------------------
+# "main()"
+#----------------------------------------------------------------------------
+process_cmd_line();
+read_input_file();
+print_options();
+my $threshold_files = print_summary_and_fn_totals();
+annotate_ann_files($threshold_files);
+
+##--------------------------------------------------------------------##
+##--- end                                           vg_annotate.in ---##
+##--------------------------------------------------------------------##
+
+
--- a/heat-parallel/heat.c
+++ b/heat-parallel/heat.c
+/*
+ * heat.h
+ *
+ * Iterative solver for heat distribution
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "input.h"
+#include "heat.h"
+#include "timing.h"
+
+
+void usage( char *s )
+{
+    fprintf(stderr, 
+	    "Usage: %s <input file> [result file]\n\n", s);
+}
+
+
+int main( int argc, char *argv[] )
+{
+    unsigned iter;
+    FILE *infile, *resfile;
+    char *resfilename;
+
+	double *tmp;
+
+    // algorithmic parameters
+    algoparam_t param;
+    int np;
+
+    double runtime, flop;
+    double residual;
+
+    // check arguments
+    if( argc < 2 )
+    {
+	usage( argv[0] );
+	return 1;
+    }
+
+    // check input file
+    if( !(infile=fopen(argv[1], "r"))  ) 
+    {
+	fprintf(stderr, 
+		"\nError: Cannot open \"%s\" for reading.\n\n", argv[1]);
+      
+	usage(argv[0]);
+	return 1;
+    }
+  
+
+    // check result file
+    resfilename= (argc>=3) ? argv[2]:"heat.ppm";
+
+    if( !(resfile=fopen(resfilename, "w")) )
+    {
+	fprintf(stderr, 
+		"\nError: Cannot open \"%s\" for writing.\n\n", 
+		resfilename);
+      
+	usage(argv[0]);
+	return 1;
+    }
+
+    // check input
+    if( !read_input(infile, &param) )
+    {
+	fprintf(stderr, "\nError: Error parsing input file.\n\n");
+      
+	usage(argv[0]);
+	return 1;
+    }
+
+    print_params(&param);
+
+    // set the visualization resolution
+    param.visres = param.max_res;
+
+    param.u     = 0;
+    param.uhelp = 0;
+    param.uvis  = 0;
+
+    param.act_res = param.initial_res;
+
+    // loop over different resolutions
+    while(1) {
+
+	// free allocated memory of previous experiment
+	if (param.u != 0)
+	    finalize(&param);
+
+	if( !initialize(&param) )
+	{
+	    fprintf(stderr, "Error in Jacobi initialization.\n\n");
+
+	    usage(argv[0]);
+	}
+
+	fprintf(stderr, "Resolution: %5u\r", param.act_res);
+
+	// full size (param.act_res are only the inner points)
+	np = param.act_res + 2;
+    
+	// starting time
+	runtime = wtime();
+	residual = 999999999;
+
+	iter = 0;
+	while(1) {
+
+	    switch( param.algorithm ) {
+
+		case 0: // JACOBI
+      
+		    residual = relax_jacobi_return_residual(param.u, param.uhelp, np, np);
+			tmp = param.u;
+			param.u = param.uhelp;
+			param.uhelp = tmp;
+		    break;
+
+		case 1: // GAUSS
+
+		    relax_gauss(param.u, np, np);
+		    residual = residual_gauss( param.u, param.uhelp, np, np);
+		    break;
+	    }
+	    
+	    iter++;
+
+	    // solution good enough ?
+	    if (residual < 0.000005) break;
+
+	    // max. iteration reached ? (no limit with maxiter=0)
+	    if (param.maxiter>0 && iter>=param.maxiter) break;
+	    
+	    if (iter % 100 == 0)
+		fprintf(stderr, "residual %f, %d iterations\n", residual, iter);
+	}
+
+	// Flop count after <i> iterations
+	flop = iter * 11.0 * param.act_res * param.act_res;
+	// stopping time
+	runtime = wtime() - runtime;
+
+	fprintf(stderr, "Resolution: %5u, ", param.act_res);
+	fprintf(stderr, "Time: %04.3f ", runtime);
+	fprintf(stderr, "(%3.3f GFlop => %6.2f MFlop/s, ", 
+		flop/1000000000.0,
+		flop/runtime/1000000);
+	fprintf(stderr, "residual %f, %d iterations)\n", residual, iter);
+
+	// for plot...
+	printf("%5d %f\n", param.act_res, flop/runtime/1000000);
+
+	if (param.act_res + param.res_step_size > param.max_res) break;
+	param.act_res += param.res_step_size;
+    }
+
+    coarsen( param.u, np, np,
+	     param.uvis, param.visres+2, param.visres+2 );
+  
+    write_image( resfile, param.uvis,  
+		 param.visres+2, 
+		 param.visres+2 );
+
+    finalize( &param );
+
+    return 0;
+}
--- a/heat-parallel/heat.h
+++ b/heat-parallel/heat.h
+/*
+ * heat.h
+ *
+ * Global definitions for the iterative solver
+ */
+
+#ifndef JACOBI_H_INCLUDED
+#define JACOBI_H_INCLUDED
+
+#include <stdio.h>
+
+// configuration
+
+typedef struct
+{
+    float posx;
+    float posy;
+    float range;
+    float temp;
+}
+heatsrc_t;
+
+typedef struct
+{
+    unsigned maxiter;       // maximum number of iterations
+    unsigned act_res;
+    unsigned max_res;       // spatial resolution
+    unsigned initial_res;
+    unsigned res_step_size;
+    int algorithm;          // 0=>Jacobi, 1=>Gauss
+
+    unsigned visres;        // visualization resolution
+  
+    double *u, *uhelp;
+    double *uvis;
+
+    unsigned   numsrcs;     // number of heat sources
+    heatsrc_t *heatsrcs;
+}
+algoparam_t;
+
+
+// function declarations
+
+// misc.c
+int initialize( algoparam_t *param );
+int finalize( algoparam_t *param );
+void write_image( FILE * f, double *u,
+		  unsigned sizex, unsigned sizey );
+int coarsen(double *uold, unsigned oldx, unsigned oldy ,
+	    double *unew, unsigned newx, unsigned newy );
+
+// Gauss-Seidel: relax_gauss.c
+double residual_gauss( double *u, double *utmp,
+		       unsigned sizex, unsigned sizey );
+void relax_gauss( double *u, 
+		  unsigned sizex, unsigned sizey  );
+
+// Jacobi: relax_jacobi.c
+#if 0
+double residual_jacobi( double *u,
+			unsigned sizex, unsigned sizey );
+#endif
+double relax_jacobi_return_residual( double *u, double *utmp,
+		   unsigned sizex, unsigned sizey ); 
+
+
+#endif // JACOBI_H_INCLUDED
--- a/heat-parallel/input.c
+++ b/heat-parallel/input.c
+//
+// input.c
+// 
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "input.h"
+
+#define BUFSIZE 100
+
+int read_input( FILE *infile, algoparam_t *param )
+{
+  int i, n;
+  char buf[BUFSIZE];
+
+  fgets(buf, BUFSIZE, infile);
+  n = sscanf( buf, "%u", &(param->maxiter) );
+  if( n!=1 )
+    return 0;
+
+  fgets(buf, BUFSIZE, infile);
+  n = sscanf( buf, "%u", &(param->initial_res) );
+  if( n!=1 )
+    return 0;
+
+  fgets(buf, BUFSIZE, infile);
+  n = sscanf( buf, "%u", &(param->max_res) );
+  if( n!=1 )
+    return 0;
+
+  fgets(buf, BUFSIZE, infile);
+  n = sscanf( buf, "%u", &(param->res_step_size) );
+  if( n!=1 )
+    return 0;
+
+
+  fgets(buf, BUFSIZE, infile);
+  n = sscanf(buf, "%d", &(param->algorithm) );
+  if( n!=1 )
+    return 0;
+
+  fgets(buf, BUFSIZE, infile);
+  n = sscanf(buf, "%u", &(param->numsrcs) );
+  if( n!=1 )
+    return 0;
+
+  (param->heatsrcs) = 
+    (heatsrc_t*) malloc( sizeof(heatsrc_t) * (param->numsrcs) );
+  
+  for( i=0; i<param->numsrcs; i++ )
+    {
+      fgets(buf, BUFSIZE, infile);
+      n = sscanf( buf, "%f %f %f %f",
+		  &(param->heatsrcs[i].posx),
+		  &(param->heatsrcs[i].posy),
+		  &(param->heatsrcs[i].range),
+		  &(param->heatsrcs[i].temp) );
+
+      if( n!=4 )
+	return 0;
+    }
+
+  return 1;
+}
+
+
+void print_params( algoparam_t *param )
+{
+  int i;
+
+  fprintf(stderr, "Resolutions       : (%u, %u, ... %u)\n",
+	  param->initial_res,
+	  param->initial_res + param->res_step_size,
+	  param->max_res);
+  fprintf(stderr, "Iterations        : %u\n", param->maxiter);
+  fprintf(stderr, "Algorithm         : %d (%s)\n",
+	  param->algorithm,
+	  (param->algorithm == 0) ? "Jacobi":"Gauss-Jacobi" );
+  fprintf(stderr, "Num. Heat sources : %u\n", param->numsrcs);
+
+  for( i=0; i<param->numsrcs; i++ )
+    {
+      fprintf(stderr, "  %2d: (%2.2f, %2.2f) %2.2f %2.2f \n",
+	     i+1,
+	     param->heatsrcs[i].posx,
+	     param->heatsrcs[i].posy,
+	     param->heatsrcs[i].range,
+	     param->heatsrcs[i].temp );
+    }
+}
--- a/heat-parallel/input.h
+++ b/heat-parallel/input.h
+//
+// input.h
+//
+
+#ifndef INPUT_H_INCLUDED
+#define INPUT_H_INCLUDED
+
+#include <stdio.h>
+
+#include "heat.h"
+
+int read_input( FILE *infile, algoparam_t *param );
+void print_params( algoparam_t *param );
+
+
+#endif // INPUT_H_INCLUDED
--- a/heat-parallel/misc.c
+++ b/heat-parallel/misc.c
+/*
+ * misc.c
+ *
+ * Helper functions for
+ * - initialization
+ * - finalization,
+ * - writing out a picture
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+
+#include "heat.h"
+
+/*
+ * Initialize the iterative solver
+ * - allocate memory for matrices
+ * - set boundary conditions according to configuration
+ */
+int initialize( algoparam_t *param )
+{
+    int i, j;
+    double dist;
+
+    // total number of points (including border)
+    const int np = param->act_res + 2;
+  
+    //
+    // allocate memory
+    //
+    (param->u)     = (double*)calloc( sizeof(double),np*np );
+    (param->uhelp) = (double*)calloc( sizeof(double),np*np );
+    (param->uvis)  = (double*)calloc( sizeof(double),
+				      (param->visres+2) *
+				      (param->visres+2) );
+  
+
+    if( !(param->u) || !(param->uhelp) || !(param->uvis) )
+    {
+	fprintf(stderr, "Error: Cannot allocate memory\n");
+	return 0;
+    }
+
+    for( i=0; i<param->numsrcs; i++ )
+    {
+	/* top row */
+	for( j=0; j<np; j++ )
+	{
+	    dist = sqrt( pow((double)j/(double)(np-1) - 
+			     param->heatsrcs[i].posx, 2)+
+			 pow(param->heatsrcs[i].posy, 2));
+	  
+	    if( dist <= param->heatsrcs[i].range )
+	    {
+		(param->u)[j] +=
+		    (param->heatsrcs[i].range-dist) /
+		    param->heatsrcs[i].range *
+		    param->heatsrcs[i].temp;
+		(param->uhelp)[j] = (param->u)[j];
+	    }
+	}
+      
+	/* bottom row */
+	for( j=0; j<np; j++ )
+	{
+	    dist = sqrt( pow((double)j/(double)(np-1) - 
+			     param->heatsrcs[i].posx, 2)+
+			 pow(1-param->heatsrcs[i].posy, 2));
+	  
+	    if( dist <= param->heatsrcs[i].range )
+	    {
+		(param->u)[(np-1)*np+j]+=
+		    (param->heatsrcs[i].range-dist) / 
+		    param->heatsrcs[i].range * 
+		    param->heatsrcs[i].temp;
+		(param->uhelp)[(np-1)*np+j] = (param->u)[(np-1)*np+j];
+	    }
+	}
+      
+	/* leftmost column */
+	for( j=1; j<np-1; j++ )
+	{
+	    dist = sqrt( pow(param->heatsrcs[i].posx, 2)+
+			 pow((double)j/(double)(np-1) - 
+			     param->heatsrcs[i].posy, 2)); 
+	  
+	    if( dist <= param->heatsrcs[i].range )
+	    {
+		(param->u)[ j*np ]+=
+		    (param->heatsrcs[i].range-dist) / 
+		    param->heatsrcs[i].range *
+		    param->heatsrcs[i].temp;
+		(param->uhelp)[ j*np ] = (param->u)[ j*np ];
+	    }
+	}
+      
+	/* rightmost column */
+	for( j=1; j<np-1; j++ )
+	{
+	    dist = sqrt( pow(1-param->heatsrcs[i].posx, 2)+
+			 pow((double)j/(double)(np-1) - 
+			     param->heatsrcs[i].posy, 2)); 
+	  
+	    if( dist <= param->heatsrcs[i].range )
+	    {
+		(param->u)[ j*np+(np-1) ]+=
+		    (param->heatsrcs[i].range-dist) /
+		    param->heatsrcs[i].range *
+		    param->heatsrcs[i].temp;
+		(param->uhelp)[ j*np+(np-1) ] = (param->u)[ j*np+(np-1) ];
+	    }
+	}
+    }
+
+    return 1;
+}
+
+/*
+ * free used memory
+ */
+int finalize( algoparam_t *param )
+{
+    if( param->u ) {
+	free(param->u);
+	param->u = 0;
+    }
+
+    if( param->uhelp ) {
+	free(param->uhelp);
+	param->uhelp = 0;
+    }
+
+    if( param->uvis ) {
+	free(param->uvis);
+	param->uvis = 0;
+    }
+
+    return 1;
+}
+
+
+/*
+ * write the given temperature u matrix to rgb values
+ * and write the resulting image to file f
+ */
+void write_image( FILE * f, double *u,
+		  unsigned sizex, unsigned sizey ) 
+{
+    // RGB table
+    unsigned char r[1024], g[1024], b[1024];
+    int i, j, k;
+  
+    double min, max;
+
+    j=1023;
+
+    // prepare RGB table
+    for( i=0; i<256; i++ )
+    {
+	r[j]=255; g[j]=i; b[j]=0;
+	j--;
+    }
+    for( i=0; i<256; i++ )
+    {
+	r[j]=255-i; g[j]=255; b[j]=0;
+	j--;
+    }
+    for( i=0; i<256; i++ )
+    {
+	r[j]=0; g[j]=255; b[j]=i;
+	j--;
+    }
+    for( i=0; i<256; i++ )
+    {
+	r[j]=0; g[j]=255-i; b[j]=255;
+	j--;
+    }
+
+    min=DBL_MAX;
+    max=-DBL_MAX;
+
+    // find minimum and maximum 
+    for( i=0; i<sizey; i++ )
+    {
+	for( j=0; j<sizex; j++ )
+	{
+	    if( u[i*sizex+j]>max )
+		max=u[i*sizex+j];
+	    if( u[i*sizex+j]<min )
+		min=u[i*sizex+j];
+	}
+    }
+  
+
+    fprintf(f, "P3\n");
+    fprintf(f, "%u %u\n", sizex, sizey);
+    fprintf(f, "%u\n", 255);
+
+    for( i=0; i<sizey; i++ )
+    {
+	for( j=0; j<sizex; j++ )
+	{
+	    k=(int)(1024.0*(u[i*sizex+j]-min)/(max-min));
+	    fprintf(f, "%d %d %d  ", r[k], g[k], b[k]);
+	}
+	fprintf(f, "\n");
+    }
+}
+
+
+int coarsen( double *uold, unsigned oldx, unsigned oldy ,
+	     double *unew, unsigned newx, unsigned newy )
+{
+    int i, j;
+
+    int stepx;
+    int stepy;
+    int stopx = newx;
+    int stopy = newy;
+
+    if (oldx>newx)
+	stepx=oldx/newx;
+    else {
+	stepx=1;
+	stopx=oldx;
+    }
+    if (oldy>newy)
+	stepy=oldy/newy;
+    else {
+	stepy=1;
+	stopy=oldy;
+    }
+
+    // NOTE: this only takes the top-left corner,
+    // and doesnt' do any real coarsening 
+    for( i=0; i<stopy-1; i++ )
+    {
+	for( j=0; j<stopx-1; j++ )
+        {
+	    unew[i*newx+j]=uold[i*oldx*stepy+j*stepx];
+        }
+    }
+
+  return 1;
+}
+
--- a/heat-parallel/pfmon2cg
+++ b/heat-parallel/pfmon2cg
+#!/usr/bin/perl
+#
+# pfmon2cg, version 0.02
+# Converter for pfmon output to Cachegrind profile format
+#
+# 4/2007, Josef Weidendorfer
+#
+
+#
+# Read in pfmon data, e.g. written from the command
+#
+#  pfmon --with-header --smpl-outfile=pfmon.out \
+#        -e l2_misses --short-smpl-periods=100000 -- <command>
+#  (sampling points every 100000 L2 Misses)
+#
+# This gives a list of sampled instruction addresses ($ipps)
+ 
+while(<>) {
+	if (/^#/) {
+		if (/command line\s*:\s*(.*)$/) { $command = $1; }
+		if (/PMD\d+: (.*?)\s*(,|=)/) { $events .= "$1 "; }
+		next; 
+	}
+	if (/^entry/) {
+		($iip) = ($entry =~ /IIP:(\S+)/);
+		if ($iip ne "") { $iips .= "$iip\n"; }
+		$entry = "";
+	}
+	chomp;
+	$entry .= $_;
+}
+($iip) = ($entry =~ /IIP:(\S+)/);
+if ($iip ne "") { $iips .= "$iip\n"; }
+
+($pid) = ($entry =~ /PID:(\S+)/);
+
+open OUT, ">.p2cg";
+print OUT $iips;
+close OUT;
+
+
+# Let addr2line convert the instruction addresses into
+# source code references (needs debug info)
+#
+# This creates hashes with event sums
+
+# Strip command line arguments
+$onlycommand = $command;
+$onlycommand =~ s/\s.*//;
+
+$cmd = "addr2line -f -e $onlycommand < .p2cg";
+#print "#Run Cmd $cmd\n";
+open ATOL, "$cmd < .p2cg|";
+while(<ATOL>) {
+	chomp;
+	$func = $_;
+	$fileline = <ATOL>;
+	chomp $fileline;
+	($file,$line) = ($fileline =~ /(.*?):(.*)/);
+	#print "Func $func, file $file, line $line\n";
+
+	if ($file{$func} eq "") {
+		push @funcs, $func;
+		$file{$func} = $file;
+	}
+	if ($count{$func,$line} eq "") {
+		$tmp = $lines->{$func};
+		$tmp = [] unless defined $tmp;
+		push @$tmp, $line;
+		$lines->{$func} = $tmp;
+	}
+	$count{$func,$line}++;
+	$totals++;
+}
+close ATOL;
+
+#
+# Dump out Cachegrind format
+#
+
+# First header info...
+print "version: 1\n";
+print "creator: pf2mon 0.02\n";
+print "pid: $pid\n";
+print "cmd: $command\n\n";
+
+print "positions: line\n";
+print "events: $events\n";
+print "summary: $totals\n\n";
+
+
+# Now profiling data.
+# 
+# We only support static linked binaries for now, so all code
+# is in the binary
+print "ob=$onlycommand\n";
+
+foreach $f (@funcs) {
+	print "\nfl=$file{$f}\n";
+	print "fn=$f\n";
+	$tmp = $lines->{$f};
+	foreach $l (@$tmp) {
+		print "$l $count{$f,$l}\n";
+	}
+}
+
--- a/heat-parallel/relax_gauss.c
+++ b/heat-parallel/relax_gauss.c
+/*
+ * relax_gauss.c
+ *
+ * Gauss-Seidel Relaxation
+ *
+ */
+
+#include "heat.h"
+
+/*
+ * Residual (length of error vector)
+ * between current solution and next after a Gauss-Seidel step
+ *
+ * Temporary array utmp needed to not change current solution
+ *
+ * Flop count in inner body is 7
+ */
+
+double residual_gauss( double *u, double *utmp,
+		       unsigned sizex, unsigned sizey )
+{
+    unsigned i, j;
+    double unew, diff, sum=0.0;
+
+    // first row (boundary condition) into utmp
+    for( j=1; j<sizex-1; j++ )
+	utmp[0*sizex + j] = u[0*sizex + j];
+    // first column (boundary condition) into utmp
+    for( i=1; i<sizey-1; i++ )
+	utmp[i*sizex + 0] = u[i*sizex + 0];
+
+    for( i=1; i<sizey-1; i++ )
+    {
+	for( j=1; j<sizex-1; j++ )
+	{
+	    unew = 0.25 * (utmp[ i*sizex     + (j-1) ]+  // new left
+			   u   [ i*sizex     + (j+1) ]+  // right  
+			   utmp[ (i-1)*sizex + j     ]+  // new top
+			   u   [ (i+1)*sizex + j     ]); // bottom
+
+	    diff = unew - u[i*sizex + j];
+	    sum += diff * diff; 
+
+	    utmp[i*sizex + j] = unew;
+	}
+    }
+
+    return sum;
+}
+
+
+/*
+ * One Gauss-Seidel iteration step
+ *
+ * Flop count in inner body is 4
+ */
+void relax_gauss( double *u, 
+		  unsigned sizex, unsigned sizey )
+{
+    unsigned i, j;
+  
+
+    for( i=1; i<sizey-1; i++ )
+    {
+	for( j=1; j<sizex-1; j++ )
+	{
+	    u[i*sizex+j]= 0.25 * (u[ i*sizex     + (j-1) ]+
+				  u[ i*sizex     + (j+1) ]+
+				  u[ (i-1)*sizex + j     ]+
+				  u[ (i+1)*sizex + j     ]);
+	}
+    }
+}
--- a/heat-parallel/relax_jacobi.c
+++ b/heat-parallel/relax_jacobi.c
+/*
+ * relax_jacobi.c
+ *
+ * Jacobi Relaxation
+ *
+ */
+
+#include "heat.h"
+
+#if 0
+/*
+ * Residual (length of error vector)
+ * between current solution and next after a Jacobi step
+ */
+double residual_jacobi( double *u, 
+			unsigned sizex, unsigned sizey )
+{
+    unsigned i, j;
+    double unew, diff, sum=0.0;
+
+    for( i=1; i<sizey-1; i++ )
+    {
+	for( j=1; j<sizex-1; j++ )
+        {
+	    unew = 0.25 * (u[ i*sizex     + (j-1) ]+  // left
+			   u[ i*sizex     + (j+1) ]+  // right
+			   u[ (i-1)*sizex + j     ]+  // top
+			   u[ (i+1)*sizex + j     ]); // bottom
+
+	    diff = unew - u[i*sizex + j];
+	    sum += diff * diff; 
+	}
+    }
+
+    return sum;
+}
+#endif
+
+/*
+ * One Jacobi iteration step
+ */
+double relax_jacobi_return_residual( double *u, double *utmp,
+		unsigned sizex, unsigned sizey )
+{
+	int i, j;
+	double unew, diff, sum=0.0;
+
+	for( i=1; i<sizey-1; i++ )
+	{
+		for( j=1; j<sizex-1; j++ )
+		{
+			{
+				utmp[i*sizex + j]= 0.25 * (u[ i*sizex+j -1 ]+  // left
+						u[ i*sizex+j +1 ]+  // right
+						u[ (i-1)*sizex + j ]+  // top
+						u[ (i+1)*sizex + j ]); // bottom
+
+				diff = utmp[i*sizex + j] - u[i*sizex +j];
+				sum += diff * diff;
+			}
+		}
+	}
+	return sum;
+}
--- a/heat-parallel/test.dat
+++ b/heat-parallel/test.dat
+50     # iterations
+100    # initial resolution
+2900   # max resolution (spatial resolution)
+200    # resolution step size
+0      # Algorithm 0=Jacobi 1=Gauss 
+2                     # number of heat sources
+0.0  0.0  1.0  1.0    # (x,y), size temperature
+1.0  1.0  1.0  0.5 
--- a/heat-parallel/test2.dat
+++ b/heat-parallel/test2.dat
+0      # iterations
+100    # initial resolution
+100    # max resolution (spatial resolution)
+200    # resolution step size
+1      # Algorithm 0=Jacobi 1=Gauss 
+2                     # number of heat sources
+0.0  0.0  1.0  1.0    # (x,y), size temperature
+1.0  1.0  1.0  0.5 
--- a/heat-parallel/timing.c
+++ b/heat-parallel/timing.c
+//
+// timing.c 
+// 
+
+
+#include <sys/time.h>
+#include "timing.h"
+
+
+double wtime()
+{
+  struct timeval tv;
+  gettimeofday(&tv, 0);
+
+  return tv.tv_sec+1e-6*tv.tv_usec;
+}
+
--- a/heat-parallel/timing.h
+++ b/heat-parallel/timing.h
+//
+// timing.h
+//
+
+#ifndef TIMING_H_INCLUDED
+#define TIMING_H_INCLUDED
+
+double wtime();
+
+#endif // TIMING_H_IINCLUDED