HPCToolkit-users-manual.tex

%
% $HeadURL$
% $Id$
%
% Copyright ((c)) 2002-2022, Rice University
% All rights reserved.
% See file LICENSE for details.
%

% ***************************************************************************
%
% ***************************************************************************

\documentclass[11pt,twoside,letterpaper]{report}

 \setcounter{topnumber}{4}
 \setcounter{totalnumber}{4}
  \renewcommand{\floatpagefraction}{0.7}


% ***************************************************************************
% Standard packages
% ***************************************************************************

\usepackage{fixltx2e}
%\usepackage{fixpdftex}

% ==========================================================
% formatting, graphics, tables, figures, etc.
% ==========================================================

\usepackage[toc]{appendix}

%\usepackage{geometry}

\usepackage{comment}

\usepackage{fullpage}
\usepackage{indentfirst}
\usepackage[bf,normalsize]{caption}
\usepackage{subcaption}

\usepackage{setspace} % setspace (better than doublespace)

\usepackage{cite}

\usepackage{verbatim,moreverb,fancyvrb}

\usepackage{listings}
\lstset{%
  %basicstyle=\small, % print whole listing small
  %keywordstyle=\color{black}\bfseries\underbar,
  language=C++,
  columns=fullflexible,
  numbers=left, numberstyle=\scriptsize, stepnumber=1, numbersep=5pt, %\tiny
  escapeinside={(@}{@)}
}

\usepackage[table]{xcolor}
\definecolor{clr:bluegrey1}{HTML}{F1F5FA}
\definecolor{clr:bluegrey2}{HTML}{ECF3FE}

% Generally load hyperref last, unless otherwise specified
\usepackage[breaklinks=true]{hyperref} %bookmarksopen,bookmarksnumbered
%\hypersetup{
%  colorlinks=false,%
%  pdfborder = 0 0 0
%}

% Cf. http://www.nersc.no/~knutal/latex_tips.html
%   To use epstopdf: pdflatex --shell-escape <*.tex>
\usepackage{ifpdf}
\ifpdf
  \usepackage[pdftex]{graphicx}
  \usepackage{epstopdf}
  \usepackage{pdfpages}
\else
  \usepackage[dvips]{graphicx}
  \usepackage{breakurl} % fix hyperref
\fi

% ==========================================================
% symbols, etc.
% ==========================================================

\usepackage{latexsym}
\usepackage{textcomp}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}

% ***************************************************************************
% Customizations
% ***************************************************************************

\specialcomment{BookVDM}{}{}
\excludecomment{BookVDM}

\setlength{\textwidth}{6.0 in}
\setlength{\oddsidemargin}{0.5 in}
\setlength{\evensidemargin}{0.5 in} % 0.0 for twoside
\clubpenalty=10000
\widowpenalty=10000

% Sanitize placement of figures
\renewcommand{\topfraction}{0.85}
\renewcommand{\textfraction}{0.1}
\renewcommand{\floatpagefraction}{0.75}


\input{myconfig}


% ***************************************************************************
% Document
% ***************************************************************************

\begin{document}

% ***************************************************************************
% ***************************************************************************

\title{\HPCToolkit{} User's Manual\\[.5in]Version 2024.01}
%\subtitle{}

\author{John Mellor-Crummey,\\
Laksono Adhianto,
Jonathon Anderson,
Mike Fagan,
Dragana Grbic,
Marty Itzkowitz,\\
Mark Krentel,
Xiaozhu Meng,
Nathan Tallent,
Keren Zhou\\
\\
\\
Rice University\\
}
\date{March 2024}

\maketitle


% ***************************************************************************
% ***************************************************************************

\pagenumbering{roman}
\setcounter{page}{1}
%\cleardoublepage

%\chapter*{Preface}


% ***************************************************************************
% ***************************************************************************

%\chapter*{Acknowledgements}


% ***************************************************************************
% ***************************************************************************
\
\begin{singlespace}

% \newpage

\pagestyle{empty}
\thispagestyle{empty}
\tableofcontents

% \newpage
% \pagestyle{empty}
% \thispagestyle{empty}
% \listoffigures

% \newpage
% \pagestyle{empty}
% \thispagestyle{empty}
% \listofalgorithms

\end{singlespace}

% ***************************************************************************
% ***************************************************************************

%\newpage
% \cleardoublepage
\pagestyle{plain}
\pagenumbering{arabic}

\chapter{Introduction}

\setcounter{page}{1}

\HPCToolkit{}~\cite{Adhianto-etal:2010:CPE-hpctoolkit,hpctoolkit-www} is an integrated suite of tools for measurement and analysis of program performance on computers ranging from multicore desktop systems to the world's largest supercomputers.
\HPCToolkit{} provides accurate measurements of a program's work, resource consumption, and inefficiency, correlates these metrics with the program's source code, works with multilingual, fully optimized binaries, has low measurement overhead, and scales to large parallel systems.
\HPCToolkit{}'s measurements provide support for analyzing a program execution cost, inefficiency, and scaling characteristics both within and across nodes of a parallel system.

\HPCToolkit{} principally monitors an execution of a multithreaded and/or multiprocess program using asynchronous sampling, unwinding thread call stacks, and attributing the metric value associated with a sample event in a thread to the calling context of the thread/process in which the event occurred. \HPCToolkit{}'s asynchronous sampling is typically triggered by the expiration of a Linux timer or a hardware performance monitoring unit event, such reaching a threshold value for a hardware performance counter.
Sampling has several advantages over instrumentation for measuring program performance: it requires no modification of source code, it avoids potential blind spots (such as code available in only binary form), and it has lower overhead.
\HPCToolkit{} typically adds measurement overhead of only a few percent to an execution for reasonable sampling rates~\cite{Tallent-MC-Fagan:2009:PLDI-hpctoolkit-binary-analysis}.
Sampling enables fine-grain measurement and attribution of costs in both serial and parallel programs.

For parallel programs, one can use HPCToolkit to measure
the fraction of time threads are idle, working, or communicating.
To obtain detailed information about a program's computation
performance, one can collect samples using a processor's built-in performance monitoring
units to measure metrics such as
operation counts, pipeline stalls, cache misses, and data movement
between processor sockets.  Such detailed measurements are essential
to understand the performance characteristics of applications
on modern multicore microprocessors that employ instruction-level
parallelism, out-of-order execution, and complex memory hierarchies.
With \HPCToolkit{}, one can also easily compute derived metrics such as cycles
per instruction, waste, and relative efficiency to provide insight
into a program's shortcomings.


\begin{figure}[t]
\centering{\includegraphics[width=.8\textwidth]{fig/hpctoolkit-code-centric}}
\caption{A code-centric view of an execution of the University of Chicago's FLASH code executing on 8192 cores of a Blue Gene/P. This bottom-up view shows that 16\% of the execution time was spent in IBM's DCMF messaging layer. By tracking these costs up the call chain, we can see that most of this time   was spent on behalf of calls to {\tt pmpi\_allreduce} on line 419 of {\tt amr\_comm\_setup}.}
\label{fig:code-centric}
\end{figure}

A unique capability of \HPCToolkit{} is its ability to unwind the call stack of a thread executing highly optimized code to attribute time, hardware counter metrics, as well as software metrics (e.g., context switches) to a full calling context.
Call stack unwinding is often difficult for highly optimized code~\cite{Tallent-MC-Fagan:2009:PLDI-hpctoolkit-binary-analysis}. For accurate call stack unwinding, HPCToolkit employs two strategies:
interpreting compiler-recorded information in DWARF Frame Descriptor Entries (FDEs) and binary analysis
to compute unwind recipes directly from an application's  machine instructions.
On ARM processors, HPCToolkit uses {\tt libunwind} exclusively. On Power processors, HPCToolkit uses
binary analysis exclusively.
On x86\_64 processors, HPCToolkit employs both strategies in an integrated fashion.

\begin{figure}[t]
\centering{\includegraphics[width=.8\textwidth]{fig/hpctoolkit-thread-centric}}
\caption{A thread-centric view of the performance of a parallel radix sort application executing on 960 cores of a Cray XE6. The bottom pane shows a calling context for {\tt usort} in the execution. The top pane shows a graph of how much time each thread spent executing calls to {\tt usort} from the   highlighted context.  On a Cray XE6, there is one MPI helper thread for each compute node in the system; these helper threads spent no time executing {\tt usort}. The graph shows that some of the MPI ranks spent twice as much time in {\tt usort} as others. This happens because the radix sort divides up the work into 1024 buckets. In an execution on 960 cores,  896 cores work on one bucket and 64 cores work on two. The middle pane shows an alternate view of the thread-centric data as a histogram.}
\label{fig:thread-centric}
\end{figure}

\HPCToolkit{} assembles performance measurements into a call path profile that associates the costs of each function call with its full calling context.
In addition, \HPCToolkit{} uses binary analysis to attribute program performance metrics with detailed precision -- full dynamic calling contexts augmented with information about call sites, inlined functions and templates, loops, and source lines.
Measurements can be analyzed in a variety of ways: top-down in a calling context tree, which associates costs with the full calling context in which they are incurred; bottom-up in a view that apportions costs associated with a function to each of the contexts in which the function is called; and in a flat view that aggregates all costs associated with a function independent of calling context.
This multiplicity of code-centric perspectives is essential to understanding a program's performance for tuning under various circumstances.
\HPCToolkit{} also supports a thread-centric perspective, which enables one to see how a performance metric for a calling context differs across threads, and a time-centric perspective, which enables a user to see how an execution unfolds over time. Figures~\ref{fig:code-centric}--\ref{fig:time-centric} show samples of HPCToolkit's code-centric, thread-centric, and time-centric views.


\begin{figure}[t]
\centering{\includegraphics[width=.8\textwidth]{fig/hpctoolkit-time-centric}}
\caption{A time-centric view of  part of an execution of the University of Chicago's FLASH code  on 256 cores of a Blue Gene/P. The figure shows a detail from the end of the initialization phase and  part of the first iteration of the solve phase. The largest pane in the figure shows the activity of cores 2--95 in the execution during a time interval ranging from 69.376s--85.58s during the execution.  Time lines for threads are arranged from top to bottom and time flows from left to right. The color at any point in time for a thread indicates the procedure that the thread is executing at that time. The right pane shows the full call stack of thread 85 at 84.82s into the execution, corresponding to the selection shown by the white crosshair; the outermost procedure frame of the call stack is shown at the top of the pane and the innermost frame is shown at the bottom. This view highlights that even though FLASH is an SPMD program, the behavior of threads over time can be quite different. The purple region highlighted by the cursor, which represents a call by all processors to {\tt mpi\_allreduce}, shows that the time spent in this call varies across the processors. The variation in time spent waiting in {\tt mpi\_allreduce} is readily explained by an imbalance in the time processes spend a prior prolongation step, shown in yellow. Further left in the figure, one can see differences among ranks executing on different cores in each node as they await the  completion of an {\tt mpi\_allreduce}. A rank executing on one core of each node waits in {\tt DCMF\_Messager\_advance} (which appears as blue stripes) while ranks executing on other cores in each node wait in a helper function (shown in green). In this phase, ranks await the delayed arrival of a few of their peers who have extra work to do inside {\tt simulation\_initblock} before they call  {\tt mpi\_allreduce}. }
\label{fig:time-centric}
\end{figure}


By working at the machine-code level, \HPCToolkit{} accurately measures and attributes costs in executions of multilingual programs, even if they are linked with libraries available only in binary form.
\HPCToolkit{} supports performance analysis of fully optimized code.
It measures and attributes performance metrics to shared libraries that are dynamically loaded at run time.
The low overhead of \HPCToolkit{}'s sampling-based measurement is particularly important
for parallel programs because measurement overhead can distort program behavior.

\HPCToolkit{} is also especially good at pinpointing scaling losses in parallel codes, both within multicore nodes and across the nodes in a parallel system.
Using differential analysis of call path profiles collected on different numbers of threads or processes enables one to quantify scalability losses and pinpoint their causes to individual lines of code executed in particular calling contexts~\cite{Coarfa-MC:2007:ICS-scalability}.
We have used this technique to quantify scaling losses in leading science applications across thousands of processor cores on Cray and IBM Blue Gene systems, associate them with individual lines of source code in full calling context~\cite{Tallent-MC-etal:2009:SC-hpctoolkit-petascale,Tallent-MC-etal:2010:SC-hpctoolkit-load-imbalance}, and quantify scaling losses in science applications within compute nodes at the loop nest level due to competition for memory bandwidth in multicore processors~\cite{Tallent-etal:2008:SciDAC-hpctoolkit}.
We have also developed techniques for efficiently attributing the idleness in one thread to its cause in another thread~\cite{Tallent-MC:2009:PPoPP-hpctoolkit-work-stealing,Tallent-MC-Porterfield:2010:PPoPP-hpctoolkit-lock-contention}.

\HPCToolkit{} is deployed on many DOE supercomputers, including
the Sierra supercomputer (IBM Power9 + NVIDIA V100 GPUs) at Lawrence Livermore National Laboratory,
Cray XC40 systems at Argonne's Leadership Computing Facility and the National Energy
Research Scientific Computing Center; the Summit supercomputer (IBM Power9 + NVIDIA V100 GPUs) at Oak Ridge Leadership Computing Facility
as well as other clusters and supercomputers based on x86\_64, Power, and ARM processors.

% ***************************************************************************
% ***************************************************************************
\cleardoublepage
\chapter{\HPCToolkit{} Overview}


\begin{figure}[t]
\centering{\includegraphics[width=.8\textwidth]{fig/hpctoolkit-gpu-workflow}}
\caption{Overview of \HPCToolkit{}'s tool work flow.}

\label{fig:hpctoolkit-overview:a}
\end{figure}

\HPCToolkit{}'s work flow is organized around four principal capabilities, as shown in Figure~\ref{fig:hpctoolkit-overview:a}:
\begin{enumerate}
  \item \emph{measurement} of context-sensitive performance metrics using call-stack unwinding
while an application executes;
  \item \emph{binary analysis} to recover program structure from the application binary and the shared libraries
and GPU binaries used in the run;
  \item \emph{attribution} of performance metrics by correlating dynamic performance metrics with static program structure; and
  \item \emph{presentation} of performance metrics and associated source code.
\end{enumerate}

To use \HPCToolkit{} to measure and analyze an application's performance, one first compiles and links the application for a production run, using \emph{full} optimization and including debugging symbols.%
\footnote{%
For the most detailed attribution of application performance data using \HPCToolkit{}, one should ensure that the compiler includes line map information in the object code it generates. While \HPCToolkit{} does not need this information to function, it can be helpful to users trying to interpret the results. Since compilers can usually provide line map information for fully optimized code, this requirement need not require a special build process. For instance, with the Intel compiler we recommend using \texttt{-g -debug inline\_debug\_info}.}
Second, one launches an application with \HPCToolkit{}'s measurement tool, \hpcrun{}, which uses statistical sampling to collect a performance profile.
Third, one invokes \hpcstruct{}, \HPCToolkit{}'s tool for analyzing an application binary and any shared objects and GPU binaries
it used in the data collection run, as stored in the measurements directory.  It recovers
information about source files, procedures, loops, and inlined code.
Fourth, one uses \hpcprof{} to combine information about an application's structure with dynamic performance measurements to produce a performance database.
Finally, one explores a performance database with \HPCToolkit{}'s \hpcviewer{} and/or \hpctraceviewer{} graphical presentation tools.

The rest of this chapter briefly discusses unique aspects of \HPCToolkit{}'s measurement, analysis and presentation capabilities.


\section{Asynchronous Sampling and Call Path Profiling}

Without accurate measurement, performance analysis results may be of questionable value.
As a result, a principal focus of work on \HPCToolkit{} has been the design and implementation of techniques to provide accurate fine-grain measurements of production applications running at scale.
For tools to be useful on production applications on large-scale parallel systems, large measurement overhead is unacceptable.
For measurements to be accurate, performance tools must avoid introducing measurement error.
Both source-level and binary instrumentation can distort application performance through a variety of mechanisms~\cite{Mytkowicz:2009:PWD:2528521.1508275}.
Frequent calls to small instrumented procedures can lead to considerable measurement overhead.
Furthermore, source-level instrumentation can distort application performance by interfering with inlining and template optimization.
To avoid these effects, many instrumentation-based tools intentionally refrain from instrumenting certain procedures.
Ironically, the more this approach reduces overhead, the more it introduces \emph{blind spots}, \ie{}, intervals of unmonitored execution.
For example, a common selective instrumentation technique is to ignore small frequently executed procedures --- but these may be just the thread synchronization library routines that are critical.
Sometimes, a tool unintentionally introduces a blind spot.
A typical example is that source code instrumentation suffers from blind spots when source code is unavailable, a common condition for math and communication libraries.

To avoid these problems, \HPCToolkit{} eschews instrumentation and favors the use of \emph{asynchronous sampling} to measure and attribute performance metrics.
During a program execution, sample events are triggered by periodic interrupts induced by an interval timer or overflow of hardware performance counters.
One can sample metrics that reflect work (\eg{}, instructions, floating-point operations), consumption of resources (\eg{}, cycles, bandwidth consumed in the memory hierarchy by data transfers in response to cache misses), or inefficiency (\eg{}, stall cycles).
For reasonable sampling frequencies, the overhead and distortion introduced by sampling-based measurement is typically much lower than that introduced by instrumentation~\cite{Froyd-MC-Fo:2005:ICS-csprof}.

For all but the most trivially structured programs, it is important to associate the costs incurred by each procedure with the contexts in which the procedure is called.
Knowing the context in which each cost is incurred is essential for understanding why the code performs as it does.
This is particularly important for code based on application frameworks and libraries.
For instance, costs incurred for calls to communication primitives (\eg{}, \mytt{MPI_Wait}) or code that results from instantiating C++ templates for data structures can vary widely depending how they are used in a particular context.
Because there are often layered implementations within applications and libraries, it is insufficient either to insert instrumentation at any one level or to distinguish costs based only upon the immediate caller.
For this reason, \HPCToolkit{} uses call path profiling to attribute costs to the full calling contexts in which they are incurred.

\HPCToolkit{}'s \hpcrun{} call path profiler uses call stack unwinding to attribute execution costs of optimized executables to the full calling context in which they occur.
Unlike other tools, to support asynchronous call stack unwinding during execution of optimized code, \hpcrun{} uses on-line binary analysis to locate procedure bounds and compute an unwind recipe for each code range within each procedure~\cite{Tallent-MC-Fagan:2009:PLDI-hpctoolkit-binary-analysis}.
These analyses enable \hpcrun{} to unwind call stacks for optimized code with little or no information other than an application's machine code.

\begin{comment}
To attribute performance back to source code, \HPCToolkit{} combines a call path profile with information gleaned
through post-mortem analysis of an application's object code and its debugging sections.
This post-mortem analysis of an executable recovers its program structure and reconstructs a mapping from
instructions back to source lines, loops, inlined functions, and procedures.
\HPCToolkit{}'s ability to attribute costs to dynamic call paths, including loops and inlined functions,
for optimized code without a special-purpose compiler is unique.
\end{comment}

The output of a run with  \hpcrun{} is a \emph{measurements directory} containing the data, and the information necessary
to recover the names of all shared libraries and GPU binaries.


\section{Recovering Static Program Structure}

To enable effective analysis, call path profiles for executions of optimized programs must be correlated
with important source code abstractions.
Since measurements refer only to instruction addresses within the running application,
it is necessary to map measurements back to the program source.
The mappings include those of the application and any shared libraries referenced during the
run, as well as those for any GPU binaries executed on GPUs during the run.
To associate measurement data with the static structure of fully-optimized executables,
we need a mapping between object code and its associated source code structure.\footnote{This object to source code mapping should be contrasted with the binary's line map, which
(if present) is typically fundamentally line based.}
\HPCToolkit{} constructs this mapping using binary analysis; we call this process
\emph{recovering program structure}~\cite{Tallent-MC-Fagan:2009:PLDI-hpctoolkit-binary-analysis}.

\HPCToolkit{} focuses its efforts on recovering source files, procedures, inlined functions and templates, as well as
loop nests as the most important elements of source code structure.
To recover program structure, \HPCToolkit's \hpcstruct{} utility parses a binary's machine instructions,
reconstructs a control flow graph, combines line map and DWARF information about inlining with interval
analysis on the control flow graph in a way that enables it to relate machine code after optimization
back to the original source.

One important benefit accrues from this approach.
\HPCToolkit{} can expose the structure of and assign metrics to the code is actually executed, \emph{even if source code is unavailable}.
For example, \hpcstruct{}'s program structure naturally reveals transformations such as loop fusion and scalarization
loops that arise from compilation of Fortran 90 array notation.
Similarly, it exposes calls to compiler support routines and wait loops in communication libraries of which one would otherwise be unaware.


\section{Reducing Performance Measurements}
\HPCToolkit{} combines (post-mortem) the recovered static program structure with dynamic call paths to expose inlined frames and loop nests.
This enables us to attribute the performance of samples in their full static and dynamic context and correlate it with source code.

The data reduction is done by \HPCToolkit's \hpcprof{} utility, invoked on the \emph{measurements directory} recorded by \hpcrun{} and augmented with program structure information by \hpcstruct{}.
From the measurements and structure, \hpcprof{} generates a \emph{database directory} containing performance data presentable by \hpcviewer{}.

In most cases \hpcprof{} is able to complete the reduction in a matter of minutes, however for especially large experiments (more than about 100,000 threads or GPU streams~\cite{10.1145/3524059.3532397}) its multi-node sibling \hpcprofmpi{} may be substantially faster.
\hpcprofmpi{} is an MPI application identical to \hpcprof{}, except that it additionally can exploit multiple compute nodes during the reduction.
In our experience, exploiting 8-10 compute nodes via \hpcprofmpi{} can be as much as $5\times$ faster than \hpcprof{} for sufficiently large experiments.

\section{Presenting Performance Measurements}

To enable an analyst to rapidly pinpoint and quantify performance bottlenecks, tools must present the performance measurements in a way that engages the analyst, focuses attention on what is important, and automates common analysis subtasks to reduce the mental effort and frustration of sifting through a sea of measurement details.

To enable rapid analysis of an execution's performance bottlenecks, we have carefully designed the \hpcviewer{}
- a code-centric presentation tool~\cite{Adhianto-MC-Ta:2010:PSTI-hpcviewer}.  It also includes a time-centric tab
~\cite{Tallent-MC-etal:2011:ICS-hpctoolkit-scalable-tracing}.

\hpcviewer{} combines a relatively small set of complementary presentation techniques that, taken together, rapidly focus an analyst's attention on performance bottlenecks rather than on unimportant information.
To facilitate the goal of rapidly focusing an analyst's attention on performance bottlenecks \hpcviewer{}
extends several existing presentation techniques.
In particular, \hpcviewer{} (1) synthesizes and presents three complementary views of calling-context-sensitive metrics;
(2) treats a procedure's static structure as first-class information with respect to both performance metrics
and constructing views; (3) enables a large variety of user-defined metrics to describe performance inefficiency;
and (4) automatically expands hot paths based on arbitrary performance metrics --- through calling contexts and static structure --- to rapidly highlight important performance data.

The trace tab enables an application developer to visualize how a parallel execution unfolds over time.
This view facilitates identification of important inefficiencies such as serialization and load imbalance, among others.


% ***************************************************************************
% ***************************************************************************

\cleardoublepage
\chapter{Quick Start}
\label{chpt:quickstart}

This chapter provides a rapid overview of analyzing the performance of an application using \HPCToolkit{}.
It assumes an operational installation of \HPCToolkit{}.


% ===========================================================================
% ===========================================================================

\section{Guided Tour}
\label{chpt:quickstart:tour}

\begin{figure}[t]
\centering{\includegraphics[width=.8\textwidth]{fig/hpctoolkit-gpu-workflow}}
\caption{Overview of \HPCToolkit{} tool's work flow.}
\label{fig:hpctoolkit-overview:b}
\end{figure}

\HPCToolkit{}'s work flow is summarized in Figure~\ref{fig:hpctoolkit-overview:b} (on page~\pageref{fig:hpctoolkit-overview:b}) and is organized around four principal capabilities:
\begin{enumerate}
  \item \emph{measurement} of context-sensitive performance metrics while an application executes;
  \item \emph{binary analysis} to recover program structure from CPU and GPU binaries;
  \item \emph{attribution} of performance metrics by correlating dynamic performance metrics with static program structure; and
  \item \emph{presentation} of performance metrics and associated source code.
\end{enumerate}

To use \HPCToolkit{} to measure and analyze an application's performance, one first compiles and links the application for a production run, using \emph{full} optimization.
Second, one launches an application with \HPCToolkit{}'s measurement tool, \hpcrun{}, which uses statistical sampling to collect a performance profile.
Third, one applies \hpcstruct{} to an application's measurement directory to recover program structure information from any CPU or GPU binary that was measured.
Program structure, which includes information about files, procedures, inlined code, and loops, is used to relate performance measurements to source code.
Fourth, one uses \hpcprof{} to combine information about an application's structure with dynamic performance measurements to produce a performance database.
Finally, one explores a performance database with \HPCToolkit{}'s graphical user interface: \hpcviewer{} which presents
both a code-centric analysis of performance metrics and a time-centric (trace-based) analysis of an execution.

The following subsections explain \HPCToolkit{}'s work flow in more detail.


% ==========================================================
% ==========================================================

\subsection{Compiling an Application}

For the most detailed attribution of application performance data using \HPCToolkit{}, one should compile so as to include with line map information in the generated object code.
This usually means compiling with options similar to `\texttt{-g -O3}'. Check your compiler's documentation for information about the right set of options to have the compiler record information about inlining and the mapping of machine instructions to source lines. We advise picking  options that indicate they will record information that relates machine instructions to source code without compromising optimization. For instance, the Portland Group (PGI) compilers, use \texttt{-gopt} in place of \texttt{-g} to collect information without interfering with optimization.

While \HPCToolkit{} does not need information about the mapping between machine instructions and source code to function,
having such information included in the binary code by the compiler can be helpful to users trying to interpret performance measurements.
Since compilers can usually provide information about line mappings and inlining for fully-optimized code,
this requirement usually involves a one-time trivial adjustment to the an application's build scripts
to provide a better experience with tools. Such mapping information enables tools such as \HPCToolkit{},
race detectors, and memory analysis tools to attribute information more precisely.

For statically linked executables, such as those often used on  Cray supercomputers, the final link step is done with \hpclink{}.

% ==========================================================
% ==========================================================

\subsection{Measuring Application Performance}
\label{chpt:quickstart:tour:measurement}

Measurement of application performance takes two different forms depending on whether your application is dynamically or statically linked.
To monitor a dynamically linked application, simply use \hpcrun{} to launch the application.
To monitor a statically linked application, the data to be collected is specified by environment variables.
In either case, the application may be sequential, multithreaded or based on MPI.
The commands below give examples for an application named \texttt{app}.
%
\begin{itemize}

\item Dynamically linked applications:\hfill

Simply launch your application with \hpcrun{}:
\begin{quote}
  \verb|[<mpi-launcher>] hpcrun [hpcrun-options] app [app-arguments]|
\end{quote}
Of course, \texttt{<mpi-launcher>} is only needed for MPI programs and is sometimes a program like \texttt{mpiexec} or \texttt{mpirun}, or a workload manager's utilities such as Slurm's {\tt srun} or IBM's Job Step Manager utility {\tt jsrun}.

\item Statically linked applications:\hfill

First, link \hpcrun{}'s monitoring code into \texttt{app}, using \hpclink{}:
\begin{quote}
  \verb|hpclink <linker> -o app <linker-arguments>|
\end{quote}

Then monitor \texttt{app} by passing \hpcrun{} options through environment variables.
For instance:
\begin{quote}
\begin{verbatim}
export HPCRUN_EVENT_LIST="CYCLES"
[<mpi-launcher>] app [app-arguments]
\end{verbatim}
\end{quote}
\hpclink{}'s \mytt{--help} option gives a list of environment variables that affect monitoring.
See Chapter~\ref{chpt:statically-linked-apps} for more information.

\end{itemize}
%
Any of these commands will produce a measurements database that contains separate measurement information for each MPI rank and thread in the application.
The database is named according the form:
\begin{quote}
  \verb|hpctoolkit-app-measurements[-<jobid>]|
\end{quote}
If the application \texttt{app} is run under control of a recognized batch job scheduler (such as Slurm, Cobalt, or IBM's Job Manager), the name of the measurements directory will contain the corresponding job identifier \texttt{<jobid>}.
Currently, the database contains measurements files for each thread that are named using the following templates:
\begin{quote}
  \verb|app-<mpi-rank>-<thread-id>-<host-id>-<process-id>.<generation-id>.hpcrun|
    \verb|app-<mpi-rank>-<thread-id>-<host-id>-<process-id>.<generation-id>.hpctrace|
\end{quote}

\subsubsection{Specifying CPU Sample Sources}

\HPCToolkit{} primarily monitors an application using asynchronous sampling.
Consequently, the most common option to \hpcrun{} is a list of sample sources that define how samples are generated.
A sample source takes the form of an event name $e$ and \texttt{howoften}, specified as \texttt{$e$@howoften}. The specifier \texttt{howoften} may
be a number, indicating a period, \eg{} \mytt{CYCLES@4000001} or it may be \texttt{f} followed by a number, \mytt{CYCLES@f200} indicating a frequency in samples/second.
For a sample source with event $e$ and period $p$, after every \emph{p} instances of \emph{e}, a sample is generated that causes \hpcrun{} to inspect the and record information about the monitored application.

To configure \hpcrun{} with two samples sources, \texttt{$e_1$@howoften$_1$} and \texttt{$e_2$@howoften$_2$}, use the following options:
\begin{quote}
  \texttt{--event $e_1$@howoften$_1$ --event $e_2$@howoften$_2$}
\end{quote}
To use the same sample sources with an \hpclink{}-ed application, use a command similar to:
\begin{quote}
  \texttt{export HPCRUN\_EVENT\_LIST="$e_1$@howoften$_1$ $e_2$@howoften$_2$"}
\end{quote}

\subsubsection{Measuring GPU Computations}

One can simply profile and optionally trace computations offloaded onto AMD, Intel, and NVIDIA GPUs by using one of the following event specifiers:
\begin{itemize}
\item  {\tt -e gpu=nvidia}  is used with CUDA and OpenMP on NVIDIA GPUs
\item  {\tt -e gpu=amd}  is used with HIP and OpenMP on AMD GPUs
\item  {\tt -e gpu=level0}  is used with Intel's Level Zero runtime for Data Parallel C++ and OpenMP
\item  {\tt -e gpu=opencl}  can be used on any of the GPU platforms.
\end{itemize}

Adding a {\tt -t} to \hpcrun{}'s command line when profiling GPU computations will trace them as well.

For more information about how to use PC sampling (NVIDIA GPUs only) or binary instrumentation (Intel GPUs) for instruction-level performance measurement of GPU kernels, see Chapter~\ref{chpt:gpu}.

% ==========================================================
% ==========================================================

\subsection{Recovering Program Structure}

Typically, \hpcstruct{} is launched without any options, with an argument that is a \HPCToolkit{}  \emph{measurement directory}.
\hpcstruct{} identifies the application as well as any shared libraries and GPU binaries it invokes.
It processes each of them and records information its program structure in the \emph{measurements directory}.
Program structure for a binary includes information about its source files, procedures, inlined code, loop nests, and statements.

When applied to a measurements directory, \hpcstruct{} analyzes multiple binaries concurrently by default.
It analyzes each small binary using a few threads and each large binary using more threads.

Although not usually necessary, one can apply \hpcstruct{}  to recover program structure information for a single CPU or GPU binary.
To recover static program structure for a single binary \texttt{b}, use the command:
\begin{quote}
  \verb|hpcstruct b|
\end{quote}
This command analyzes the binary and saves this information in a file named \texttt{b.hpcstruct}.


% ==========================================================
% ==========================================================

\subsection{Analyzing Measurements \& Attributing Them to Source Code}

To analyze \HPCToolkit{}'s measurements and attribute them to the application's source code, use \hpcprof{}, typically invoked as follows:
\begin{quote}
\begin{verbatim}
hpcprof hpctoolkit-app-measurements
\end{verbatim}
\end{quote}

This command will produce an \HPCToolkit{} performance database with the name \texttt{hpctoolkit-app-database}.
If this database directory already exists, \hpcprof{} will form a unique name by appending a random hexadecimal qualifier.

\hpcprof{} performs this analysis in parallel using multithreading.
By default all available threads are used.
If this is not wanted (e.g. using sharing a single machine), the thread count can be specified with \texttt{-j <threads>}.

\hpcprof{} usually completes this analysis in a matter of minutes.
For especially large experiments (applications using thousands of threads and/or GPU streams), the sibling \hpcprofmpi{} may produce results faster by exploiting additional compute nodes\footnote{
We recommend running \hpcprofmpi{} across 8-10 compute nodes. More than this may not improve or may degrade the overall speed of the analysis.
}.
Typically \hpcprofmpi{} is invoked as follows, using 8 ranks and compute nodes:
\begin{quote}
\begin{verbatim}
<mpi-launcher> -n 8 hpcprof-mpi hpctoolkit-app-measurements
\end{verbatim}
\end{quote}
Note that additional options may be needed to grant \hpcprofmpi{} access to all threads on each node, check the documentation for your scheduler and MPI implementation for details.

If possible, \hpcprof{} will copy the sources for the application and any libraries into the resulting database.
If the source code was moved since or was mounted at a different location than when the application was compiled, the resulting database may be missing some important source files.
In these cases, the \texttt{-R/--replace-path} option may be specified to provide substitute paths based on prefixes.
For example, if the application was compiled from source at \texttt{/home/joe/app/src/} but it is mounted at \texttt{/extern/homes/joe/app/src/} when running \hpcprof{}, the source files can be made available by invoking \hpcprof{} as follows:
\begin{quote}
\begin{verbatim}
hpcprof -R `/home/joe/app/src/=/extern/homes/joe/app/src/' \
  hpctoolkit-app-measurements
\end{verbatim}
\end{quote}
Note that on systems where MPI applications are restricted to a scratch file system, it is the users responsibility to copy any wanted source files and make them available to \hpcprof{}.


% ==========================================================
% ==========================================================

\subsection{Presenting Performance Measurements for Interactive Analysis}

To interactively view and analyze an \HPCToolkit{} performance database, use \hpcviewer{}.
\hpcviewer{} may be launched from the command line or by double-clicking on its icon on MacOS or Windows.
The following is an example of launching from a command line:
\begin{quote}
  \verb|hpcviewer hpctoolkit-app-database|
\end{quote}
Additional help for \hpcviewer{} can be found in a help pane available from \hpcviewer{}'s \emph{Help} menu.

% ==========================================================
% ==========================================================

\subsection{Effective Performance Analysis Techniques}

To effectively analyze application performance, consider using one of the following strategies, which are described in more detail in Chapter~\ref{chpt:effective-performance-analysis}.
\begin{itemize}
\item
A waste metric, which represents the difference between achieved performance and potential peak performance is a good way of understanding the potential for tuning the node performance of codes (Section~\ref{sec:effective-performance-analysis:inefficiencies}).
\hpcviewer{} supports synthesis of derived metrics to aid analysis.
Derived metrics are specified within \hpcviewer{} using spreadsheet-like formula.
See the \hpcviewer{} help pane for details about how to specify derived metrics.

\item
Scalability bottlenecks in parallel codes can be pinpointed by differential analysis of two profiles with different degrees of parallelism (Section~\ref{sec:effective-performance-analysis:scalability}).

\end{itemize}


% ===========================================================================
% ===========================================================================

\section{Additional Guidance}

For additional information, consult the rest of this manual and other documentation:
First, we summarize the available documentation and command-line help:

\begin{description}

\item[Command-line help.]\hfill

Each of \HPCToolkit{}'s command-line tools can generate a help message summarizing the tool's usage, arguments and options.
To generate this help message, invoke the tool with \mytt{-h} or \mytt{--help}.

\item[Man pages.]\hfill

Man pages are available either via the Internet (\url{http://hpctoolkit.org/documentation.html}) or from a local \HPCToolkit{} installation (\mytt{<hpctoolkit-installation>/share/man}).

\item[Manuals.]\hfill

Manuals are available either via the Internet (\url{http://hpctoolkit.org/documentation.html}) or from a local \HPCToolkit{} installation (\mytt{<hpctoolkit-installation>/share/doc/hpctoolkit/documentation.html}).

\item[Articles and Papers.]\hfill

There are a number of articles and papers that describe various aspects of \HPCToolkit{}'s measurement, analysis, attribution and presentation technology.
They can be found at \url{http://hpctoolkit.org/publications.html}.

\end{description}


% ***************************************************************************
% ***************************************************************************

\cleardoublepage
\chapter{Effective Strategies for Analyzing Program Performance}
\label{chpt:effective-performance-analysis}

This chapter describes some proven strategies for using performance measurements to identify performance bottlenecks in both serial and parallel codes.


% ===========================================================================
% ===========================================================================

\section{Monitoring High-Latency Penalty Events}
\label{sec:effective-performance-analysis:penalty-events}

A very simple and often effective methodology is to profile with respect to cycles and high-latency penalty events.
If \HPCToolkit{} attributes a large number of penalty events with a particular source-code statement, there is an extremely high likelihood of significant exposed stalling.
This is true even though (1) modern out-of-order processors can overlap the stall latency of one instruction with nearby independent instructions and (2) some penalty events ``over count''.%
\footnote{For example, performance monitoring units often categorize a prefetch as a cache miss.}
If a source-code statement incurs a large number of penalty events and it also consumes a non-trivial amount of cycles, then this region of code is an opportunity for optimization.
Examples of good penalty events are last-level cache misses and TLB misses.


% ===========================================================================
% ===========================================================================

\section{Computing Derived Metrics}
\label{sec:effective-performance-analysis:derived-metrics}

Modern computer systems provide access to a rich set of hardware performance counters that can directly measure various aspects of a program's performance.
Counters in the processor core and memory hierarchy enable one to collect measures of work (\eg, operations performed), resource consumption (\eg, cycles), and inefficiency (\eg, stall cycles).
One can also measure time using system timers.

Values of individual metrics are of limited use by themselves.
For instance, knowing the count of cache misses for a loop or routine is of little value by itself; only when combined with other information such as the number of instructions executed or the total number of cache accesses does the data become informative.
While a developer might not mind using mental arithmetic to evaluate the relationship between a pair of metrics for a particular program scope (\eg, a loop or a procedure), doing this for many program scopes is exhausting.
To address this problem, \hpcviewer{} supports calculation of derived metrics.
\hpcviewer{} provides an interface that enables a user to specify spreadsheet-like formula that can be used to calculate a derived metric for every program scope.

% For instance, if one wants to compute the cache miss rate in a scope, one could divide the total number of cache misses in a scope by the sum of counts of loads and stores in the scope. On the other hand, if one wanted to compute the fraction of a program's cache misses that occurred in a particular scope, one could divide the number of misses in the scope by the total number of misses in the program.

\begin{figure}[t]
\center{\includegraphics[width=1.0\textwidth]{fig/cycles-per-inst.png}}
%Two possible representations for the call path fragment $\ldots s_1 \rightarrow s_2 \ldots$, where $s_1$ and $s_2$ are call sites and where $s_1$ represents a call from $p$ to $q$ and $s_2$ a call from $q'$ to $r$.
\caption{Computing a derived metric (cycles per instruction) in \hpcviewer{}.}
\label{fig:cycles-per-inst}
\end{figure}

Figure~\ref{fig:cycles-per-inst} shows how to use \hpcviewer{} to compute a \emph{cycles}/\emph{instruction} derived metric from measured metrics \mytt{PAPI_TOT_CYC} and \mytt{PAPI_TOT_INS}; these metrics correspond to {\em cycles} and {\em total instructions executed} measured with the PAPI hardware counter interface.
To compute a derived metric, one first depresses the button marked $f(x)$ above the metric pane; that will cause the pane for computing a derived metric to appear.
Next, one types in the formula for the metric of interest.
When specifying a formula, existing columns of metric data are referred to using a positional name \$$n$ to refer to the $n^{th}$ column, where the first column is written as \$0.
The metric pane shows the formula $\$1/\$3$.
Here, \$1 refers to the column of data representing the exclusive value for \mytt{PAPI_TOT_CYC} and \$3 refers to the column of data representing the exclusive value for \mytt{PAPI_TOT_INS}.%
\footnote{An {\em exclusive} metric for a scope refers to the quantity of the metric measured for that scope alone; an \emph{inclusive} metric for a scope represents the value measured for that scope as well as costs incurred by any functions it calls. In \hpcviewer{}, inclusive metric columns are marked with ``(I)'' and exclusive metric columns are marked with ``(E).''}
Positional names for metrics you use in your formula can be determined using the \emph{Metric} pull-down menu in the pane.
If you select your metric of choice using the pull-down, you can insert its positional name into the formula using the {\em insert metric} button, or you can simply type the positional name directly into the formula.

\begin{figure}[t]
\center{\includegraphics[width=1.0\textwidth]{fig/cycles-per-inst-2}}
%Two possible representations for the call path fragment $\ldots s_1 \rightarrow s_2 \ldots$, where $s_1$ and $s_2$ are call sites and where $s_1$ represents a call from $p$ to $q$ and $s_2$ a call from $q'$ to $r$.
\caption{Displaying the new {\em cycles/ instruction} derived metric in \hpcviewer{}.}
\label{fig:cycles-per-inst-2}
\end{figure}

At the bottom of the derived metric pane, one can specify a name for the new metric.
One also has the option to indicate that the derived metric column should report for each scope what percent of the total its quantity represents; for a metric that is a ratio, computing a percent of the total is not meaningful, so we leave the box unchecked.
After clicking the OK button, the derived metric pane will disappear and the new metric will appear as the rightmost column in the metric pane.
If the metric pane is already filled with other columns of metric, you may need to scroll right in the pane to see the new metric.
Alternatively, you can use the metric check-box pane (selected by depressing the button to the right of $f(x)$ above the metric pane) to hide some of the existing metrics so that there will be enough room on the screen to display the new metric.
Figure~\ref{fig:cycles-per-inst-2} shows the resulting \hpcviewer{} display after clicking OK to add the derived metric.

The following sections describe several types of derived metrics that are of particular use to gain insight into performance bottlenecks and opportunities for tuning.

% ===========================================================================
% ===========================================================================

\section{Pinpointing and Quantifying Inefficiencies}
\label{sec:effective-performance-analysis:inefficiencies}

While knowing where a program spends most of its time or executes most of its floating point operations may be interesting, such information may not suffice to identify the biggest targets of opportunity for improving program performance.
For program tuning, it is less important to know how much resources (\eg, time, instructions) were consumed in each program context than knowing where resources were consumed {\em inefficiently}.

To identify performance problems, it might initially seem appealing to compute ratios to see how many events per cycle occur in each program context.
For instance, one might compute ratios such as FLOPs/cycle, instructions/cycle, or cache miss ratios.
However, using such ratios as a sorting key to identify inefficient program contexts can misdirect a user's attention.
There may be program contexts (\eg, loops) in which computation is terribly inefficient (\eg, with low operation counts per cycle); however, some or all of the least efficient contexts may not account for a significant amount of execution time.
Just because a loop is inefficient doesn't mean that it is important for tuning.

The best opportunities for tuning are where the aggregate performance losses are greatest.
For instance, consider a program with two loops.
The first loop might account for 90\% of the execution time and run at 50\% of peak performance.
The second loop might account for 10\% of the execution time, but only achieve 12\% of peak performance.
In this case, the total performance loss in the first loop accounts for 50\% of the first loop's execution time, which corresponds to 45\% of the total program execution time.
The 88\% performance loss in the second loop would account for only 8.8\% of the program's execution time.
In this case, tuning the first loop has a greater potential for improving the program performance even though the second loop is less efficient.

A good way to focus on inefficiency directly is with a derived {\em waste} metric.
Fortunately, it is easy to compute such useful metrics.
However, there is no one {\em right} measure of waste for all codes.
Depending upon what one expects as the rate-limiting resource (\eg, floating-point computation, memory bandwidth, etc.), one can define an appropriate waste metric (\eg, FLOP opportunities missed, bandwidth not consumed) and sort by that.


\begin{figure}[t]
\center{\includegraphics[width=1.0\textwidth]{fig/fpwaste.png}}
%Two possible representations for the call path fragment $\ldots s_1 \rightarrow s_2 \ldots$, where $s_1$ and $s_2$ are call sites and where $s_1$ represents a call from $p$ to $q$ and $s_2$ a call from $q'$ to $r$.
\caption{Computing a floating point waste metric in \hpcviewer{}.}
\label{fig:fpwaste}
\end{figure}

For instance, in a floating-point intensive code, one might consider keeping the floating point pipeline full as a metric of success.
One can directly quantify and pinpoint losses from failing to keep the floating point pipeline full {\em regardless of why this occurs}.
One can pinpoint and quantify losses of this nature by computing a {\em floating-point waste} metric that is calculated as the difference between the potential number of calculations that could have been performed if the computation was running at its peak rate minus the actual number that were performed.
To compute the number of calculations that could have been completed in each scope, multiply the total number of cycles spent in the scope by the peak rate of operations per cycle.
Using \hpcviewer{}, one can specify a formula to compute such a derived metric and it will compute the value of the derived metric for every scope.
Figure~\ref{fig:fpwaste} shows the specification of this floating-point waste metric for a code.\footnote{Many recent processors have trouble accurately counting floating-point operations accurately, which is unfortunate. If your processor can't accurately count floating-point operations, a floating-point waste metric will be less useful.}

Sorting by a waste metric will rank order scopes to show the scopes with the greatest waste.
Such scopes correspond directly to those that contain the greatest opportunities for improving overall program performance.
A waste metric will typically highlight loops where
\begin{itemize}
\item a lot of time is spent computing efficiently, but the aggregate inefficiencies accumulate,
\item less time is spent computing, but the computation is rather inefficient, and
\item scopes such as copy loops that contain no computation at all, which represent a complete waste according to a metric such as floating point waste.
\end{itemize}

\begin{figure}[t]
\center{\includegraphics[width=1.0\textwidth]{fig/fp-efficiency.png}}
%Two possible representations for the call path fragment $\ldots s_1 \rightarrow s_2 \ldots$, where $s_1$ and $s_2$ are call sites and where $s_1$ represents a call from $p$ to $q$ and $s_2$ a call from $q'$ to $r$.
\caption{Computing floating point efficiency in percent using \hpcviewer{}.}
\label{fig:fpefficiency}
\end{figure}

Beyond identifying and quantifying opportunities for tuning with a waste metric, one can compute a companion derived metric {\em relative efficiency} metric to help understand how easy it might be to improve performance.
A scope running at very high efficiency will typically be much harder to tune than running at low efficiency.
For our floating-point waste metric, we one can compute the floating point efficiency metric by dividing measured FLOPs by potential peak FLOPs and multiplying the quantity by 100.
Figure~\ref{fig:fpefficiency} shows the specification of this floating-point efficiency metric for a code.

\begin{figure}[t]
\center{\includegraphics[width=1.0\textwidth]{fig/fp-efficiency-loop.png}}
%Two possible representations for the call path fragment $\ldots s_1 \rightarrow s_2 \ldots$, where $s_1$ and $s_2$ are call sites and where $s_1$ represents a call from $p$ to $q$ and $s_2$ a call from $q'$ to $r$.
\caption{Using floating point waste and the percent of floating point efficiency to evaluate opportunities for optimization.}
\label{fig:fpefficiency-loop}
\end{figure}

Scopes that rank high according to a waste metric and low according to a companion relative efficiency metric often make the best targets for optimization.
Figure~\ref{fig:fpefficiency-loop} shows the specification of this floating-point efficiency metric for a code.
Figure~\ref{fig:fpefficiency-loop} shows an \hpcviewer{} display that shows the top two routines that collectively account for 32.2\% of the floating point waste in a reactive turbulent combustion code.
The second routine (\mytt{ratt}) is expanded to show the loops and statements within.
While the overall floating point efficiency for \mytt{ratt} is at 6.6\% of peak (shown in scientific notation in the \hpcviewer{} display), the most costly loop in \mytt{ratt} that accounts for 7.3\% of the floating point waste is executing at only 0.114\% efficiency.
 Identifying such sources of inefficiency is the first step towards improving performance via tuning.


% ===========================================================================
% ===========================================================================

\section{Pinpointing and Quantifying Scalability Bottlenecks}
\label{sec:effective-performance-analysis:scalability}

On large-scale parallel systems, identifying impediments to scalability is of paramount importance.
On today's systems fashioned out of multicore processors, two kinds of scalability are of particular interest:
\begin{itemize}
\item scaling within nodes, and
\item scaling across the entire system.
\end{itemize}
\HPCToolkit{} can be used to readily pinpoint both kinds of bottlenecks.
Using call path profiles collected by \hpcrun{}, it is possible to quantify and pinpoint scalability bottlenecks of any kind, {\em regardless of cause}.

To pinpoint scalability bottlenecks in parallel programs, we use {\em differential profiling} --- mathematically combining corresponding buckets of two or more execution profiles.
Differential profiling was first described by McKenney~\cite{McKenney:1999:differential}; he used differential profiling to compare two {\em flat} execution profiles.
Differencing of flat profiles is useful for identifying what parts of a program incur different costs in two executions.
Building upon McKenney's idea of differential profiling, we compare call path profiles of parallel executions at different scales to pinpoint scalability bottlenecks.
Differential analysis of call path profiles pinpoints not only differences between two executions (in this case scalability losses), but the contexts in which those differences occur.
Associating changes in cost with full calling contexts is particularly important for pinpointing context-dependent behavior.
Context-dependent behavior is common in parallel programs.
For instance, in message passing programs, the time spent by a call to \mytt{MPI_Wait} depends upon the context in which it is called.
Similarly, how the performance of a communication event scales as the number of processors in a parallel execution increases depends upon a variety of factors such as whether the size of the data transferred increases and whether the communication is collective or not.


% ==========================================================
% ==========================================================

\subsection{Scalability Analysis Using Expectations}

Application developers have expectations about how the performance of their code should scale as the number of processors in a parallel execution increases.
Namely,
\begin{itemize}
\item when different numbers of
processors are used to solve the same problem (strong scaling), one
expects an execution's speedup to increase linearly with the number of processors employed;
\item when
different numbers of processors are used but the amount of computation
per processor is held constant (weak scaling), one expects the execution
time on a different number of processors to be the same.
\end{itemize}

In both of these situations, a code developer can express their expectations for how performance will scale as a formula that can be used to predict execution performance on a different number of processors.
One's expectations about how overall application performance should scale can be applied to each context in a program
to pinpoint and quantify deviations from expected scaling.
Specifically, one can scale and difference the performance of an application on different numbers of processors to pinpoint contexts that are not scaling ideally.

To pinpoint and quantify scalability bottlenecks in a parallel application, we first use \hpcrun{} to a collect call path profile for an application on two different numbers of processors.
Let $E_p$ be an execution on $p$ processors and $E_q$ be an execution on $q$ processors.
Without loss of generality, assume that $q > p$.

In our analysis, we consider both {\it inclusive} and {\it exclusive} costs for CCT nodes.
The inclusive cost at $n$ represents the sum of all costs attributed to $n$ and any of its descendants in the CCT, and is denoted by $I(n)$.
The exclusive cost at $n$ represents the sum of all costs attributed strictly to $n$, and we denote it by $E(n)$.
If $n$ is an interior node in a CCT, it represents an invocation of a procedure.
If $n$ is a leaf in a CCT, it represents a statement inside some procedure. For leaves, their inclusive and exclusive costs are equal.

It is useful to perform scalability analysis for both inclusive and exclusive costs; if the loss of scalability attributed to the inclusive costs of a function invocation is roughly equal to the loss of scalability due to its exclusive costs, then we know that the computation in that function invocation does not scale.
However, if the loss of scalability attributed to a function invocation's inclusive costs outweighs the loss of scalability accounted for by exclusive costs, we need to explore the scalability of the function's callees.

Given CCTs for an ensemble of executions, the next step to analyzing the scalability of their performance is to clearly define our expectations.
Next, we describe performance expectations for weak scaling and intuitive metrics that represent how much performance deviates from our expectations.
More information about our scalability analysis technique can be found elsewhere~\cite{Coarfa-MC:2007:ICS-scalability,Tallent-MC-etal:2009:SC-hpctoolkit-petascale}.

\begin{figure}[t]
\center{\includegraphics[width=1.0\textwidth]{fig/flash-scalability.png}}
%Two possible representations for the call path fragment $\ldots s_1 \rightarrow s_2 \ldots$, where $s_1$ and $s_2$ are call sites and where $s_1$ represents a call from $p$ to $q$ and $s_2$ a call from $q'$ to $r$.
\caption{Computing the scaling loss when weak scaling a white dwarf detonation simulation with FLASH3  from 256 to 8192 cores. For weak scaling, the time on an MPI rank in each of the simulations will be the same. In the figure, column 0 represents the inclusive cost for one MPI rank in a 256-core simulation; column 2 represents the inclusive cost for one MPI rank in an 8192-core simulation.  The difference between these two columns, computed as {\tt \$2-\$0},
represents the excess work present in the larger simulation for each unique program context in the calling context tree. Dividing that by the total time in the 8192-core execution {\tt @2} gives the fraction of wasted  time. Multiplying through by 100 gives the percent of the time wasted in the 8192-core execution, which corresponds to the \%~scalability loss.}
\label{fig:scaling-loss}
\end{figure}

\begin{figure}[t]
\center{\includegraphics[width=1.0\textwidth]{fig/scaling-loss-2.png}}
%Two possible representations for the call path fragment $\ldots s_1 \rightarrow s_2 \ldots$, where $s_1$ and $s_2$ are call sites and where $s_1$ represents a call from $p$ to $q$ and $s_2$ a call from $q'$ to $r$.
\caption{Using the fraction the scalability loss metric of Figure~\ref{fig:scaling-loss} to rank order loop nests by their scaling loss.}
\label{fig:scaling-loss-2}
\end{figure}


\subsubsection*{Weak Scaling}

Consider two weak scaling experiments executed on $p$ and $q$ processors, respectively, $p<q$.
In Figure~\ref{fig:scaling-loss} shows how we can use a derived metric to compute and attribute scalability losses.
Here, we compute the difference in inclusive cycles spent on one core of a 8192-core run and one core in a 256-core run in a weak scaling experiment.
If the code had perfect weak scaling, the time for an MPI rank in each of the executions would be identical. In this case, they are not.
We compute the excess work by computing the difference for each scope between the time on the 8192-core  run and the time on the 256-core core run.
We normalize the differences of the time spent in the two runs by dividing then by the total time spent on the 8192-core  run. This yields the fraction of wasted effort
for each scope when scaling from 256 to 8192 cores. Finally, we multiply these resuls by 100 to compute the \% scalability loss.
This example shows how one can compute a derived metric to that pinpoints and quantifies scaling losses across different node counts of a Blue Gene/P system.

A similar analysis can be applied to compute scaling losses between jobs that use different numbers of core counts on individual processors.
 Figure~\ref{fig:scaling-loss-2} shows the result of computing  the scaling loss for each loop nest when scaling from one to eight cores on a multicore node and rank order loop nests by their scaling loss metric. Here, we simply compute the scaling loss as the difference between the cycle counts of the eight-core and the one-core runs, divided through by the aggregate cost of the process executing on eight cores. This figure shows the scaling lost written in scientific notation as a fraction rather than multiplying through by 100 to yield a percent.
In this figure, we examine scaling losses in the flat view, showing them for each loop nest.
The source pane shows the loop nest responsible for the greatest scaling loss when scaling from one to eight cores.
Unsurprisingly, the loop with the worst scaling loss is very memory intensive.
Memory bandwidth is a precious commodity on multicore processors.

While we have shown how to compute and attribute the fraction of excess work in a weak scaling experiment, one can compute a similar quantity for experiments with strong scaling. When differencing the costs summed across all of the threads in a pair of strong-scaling experiments, one uses exactly the same approach as shown in Figure~\ref{fig:scaling-loss}. If comparing weak scaling costs summed across all ranks in $p$ and $q$ core executions, one can simply scale the aggregate costs by $1/p$ and $1/q$ respectively before differencing them.


\subsubsection{Exploring Scaling Losses}

Scaling losses can be explored in \hpcviewer{} using any of its three views.

\begin{itemize}
\item {\em Top-down view.} This view represents the dynamic calling contexts (call paths) in which costs were incurred.

\item {\em Bottom-up view.} This view enables one to look upward along call paths. This view is particularly useful for understanding the performance of software components or procedures that are used in more than one context, such as communication library routines.

\item {\em Flat view.} This view organizes performance measurement data according to the static structure of an application. All costs incurred in {\em any} calling context by a procedure are aggregated together in the flat view.
\end{itemize}

\hpcviewer{} enables developers to explore top-down, bottom-up, and flat views of CCTs annotated with costs, helping to quickly pinpoint performance bottlenecks.
Typically, one begins analyzing an application's scalability and performance using the top-down calling context tree view.
Using this view, one can readily see how costs and scalability losses are associated with different calling contexts.
If costs or scalability losses are associated with only a few calling contexts, then this view suffices for identifying the bottlenecks.
When scalability losses are spread among many calling contexts, \eg, among different invocations of \mytt{MPI_Wait}, often it is useful to switch to the bottom-up of the data to see if many losses are due to the same underlying cause.
In the bottom-up view, one can sort routines by their exclusive scalability losses and then look upward to see how these losses accumulate from the different calling contexts in which the routine was invoked.

Scaling loss based on excess work is intuitive; perfect scaling corresponds to a excess work value of $0$, sublinear scaling yields positive values, and superlinear scaling yields negative values.
Typically, CCTs for SPMD programs have similar structure.
If CCTs for different executions diverge, using \hpcviewer{} to compute and report excess work will highlight these program regions.

Inclusive excess work and exclusive excess work serve as useful measures of scalability associated with nodes in a calling context tree (CCT).
By computing both metrics, one can determine whether the application scales well or not at a CCT node and also pinpoint the cause of any lack of scaling.
If a node for a function in the CCT has comparable positive values for both inclusive excess work and exclusive excess work, then the loss of scaling is due to computation in the function itself.
However, if the inclusive excess work for the function outweighs that accounted for by its exclusive costs, then one should explore the scalability of its callees.
To isolate code that is an impediment to scalable performance, one can use the {\em hot path} button in \hpcviewer{} to trace a path down through the CCT to see where the cost is incurred.


% ===========================================================================
% ===========================================================================

%\section{Identifying Load Imbalance}
%\label{sec:effective-performance-analysis:load-imbalance}


% ***************************************************************************
% ***************************************************************************

\cleardoublepage
\chapter{Monitoring Dynamically-linked Applications with \hpcrun{}}
\label{chpt:hpcrun}

\input{hpcrun}


% ***************************************************************************
% ***************************************************************************
\cleardoublepage

\chapter{Monitoring Statically Linked Applications with \hpclink{}}
\label{chpt:statically-linked-apps}


% ===========================================================================
% ===========================================================================

On modern Linux systems, dynamically linked executables are the default.
With dynamically linked executables, \HPCToolkit{}'s \hpcrun{} script uses library preloading to inject \HPCToolkit's monitoring code into an application's address space.
However, in some cases, statically-linked executables are necessary or desirable.
\begin{itemize}
\item One might prefer a statically linked executable because they are generally faster if the executable spends a significant amount of time calling functions in libraries.
\item On Cray supercomputers, statically-linked executables are often the default.
\end{itemize}

For statically linked executables, preloading \HPCToolkit's monitoring code into an application's address space at program launch is not an option.
Instead, monitoring code must be added at link time; \HPCToolkit{}'s \hpclink{} script is used for this purpose.

% ===========================================================================
% ===========================================================================

\section{Linking with \hpclink{}}

Adding \HPCToolkit{}'s monitoring code into a statically linked application is easy.
This does not require any source-code modifications, but it does involve a small change to your build procedure.
You continue to compile all of your object (\texttt{.o}) files exactly as before, but you will need to modify your final link step to use \hpclink{} to add \HPCToolkit{}'s monitoring code to your executable.

In your build scripts, locate the last step in the build, namely, the command that produces the final statically linked binary.
Edit that command line to add the \hpclink{} command at the front.

For example, suppose that the name of your application binary is \texttt{app} and the last step in
your \texttt{Makefile} links various object files and libraries as
follows into a statically linked executable:
\begin{quote}
  \verb|mpicc -o app -static file.o ... -l<lib> ...|
\end{quote}
To build a version of your executable with \HPCToolkit's monitoring code linked in, you would use the following command line:
\begin{quote}
  \verb|hpclink mpicc -o app -static file.o ... -l<lib> ...|
\end{quote}

In practice, you may want to edit your \texttt{Makefile} to always build two versions of your program, perhaps naming them \texttt{app} and \texttt{app.hpc}.


\subsection{Using \hpclink{} when {\tt gprof} instrumentation is present}

When an application has been compiled with the compiler flag \verb|-pg|,
the compiler adds instrumentation to collect performance measurement data for
the \verb|gprof| profiler. Measuring application performance with
\HPCToolkit{}'s measurement subsystem and \verb|gprof| instrumentation
active in the same execution may cause the execution
to abort. One can detect the presence of \verb|gprof| instrumentation in an
application by the presence of \verb|__monstartup| and \verb|_mcleanup| symbols
in a executable.
One can disable \verb|gprof| instrumentation when measuring the performance of
a statically-linked application by using the \verb|--disable-gprof|
argument to \hpclink{}.


% ===========================================================================
% ===========================================================================

\section{Running a Statically Linked Binary}

For dynamically linked executables, the \hpcrun{} script sets environment variables to pass information to the \HPCToolkit{} monitoring library.
On standard Linux systems, statically linked \hpclink{}-ed executables can still be launched with \hpcrun{}.

You many encounter a situation where the \hpcrun{} script cannot be used with an application launcher.
In such cases, you will need to use the \mytt{HPCRUN_EVENT_LIST} environment variable to pass a list of events to \HPCToolkit{}'s monitoring code, which was linked into your executable using \hpclink{}.
Typically, you would set \mytt{HPCRUN_EVENT_LIST} in your launch script.

The \mytt{HPCRUN_EVENT_LIST} environment variable should be set to a space-separated list of \mytt{EVENT@COUNT} pairs.
For example, in a PBS script for a Cray system, you might write the following in Bourne shell or bash syntax:
\begin{quote}
\begin{verbatim}
#!/bin/sh
#PBS -l size=64
#PBS -l walltime=01:00:00
cd $PBS_O_WORKDIR
export HPCRUN_EVENT_LIST="CYCLES@f200 PERF_COUNT_HW_CACHE_MISSES@f200"
aprun -n 64 ./app arg ...
\end{verbatim}
\end{quote}

To collect sample traces of  an execution of a statically linked binary (for visualization with \hpctraceviewer{}), one needs to set the environment variable \mytt{HPCRUN_TRACE=1} in the execution environment.


% ===========================================================================
% ===========================================================================

\section{Troubleshooting}

With some compilers you need to disable interprocedural optimization to use \hpclink{}.
To instrument your statically linked executable at link time, \hpclink{} uses the \texttt{ld} option \texttt{--wrap} (see the ld(1) man page) to interpose monitoring code between your application and various process, thread, and signal control operations, \eg{}, \mytt{fork}, \mytt{pthread_create}, and \mytt{sigprocmask} to name a few.
For some compilers, \eg{}, IBM's XL compilers, interprocedural optimization interferes with the \texttt{--wrap} option and prevents \hpclink{} from working properly.
If this is the case, \hpclink{} will emit error messages and fail.
If you want to use \hpclink{} with such compilers, sadly, you must turn off interprocedural optimization.

Note that interprocedural optimization may not be explicitly enabled during your compiles; it might be implicitly enabled when using a compiler optimization option such as \texttt{-fast}.
In cases such as this, you can often specify \texttt{-fast} along with an option such as \texttt{-no-ipa}; this option combination will provide the benefit of all of \texttt{-fast}'s optimizations {\em except} interprocedural optimization.

% ***************************************************************************
% ***************************************************************************

\cleardoublepage
\chapter{Monitoring MPI Applications}
\label{chpt:mpi-apps}

\HPCToolkit{}'s measurement subsystem can measure each process and thread in an execution of an MPI program.
\HPCToolkit{} can be used with pure MPI programs as well as hybrid programs that use multithreading, e.g. OpenMP or Pthreads, within MPI processes.

\HPCToolkit{} supports C, C++ and Fortran MPI programs.
It has been successfully tested with MPICH, MVAPICH and OpenMPI and should work with almost all MPI implementations.


% ===========================================================================
% ===========================================================================

\section{Running and Analyzing MPI Programs}

\newcommand{\question}[1]{\vspace{4pt}\par\noindent{\bf Q: #1}}
\newcommand{\answer}{\par\vspace{2pt}\noindent{\bf A:}}

\question{How do I launch an MPI program with hpcrun?}

\answer{}
For a dynamically linked application binary \texttt{app}, use a command line similar to the following example:
%
\begin{quote}
  \verb|<mpi-launcher> hpcrun -e <event>:<period> ... app [app-arguments]|
\end{quote}
%
Observe that the MPI launcher (\texttt{mpirun}, \texttt{mpiexec}, \etc{}) is used to launch \hpcrun{}, which is then used to launch the application program.

\vspace{1ex}

\question{How do I compile and run a statically linked MPI program?}

\answer{}
To use  \HPCToolkit{} to monitor  statically linked binaries, use \hpclink{} to build a statically linked version of your application that includes \HPCToolkit{}'s monitoring library.
For example, to link your application binary \texttt{app}:
%
\begin{quote}
\begin{verbatim}
hpclink <linker> -o app <linker-arguments>
\end{verbatim}
\end{quote}
%
Then, set the \mytt{HPCRUN_EVENT_LIST} environment variable in the launch script before running the application:
%
\begin{quote}
\begin{verbatim}
export HPCRUN_EVENT_LIST="CYCLES@f200"
<mpi-launcher> app [app-arguments]
\end{verbatim}
%
\end{quote}
See the Chapter~\ref{chpt:statically-linked-apps} for more information.

\vspace{1ex}

\question{What files does \hpcrun{} produce for an MPI program?}

\answer{}
In this example, \mytt{s3d_f90.x} is the Fortran S3D program compiled with OpenMPI and run with the command line
%
\begin{quote}
  \verb|mpiexec -n 4 hpcrun -e PAPI_TOT_CYC:2500000 ./s3d_f90.x|
\end{quote}
%
This produced 12 files in the following abbreviated \texttt{ls} listing:
%
\begin{quote}
\begin{verbatim}
krentel 1889240 Feb 18  s3d_f90.x-000000-000-72815673-21063.hpcrun
krentel    9848 Feb 18  s3d_f90.x-000000-001-72815673-21063.hpcrun
krentel 1914680 Feb 18  s3d_f90.x-000001-000-72815673-21064.hpcrun
krentel    9848 Feb 18  s3d_f90.x-000001-001-72815673-21064.hpcrun
krentel 1908030 Feb 18  s3d_f90.x-000002-000-72815673-21065.hpcrun
krentel    7974 Feb 18  s3d_f90.x-000002-001-72815673-21065.hpcrun
krentel 1912220 Feb 18  s3d_f90.x-000003-000-72815673-21066.hpcrun
krentel    9848 Feb 18  s3d_f90.x-000003-001-72815673-21066.hpcrun
krentel  147635 Feb 18  s3d_f90.x-72815673-21063.log
krentel  142777 Feb 18  s3d_f90.x-72815673-21064.log
krentel  161266 Feb 18  s3d_f90.x-72815673-21065.log
krentel  143335 Feb 18  s3d_f90.x-72815673-21066.log
\end{verbatim}
\end{quote}
%
Here, there are four processes and two threads per process.
Looking at the file names, \mytt{s3d_f90.x} is the name of the program binary, \mytt{000000-000} through \mytt{000003-001} are the MPI rank and thread numbers, and \mytt{21063} through \mytt{21066} are the process IDs.

We see from the file sizes that OpenMPI is spawning one helper thread per process.
Technically, the smaller \mytt{.hpcrun} files imply only a smaller calling-context tree (CCT), not necessarily fewer samples.
But in this case, the helper threads are not doing much work.


\vspace{1ex}

\question{Do I need to include anything special in the source code?}

\answer{}
Just one thing.
Early in the program, preferably right after \mytt{MPI_Init()}, the program should call \mytt{MPI_Comm_rank()} with communicator \mytt{MPI_COMM_WORLD}.
Nearly all MPI programs already do this, so this is rarely a problem.
For example, in C, the program might begin with:
%
\begin{quote}
\begin{verbatim}
int main(int argc, char **argv)
{
    int size, rank;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    ...
}
\end{verbatim}
\end{quote}

\emph{Note:} The first call to \mytt{MPI_Comm_rank()} should use \mytt{MPI_COMM_WORLD}.
This sets the process's MPI rank in the eyes of \hpcrun{}.
Other communicators are allowed, but the first call should use \mytt{MPI_COMM_WORLD}.

Also, the call to \mytt{MPI_Comm_rank()} should be unconditional, that is all processes should make this call.
Actually, the call to \mytt{MPI_Comm_size()} is not necessary (for \hpcrun{}), although most MPI programs normally call both \mytt{MPI_Comm_size()} and \mytt{MPI_Comm_rank()}.

\vspace{1ex}

\question{What MPI implementations are supported?}

\answer{}
Although the matrix of all possible MPI variants, versions, compilers, architectures and systems is very large, \HPCToolkit{} has been tested successfully with MPICH, MVAPICH and OpenMPI and should work with most MPI implementations.

\vspace{1ex}

\question{What languages are supported?}

\answer{}
C, C++ and Fortran are supported.

% ===========================================================================
% ===========================================================================

\section{Building and Installing \HPCToolkit{}}

\question{Do I need to compile \HPCToolkit{} with any special options for MPI support?}

\answer{}
No, \HPCToolkit{} is designed to work with multiple MPI implementations at the same time.
That is, you don't need to provide an \mytt{mpi.h} include path, and you don't need to compile multiple versions of \HPCToolkit{}, one for each MPI implementation.

The technically-minded reader will note that each MPI implementation uses a different value for \mytt{MPI_COMM_WORLD} and may wonder how this is possible.
\hpcrun{} (actually \mytt{libmonitor}) waits for the application to call \mytt{MPI_Comm_rank()} and uses the same communicator value that the application uses.
This is why we need the application to call \mytt{MPI_Comm_rank()} with communicator \mytt{MPI_COMM_WORLD}.


% ***************************************************************************
% ***************************************************************************

\cleardoublepage
\chapter{Measurement and Analysis of GPU-accelerated Applications}
\label{chpt:gpu}

\input{gpu}


% ***************************************************************************
% ***************************************************************************

\cleardoublepage
\chapter{Measurement and Analysis of OpenMP Multithreading}
\label{chpt:openmp}

HPCToolkit includes an implementation of the OpenMP Tools API
known as OMPT that was first defined in OpenMP 5.0. The OMPT interface enables HPCToolkit to extract
enough information to reconstruct user-level calling contexts from
implementation-level measurements.

In the unlikely event that there is a bad interaction between HPCToolkit's support for the OMPT interface
and an OpenMP runtime, OMPT support may be disabled
when measuring your code with HPCToolkit by setting an environment variable, as shown below

\vspace{1ex}
{\tt export OMP\_TOOL=disabled}

\section{Monitoring OpenMP on the Host}
Support for OpenMP 5.0 and OMPT is emerging in OpenMP runtimes.
IBM's LOMP (Lightweight OpenMP Runtime) and recent versions of LLVM's OpenMP runtime,
AMD's AOMP, and Intel's OpenMP runtime provide emerging support
for OMPT. Support in these implementations evolving, especially with respect to offloading computation
onto TARGET devices.
A notable exception for a popular runtime that lacks OMPT support is the GCC compiler suite's {\tt libgomp}. Fortunately, the LLVM OpenMP runtime, which supports OMPT, is compatible with {\tt libgomp}, at least on the host.\footnote{It appears that GCC's support for OpenMP offloading can only be used with {\tt libgomp},}

 In OpenMP implementations without support for the OMPT interface, HPCToolkit records and reports implementation-level measurements of program executions. At the implementation-level, work is typically partitioned between a primary (master) thread and one or more worker threads. Without the OMPT interface, work executed by the master thread can be associated with its full user-level calling context and is reported under \verb|<program root>|. However, OpenMP regions and tasks executed by worker threads typically can't be associated with the calling context in which regions or tasks were launched. Instead, the work is attributed to a worker thread outer context that polls for work, finds the work, and executes the work. HPCToolkit reports such work under \verb|<thread root>|.


When an OpenMP runtime supports the OMPT interface,  by registering  callbacks  using the OMPT interface and  making calls to OMPT interface operations in the runtime API, HPCToolkit can gather information that enables it to reconstruct a global, user-level view of the parallelism. Using the OMPT interface, HPCToolkit can attribute metrics for costs incurred by worker threads in parallel regions back to the calling contexts in which those parallel regions were invoked. In such cases, most or all work performance is attributed back to global user-level calling contexts that are descendants of \verb|<program root>|. When using the OMPT interface, there may be some costs that cannot be attributed back to a global user-level calling context in an OpenMP program. For instance, costs assocuated with idle worker threads that can't be associated with any parallel region may be attributed to \verb|<omp idle>|. Even when using the OMPT interface, some costs may be attributed to \verb|<thread root>|; however, such costs are typically small and are often associated with runtime startup.

\section{Monitoring OpenMP Offloading on GPUs}

HPCToolkit includes support for using the OMPT interface to monitor offloading of computations specified with OpenMP TARGET to GPUs and attributing them back to the host calling contexts from which they were offloaded.

\subsection{NVIDIA GPUs}

OpenMP computations executing on NVIDIA GPUs are monitored whenever {\tt hpcrun}'s command-line witches are configured to monitor operations on NVIDIA GPUs, as described in Section~\ref{sec:nvidia-gpu}.

At this writing, NVIDIA's OpenMP {\tt nvc++} compiler and
runtime lack OMPT support. Without OMPT support, HPCToolkit
separates performance information for the OpenMP primary thread from
other OpenMP threads (and any other threads that may be present at
runtime, such as MPI helper threads). Performance of the primary thread is
attributed to \verb|<program root>|; the performance of all other threads
is attributed to \verb|<thread root>|. While this is not as easy to analyze and understand
as the global, user-level calling context view constructed using the OMPT interface,
this approach can be used to analyze performance data for OpenMP programs
compiled with NVIDIA's compilers using HPCToolkit.

LLVM-generated code for  v12.0 or later have good host-side OMPT
support in the runtime. HPCToolkit does a good job associating
the performance of kernels with global, user-level CPU calling contexts in which
they are launched.

Regardless of what compiler is used to offload OpenMP computations to NVIDIA GPUs, HPCToolkit simplifies the host calling contexts to which it attributes GPU operations by hiding all NVIDIA library frames that correspond to stripped code in NVIDIA's CUDA runtime.
The presence of long chains of procedure frames only identified by their machine code address in NVIDIA's CUDA library in the calling contexts for GPU operations obscures rather than enlightens;
thus, suppressing them is appropriate.

\subsection{AMD GPUs}

OpenMP computations executing on AMD GPUs are monitored whenever {\tt hpcrun}'s command-line switches are configured to monitor operations on AMD GPUs, as described in Section~\ref{sec:amd-gpu}.

AMD's ROCm 5.1 and later releases contains OMPT support for monitoring and attributing host computations as well as computations
offloaded to AMD GPUs using OpenMP TARGET. When compiled with  {\tt amdclang} or {\tt amdclang++}, both host computations and computations offloaded to AMD GPUs can be associated with global user-level calling contexts that are children of \verb|<program root>|.

Cray's compilers only have partial support for the OMPT interface, which renders HPCToolkit unable to elide implementation-level details of parallel regions. For everyone but compiler or runtime developers, such details are unnecessary and make it harder for application developers to understand their code with no added value.


\subsection{Intel GPUs}

OpenMP computations executing on Intel GPUs are monitored whenever {\tt hpcrun}'s command-line switches are configured to monitor operations on Intel GPUs, as described in Section~\ref{sec:intel-gpu}.

Intel's OneAPI {\tt ifx} and {\tt icx} compilers, which support OpenMP offloading in their OpenMP runtime atop Intel's latest GPU-enabled Level Zero runtime, provide support for the OMPT tools interface.
The implementation of host-side OMPT callbacks in Intel's OpenMP runtime is sufficient for attributing GPU work to global, user-level calling contexts rooted at \verb|<program root>|.


% ***************************************************************************
% ***************************************************************************

\cleardoublepage
\chapter{Analyzing Performance Data with \hpcviewer{}}
\label{chpt:hpcviewer-interface}

\input{hpcviewer}


% ***************************************************************************
% ***************************************************************************


% ***************************************************************************
% ***************************************************************************

\cleardoublepage
\chapter{Known Issues}
\label{chpt:known}

This section lists some known issues and potential workarounds.
Other known issues can be seen in the project's Gitlab issues pages:
\begin{itemize}
 \item For HPCToolkit in general, see  \url{https://gitlab.com/HPCToolkit/HPCToolkit/issues}
 \item For hpcviewer, see \url{https://gitlab.com/HPCToolkit/HPCViewer/issues}
\end{itemize}

\section{Inaccurate attribution of performance information can occur on Sapphire Rapids processors with HPCToolkit}

\paragraph{Description:} HPCToolkit depends upon the Dyninst binary analysis framework. Prior to Dyninst version 13.0, Dyninst stops its binary analysis of functions whenever it sees an a bit pattern that it doesn't recognize as a machine instruction. Unfortunately, Dyninst currently lacks support for decoding the AMX (advanced matrix extension) instructions supported by Sapphire Rapids. Prior to version 13.0, Dyninst will stop analysis of machine code in a function if it encounters an AMX instruction. This shortcoming in Dyninst can cause \hpcstruct{} to fail to fully recover program structure for functions, leading to inaccurate attribution of program performance by HPCToolkit.

\paragraph{Workaround:} Building HPCToolkit atop Dyninst 13.0 resolves the issue.

\section{We observed bad generated code for Dyninst using gcc 11.2.0 on Aurora}

\paragraph{Description:} On the Aurora supercomputer, we observed that hpcstruct crashed with a segmentation fault when it and its Dyninst dependence were compiled with gcc 11.2.0. Debugging showed that when a class in Dyninst called a function implemented by its base class, the wrong {\tt this} pointer value was passed to the function, causing a segmentation fault. 

\paragraph{Workaround:} Recompiling HPCToolkit and Dyninst with gcc 12.2.0 eliminated the problem.

\section{When monitoring applications that use ROCm 6.0.0, using LD\_AUDIT in \hpcrun{} may cause it to fail to elide OpenMP runtime frames}

\paragraph{Description:}  When an application provides a runtime that supports the OpenMP tools API known as OMPT, normally in the OpenMP runtime frames between user code on call stacks are elided. However, have observed that when using Glibc's \verb|LD_AUDIT| as part of HPCToolkit's measurement infrastructure and using Rocm 6.0.0, an application's TLS storage may be reinitialized during HPCToolkit's initialization; this clears some important HPCToolkit state information from thread local variables. As a result, the primary thread is not recognized as an OpenMP thread, which is necessary to elide runtime frames.

The root cause of the problem is a bug in Glibc's \verb|LD_AUDIT|. This is believed to affect all versions of Glibc. However, we have only observed this problem when using ROCm 6.0.0.

\paragraph{Workaround:} Use the \verb|--disable-auditor| option to \hpcrun{}.

\section{When using Intel GPUs, \hpcrun{}  may report that substantial time is spent in a partial call path consisting of only an unknown procedure}

\paragraph{Description:} Binary instrumentation on Intel GPUs uses Intel's GTPin. GTPin runs in its own private namespace.  Asynchronous samples collected in response to Linux timer or hardware counter events may often occur when GTPin is executing. With GTPin in a private namespace, its code and symbols are invisible to \hpcrun{}, which causes a degenerate unwind consisting of only an unknown procedure. 

\paragraph{Workaround:} Don't collect Linux timer or hardware counter events on the CPU when using binary instrumentation to collect instruction-level performance measurements of kernels executing on Intel GPUs.

\section{\hpcrun{} reports partial call paths for code executed by a constructor prior to entering main}

\paragraph{Description:} At present, all samples of code executed by constructors are reported as a partial call paths even if they are full unwinds. This occurs because HPCToolkit wasn't designed to attribute code that executes in constructors.

\paragraph{Workaround:} Don't be concerned by partial call paths that unwind through \verb|__libc_start_main|  and \verb|__lib_csu_init|. The samples are fully attributed even though HPCToolkit does not recognize them as such.

\paragraph{Development Plan:} A future version of HPCToolkit will recognize that these unwinds are indeed full call paths and attribute them as such.

\section{\hpcrun{} may fail to measure a program execution on a CPU with hardware performance counters}

\paragraph{Description:} We observed a problem using Linux  {\tt perf\_events} to measure CPU performance using hardware performance counters on an {\tt x86\_64} cluster at Sandia. An investigation determined that the cluster was running Sandia's LDMS (Lightweight Distributed Metric Service)---a low-overhead, low-latency framework for collecting, transferring, and storing metric data on a large distributed computer system. On this cluster, the LDMS  daemon had been configured to use the {\tt syspapi\_sampler} (\url{https://github.com/ovis-hpc/ovis/blob/OVIS-4/ldms/src/sampler/syspapi/syspapi\_sampler.c}), which uses the Linux {\tt perf\_events} subsystem to measure hardware counters at the node level. At present, the LDMS {\tt syspapi\_sampler}'s use of  the Linux {\tt perf\_events} subsystem for data collection at the node level conflicts with native use of use the Linux {\tt perf\_events} subsystem by HPCToolkit for process-level measurement.\footnote{We observed the same conflict between the LDMS {\tt syspapi\_sampler}  and the Linux {\tt perf} command-line tool. We expect that the  {\tt syspapi\_sampler}
conflicts with other process-level tools that use the Linux {\tt perf\_events} subsystem to measure events using hardware counters.}

\paragraph{Workaround:} Surprisingly, measurement using HPCToolkit's PAPI interface atop Linux  {\tt perf\_events} works even though using HPCToolkit directly atop Linux  {\tt perf\_events} yields no measurement data.  For instance, rather than measuring {\tt cycles} using Linux {\tt perf\_events} directly with {\tt -e cycles}, one can measure cycles through HPCToolkit's PAPI measurement subsystem using {\tt -e PAPI\_TOT\_CYC}. Of course, one can configure PAPI to measure other hardware events, such as graduated instructions and cache misses.

\paragraph{Development Plan:} Identify why the use of the Linux {\tt perf\_events} subsystem by the LDMS {\tt syspapi\_sampler} conflicts with the use of the direct use of Linux {\tt perf\_events} HPCToolkit and the Linux {\tt perf} tool but not with the use of Linux {\tt perf\_events} by PAPI.

\section{hpcrun{} may associate several profiles and traces with rank 0, thread 0}

\paragraph{Description:}  On Cray systems, we have observed that \hpcrun{} associates several profiles and traces with rank 0, thread 0. This results from the fact that the Cray PMI daemon gets forked from the application in a constructor and there is no exec. Initially, each process gets tagged with rank 0, thread 0 until the real rank and thread is determined later in the execution. That determination never happens for the PMI daemon.

\paragraph{Workaround:} In our experience, the hpcrun files in the measurement for the daemon tagged with rank 0 thread 0 are very small. In experiments we ran, they were about 2K. You can remove these profiles and their matching trace files before processing a measurement database with \hpcprof{}. The correspondence between a profile and trace can be determined because they only differ in their suffix (hpcrun or hpctrace).

\section{\hpcrun{} sometimes enables writing of read-only data}

If an application or shared library contains a \verb|PT_GNU_RELRO| segment in its program header, the runtime loader \verb|ld.so| will mark all data in that segment readonly
after relocations have been processed at runtime.
As described in Section~\ref{hpcrun-audit} of the manual, on \verb|x86_64| and Power architectures, \hpcrun{} uses \verb|LD_AUDIT| to monitor operations on dynamic libraries.
For \hpcrun{} to properly resolve calls to functions in shared libraries, the Global Offset Table (GOT) must be writable. Sometimes, the GOT lies within the \verb|PT_GNU_RELRO| segment, which may cause it to be marked readonly after relocations are processed.
If \hpcrun{} is using   \verb|LD_AUDIT| to monitor shared library operations, it will enable write permissions on the \verb|PT_GNU_RELRO| segment during execution. While this makes some data writable that should have read-only permissions, it should not affect the behavior of any program that does not attempt to overwrite data that should have been readonly in its address space.

\section{A confusing label for GPU theoretical occupancy}
\paragraph{Affected architectures:}  NVIDIA GPUs

\paragraph{Description:} When analyzing a GPU-accelerated application that employs NVIDIA GPUs, HPCToolkit estimates percent GPU theoretical occupancy as the ratio of active GPU threads divided by the maximum number of GPU threads available. In multi-threaded or multi-rank programs, HPCToolkit reports GPU  theoretical occupancy with the label

\begin{quote}
Sum over rank/thread of exclusive 'GPU kernel: theoretical occupancy (FGP\_ACT / FGP\_MAX)'
\end{quote}

\noindent rather than its correct label

\begin{quote}
GPU kernel: theoretical occupancy (FGP\_ACT / FGP\_MAX)
\end{quote}

The metric is computed correctly by summing the fine-grain parallelism used in each kernel launch across all threads and ranks and dividing it by the sum of the maximum fine-grain parallelism available to each kernel launch across all threads and ranks, and presenting the value as a percent.

\paragraph{Explanation:} This metric is unlike others computed by HPCToolkit. Rather than being computed by \hpcprof{}, it is computed by having \hpcviewer{} interpret a formula.

\paragraph{Workaround:} Pay attention to the metric value, which is computed correctly and ignore its awkward label.


\paragraph{Development Plan:}  Add additional support to  \hpcrun{} and \hpcprof{} to understand how derived metrics are computed and avoid spoiling their labels.

\section{Deadlock when using Darshan}

\paragraph{Affected architectures:}  \verb|x86_64| and ARM

\paragraph{Description:}
Darshan is a library for monitoring POSIX I/O. When using asynchronous sampling on the CPU to monitor a program that is being monitored with Darshan, your program may deadlock.

\paragraph{Explanation:} Darshan hijacks calls to \verb|open|.
HPCToolkit uses the \verb|libunwind| library.
Under certain circumstances, \verb|libunwind| uses \verb|open| to inspect an application's executable or one of the shared libraries it uses to look for unwinding information recorded by the compiler.
The following sequence of actions leads to a problem:
\begin{enumerate}
\item
A user application calls \verb|malloc| and acquires a mutex lock on an allocator data structure.
\item
HPCToolkit's signal handler is invoked to record an asynchronous sample.
\item
\verb|libunwind| is invoked to obtain the calling context for the sample.
\item
\verb|libunwind|  calls \verb|open| to look for
compiler-based unwind information.
\item
A Darshan wrapper for \verb|open| executes in HPCToolkit's signal handler.
\item
The Darshan wrapper for \verb|open| may try to allocate data to record statistics for the application's calls to \verb|open|, deadlocking because a non-reentrant allocator lock is already held by this thread.
\end{enumerate}

\paragraph{Workaround:} Unload the Darshan module before compiling a statically-linked application or running a dynamically-linked application.

\paragraph{Development Plan:} Ensure that \verb|libunwind|'s calls to \verb|open| are never intercepted by Darshan.


% ***************************************************************************
% ***************************************************************************

\cleardoublepage
\chapter{FAQ and Troubleshooting}
\label{chpt:faq-troubleshooting}

To measure an application's performance with \HPCToolkit, one must add
\HPCToolkit's measurement subsystem to an application's address
space.
\begin{itemize}
\item
For a statically-linked binary, one adds \HPCToolkit's
measurement subsystem directly into the binary
by prefixing your link command
with \HPCToolkit{}'s \hpclink{} command.
\item
For a dynamically-linked
binary, launching your application with \HPCToolkit's \hpcrun{}
command pre-loads \HPCToolkit's measurement subsystem into your
application's address space before the application begins to execute.
\end{itemize}
In this chapter, for convenience, we refer to HPCToolkit's measurement
system simply as \hpcrun{} since the measurement subsystem is most commonly used
with dynamically-linked binaries. From the context, it should be clear enough
whether we are talking about \HPCToolkit's measurement subsystem
or the \hpcrun{} command itself.

\section{Instrumenting Statically-linked Applications}

\subsubsection{Using \hpclink{} with {\tt cmake}}

When creating a statically-linked executable with {\tt cmake}, it is not obvious how to add {\tt hpclink} as a prefix to a link command. Unless it is overridden somewhere  along the way, the following rule found in {\tt Modules/CMakeCXXInformation.cmake} is
used to create the link command line for a C++ executable:

\begin{verbatim}
  if(NOT CMAKE_CXX_LINK_EXECUTABLE)
    set(CMAKE_CXX_LINK_EXECUTABLE
       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS>
                             <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
  endif()
\end{verbatim}

\noindent
As the rule shows, by default, the C++ compiler is used to link C++ executables. One way to change this is to override the definition for \verb|CMAKE_CXX_LINK_EXECUTABLE|  on the {\tt cmake} command line so that it includes the  necessary \hpclink{} prefix, as shown below:

\begin{verbatim}
  cmake srcdir ... \
     -DCMAKE_CXX_LINK_EXECUTABLE="hpclink <CMAKE_CXX_COMPILER> \
          <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> \
          <LINK_LIBRARIES>" ...
\end{verbatim}

\noindent
If your project has executables linked with a C or Fortran compiler, you will need analogous redefinitions for \verb|CMAKE_C_LINK_EXECUTABLE| or  \verb|CMAKE_Fortran_LINK_EXECUTABLE| as well.

Rather than adding the redefinitions of these linker rules to the {\tt cmake} command line,
you may find it more convenient to add definitions of these rules to your {\tt CMakeLists.cmake} file.

% ===========================================================================
% ===========================================================================

\section{General Measurement Failures}


\subsection{Unable to find HPCTOOLKIT root directory}

On some systems, you might see a message like this:
\begin{quote}
\begin{verbatim}
/path/to/copy/of/hpcrun: Unable to find HPCTOOLKIT root directory.
Please set HPCTOOLKIT to the install prefix, either in this script,
or in your environment, and try again.
\end{verbatim}
\end{quote}

The problem is that the system job launcher copies the \hpcrun{}
script from its install directory to a launch directory and runs
it from there.  When the system launcher moves \hpcrun{} to a different directory, this
breaks \hpcrun{}'s method for finding its own install directory.
The solution is to add \verb|HPCTOOLKIT| to your environment so that
\hpcrun{} can find its install directory. See section~\ref{sec:env-vars} for
general notes on environment variables for \hpcrun{}. Also, see section~\ref{sec:platform-specific},
as this problem occurs on Cray XE and XK systems.

Note: Your system may have a module installed for \verb|hpctoolkit| with the
correct settings for \verb|PATH|, \verb|HPCTOOLKIT|, etc.  In that case,
the easiest solution is to load the \verb|hpctoolkit| module.  If there is
such a module, Try
``\verb|module show hpctoolkit|'' to see if it sets \verb|HPCTOOLKIT|.

% ===========================================================================
% ===========================================================================


\subsection{Profiling {\tt setuid} programs}

\hpcrun{} uses preloaded shared libraries to initiate profiling.  For this
reason, it cannot be used to profile {\tt setuid} programs.


% ===========================================================================
% ===========================================================================

\subsection{Problems loading dynamic libraries}

By default, hpcrun uses Glibc's {\tt LD\_AUDIT} subsystem to monitor an application's use of dynamic
  libraries. Use of  {\tt LD\_AUDIT}  is needed to properly track loaded libraries when a
  {\tt RUNPATH} is set in the application or libraries.
Due to known bugs in Glibc's implementation, this may cause the application to crash unexpectedly.
See Section~\ref{hpcrun-audit} for details on the issues present and how to avoid them.

% ===========================================================================
% ===========================================================================

\subsection{Problems caused by {\tt gprof} instrumentation}

When an application has been compiled with the compiler flag \verb|-pg|,
the compiler adds instrumentation to collect performance measurement data for
the \verb|gprof| profiler. Measuring application performance with
\HPCToolkit{}'s measurement subsystem and \verb|gprof| instrumentation
active in the same execution may cause the execution
to abort. One can detect the presence of \verb|gprof| instrumentation in an
application by the presence of the \verb|__monstartup| and \verb|_mcleanup| symbols
in a executable.  You can recompile your code without the
\verb|-pg| compiler flag and measure again. Alternatively, you can use the \verb|--disable-gprof|
argument to \hpcrun{} or \hpclink{} to disable \verb|gprof| instrumentation while
measuring performance with \HPCToolkit{}.

To cope with {\tt gprof} instrumentation in dynamically-linked programs, you can use \hpcrun{}'s \verb|--disable-gprof| option.

% ===========================================================================
% ===========================================================================

\section{Measurement Failures using NVIDIA GPUs}

\subsection{Deadlock while monitoring a program that uses IBM Spectrum MPI and NVIDIA GPUs}

IBM's Spectrum MPI uses a special library {\tt libpami\_cudahook.so} to intercept allocations of GPU memory so that Spectrum MPI knows when data is allocated on an NVIDIA GPU.
Unfortunately, the mechanism used by Spectrum MPI to do so (wrapping {\tt dlsym}) interferes with performance tools that use {\tt dlopen} and {\tt dlsym}.
This interference causes measurement of a GPU-accelerated MPI application using HPCToolkit to deadlock when an application uses both Spectrum MPI and and CUDA on an NVIDIA GPU.

To avoid this deadlock on systems when launching a program that uses Spectrum MPI with  {\tt jsrun}, use {\tt --smpiargs="-x PAMI\_DISABLE\_CUDA\_HOOK=1 -disable\_gpu\_hooks"} to disable the PAMI CUDA hook library.
These flags  cannot be used with the {\tt -gpu} flag.

Note however that disabling Spectrum MPI's CUDA hook will cause trouble if CUDA device memory is passed into the MPI library as a send or receive buffer.
An additional restriction is that memory obtained with a call to {\tt cudaMallocHost} may not be passed as a send or receive buffer.
Functionally similar memory may be obtained with any host allocation function followed by a call the {\tt cudaHostRegister}.

% ===========================================================================
% ===========================================================================

\subsection{Ensuring permission to use GPU performance counters}

Your Administrator or a recent NVIDIA driver installation may have disabled access to GPU Performance due to Security Notice: NVIDIA Response to ``Rendered Insecure: GPU Side Channel Attacks are Practical'' \url{https://nvidia.custhelp.com/app/answers/detail/a_id/4738} - November 2018.
If that is the case, HPCToolkit cannot access NVIDIA GPU performance counters when using a Linux 418.43 or later driver. This may cause an error message when you try to use PC sampling on an NVIDIA GPU.

A good way to check whether GPU performance counters are available to non-root users on Linux is to execute the following commands:
\begin{enumerate}
\item {\tt cd /etc/modprobe.d}
\item \verb|grep NVreg_RestrictProfilingToAdminUsers *|
\end{enumerate}
\noindent Generally, if non-root user access to GPU performance counters is enabled, the {\tt grep} command above should yield a line that contains {\tt NVreg\_RestrictProfilingToAdminUsers=0}.
Note: if you are on a cluster, access to GPU performance counters may be disabled on a login node, but enabled on a compute node. You should run an interactive job on a compute node and perform the checks there.

If access to GPU hardware performance counters is not enabled, one option you have is to use \hpcrun{} without PC sampling, i.e., with the {\tt -e gpu=nvidia} option instead of {\tt -e gpu=nvidia,pc}.

If PC sampling is a must, you have two  options:
\begin{enumerate}
\item Run the tool or application being profiled with administrative privileges.
On Linux, launch HPCToolkit with {\tt sudo} or as a user with the {\tt CAP\_SYS\_ADMIN} capability set.
\item Have a system administrator enable access to the NVIDIA performance counters using the instructions on the following web page: \url{https://developer.nvidia.com/ERR_NVGPUCTRPERM}.
\end{enumerate}

% ===========================================================================
% ===========================================================================

\subsection{Avoiding the error \texttt{cudaErrorUnknown}}
When monitoring a CUDA application with \texttt{REALTIME} or \texttt{CPUTIME}, you may encounter a
\texttt{cudaErrorUnknown} return from many or all CUDA calls in the application.
\footnote{We have observed this error on ORNL's Summit machine, running Red Hat Enterprise Linux 8.2.}
This error may occur non-deterministically, we have observed that this error occurs regularly
at very fast periods such as \texttt{REALTIME@100}. If this occurs, we recommend using \texttt{CYCLES}
as a working alternative similar to \texttt{CPUTIME}, see Section~\ref{sec:troubleshooting:hpcrun-sample-periods}
for more detail on HPCToolkit's \texttt{perfevents} support.

\subsection{Avoiding the error \texttt{CUPTI\_ERROR\_NOT\_INITIALIZED}}
\label{sec:troubleshooting:cupti}

\hpcrun{} uses NVIDIA's CUDA Performance Tools Interface known as
CUPTI to monitor computations on NVIDIA GPUs. In our experience,
this error occurs when the version of CUPTI used by HPCToolkit is
incompatible with the version of CUDA used by your program or CUDA kernel driver installed on your system. You can check the
version of the CUDA kernel driver installed on your system using the {\tt nvidia-smi} command.
Table 3 {\em CUDA Application Compatibility Support Matrix} at the following URL~\url{https://docs.nvidia.com/deploy/cuda-compatibility/index.html\#cuda-application-compatibility}
specifies what versions of the CUDA kernel driver match each version of CUDA and CUPTI.
Although the table indicates that some drivers can support newer versions of CUDA than the one that they were designed for,
e.g. driver 418.40.04+ designed to support CUDA 10.1 can also run CUDA 11.0 and 11.1 programs,
in our experience that does not necessarily mean that the driver will support performance measurement of CUDA programs
using any CUDA version newer than 10.1.
We believe that best way to avoid the \texttt{CUPTI\_ERROR\_NOT\_INITIALIZED} error is to ensure that
(1) HPCToolkit is compiled with the version of CUDA that your installed CUDA kernel driver was designed to support, and
(2) your application uses the version of CUDA that matches the one your kernel driver was designed to support or a compatible older version.

% ===========================================================================
% ===========================================================================


\subsection{Avoiding the error {\tt CUPTI\_ERROR\_HARDWARE\_BUSY}}

When trying to use PC sampling to measure computation on an NVIDIA GPU, you may encounter the following error: `function {\tt cuptiActivityConfigurePCSampling} failed with error {\tt CUPTI\_ERROR\_HARDWARE\_BUSY}'.

For all versions of CUDA  to date (through CUDA 11), NVIDIA's CUPTI library only supports PC sampling for only one process per GPU. If multiple MPI ranks in your application run CUDA on the same GPU, you may see this error.\footnote{We have observed this error on CUDA 11.}

You have two alternatives:
\begin{enumerate}
\item Measure the execution in which multiple MPI ranks share a GPU using only {\tt -e gpu=nvidia} without PC sampling.
\item Launch your program so that there is only a single MPI rank per GPU.
\begin{enumerate}
\item {\tt jsrun} advice: if using {\tt -g1} for a resource set, don't use anything other than {\tt -a1}.
\end{enumerate}
\end{enumerate}

% ===========================================================================
% ===========================================================================

\subsection{Avoiding the error {\tt CUPTI\_ERROR\_UNKNOWN}}

When trying to use PC sampling to measure computation on an NVIDIA GPU, you may encounter the following error: `function {\tt cuptiActivityEnableContext} failed with error {\tt CUPTI\_ERROR\_UNKNOWN}'.

For all versions of CUDA  to date (through CUDA 11), NVIDIA's CUPTI library only supports PC sampling for only one process per GPU. If multiple MPI ranks in your application run CUDA on the same GPU, you may see this error.\footnote{We have observed this error on CUDA 10.} You have two alternatives:
\begin{enumerate}
\item Measure the execution in which multiple MPI ranks share a GPU using only {\tt -e gpu=nvidia} without PC sampling.
\item Launch your program so that there is only a single MPI rank per GPU.
\begin{enumerate}
\item {\tt jsrun} advice: if using {\tt -g1} for a resource set, don't use anything other than {\tt -a1}.
\end{enumerate}
\end{enumerate}


% ===========================================================================
% ===========================================================================


\section{General Measurement Issues}

\subsection{How do I choose sampling periods?}
\label{sec:troubleshooting:hpcrun-sample-periods}

When using sample sources for hardware counter and software counter events provided by Linux \verb|perf_events|,
we recommend that you use frequency-based sampling. The default frequency is 300 samples/second.

Statisticians use samples sizes of approximately 3500 to make accurate projections about the voting preferences of millions of people.
In an analogous way, rather than measuring and attributing every action of a program or every runtime event (e.g., a cache miss), sampling-based performance measurement collects ``just enough'' representative performance data.
You can control \hpcrun{}'s sampling periods to collect ``just enough'' representative data even for very long executions and, to a lesser degree, for very short executions.

For reasonable accuracy ($\pm 5\%$), there should be at least 20 samples in each context that is important with respect to performance.
Since unimportant contexts are irrelevant to performance, as long as this condition is met (and as long as samples are not correlated, etc.), \HPCToolkit{}'s performance data should be accurate enough to guide program tuning.

We typically recommend targeting a frequency of hundreds of samples per second.
For very short runs, you may need to collect thousands of samples per second to record an adequate number of samples.
For long runs, tens of samples per second may suffice for performance diagnosis.

Choosing sampling periods for some events, such as Linux timers, cycles and instructions, is easy given a target sampling frequency.
Choosing sampling periods for other events such as cache misses is harder.
In principle, an architectural expert can easily derive reasonable sampling periods by working backwards from (a) a maximum target sampling frequency and (b) hardware resource saturation points.
In practice, this may require some experimentation.

See also the \hpcrun{} \href{http://hpctoolkit.org/man/hpcrun.html}{man page}.

% ===========================================================================
% ===========================================================================

\subsection{Why do I see partial unwinds?}
\label{sec:partial}

Under certain circumstances, HPCToolkit can't fully unwind the call
stack to determine the full calling context where a sample event
occurred. Most often, this occurs when \hpcrun{} tries to unwind
through functions in a shared library or executable that has not
been compiled with \verb|-g| as one of its options. The \verb|-g|
compiler flag can be used in addition to optimization flags. On
Power and \verb|x86_64| processors, \hpcrun{} can often compensate
for the lack of unwind recipes by using
binary analysis to compute recipes itself. However, since \hpcrun{}
lacks binary analysis capabilities for ARM processors, there is a
higher likelihood that the lack of a \verb|-g| compiler option for
an executable or a shared library will lead to partial unwinds.

One annoying place where partial unwinds are somewhat common on \verb|x86_64|
processors is in Intel's MKL family of libraries.  A careful examination of Intel’s
MKL libraries showed that most but not all routines have
compiler-generated Frame Descriptor Entries (FDEs) that help tools
unwind the call stack.  For any routine that lacks
an FDE, HPCToolkit tries to compensate using binary analysis.
Unfortunately, highly-optimized code in MKL library
routines has code features that are difficult to analyze correctly.

There are two ways to deal with this problem:
\begin{itemize}
\item Analyze the execution using information from partial unwinds. Often knowing several levels of calling context is enough for analysis without full calling context for sample events.
\item
Recompile the binary or shared library causing the problem and add \verb|-g| to the list of its compiler options.
\end{itemize}

% ===========================================================================
% ===========================================================================

\subsection{Measurement with HPCToolkit has high overhead! Why?}

For reasonable sampling periods, we expect \hpcrun{}'s overhead percentage to be in the low single digits, \eg{}, less than 5\%.
The most common causes for unusually high overhead are the following:
\begin{itemize}

\item Your sampling frequency is too high.
  Recall that the goal is to obtain a representative set of performance data.
  For this, we typically recommend targeting a frequency of hundreds of samples per second.
  For very short runs, you may need to try thousands of samples per second.
  For very long runs, tens of samples per second can be quite reasonable.
  See also Section~\ref{sec:troubleshooting:hpcrun-sample-periods}.

\item \hpcrun{} has a problem unwinding.
  This causes overhead in two forms.
  First, \hpcrun{} will resort to more expensive unwind heuristics and possibly have to recover from self-generated segmentation faults.
  Second, when these exceptional behaviors occur, \hpcrun{} writes some information to a log file.
  In the context of a parallel application and overloaded parallel file system, this can perturb the execution significantly.
  To diagnose this, execute the following command and look for ``Errant Samples'':
  \begin{quote}
  \verb|hpcsummary --all <hpctoolkit-measurements>|
  \end{quote}
  Note: The \verb|hpcsummary| script is no longer included in the \verb|bin| directory of an \HPCToolkit{} installation;
  it is a developer script that can be found in the \verb|libexec/hpctoolkit| directory.
  Let us know if you encounter significant problems with bad unwinds.

\item You have very long call paths where long is in the hundreds or thousands.
  On x86-based architectures, try additionally using \hpcrun{}'s \texttt{RETCNT} event.
  This has two effects: It causes \hpcrun{} to collect function return counts and to memoize common unwind prefixes between samples.

\item Currently, on very large runs the process of writing profile data can take a long time.
  However, because this occurs after the application has finished executing, it is relatively benign overhead.
  (We plan to address this issue in a future release.)

\end{itemize}


% ===========================================================================
% ===========================================================================

\subsection{Some of my syscalls return EINTR}

When profiling a threaded program, there are times when it is
necessary for \hpcrun{} to signal another thread to take some action.
When this happens, if the thread receiving the signal is blocked in a
syscall, the kernel may return EINTR from the syscall.  This would
happen only in a threaded program and mainly with ``slow'' syscalls
such as {\tt select()}, {\tt poll()} or {\tt sem\_wait()}.


\subsection{My application spends a lot of time in C library functions with names that include {\tt mcount}}

If performance measurements with \HPCToolkit{} show that your
application is spending a lot of time in C library routines with
names that include the string {\tt mcount} (e.g., \verb|mcount|, \verb|_mcount| or
\verb|__mcount_internal|), your code has been compiled with the compiler
flag \verb|-pg|, which adds instrumentation to collect performance
measurement data for the \verb|gprof| profiler.  If you are using
\HPCToolkit{} to collect performance data, the \verb|gprof| instrumentation
is needlessly slowing your application. You can recompile your code without the
\verb|-pg| compiler flag and measure again. Alternatively, you can use the \verb|--disable-gprof|
argument to \hpcrun{} or \hpclink{} to disable \verb|gprof| instrumentation while
measuring performance with \HPCToolkit{}.

\section{Problems Recovering Loops in NVIDIA GPU binaries}
\label{section:hpcstruct-cubin}

\begin{itemize}
\item
 When using the {\tt --gpucfg yes} option to analyze control flow to recover information about loops in CUDA binaries, \hpcstruct{} needs to use NVIDIA's {\tt nvdisasm} tool. It is important to note that \hpcstruct{} uses the version of {\tt nvdisasm} that is on your path. When using the {\tt --gpucfg yes} option to recover loops in CUBINs, you  can improve  \hpcstruct{}'s ability to recover loops by having a newer version of {\tt nvdisasm} on your path. Specifically, the version of {\tt nvdisasm} in CUDA 11.2  is much better than  {\tt nvdisasm} in CUDA 10.2. It will recover loops for more procedures and faster.
\item While NVIDIA has improved the capability and speed of {\tt nvdisasm} in CUDA 11.2, it may still be too slow to be usable on large CUDA binaries.  Because of failures we have encountered with {\tt nvdisasm},  \hpcstruct{} launches {\tt nvdisasm} once for each procedure in a GPU binary to maximize the information it can extract. With this approach, we have  seen  \hpcstruct{}   take over 12 hours to analyze a CUBIN of roughly 800MB with 40K GPU functions. For large CUDA binaries, our advice is to skip the {\tt --gpucfg yes} option at present until we adjust  \hpcstruct{} launch multiple copies of {\tt nvdisasm} in parallel to reduce analysis time.
\end{itemize}


\section{Graphical User Interface Issues}

% ===========================================================================
% ===========================================================================

\subsection{Fail to run \hpcviewer{}: executable launcher was unable to locate its companion shared library}

Although this error mostly incurrs on Windows platform, but it can happen in other environment.
The cause of this issue is that the permission of one of Eclipse launcher library (org.eclipse.equinox.launcher.*) is too restricted.
To fix this, set the permission of the library to 0755, and launch again the viewer.


% ===========================================================================
% ===========================================================================

\subsection{Launching \hpcviewer{} is very slow on Windows}

There is a known issue that Windows Defender significantly slow down Java-based applications.
See the github issue at \url{https://github.com/microsoft/java-wdb/issues/9}.

A temporary solution is to add \hpcviewer{} in the Windows' exclusion list:
\begin{enumerate}
  \item Open Windows 10 settings.
  \item Search for "Virus and threat protection" and open it.
  \item Now click on "Manage settings" under "Virus and threat protection settings" section.
  \item Now click "Add or remove exclusions" under "Exclusions" section.
  \item Now click "Add an exclusion" then select "Folder"
  \item Point to \hpcviewer{} directory and press "Select Folder"
\end{enumerate}


% ===========================================================================
% ===========================================================================

\subsection{Mac only: \hpcviewer{} runs on Java $X$ instead of ``Java 11"}

\hpcviewer{} has mainly been tested on Java 11. If you are running an older than Java 11 or newer than Java 17, obtain a version of Java 11 or 17 from \url{https://adoptopenjdk.net} or \url{https://adoptium.net/}.

If your system has multiple versions of Java and Java 11 is not the newest version, you need to set Java 11 as the default JVM. On MacOS, you need to exclude older Java as follows:

\begin{enumerate}
\item Leave all JDKs at their default location (usually under \texttt{/Library/Java/JavaVirtualMachines}). The system will pick the highest version by default.
\item To exclude a JDK from being picked by default, rename \texttt{Contents/Info.plist} file to other name like \texttt{Info.plist.disabled}. That JDK can still be used when \texttt{\$JAVA\_HOME} points to it, or explicitly referenced in a script or configuration. It will simply be ignored by your Mac's java command.
\end{enumerate}


% ===========================================================================
% ===========================================================================

\subsection{When executing \hpcviewer, it complains cannot create ``Java Virtual Machine"}
If you encounter this problem, we recommend that you edit the
\texttt{hpcviewer.ini}
file which is located in HPCToolkit installation directory to reduce the
Java heap size.
By default, the content of the file on Linux \verb|x86| is as follows:
\begin{verbatim}
-startup
plugins/org.eclipse.equinox.launcher_1.6.200.v20210416-2027.jar
--launcher.library
plugins/org.eclipse.equinox.launcher.gtk.linux.x86_64_1.2.200.v20210429-1609
-clearPersistedState
-vmargs
-Xmx2048m
-Dosgi.locking=none
\end{verbatim}
You can decrease the maximum size of the Java heap from 2048MB to 1GB
 by changing the {\tt Xmx} specification in the \texttt{hpcviewer.ini} file as follows:
\begin{verbatim}
-Xmx1024m
\end{verbatim}


% ===========================================================================
% ===========================================================================

\subsection{\hpcviewer{} fails to launch due to {\tt java.lang.NoSuchMethodError} exception.}

The root cause of the error is due to a mix of old and new hpcviewer{} binaries.
To solve this problem, you need to remove your hpcviewer workspace (usually in your \texttt{\$HOME/.hpctoolkit/hpcviewer} directory), and run \hpcviewer{} again.

% ===========================================================================
% ===========================================================================

\subsection{\hpcviewer{} fails due to {\tt java.lang.OutOfMemoryError} exception.}

If you see this error, the memory footprint that \hpcviewer{} needs to store and the metrics for your measured program execution exceeds the maximum size for the Java heap specified at program launch.  On Linux, \hpcviewer{} accepts a command-line option \verb|--java-heap| that enables you to specify a larger non-default value for the maximum size of the Java heap. Run \verb|hpcviewer --help| for the details of how to use this option.

% ===========================================================================
% ===========================================================================


\subsection{\hpcviewer{} writes a long list of Java error messages to the terminal!}

The Eclipse Java framework that serves as the foundation for \hpcviewer{} can be somewhat temperamental. If the persistent state maintained by Eclipse for \hpcviewer{}
gets corrupted, \hpcviewer{} may spew a list of errors deep within call chains of the Eclipse framework.

On MacOS and Linux, try removing your \hpcviewer{} Eclipse workspace with default location:\\
 \texttt{\$HOME/.hpctoolkit/hpcviewer} \\
 and run \hpcviewer{} again.


% ===========================================================================
% ===========================================================================

\subsection{\hpcviewer{} attributes performance information only to functions and not to source code loops and lines! Why?}
\label{sec:troubleshooting:debug-info}

Most likely, your application's binary either lacks debugging information or is stripped.
A binary's (optional) debugging information includes a line map that is used by profilers and debuggers to map object code to source code.
\HPCToolkit{} can profile binaries without debugging information, but without such debugging information it can only map performance information (at best) to functions instead of source code loops and lines.

For this reason, we recommend that you always compile your production applications with optimization \emph{and} with debugging information.
The options for doing this vary by compiler.
We suggest the following options:
\begin{itemize}
\item GNU compilers (\texttt{gcc}, \texttt{g++}, \texttt{gfortran}): \texttt{-g}
\item IBM compilers (xlc, xlf, xlC): \texttt{-g}
\item Intel compilers (\texttt{icc}, \texttt{icpc}, \texttt{ifort}): \texttt{-g -debug inline\_debug\_info}
\item PGI compilers (\texttt{pgcc}, \texttt{pgCC}, \texttt{pgf95}): \texttt{-gopt}.
\end{itemize}
We generally recommend adding optimization options \emph{after} debugging options --- \eg{}, `\texttt{-g -O2}' --- to minimize any potential effects of adding debugging information.%
\footnote{In general, debugging information is compatible with compiler optimization.
However, in a few cases, compiling with debugging information will disable some optimization.
We recommend placing optimization options \emph{after} debugging options because compilers usually resolve option incompatibilities in favor of the last option.}
Also, be careful not to strip the binary as that would remove the debugging information.
(Adding debugging information to a binary does not make a program run slower; likewise, stripping a binary does not make a program run faster.)

Please note that at high optimization levels, a compiler may make significant program transformations that do not cleanly map to line numbers in the original source code.
Even so, the performance attribution is usually very informative.


% ===========================================================================
% ===========================================================================

\subsection{\hpcviewer{} hangs trying to open a large database! Why?}

The most likely problem is that the Java virtual machine is low on memory and thrashing. The memory footprint that \hpcviewer{} needs to store and the metrics for your measured program execution is likely near the maximum size for the Java heap specified at program launch.

On Linux, \hpcviewer{} accepts a command-line option \verb|--java-heap| that enables you to specify a larger non-default value for the maximum size of the Java heap. Run \verb|hpcviewer --help| for the details of how to use this option.


% ===========================================================================
% ===========================================================================

\subsection{\hpcviewer{} runs glacially slowly! Why?}

There are three likely reasons why \hpcviewer{} might run slowly.
First, you may be running \hpcviewer{} on a remote system with low bandwidth, high latency or an otherwise unsatisfactory network connection to your desktop.
If any of these conditions are true, \hpcviewer{}'s otherwise snappy GUI can become sluggish if not downright unresponsive.
The solution is to install \hpcviewer{} on your local system, copy the database onto your local system, and run \hpcviewer{} locally.
We almost always run \hpcviewer{} on our local desktops or laptops for this reason.

Second, the \HPCToolkit{} database may be very large, which can cause the Java virtual machine to run short on memory and thrash.
The memory footprint that \hpcviewer{} needs to store and the metrics for your measured program execution is likely near the maximum size for the Java heap specified at program launch.
On Linux, \hpcviewer{} accepts a command-line option \verb|--java-heap| that enables you to specify a larger non-default value for the maximum size of the Java heap. Run \verb|hpcviewer --help| for the details of how to use this option.


% ===========================================================================
% ===========================================================================

\subsection{\hpcviewer{} does not show my source code! Why?}


Assuming you compiled your application with debugging information (see Issue~\ref{sec:troubleshooting:debug-info}),
the most common reason that \hpcviewer{} does not show source code is that \hpcprofAll{}
could not find it and therefore could not copy it into the \HPCToolkit{} performance database.


% ==========================================================
% ==========================================================
\paragraph{Follow `best practices'}
When running \hpcprofAll{}, we recommend using an \texttt{-I/--include} option to specify a search directory for each distinct top-level source directory (or build directory, if it is separate from the source directory).
Assume the paths to your top-level source directories are \texttt{<dir1>} through \texttt{<dirN>}.
Then, pass the the following options to \hpcprofAll{}:
\begin{quote}
  \verb|-I <dir1>/+ -I <dir2>/+ ... -I <dirN>/+|
\end{quote}
These options instruct \hpcprofAll{} to search for source files that live within any of the source directories \texttt{<dir1>} through \texttt{<dirN>}.
Each directory argument can be either absolute or relative to the current working directory.

It will be instructive to unpack the rationale behind this recommendation.
\hpcprofAll{} obtains source file names from your application binary's debugging information.
These source file paths may be either absolute or relative.
Without any \texttt{-I/--include} options, \hpcprofAll{} can find source files that either (1) have absolute paths (and that still exist on the file system) or (2) are relative to the current working directory.
However, because the nature of these paths depends on your compiler and the way you built your application, it is not wise to depend on either of these default path resolution techniques.
For this reason, we always recommend supplying at least one \texttt{-I/--include} option.

There are two basic forms in which the search directory can be specified: non-recursive and recursive.
In most cases, the most useful form is the recursive search directory, which means that the directory should be searched \emph{along with all of its descendants}.
A non-recursive search directory \texttt{dir} is simply specified as \texttt{dir}.
A recursive search directory \texttt{dir} is specified as the base search directory followed by the special suffix `\texttt{/+}': \texttt{dir/+}.
The paths above use the recursive form.

% ==========================================================
% ==========================================================

\paragraph{An explanation how HPCToolkit finds source files}

\hpcprofAll{} obtains source file names from your application binary's debugging information.
If debugging information is unavailable, such as is often the case for system or math libraries, then source files are unknown.


Two things immediately follow from this.
First, in most normal situations, there will always be some functions for which source code cannot be found, such as those within system libraries.\footnote{Having a system administrator download the associated {\tt devel} package for a library can enable visibility into the source code of system libraries.}
Second, to ensure that \hpcprofAll{} has file names for which to search, make sure as much of your application as possible (including libraries) contains debugging information.

If debugging information is available, source files can come in two forms: absolute and relative.
\hpcprofAll{} can find source files under the following conditions:
\begin{itemize}
\item If a source file path is absolute and the source file can be found on the file system, then \hpcprofAll{} will find it.
\item If a source file path is relative, \hpcprofAll{} can only find it if the source file can be found from the current working directory or within a search directory (specified with the \texttt{-I/--include} option).
\item Finally, if a source file path is absolute and cannot be found by its absolute path, \hpcprofAll{} uses a special search mode.
Let the source file path be \texttt{$p$/$f$}.
If the path's base file name $f$ is found within a search directory, then that is considered a match.
This special search mode accommodates common complexities such as:
(1) source file paths that are relative not to your source code tree but to the directory where the source was compiled;
(2) source file paths to source code that is later moved; and
(3) source file paths that are relative to file system that is no longer mounted.
\end{itemize}
Note that given a source file path \texttt{$p$/$f$} (where $p$ may be relative or absolute), it may be the case that there are multiple instances of a file's base name $f$ within one search directory, \eg{}, \texttt{$p_1$/$f$} through \texttt{$p_n$/$f$}, where $p_i$ refers to the $i$\textsuperscript{th} path to $f$.
Similarly, with multiple search-directory arguments, $f$ may exist within more than one search directory.
If this is the case, the source file \texttt{$p$/$f$} is resolved to the first instance \texttt{$p'$/$f$} such that $p'$ best corresponds to $p$, where instances are ordered by the order of search directories on the command line.

For any functions whose source code is not found (such as functions within system libraries), \hpcviewer{} will generate a synopsis that shows the presence of the function and its line extents (if known).

% Replace paths

% ===========================================================================
% ===========================================================================


% \section{\hpcviewer{} \emph{still} does not show my source code! Why?}

% Diagnosing and fixing this problem requires knowing exactly what path names are referenced in the binary and/or perhaps the performance data. Fortunately, this is information is supplied by \hpcprof{}.
% If a source file is successfully located, then a
% \begin{quote}
%   \verb|msg:   cp:...|
% \end{quote}
% line appears in the output of \hpcprof{}.
% Unlocated files are deemed `lost' and there is an output line of the form
% \begin{quote}
%   \verb|WARNING:  lost:|
% \end{quote}
% in the output.

% For example, suppose we have an application \verb|app1| whose main source
% is in in a directory \verb|/projs/Apps/app1-src|. The \verb|app1|
% application is built inside the \verb|app1-src| subdirectory, and it uses
% source files from a subdirectory \verb|app1-src/special| as well as some
% source common to all applications, located in
% \verb|/projs/Apps/common|. When \verb|app1| is built, the
% \verb|common| source is accessed by relative path \verb|../common|.
% The \verb|app1| executable is installed on our path.

% Now, we switch to our home directory \verb|/h/user/T1| to collect
% some profile data for \verb|app1|.
% When we run \hpcprof\ (without the \verb|-I| flag) as follows:
% \begin{quote}
%   \verb|hpcprof hpctoolkit-app1-measurements/|
% \end{quote}
% This results in the output
% \begin{quote}
% \begin{Verbatim}[fontsize=\small]
% msg: Line map : /opt/apps/intel/compilers/10.1/lib/libimf.so
% msg: STRUCTURE: /usr/local/bin/app1
% msg: Copying source files reached by PATH option to /h/user/T1/hpctoolkit-app1-database
% WARNING: lost: app1.c
% WARNING: lost: special/xfn1.c
% WARNING: lost: ../common/mathx.c
% WARNING: lost: ~unknown-file~
% WARNING: lost: irc_msg_support.c
% \end{Verbatim}
% \end{quote}
% The \verb|WARNING: lost:| obtains for \verb|~unknown-file~| and
% \verb|irc_msg_support.c| because these are compiler system files --- source
% is unavailable. The other lost files, however, can be found by using
% the proper \verb|-I| flag:
% \begin{quote}
% \begin{verbatim}
% hpcprof -I /projs/Apps/'*' hpctoolkit-app1-measurements/
% \end{verbatim}
% \end{quote}

% The resulting output:
% \begin{quote}
% \begin{Verbatim}[fontsize=\small]
% msg: Line map : /opt/apps/intel/compilers/10.1/lib/libimf.so
% msg: STRUCTURE: /usr/local/bin/app1
% msg: Copying source files reached by PATH option to /h/user/T1/hpctoolkit-app1-database
% msg:   cp:/projs/Apps/app1-src/app1.c -> ./projs/Apps/app1-src/app1.c
% msg:   cp:/projs/Apps/app1-src/special/xfn1.c -> ./projs/Apps/app1-src/special/xfn1.c
% msg:   cp:/projs/Apps/common/mathx.c -> ./projs/Apps/common/mathx.c
% WARNING: lost: ~unknown-file~
% WARNING: lost: irc_msg_support.c
% \end{Verbatim}
% \end{quote}
% \nobreak Much better!

% \textbf{Best Practice:} First, carefully inspect the output of \hpcprof\ to
% determine which files are lost. Next, determine the absolute path for
% each distinct top-level source directory (or build directory, if it
% is separate from the source directory). Finally, for each of these (absolute) directory paths,
% specify a \verb|-I| option with the recursive search option ( \verb|'*'| at the end
% of the path).


% ===========================================================================
% ===========================================================================

\subsection{\hpcviewer{}'s reported line numbers do not exactly correspond to what I see in my source code!  Why?}

To use a clich\'{e}, ``garbage in, garbage out''.
\HPCToolkit{} depends on information recorded in the symbol table by the compiler.
Line numbers for procedures and loops are inferred by looking at the symbol table information recorded for machine instructions identified as being inside the procedure or loop.

For procedures, often no machine instructions are associated with a procedure's declarations.
Thus, the first line in the procedure that has an associated machine instruction is the first line of executable code.

Inlined functions may occasionally lead to confusing data for a procedure.
Machine instructions mapped to source lines from the inlined function appear in the context of other functions.
While \hpcprof{}'s methods for handling incline functions are good, some codes can confuse the system.

For loops, the process of identifying what source lines are in a loop is similar to the procedure process: what source lines map to machine instructions inside a loop defined by a backward branch to a loop head.
Sometimes compilers do not properly record the line number mapping.

% When the compiler line mapping information is wrong, there is little you can do about it other than to ignore its
% imperfections, or hand-edit the XML program structure file produced by \hpcstruct{}.
% This technique is used only when truly desperate.


% ===========================================================================
% ===========================================================================

\subsection{\hpcviewer{} claims that there are several calls to a function within a particular source code scope, but my source code only has one!  Why?}

In the course of code optimization, compilers often replicate code blocks.
For instance, as it generates code, a compiler may peel iterations from a loop or split the iteration space of a loop into two or more loops.
In such cases, one call in the source code may be transformed into multiple distinct calls that reside at different code addresses in the executable.

When analyzing applications at the binary level, it is difficult to determine whether two distinct calls to the same function that appear in the machine code were derived from the same call in the source code.
Even if both calls map to the same source line, it may be wrong to coalesce them; the source code might contain multiple calls to the same function on the same line.
By design, \HPCToolkit{} does not attempt to coalesce distinct calls to the same function because it might be incorrect to do so; instead, it independently reports each call site that appears in the machine code.
If the compiler duplicated calls as it replicated code during optimization, multiple call sites may be reported by \hpcviewer{} when only one appeared in the source code.

% ===========================================================================
% ===========================================================================

\subsection{\hpctraceviewer{} shows lots of white space on the left. Why?}

At startup, \hpctraceviewer{}  renders traces for the time interval between the minimum and maximum times recorded for any process or thread in the execution. The minimum time for each process or thread is recorded when its trace file is opened as HPCToolkit's monitoring facilities are initialized at the beginning of its execution. The maximum time for a process or thread is recorded when the process or thread is finalized and its trace file is closed. When an application uses the \verb|hpctoolkit_start| and \verb|hpctoolkit_stop| primitives, the minimum and maximum time recorded for a process/thread are  at the beginning and end of its execution, which may be  distant from the start/stop interval. This can cause significant white space to appear in \hpctraceviewer{}'s display to the left and right of the region (or regions) of interest demarcated in an execution by start/stop calls.

% ===========================================================================
% ===========================================================================
\section{Debugging}

\subsection{How do I debug \HPCToolkit{}'s measurement?}

Assume you want to debug \HPCToolkit{}'s measurement subsystem when
collecting measurements for an application named \texttt{app}.

% ==========================================================
% ==========================================================

\subsection{Tracing \libmonitor{}}

\HPCToolkit{}'s measurement subsystem
uses \libmonitor{} for process/thread control.
To collect a debug trace of \libmonitor{}, use either \texttt{monitor-run} or \texttt{monitor-link}, which are located within:
%
\begin{quote}
  \verb|<externals-install>/libmonitor/bin|
\end{quote}
Launch your application as follows:
%
\begin{itemize}

\item Dynamically linked applications:\hfill
\begin{quote}
  \verb|[<mpi-launcher>] monitor-run --debug app [app-arguments]|
\end{quote}

\item Statically linked applications:\hfill

Link \libmonitor{} into \texttt{app}:
\begin{quote}
  \verb|monitor-link <linker> -o app <linker-arguments>|
\end{quote}
%
Then execute \texttt{app} under special environment variables:
\begin{quote}
\begin{verbatim}
export MONITOR_DEBUG=1
[<mpi-launcher>] app [app-arguments]
\end{verbatim}
\end{quote}
\end{itemize}


% ==========================================================
% ==========================================================

\subsection{Tracing \HPCToolkit{}'s Measurement Subsystem}

Broadly speaking, there are two levels at which a user can test \hpcrun{}.
The first level is tracing \hpcrun{}'s application control, that is, running \hpcrun{} without an asynchronous sample source.
The second level is tracing \hpcrun{} with a sample source.
The key difference between the two is that the former uses the \texttt{--event NONE} or \mytt{HPCRUN_EVENT_LIST="NONE"} option (shown below) whereas the latter does not (which enables the default CPUTIME sample source).
With this in mind, to collect a debug trace for either of these levels, use commands similar to the following:
%
\begin{itemize}

\item Dynamically linked applications:\hfill
%
\begin{quote}
\begin{verbatim}
[<mpi-launcher>] \
  hpcrun --monitor-debug --dynamic-debug ALL --event NONE \
    app [app-arguments]
\end{verbatim}
\end{quote}

\item Statically linked applications:\hfill

Link \hpcrun{} into \texttt{app} (see Section~\ref{chpt:quickstart:tour:measurement}).
Then execute \texttt{app} under special environment variables:
\begin{quote}
\begin{verbatim}
export MONITOR_DEBUG=1
export HPCRUN_EVENT_LIST="NONE"
export HPCRUN_DEBUG_FLAGS="ALL"
[<mpi-launcher>] app [app-arguments]
\end{verbatim}
\end{quote}
\end{itemize}
%
Note that the \texttt{*debug*} flags are optional.
The \mytt{--monitor-debug/MONITOR_DEBUG} flag enables \libmonitor{} tracing.
The \mytt{--dynamic-debug/HPCRUN_DEBUG_FLAGS} flag enables \hpcrun{} tracing.


% ==========================================================
% ==========================================================

\subsection{Using a debugger to inspect an execution being monitored by \HPCToolkit{}}

If \HPCToolkit{} has trouble monitoring an application, you may find it useful to
execute an application being monitored by \HPCToolkit{} under the control
of a debugger to observe how \HPCToolkit{}'s measurement subsystem interacts with the application.

\HPCToolkit{}'s measurement subsystem is easiest to debug if you configure and
build \HPCToolkit{} by adding the \texttt{--enable-develop} option as an argument to \texttt{configure} when preparing to build \HPCToolkit{}.
(It is not necessary to rebuild \HPCToolkit{}'s \verb|hpctoolkit-externals|.)
%

One can debug a statically-linked or a dynamically-linked applications being measured by
\HPCToolkit{}'s measurement subsystem.
\begin{itemize}
\item Dynamically-linked applications. When launching an application with \hpcrun{}, add the \verb|--debug| option to \hpcrun{}.
\item Statically-linked applications. To debug a statically-linked application that has \HPCToolkit{}'s measurement subsystem linked into it, set \verb|HPCRUN_WAIT| in the environment before launching the application, e.g.
\begin{verbatim}
export HPCRUN_WAIT=1
export HPCRUN_EVENT_LIST="... the metric(s) you want to measure ..."
app [app-arguments]
\end{verbatim}
\end{itemize}

There are two ways to use launch an application with a debugger when using
To attach a debugger when monitoring an application using \hpcrun{}, add \hpcrun{}'s \verb|--debug| option

o debug hpcrun with a debugger use the following approach.
\begin{enumerate}

\item Launch your application.
  To debug \hpcrun{} without controlling sampling signals, launch normally.
  To debug \hpcrun{} with controlled sampling signals, launch as follows:
\begin{quote}
\begin{verbatim}
hpcrun --debug --event REALTIME@0 app [app-arguments]
\end{verbatim}
\end{quote}
or
\begin{quote}
\begin{verbatim}
export HPCRUN_WAIT=1
export HPCRUN_EVENT_LIST="REALTIME@0"
app [app-arguments]
\end{verbatim}
\end{quote}

\item Attach a debugger.
  The debugger should be spinning in a loop whose exit is conditioned by the
  \verb|HPCRUN_DEBUGGER_WAIT| variable.

\item Set any desired breakpoints.
  To send a sampling signal at a particular point, make sure to stop at that point with a \emph{one-time} or \emph{temporary} breakpoint (\texttt{tbreak} in GDB).

\item Call \verb|hpcrun_continue()| or set the \verb|HPCRUN_DEBUGGER_WAIT| variable to 0
and continue.

\item To raise a controlled sampling signal, raise a SIGPROF, \eg{}, using GDB's command \verb|signal SIGPROF|.

\end{enumerate}


% ===========================================================================
% ===========================================================================

\begin{comment}
  hpcviewer shows a loop where there is none: bad line information or compiler-generated (scalarization, copy loop)

  hpcviewer shows a call site where there is none: compiler-inserted call

  possible question: What happens if I forget to run hpcstruct?
No hpcstruct =
  really crazy procedure & loop bounds
  no inlining detection!


  Note: Beware of kernels 2.6.28 through 2.6.29.2.  These kernels have a rare asynchronous signal race condition where floating point registers are not restored correctly.

  Perf-events kernels 2.6.32-2.6.?? + PAPI = bug where you get a SIGIO in the
  middle of a signal handler.

\end{comment}


% ***************************************************************************
% ***************************************************************************

% \chapter{Sample}
% \label{chpt:sample}

% % ===========================================================================
% % ===========================================================================

% \section{a section}

% % ==========================================================
% % ==========================================================

% \subsection{a subsection}


% ***************************************************************************
% ***************************************************************************

% \begin{appendices}

% \end{appendices}


% ***************************************************************************
% ***************************************************************************

%% Build instructions:


%% Old build instructions:

%%     <li><p>BlueGene/P front end:</p>
%% 	<dl class="indent">
%%           <dt>HPCToolkit's Externals</dt>
%% 	  <dd><code>../configure CC=gcc CXX=g++ \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; --prefix=`pwd`/../powerpc64-linux
%% 	      </code>
%% 	  </dd>

%%           <dt>HPCToolkit</dt>
%% 	  <dd><code>
%% 	      ../configure CC=gcc CXX=g++ \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; --prefix=&lt;<i>install-path-fe</i>&gt; \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; --with-externals=&lt;<i>hpctoolkit-externals</i>&gt;/powerpc64-linux \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; --disable-mpi
%% 	      </code>
%%           </dd>
%% 	</dl>
%%     </li>

%%     <li><p>BlueGene/P back end:</p>
%% 	<dl class="indent">
%%           <dt>HPCToolkit's Externals</dt>
%% 	  <dd><code>
%% 	      ../configure \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; CC="/bgsys/drivers/ppcfloor/gnu-linux/bin/powerpc-bgp-linux-gcc -dynamic" \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; CXX="/bgsys/drivers/ppcfloor/gnu-linux/bin/powerpc-bgp-linux-g++ -dynamic" \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; --host=powerpc-bgp-linux --prefix=`pwd`/../powerpc-bgp-linux
%% 	    </code>
%% 	  </dd>

%%           <dt>HPCToolkit</dt>
%% 	  <dd><code>
%% 	      ../configure \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; CC="/bgsys/drivers/ppcfloor/gnu-linux/bin/powerpc-bgp-linux-gcc -dynamic" \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; CXX="/bgsys/drivers/ppcfloor/gnu-linux/bin/powerpc-bgp-linux-g++ -dynamic" \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; MPICXX="mpicxx -dynamic" \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; --host=powerpc-bgp-linux --prefix=&lt;<i>install-path-be</i>&gt; \<br />
%% 	      &nbsp;&nbsp;&nbsp;&nbsp; --with-externals=&lt;<i>hpctoolkit-externals</i>&gt;/powerpc-bgp-linux
%% 	    </code>
%% 	  </dd>
%% 	</dl>
%%     </li>


%% <li><p>Cray XT front end (use default programming environment):</p>
%%     <dl class="indent">
%%       <dt>HPCToolkit's Externals</dt>
%%       <dd><code>../configure --prefix=`pwd`/../x86_64-linux</code>
%%       </dd>

%%       <dt>HPCToolkit</dt>
%%       <dd><code>../configure --prefix=&lt;<i>install-path-fe</i>&gt; \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --with-externals=&lt;<i>hpctoolkit-externals</i>&gt;/x86_64-linux \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --with-papi=/opt/xt-tools/papi/3.7.2/v23
%%           </code>
%%       </dd>
%%     </dl>
%% </li>

%% <li><p>Cray XT back end (use GNU programming environment):</p>
%%     <dl class="indent">
%%       <dt>HPCToolkit's Externals</dt>
%%       <dd><code>
%%           export XTPE_INFO_MESSAGE_OFF=1 # breaks GNU binutils' configure! <br />

%%           ../configure CC="cc -static" CXX="CC -static" \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --host=x86_64-suse-linux --prefix=`pwd`/../x86_64-crayxt \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --without-symtabAPI \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --without-libelf \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --without-libdwarf \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --without-libunwind \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --without-old-monitor
%%           </code>
%%       </dd>

%%       <dt>HPCToolkit</dt>
%%       <dd><code>../configure CC="cc -static" CXX="CC -static" MPICXX="CC -static" \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --host=x86_64-suse-linux --prefix=&lt;<i>install-path-be</i>&gt; \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --with-externals=&lt;<i>hpctoolkit-externals</i>&gt;/x86_64-crayxt \<br />
%%           &nbsp;&nbsp;&nbsp;&nbsp; --disable-hpcrun
%%           </code>
%%       </dd>
%%     </dl>
%% </li>


% ***************************************************************************
% ***************************************************************************


%% Using HPCToolkit to Analyze Cilk Applications

%% --------------------------------------------------
%% Build modified MIT cilk:
%% --------------------------------------------------

%% ../configure --prefix=<cilk-install>

%% After install, copy
%%   <cilk-src>/runtime/cilk-internal.h
%% to
%%   <cilk-install>/include/cilk/cilk-internal.h

%% --------------------------------------------------
%% Configure HPCToolkit as follows:
%% --------------------------------------------------

%% ../configure --prefix=<hpctoolkit-install> \
%%   --with-externals=<externals-install> \
%%   [--with-papi=<papi-install>] \
%%   --enable-lush --with-cilk=<cilk-install>

%% --------------------------------------------------
%% Using HPCToolkit:
%% --------------------------------------------------

%% export PATH=<hpctoolkit-install>/src/tool/misc:${PATH}
%% Build Cilk app with modified Cilk compiler

%% hpcrun --agent <hpctoolkit-install>/lib/hpctoolkit/libagent-cilk.so \
%%        -e WALLCLOCK@5000 \
%%        <app>

%% hpcstruct --agent-cilk <app>

%% hpcprof/mpi --agent-cilk -I ... <hpctoolkit-measurements>

%% hpcviewer <hpctoolkit-database>

%% =============================================================================

%% Using HPCToolkit To Analyze Lock Contention

%% --------------------------------------------------
%% Modify one source file
%% --------------------------------------------------

%% Open the file
%%   hpctoolkit.trunk/src/tool/hpcrun/lush/lush-pthread.h

%% Find LUSH_PTHR_FN_TY and set it to 3:
%%   #define LUSH_PTHR_FN_TY 3

%% --------------------------------------------------
%% Build HPCToolkit as follows:
%% --------------------------------------------------

%% ../configure --prefix=<hpctoolkit-install> \
%%   --with-externals=<externals-install> \
%%   [--with-papi=<papi-install>] \
%%   --enable-lush --enable-lush-pthreads

%% --------------------------------------------------
%% Using HPCToolkit:
%% --------------------------------------------------

%% hpcrun --agent <hpctoolkit-install>/lib/hpctoolkit/libagent-pthread.so \
%%        -e WALLCLOCK@5000 \
%%        <app>

%% hpcstruct <app>

%% hpcprof/mpi --agent-pthread -I ... <hpctoolkit-measurements>

%% hpcviewer <hpctoolkit-database>

%% =============================================================================

%% Using HPCToolkit To Analyze MPI Load Imbalance

%% --------------------------------------------------
%% Using HPCToolkit:
%% --------------------------------------------------

%% hpcprof-mpi --agent-mpi ...


% ***************************************************************************
% ***************************************************************************

\bibliographystyle{abbrv}
%\bibliographystyle{acm}
\bibliography{texmf/bibtex/bib/refs,texmf/bibtex/bib/pubs,texmf/bibtex/bib/gpurefs}

%\thispagestyle{plain}


\appendix
\input{environ}

\end{document}