#######################################################################
# Example configuration file for the GNU Compilers
#
# Defines: "model" => "mpi", "acc", "omp", "tgt", "tgtgpu"  default "mpi"
#          "label" => ext base label, default "nv"
#
# MPI-only Command:
# runhpc -c Example_gnu --reportable -T base --define model=mpi --ranks=40 small
#
# OpenACC Command:
# runhpc -c Example_gnu --reportable -T base --define model=acc --ranks=4  small
#
# OpenMP Command:
# runhpc -c Example_gnu --reportable -T base --define model=omp --ranks=1 --threads=40 small
#
# OpenMP Target Offload to Host Command:
# runhpc -c Example_gnu --reportable -T base --define model=tgt --ranks=1 --threads=40 small
#
# OpenMP Target Offload to NVIDIA GPU Command:
# runhpc -c Example_gnu --reportable -T base --define model=tgtnv --ranks=4  small
#
#######################################################################

%ifndef %{label}         # IF label is not set use gnu 
%   define label gnu    
%endif              

%ifndef %{model}         # IF model is not set use mpi
%   define model mpi      
%endif              

teeout = yes
makeflags=-j 24

# Tester Information 
license_num     = 37A
tester          = Technische Universitaet Dresden
test_sponsor    = Technische Universitaet Dresden

######################################################
# SUT Section
######################################################
#include: Example_SUT.inc
include: sut-taurus.inc

#[Software]
sw_compiler000   = C/C++/Fortran: Version 8.2.0 of
sw_compiler001   = GNU Compilers
sw_mpi_library = OpenMPI Version 3.1.3
sw_mpi_other = None
sw_other = None

#[General notes]
notes_000 = MPI startup command:
notes_005 =   slurm srun command was used to start MPI jobs.

#######################################################################
# End of SUT section
#######################################################################

#######################################################################
# The header section of the config file.  Must appear
# before any instances of "section markers" (see below)
#
# ext = how the binaries you generated will be identified
# tune = specify "base" or "peak" or "all"
%ifndef %{tudprof}
label         = %{label}_%{model}
%else
label         = %{label}_%{model}_%{tudprof}
%endif

tune          = base
output_format = text
use_submit_for_speed = 1

# Compiler Settings
default:
CC           = mpicc
CXX          = mpicxx
FC           = mpif90
%if %{tudprof} eq 'scorep'
CC           = scorep --mpp=mpi --instrument-filter=${SPEC}/scorep.filter mpicc
CXX          = scorep --mpp=mpi --instrument-filter=${SPEC}/scorep.filter mpicxx
FC           = scorep --mpp=mpi --instrument-filter=${SPEC}/scorep.filter mpif90
%endif


# Compiler Version Flags
CC_VERSION_OPTION  = --version
CXX_VERSION_OPTION = --version
FC_VERSION_OPTION  = --version

# enable non-official patches to this kit
#strict_rundir_verify = 0

# MPI options and binding environment, dependent upon Model being run
# Adjust to match your system

# OpenMP (CPU) Settings
%if %{model} eq 'omp' 
preENV_OMP_PROC_BIND=true
preENV_OMP_PLACES=cores
%endif

#OpenMP Targeting Host Settings
%if %{model} eq 'tgt' 
preENV_OMP_PROC_BIND=true
preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
%endif

#MPIRUN_OPTS = --bind-to none -q
MPIRUN_OPTS=
submit = timeout 2h srun ${MPIRUN_OPTS} -n $ranks -c $threads $command

# MPI Workaround for mca issues in sph_exa
#preOMPI_MCA_topo=basic

# Score-P performance profiling
%if %{tudprof} eq 'scorep'

## only record calls to main and MPI functions (runtime filtering) 
## runtime filtering was replaced by compile-time filtering (see above)
# preENV_SCOREP_FILTERING_FILE=/home/brunst/ws-hpc2020/kit91/scorep.filter

## set buffer memory size for profile/trace
preENV_SCOREP_TOTAL_MEMORY=64MB

## enable profile recording
preENV_SCOREP_ENABLE_PROFILING=true

## set to 'true' to enable detailed trace file recording 
preENV_SCOREP_ENABLE_TRACING=false

## collect memory consumption per node
preENV_SCOREP_METRIC_RUSAGE=ru_maxrss

## uncomment to record cycle counter for scheduling analysis 
preENV_SCOREP_METRIC_PAPI=PAPI_TOT_CYC

%endif


#######################################################################
# Optimization

# Note that SPEC baseline rules require that all uses of a given compiler 
# use the same flags in the same order. See the SPEChpc Run Rules
# for more details 
#      http://www.spec.org/hpc2021/Docs/runrules.html
#
# OPTIMIZE    = flags applicable to all compilers
# FOPTIMIZE   = flags appliable to the Fortran compiler
# COPTIMIZE   = flags appliable to the C compiler
# CXXOPTIMIZE = flags appliable to the C++ compiler
# 
# See your compiler manual for information on the flags available
# for your compiler

# Compiler flags applied to all models
default=base=default:
COPTIMIZE     = -Ofast -march=native -lm        # use -mcpu=native for ARM
CXXOPTIMIZE   = -Ofast -march=native -std=c++14
FOPTIMIZE     = -Ofast -march=native -fno-stack-protector
FPORTABILITY  = -ffree-line-length-none

%if %{model} eq 'mpi'
  pmodel=MPI
%endif

# OpenACC flags
%if %{model} eq 'acc'
  pmodel=ACC
  OPTIMIZE += -fopenacc -foffload=-lm  
%endif

# OpenMP (CPU) flags
%if %{model} eq 'omp' 
  pmodel=OMP
  OPTIMIZE += -fopenmp 
%endif

# OpenMP Targeting host flags
%if %{model} eq 'tgt' 
  pmodel=TGT
  OPTIMIZE += -fopenmp 
%endif

# OpenMP Targeting Nvidia GPU flags
%if %{model} eq 'tgtnv'
  pmodel=TGT 
  OPTIMIZE += -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
%endif

# No peak flags set, so make peak use the same flags as base 
default=peak=default:
basepeak=1

#######################################################################
# Portability
#######################################################################


519.clvleaf_t,619.clvleaf_s,719.clvleaf_m,819.clvleaf_l=default=default:
# Not needed anymore?
#PORTABILITY += -DSPEC_GNU_FLUSH