# Invocation command line:
# runhpc --config nvhpc_ppc --define pmodel=acc --action run --nobuild --ranks=6 --reportable tiny
# output_root was not used for this run
#######################################################################
# Example configuration file for the GNU Compilers
#
# Defines: "pmodel" => "mpi", "acc", "omp", "tgt", "tgtnv"  default "mpi"
#          "label" => ext base label, default "nv"
#
# MPI-only Command:
# runhpc -c Example_gnu --reportable -T base --define pmodel=mpi --ranks=40 small
#
# OpenACC Command:
# runhpc -c Example_gnu --reportable -T base --define pmodel=acc --ranks=4  small
#
# OpenMP Command:
# runhpc -c Example_gnu --reportable -T base --define pmodel=omp --ranks=1 --threads=40 small
#
# OpenMP Target Offload to Host Command:
# runhpc -c Example_gnu --reportable -T base --define pmodel=tgt --ranks=1 --threads=40 small
#
# OpenMP Target Offload to NVIDIA GPU Command:
# runhpc -c Example_gnu --reportable -T base --define pmodel=tgtnv --ranks=4  small
#
#######################################################################

%ifndef %{label}         # IF label is not set use default "nv"
%   define label nv
%endif

%ifndef %{pmodel}         # IF pmodel is not set use default mpi
%   define pmodel mpi
%endif

teeout = yes
makeflags=-j 40

# Tester Information
license_num     = 37A
test_sponsor    = TU Dresden
tester          = TU Dresden

#######################################################################
# SUT Section
#######################################################################
#include: Example_SUT.inc
#  ----- Begin inclusion of 'Example_SUT.inc'
#######################################################################
# General SUT info
system_vendor      = IBM
system_name        = Taurus: IBM Power System AC922 (IBM Power9, Tesla V100-SXM2-32GB)
node_compute_sw_accel_driver = NVIDIA CUDA 440.64.00
node_compute_hw_adapter_ib_slot_type = None
node_compute_hw_adapter_ib_ports_used = 2
node_compute_hw_adapter_ib_model = Mellanox ConnectX-5
node_compute_hw_adapter_ib_interconnect = EDR InfiniBand
node_compute_hw_adapter_ib_firmware = 16.27.6008
node_compute_hw_adapter_ib_driver = mlx5_core
node_compute_hw_adapter_ib_data_rate = 100 Gb/s (4X EDR)
node_compute_hw_adapter_ib_count = 2
interconnect_ib_syslbl = Mellanox InfiniBand
interconnect_ib_purpose = MPI Traffic and GPFS access
interconnect_ib_order = 1
#interconnect_ib_hw_vendor = Mellanox
#interconnect_ib_hw_topo = Non-blocking Fat-tree
#interconnect_ib_hw_switch_ib_ports = 36
#interconnect_ib_hw_switch_ib_data_rate = 100 Gb/s
#interconnect_ib_hw_switch_ib_count = 1
#interconnect_ib_hw_model = Mellanox Switch IB-2
hw_avail           = Nov-2018
sw_avail           = Nov-2021
prepared_by = Noah Trumpik (Noah.Trumpik@tu-dresden.de)

#[Node_Description: Hardware]
node_compute_syslbl = IBM Power System AC922
node_compute_order = 1
node_compute_count = 30
node_compute_purpose = compute
node_compute_hw_vendor = IBM
node_compute_hw_model = IBM Power System AC922
node_compute_hw_cpu_name = IBM POWER9 2.2 (pvr 004e 1202)
node_compute_hw_ncpuorder = 2 chips
node_compute_hw_nchips = 2
node_compute_hw_ncores = 44
node_compute_hw_ncoresperchip = 22
node_compute_hw_nthreadspercore = 4
node_compute_hw_cpu_char = Up to 3.8 GHz
node_compute_hw_cpu_mhz = 2300
node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core
node_compute_hw_scache = 512 KB I+D on chip per core
node_compute_hw_tcache000= 10240 KB I+D on chip per chip
node_compute_hw_ocache = None
node_compute_hw_memory = 256 GB (16 x 16 GB RDIMM-DDR4-2666)
node_compute_hw_disk000= 2 x 1 TB (ATA Rev BE35)
node_compute_hw_disk001 = NVMe SSD Controller 172Xa/172Xb
node_compute_hw_other = None

#[Node_Description: Accelerator]
node_compute_hw_accel_model = Tesla V100-SXM2-32GB
node_compute_hw_accel_count = 6
node_compute_hw_accel_vendor= NVIDIA Corporation
node_compute_hw_accel_type  = GPU
node_compute_hw_accel_connect = NVLINK
node_compute_hw_accel_ecc    = Yes
node_compute_hw_accel_desc   = See Notes

#[Node_Description: Software]
node_compute_sw_os000 = Red Hat Enterprise Linux
node_compute_sw_os001 = 7.6
node_compute_sw_localfile = xfs
node_compute_sw_sharedfile = 4 PB Lustre parallel filesystem
node_compute_sw_state = Multi-user
node_compute_sw_other = None

#[Fileserver]

#[Interconnect]

#[Software]
sw_compiler000 = C/C++/Fortran: Version 21.5 of the
sw_compiler001 = NVHPC toolkit
sw_mpi_library = Open MPI Version 4.1.2
sw_mpi_other = None
system_class = Homogenous Cluster
sw_other = None

#[General notes]
notes_000 = MPI startup command:
notes_005 =   srun command was used to launch job using 1 GPU/rank.
notes_010 =Detailed information from nvaccelinfo
notes_015 =
notes_020 =CUDA Driver Version:           11000
notes_025 =NVRM version:                  NVIDIA UNIX ppc64le Kernel Module  440.64.00  Wed Feb 26 16:01:28 UTC 2020
notes_030 =
notes_035 =Device Number:                 0
notes_040 =Device Name:                   Tesla V100-SXM2-32GB
notes_045 =Device Revision Number:        7.0
notes_050 =Global Memory Size:            33822867456
notes_055 =Number of Multiprocessors:     80
notes_060 =Concurrent Copy and Execution: Yes
notes_065 =Total Constant Memory:         65536
notes_070 =Total Shared Memory per Block: 49152
notes_075 =Registers per Block:           65536
notes_080 =Warp Size:                     32
notes_085 =Maximum Threads per Block:     1024
notes_090 =Maximum Block Dimensions:      1024, 1024, 64
notes_095 =Maximum Grid Dimensions:       2147483647 x 65535 x 65535
notes_100 =Maximum Memory Pitch:          2147483647B
notes_105 =Texture Alignment:             512B
notes_110 =Max Clock Rate:                1530 MHz
notes_115 =Execution Timeout:             No
notes_120 =Integrated Device:             No
notes_125 =Can Map Host Memory:           Yes
notes_130 =Compute Mode:                  default
notes_135 =Concurrent Kernels:            Yes
notes_140 =ECC Enabled:                   Yes
notes_145 =Memory Clock Rate:             877 MHz
notes_150 =Memory Bus Width:              4096 bits
notes_155 =L2 Cache Size:                 6291456 bytes
notes_160 =Max Threads Per SMP:           2048
notes_165 =Async Engines:                 4
notes_170 =Unified Addressing:            Yes
notes_175 =Managed Memory:                Yes
notes_180 =Concurrent Managed Memory:     Yes
notes_185 =Preemption Supported:          Yes
notes_190 =Cooperative Launch:            Yes
notes_195 =  Multi-Device:                Yes
notes_200 =Default Target:                cc70
notes_205 =

#######################################################################
# End of SUT section
#######################################################################

#######################################################################
# The header section of the config file.  Must appear
# before any instances of "section markers" (see below)
#
# ext = how the binaries you generated will be identified
# tune = specify "base" or "peak" or "all"
label         = %{label}_%{pmodel}
tune          = base
output_format = text
use_submit_for_speed = 1

# Compiler Settings
default:
CC           = mpicc
CXX          = mpic++
FC           = mpif90
# Compiler Version Flags
CC_VERSION_OPTION  = --version
CXX_VERSION_OPTION = --version
FC_VERSION_OPTION  = --version

# MPI options and binding environment, dependent upon Model being run
# Adjust to match your system

# OpenMP (CPU) Settings
%if %{pmodel} eq 'omp'
preENV_OMP_PLACES=cores
#preENV_OMP_PROC_BIND=true
#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2\
#3,24
%endif

#OpenMP Targeting Host Settings
%if %{pmodel} eq 'tgt'
#preENV_OMP_PROC_BIND=true
preENV_MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
preEnv_MPICH_GPU_SUPPORT_ENABLED=1
preEnv_MPICH_SMP_SINGLE_COPY_MODE=CMA
preEnv_MPICH_GPU_EAGER_DEVICE_MEM=0
#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
%endif

%ifdef %{ucx}
# if using Open MPI with UCX support, these settings are needed with use of CUDA Aware MPI
# without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs
preENV_UCX_MEMTYPE_CACHE=n
preENV_UCX_TLS=self,shm,cuda_copy
%endif

#MPIRUN_OPTS = --bind-to none -q
# 1 GPU per rs, 7 cores per RS, 1 MPI task per RS, 6 RS per host
submit = srun ${MPIRUN_OPTS} $command

#######################################################################
# Optimization

# Note that SPEC baseline rules require that all uses of a given compiler
# use the same flags in the same order. See the SPEChpc Run Rules
# for more details
#      http://www.spec.org/hpc2021/Docs/runrules.html
#
# OPTIMIZE    = flags applicable to all compilers
# FOPTIMIZE   = flags appliable to the Fortran compiler
# COPTIMIZE   = flags appliable to the C compiler
# CXXOPTIMIZE = flags appliable to the C++ compiler
#
# See your compiler manual for information on the flags available
# for your compiler

# Compiler flags applied to all models
default=base=default:
OPTIMIZE     = -O3
COPTIMIZE     = -lm       # use -mcpu=native for ARM
CXXOPTIMIZE   = -std=c++11
#FOPTIMIZE     = -ffree-line-length-none -fno-stack-protector
FOPTIMIZE     =

%if %{model} eq 'mpi'
  pmodel=MPI
%endif

# OpenACC flags
%if %{pmodel} eq 'acc'
  # Use with PGI compiler only
  # https://docs.nvidia.com/hpc-sdk/archive/21.5/
  pmodel=ACC
  #OPTIMIZE += -acc=gpu
  OPTIMIZE += -acc -ta=tesla
  OPTIMIZE += -acc -ta=tesla -DSPEC_ACCEL_AWARE_MPI #-Minfo=accel
%endif

# Note that NVHPC is in the process of adding OpenMP array
# reduction support so this option may be removed in future
# reduction not supported on taurusml due to old driver
513.soma_t:
PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE
513.soma_s:
PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE

# OpenMP (CPU) flags
%if %{pmodel} eq 'omp'
  pmodel=OMP
  #OPTIMIZE += -qsmp=omp
  OPTIMIZE += -fopenmp
  #FOPTIMIZE +=
%endif

# OpenMP Targeting host flags
%if %{pmodel} eq 'tgt'
  pmodel=TGT
  # PGI
  OPTIMIZE += -mp -acc=multicore
  # Intel??
  # OPTIMIZE += -qsmp=omp -qoffload
  # -fopen-simd
  # GCC (doesn't recognize its own flags)
  #OPTIMIZE += -fopenmp
  #OPTIMIZE += -fopenmp -mgomp
  #OPTIMIZE += -fopenmp -msoft-stack -muniform-simt
  #FOPTIMIZE += -homp
%endif

# OpenMP Targeting host flags
%if %{pmodel} eq 'tgtnv'
  pmodel=TGT
  # PGI
  OPTIMIZE += -mp=gpu -acc
  #FOPTIMIZE += -homp
%endif

# No peak flags set, so make peak use the same flags as base
default=peak=default:
basepeak=1

#######################################################################
# Portability
#######################################################################


# The following section was added automatically, and contains settings that
# did not appear in the original configuration file, but were added to the
# raw file after the run.
default:
flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags.xml
interconnect_ib_hw_switch_ib_model000 = Mellanox IB EDR Switch IB-2