# Invocation command line:
# runhpc --config nvhpc_alpha.cfg --ranks 8 --rebuild --define pmodel=acc --noreportable --tune=base --iterations=1 small
#######################################################################
# Example configuration file for the GNU Compilers
#
# Defines: "pmodel" => "mpi", "acc", "omp", "tgt", "tgtgpu"  default "mpi"
#          "label" => ext base label, default "nv"
#
# MPI-only Command:
# runhpc -c Example_gnu --reportable -T base --define pmodel=mpi --ranks=40 small
#
# OpenACC Command:
# runhpc -c Example_gnu --reportable -T base --define pmodel=acc --ranks=4  small
#
# OpenMP Command:
# runhpc -c Example_gnu --reportable -T base --define pmodel=omp --ranks=1 --threads=40 small
#
# OpenMP Target Offload to Host Command:
# runhpc -c Example_gnu --reportable -T base --define pmodel=tgt --ranks=1 --threads=40 small
#
# OpenMP Target Offload to NVIDIA GPU Command:
# runhpc -c Example_gnu --reportable -T base --define pmodel=tgtnv --ranks=4  small
#
#######################################################################

%ifndef %{label}         # IF label is not set use gnu
%   define label nv
%endif

%ifndef %{pmodel}         # IF pmodel is not set use mpi
%   define pmodel mpi
%endif

teeout = yes
makeflags=-j 40

# Tester Information
license_num     = 37A
test_sponsor    = TU Dresden
tester          = TU Dresden


#######################################################################
# SUT Section
#######################################################################

# General SUT info
system_vendor      = AMD
system_name        = Alpha Centauri: AMD EPYC 7352 (AMD x86_64, NVIDIA A100-SXM4-40GB)
hw_avail           = Jan-2019
sw_avail           = Aug-2022

#[Node_Description: Hardware]
node_compute_syslbl = AMD Rome
node_compute_order = 1
node_compute_count = 34
node_compute_purpose = compute
node_compute_hw_vendor = AMD
node_compute_hw_model = AMD K17 (Zen2)
node_compute_hw_cpu_name = AMD EPYC 7352
node_compute_hw_ncpuorder = 2 chips
node_compute_hw_nchips = 2
node_compute_hw_ncores = 96
node_compute_hw_ncoresperchip = 48
node_compute_hw_nthreadspercore = 2
node_compute_hw_cpu_char = Up to 2.3 GHz
node_compute_hw_cpu_mhz = 2100
node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core
node_compute_hw_scache = 512 KB I+D on chip per core
node_compute_hw_tcache000= 16384 KB I+D on chip per chip
node_compute_hw_ocache = None
node_compute_hw_memory = 1 TB
node_compute_hw_disk000= 3.5 TB
node_compute_hw_disk001 = NVMe SSD Controller SM981/PM981/PM983
node_compute_hw_adapter_ib_model = Mellanox ConnectX-6
node_compute_hw_adapter_ib_interconnect = EDR InfiniBand
node_compute_hw_adapter_ib_firmware = 20.28.2006
node_compute_hw_adapter_ib_driver = mlx5_core
node_compute_hw_adapter_ib_data_rate = 200 Gb/s
node_compute_hw_adapter_ib_count = 2
node_compute_hw_adapter_ib_slot_type = PCIe
node_compute_hw_adapter_ib_ports_used = 2
node_compute_hw_other = None

#[Node_Description: Accelerator]
node_compute_hw_accel_model = Tesla A100-SXM4-40GB
node_compute_hw_accel_count = 8
node_compute_hw_accel_vendor = NVIDIA Corporation
node_compute_sw_accel_driver = NVIDIA CUDA 470.57.02
node_compute_hw_accel_type   = GPU
node_compute_hw_accel_connect = ASPEED Technology, Inc. (rev 04)
node_compute_hw_accel_ecc    = Yes
node_compute_hw_accel_desc   = none

#[Node_Description: Software]
node_compute_sw_os000 = CentOS Linux
node_compute_sw_os001 = 7
node_compute_sw_localfile = xfs
node_compute_sw_sharedfile000 = 4 PB Lustre parallel filesystem
node_compute_sw_sharedfile001 = over 4X EDR InfiniBand
node_compute_sw_state = Multi-user
node_compute_sw_other = None

#[Fileserver]

#[Interconnect]
interconnect_ib_syslbl = Mellanox InfiniBand
interconnect_ib_purpose = MPI Traffic and GPFS access
interconnect_ib_order = 1
interconnect_ib_hw_vendor = Mellanox
interconnect_ib_hw_topo = Non-blocking Fat-tree
#interconnect_ib_hw_switch_ib_count = 2
#interconnect_ib_hw_switch_ib_ports = 2
#interconnect_ib_hw_switch_ib_data_rate = 100 Gb/s
#interconnect_ib_hw_switch_ib_model = Mellanox Switch IB-2

#[Software]
sw_compiler000 = C/C++/Fortran: Version 21.7 of the
sw_compiler001 = NVHPC toolkit
sw_mpi_library = Open MPI Version 4.1.1
sw_mpi_other = None
system_class = Homogenous Cluster
sw_other = CUDA Driver Version: 11.4.2

#######################################################################
# End of SUT section
#######################################################################


#######################################################################
# The header section of the config file.  Must appear
# before any instances of "section markers" (see below)
#
# ext = how the binaries you generated will be identified
# tune = specify "base" or "peak" or "all"
label         = %{label}_%{pmodel}
tune          = base
output_format = text
use_submit_for_speed = 1

# Compiler Settings
default:
CC           = mpicc
CXX          = mpicxx
FC           = mpif90
# Compiler Version Flags
CC_VERSION_OPTION  = --version
CXX_VERSION_OPTION = --version
FC_VERSION_OPTION  = --version

# MPI options and binding environment, dependent upon Model being run
# Adjust to match your system

# OpenMP (CPU) Settings
%if %{pmodel} eq 'omp'
preENV_OMP_PLACES=cores
#preENV_OMP_PROC_BIND=true
#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2\
#3,24
%endif

#OpenMP Targeting Host Settings
%if %{pmodel} eq 'tgt'
#preENV_OMP_PROC_BIND=true
preENV_MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
preEnv_MPICH_GPU_SUPPORT_ENABLED=1
preEnv_MPICH_SMP_SINGLE_COPY_MODE=CMA
preEnv_MPICH_GPU_EAGER_DEVICE_MEM=0
#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
%endif

%ifdef %{ucx}
# if using Open MPI with UCX support, these settings are needed with use of CUDA Aware MPI
# without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs
preENV_UCX_MEMTYPE_CACHE=n
preENV_UCX_TLS=self,shm,cuda_copy
%endif

#MPIRUN_OPTS = --bind-to none -q
#submit = mpirun ${MPIRUN_OPTS} -n $ranks $command
submit = srun $command

#######################################################################
# Optimization

# Note that SPEC baseline rules require that all uses of a given compiler
# use the same flags in the same order. See the SPEChpc Run Rules
# for more details
#      http://www.spec.org/hpc2021/Docs/runrules.html
#
# OPTIMIZE    = flags applicable to all compilers
# FOPTIMIZE   = flags appliable to the Fortran compiler
# COPTIMIZE   = flags appliable to the C compiler
# CXXOPTIMIZE = flags appliable to the C++ compiler
#
# See your compiler manual for information on the flags available
# for your compiler

# Compiler flags applied to all models
default=base=default:
#OPTIMIZE     = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast
OPTIMIZE     = -w -O3 -Mfprelaxed -Mnouniform -Mstack_arrays
COPTIMIZE     = -lm       # use -mcpu=native for ARM
CXXOPTIMIZE   = -std=c++11
CXXPORTABILITY = --c++17

#ARM
%if %{armOn} eq 'arm'
    COPTIMIZE += -mcpu=native
    #OPTIMIZE += -mcpu=a64fx
%endif

# SVE
%if %{sveOn} eq 'sve'
  COPTIMIZE += -march=armv8-a+sve
%endif

%if %{model} eq 'mpi'
  pmodel=MPI
%endif

# OpenACC flags
%if %{pmodel} eq 'acc'
  pmodel=ACC
  # Use with PGI compiler only
  # https://docs.nvidia.com/hpc-sdk/archive/21.7/
  #OPTIMIZE += -acc=gpu
  OPTIMIZE += -acc -ta=tesla -tp=zen #-Minfo=accel #-DSPEC_ACCEL_AWARE_MPI->hangs it forever

#  513.soma_t:
#  PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE
%endif

# OpenMP (CPU) flags
%if %{pmodel} eq 'omp'
  pmodel=OMP
  #OPTIMIZE += -qsmp=omp
  OPTIMIZE += -fopenmp
  #FOPTIMIZE +=
%endif

# OpenMP Targeting host flags
%if %{pmodel} eq 'tgt'
  pmodel=TGT
  # PGI
  OPTIMIZE += -mp -acc=multicore
  # Intel??
  # OPTIMIZE += -qsmp=omp -qoffload
  # -fopen-simd
  # GCC (doesn't recognize its own flags)
  #OPTIMIZE += -fopenmp
  #OPTIMIZE += -fopenmp -mgomp
  #OPTIMIZE += -fopenmp -msoft-stack -muniform-simt
  #FOPTIMIZE += -homp
%endif

# OpenMP Targeting host flags
%if %{pmodel} eq 'tgtnv'
  pmodel=TGT
  # PGI
  OPTIMIZE += -mp=gpu -acc
  #FOPTIMIZE += -homp

  # Note that NVHPC is in the process of adding OpenMP array
  # reduction support so this option may be removed in future
  513.soma_t:
  PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE
%endif

# No peak flags set, so make peak use the same flags as base
default=peak=default:
basepeak=1

#######################################################################
# Portability
#######################################################################


# The following section was added automatically, and contains settings that
# did not appear in the original configuration file, but were added to the
# raw file after the run.
default:
flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags.xml
interconnect_ib_hw_switch_ib_model000 = Mellanox IB EDR Switch IB-2