# Invocation command line: # runhpc --config nvhpc_alpha.cfg --ranks 8 --rebuild --define pmodel=acc --noreportable --tune=base --iterations=1 small ####################################################################### # Example configuration file for the GNU Compilers # # Defines: "pmodel" => "mpi", "acc", "omp", "tgt", "tgtgpu" default "mpi" # "label" => ext base label, default "nv" # # MPI-only Command: # runhpc -c Example_gnu --reportable -T base --define pmodel=mpi --ranks=40 small # # OpenACC Command: # runhpc -c Example_gnu --reportable -T base --define pmodel=acc --ranks=4 small # # OpenMP Command: # runhpc -c Example_gnu --reportable -T base --define pmodel=omp --ranks=1 --threads=40 small # # OpenMP Target Offload to Host Command: # runhpc -c Example_gnu --reportable -T base --define pmodel=tgt --ranks=1 --threads=40 small # # OpenMP Target Offload to NVIDIA GPU Command: # runhpc -c Example_gnu --reportable -T base --define pmodel=tgtnv --ranks=4 small # ####################################################################### %ifndef %{label} # IF label is not set use gnu % define label nv %endif %ifndef %{pmodel} # IF pmodel is not set use mpi % define pmodel mpi %endif teeout = yes makeflags=-j 40 # Tester Information license_num = 37A test_sponsor = TU Dresden tester = TU Dresden ####################################################################### # SUT Section ####################################################################### # General SUT info system_vendor = AMD system_name = Alpha Centauri: AMD EPYC 7352 (AMD x86_64, NVIDIA A100-SXM4-40GB) hw_avail = Jan-2019 sw_avail = Aug-2022 #[Node_Description: Hardware] node_compute_syslbl = AMD Rome node_compute_order = 1 node_compute_count = 34 node_compute_purpose = compute node_compute_hw_vendor = AMD node_compute_hw_model = AMD K17 (Zen2) node_compute_hw_cpu_name = AMD EPYC 7352 node_compute_hw_ncpuorder = 2 chips node_compute_hw_nchips = 2 node_compute_hw_ncores = 96 node_compute_hw_ncoresperchip = 48 node_compute_hw_nthreadspercore = 2 node_compute_hw_cpu_char = Up to 2.3 GHz node_compute_hw_cpu_mhz = 2100 node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core node_compute_hw_scache = 512 KB I+D on chip per core node_compute_hw_tcache000= 16384 KB I+D on chip per chip node_compute_hw_ocache = None node_compute_hw_memory = 1 TB node_compute_hw_disk000= 3.5 TB node_compute_hw_disk001 = NVMe SSD Controller SM981/PM981/PM983 node_compute_hw_adapter_ib_model = Mellanox ConnectX-6 node_compute_hw_adapter_ib_interconnect = EDR InfiniBand node_compute_hw_adapter_ib_firmware = 20.28.2006 node_compute_hw_adapter_ib_driver = mlx5_core node_compute_hw_adapter_ib_data_rate = 200 Gb/s node_compute_hw_adapter_ib_count = 2 node_compute_hw_adapter_ib_slot_type = PCIe node_compute_hw_adapter_ib_ports_used = 2 node_compute_hw_other = None #[Node_Description: Accelerator] node_compute_hw_accel_model = Tesla A100-SXM4-40GB node_compute_hw_accel_count = 8 node_compute_hw_accel_vendor = NVIDIA Corporation node_compute_sw_accel_driver = NVIDIA CUDA 470.57.02 node_compute_hw_accel_type = GPU node_compute_hw_accel_connect = ASPEED Technology, Inc. (rev 04) node_compute_hw_accel_ecc = Yes node_compute_hw_accel_desc = none #[Node_Description: Software] node_compute_sw_os000 = CentOS Linux node_compute_sw_os001 = 7 node_compute_sw_localfile = xfs node_compute_sw_sharedfile000 = 4 PB Lustre parallel filesystem node_compute_sw_sharedfile001 = over 4X EDR InfiniBand node_compute_sw_state = Multi-user node_compute_sw_other = None #[Fileserver] #[Interconnect] interconnect_ib_syslbl = Mellanox InfiniBand interconnect_ib_purpose = MPI Traffic and GPFS access interconnect_ib_order = 1 interconnect_ib_hw_vendor = Mellanox interconnect_ib_hw_topo = Non-blocking Fat-tree #interconnect_ib_hw_switch_ib_count = 2 #interconnect_ib_hw_switch_ib_ports = 2 #interconnect_ib_hw_switch_ib_data_rate = 100 Gb/s #interconnect_ib_hw_switch_ib_model = Mellanox Switch IB-2 #[Software] sw_compiler000 = C/C++/Fortran: Version 21.7 of the sw_compiler001 = NVHPC toolkit sw_mpi_library = Open MPI Version 4.1.1 sw_mpi_other = None system_class = Homogenous Cluster sw_other = CUDA Driver Version: 11.4.2 ####################################################################### # End of SUT section ####################################################################### ####################################################################### # The header section of the config file. Must appear # before any instances of "section markers" (see below) # # ext = how the binaries you generated will be identified # tune = specify "base" or "peak" or "all" label = %{label}_%{pmodel} tune = base output_format = text use_submit_for_speed = 1 # Compiler Settings default: CC = mpicc CXX = mpicxx FC = mpif90 # Compiler Version Flags CC_VERSION_OPTION = --version CXX_VERSION_OPTION = --version FC_VERSION_OPTION = --version # MPI options and binding environment, dependent upon Model being run # Adjust to match your system # OpenMP (CPU) Settings %if %{pmodel} eq 'omp' preENV_OMP_PLACES=cores #preENV_OMP_PROC_BIND=true #preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 #preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2\ #3,24 %endif #OpenMP Targeting Host Settings %if %{pmodel} eq 'tgt' #preENV_OMP_PROC_BIND=true preENV_MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 preEnv_MPICH_GPU_SUPPORT_ENABLED=1 preEnv_MPICH_SMP_SINGLE_COPY_MODE=CMA preEnv_MPICH_GPU_EAGER_DEVICE_MEM=0 #preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 #preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24 %endif %ifdef %{ucx} # if using Open MPI with UCX support, these settings are needed with use of CUDA Aware MPI # without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs preENV_UCX_MEMTYPE_CACHE=n preENV_UCX_TLS=self,shm,cuda_copy %endif #MPIRUN_OPTS = --bind-to none -q #submit = mpirun ${MPIRUN_OPTS} -n $ranks $command submit = srun $command ####################################################################### # Optimization # Note that SPEC baseline rules require that all uses of a given compiler # use the same flags in the same order. See the SPEChpc Run Rules # for more details # http://www.spec.org/hpc2021/Docs/runrules.html # # OPTIMIZE = flags applicable to all compilers # FOPTIMIZE = flags appliable to the Fortran compiler # COPTIMIZE = flags appliable to the C compiler # CXXOPTIMIZE = flags appliable to the C++ compiler # # See your compiler manual for information on the flags available # for your compiler # Compiler flags applied to all models default=base=default: #OPTIMIZE = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast OPTIMIZE = -w -O3 -Mfprelaxed -Mnouniform -Mstack_arrays COPTIMIZE = -lm # use -mcpu=native for ARM CXXOPTIMIZE = -std=c++11 CXXPORTABILITY = --c++17 #ARM %if %{armOn} eq 'arm' COPTIMIZE += -mcpu=native #OPTIMIZE += -mcpu=a64fx %endif # SVE %if %{sveOn} eq 'sve' COPTIMIZE += -march=armv8-a+sve %endif %if %{model} eq 'mpi' pmodel=MPI %endif # OpenACC flags %if %{pmodel} eq 'acc' pmodel=ACC # Use with PGI compiler only # https://docs.nvidia.com/hpc-sdk/archive/21.7/ #OPTIMIZE += -acc=gpu OPTIMIZE += -acc -ta=tesla -tp=zen #-Minfo=accel #-DSPEC_ACCEL_AWARE_MPI->hangs it forever # 513.soma_t: # PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE %endif # OpenMP (CPU) flags %if %{pmodel} eq 'omp' pmodel=OMP #OPTIMIZE += -qsmp=omp OPTIMIZE += -fopenmp #FOPTIMIZE += %endif # OpenMP Targeting host flags %if %{pmodel} eq 'tgt' pmodel=TGT # PGI OPTIMIZE += -mp -acc=multicore # Intel?? # OPTIMIZE += -qsmp=omp -qoffload # -fopen-simd # GCC (doesn't recognize its own flags) #OPTIMIZE += -fopenmp #OPTIMIZE += -fopenmp -mgomp #OPTIMIZE += -fopenmp -msoft-stack -muniform-simt #FOPTIMIZE += -homp %endif # OpenMP Targeting host flags %if %{pmodel} eq 'tgtnv' pmodel=TGT # PGI OPTIMIZE += -mp=gpu -acc #FOPTIMIZE += -homp # Note that NVHPC is in the process of adding OpenMP array # reduction support so this option may be removed in future 513.soma_t: PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE %endif # No peak flags set, so make peak use the same flags as base default=peak=default: basepeak=1 ####################################################################### # Portability ####################################################################### # The following section was added automatically, and contains settings that # did not appear in the original configuration file, but were added to the # raw file after the run. default: flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags.xml interconnect_ib_hw_switch_ib_model000 = Mellanox IB EDR Switch IB-2