# Invocation command line: # runhpc --config nvhpc_ppc --define pmodel=acc --action run --nobuild --ranks=6 --reportable tiny # output_root was not used for this run ####################################################################### # Example configuration file for the GNU Compilers # # Defines: "pmodel" => "mpi", "acc", "omp", "tgt", "tgtnv" default "mpi" # "label" => ext base label, default "nv" # # MPI-only Command: # runhpc -c Example_gnu --reportable -T base --define pmodel=mpi --ranks=40 small # # OpenACC Command: # runhpc -c Example_gnu --reportable -T base --define pmodel=acc --ranks=4 small # # OpenMP Command: # runhpc -c Example_gnu --reportable -T base --define pmodel=omp --ranks=1 --threads=40 small # # OpenMP Target Offload to Host Command: # runhpc -c Example_gnu --reportable -T base --define pmodel=tgt --ranks=1 --threads=40 small # # OpenMP Target Offload to NVIDIA GPU Command: # runhpc -c Example_gnu --reportable -T base --define pmodel=tgtnv --ranks=4 small # ####################################################################### %ifndef %{label} # IF label is not set use default "nv" % define label nv %endif %ifndef %{pmodel} # IF pmodel is not set use default mpi % define pmodel mpi %endif teeout = yes makeflags=-j 40 # Tester Information license_num = 37A test_sponsor = TU Dresden tester = TU Dresden ####################################################################### # SUT Section ####################################################################### #include: Example_SUT.inc # ----- Begin inclusion of 'Example_SUT.inc' ####################################################################### # General SUT info system_vendor = IBM system_name = Taurus: IBM Power System AC922 (IBM Power9, Tesla V100-SXM2-32GB) node_compute_sw_accel_driver = NVIDIA CUDA 440.64.00 node_compute_hw_adapter_ib_slot_type = None node_compute_hw_adapter_ib_ports_used = 2 node_compute_hw_adapter_ib_model = Mellanox ConnectX-5 node_compute_hw_adapter_ib_interconnect = EDR InfiniBand node_compute_hw_adapter_ib_firmware = 16.27.6008 node_compute_hw_adapter_ib_driver = mlx5_core node_compute_hw_adapter_ib_data_rate = 100 Gb/s (4X EDR) node_compute_hw_adapter_ib_count = 2 interconnect_ib_syslbl = Mellanox InfiniBand interconnect_ib_purpose = MPI Traffic and GPFS access interconnect_ib_order = 1 #interconnect_ib_hw_vendor = Mellanox #interconnect_ib_hw_topo = Non-blocking Fat-tree #interconnect_ib_hw_switch_ib_ports = 36 #interconnect_ib_hw_switch_ib_data_rate = 100 Gb/s #interconnect_ib_hw_switch_ib_count = 1 #interconnect_ib_hw_model = Mellanox Switch IB-2 hw_avail = Nov-2018 sw_avail = Nov-2021 prepared_by = Noah Trumpik (Noah.Trumpik@tu-dresden.de) #[Node_Description: Hardware] node_compute_syslbl = IBM Power System AC922 node_compute_order = 1 node_compute_count = 30 node_compute_purpose = compute node_compute_hw_vendor = IBM node_compute_hw_model = IBM Power System AC922 node_compute_hw_cpu_name = IBM POWER9 2.2 (pvr 004e 1202) node_compute_hw_ncpuorder = 2 chips node_compute_hw_nchips = 2 node_compute_hw_ncores = 44 node_compute_hw_ncoresperchip = 22 node_compute_hw_nthreadspercore = 4 node_compute_hw_cpu_char = Up to 3.8 GHz node_compute_hw_cpu_mhz = 2300 node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core node_compute_hw_scache = 512 KB I+D on chip per core node_compute_hw_tcache000= 10240 KB I+D on chip per chip node_compute_hw_ocache = None node_compute_hw_memory = 256 GB (16 x 16 GB RDIMM-DDR4-2666) node_compute_hw_disk000= 2 x 1 TB (ATA Rev BE35) node_compute_hw_disk001 = NVMe SSD Controller 172Xa/172Xb node_compute_hw_other = None #[Node_Description: Accelerator] node_compute_hw_accel_model = Tesla V100-SXM2-32GB node_compute_hw_accel_count = 6 node_compute_hw_accel_vendor= NVIDIA Corporation node_compute_hw_accel_type = GPU node_compute_hw_accel_connect = NVLINK node_compute_hw_accel_ecc = Yes node_compute_hw_accel_desc = See Notes #[Node_Description: Software] node_compute_sw_os000 = Red Hat Enterprise Linux node_compute_sw_os001 = 7.6 node_compute_sw_localfile = xfs node_compute_sw_sharedfile = 4 PB Lustre parallel filesystem node_compute_sw_state = Multi-user node_compute_sw_other = None #[Fileserver] #[Interconnect] #[Software] sw_compiler000 = C/C++/Fortran: Version 21.5 of the sw_compiler001 = NVHPC toolkit sw_mpi_library = Open MPI Version 4.1.2 sw_mpi_other = None system_class = Homogenous Cluster sw_other = None #[General notes] notes_000 = MPI startup command: notes_005 = srun command was used to launch job using 1 GPU/rank. notes_010 =Detailed information from nvaccelinfo notes_015 = notes_020 =CUDA Driver Version: 11000 notes_025 =NVRM version: NVIDIA UNIX ppc64le Kernel Module 440.64.00 Wed Feb 26 16:01:28 UTC 2020 notes_030 = notes_035 =Device Number: 0 notes_040 =Device Name: Tesla V100-SXM2-32GB notes_045 =Device Revision Number: 7.0 notes_050 =Global Memory Size: 33822867456 notes_055 =Number of Multiprocessors: 80 notes_060 =Concurrent Copy and Execution: Yes notes_065 =Total Constant Memory: 65536 notes_070 =Total Shared Memory per Block: 49152 notes_075 =Registers per Block: 65536 notes_080 =Warp Size: 32 notes_085 =Maximum Threads per Block: 1024 notes_090 =Maximum Block Dimensions: 1024, 1024, 64 notes_095 =Maximum Grid Dimensions: 2147483647 x 65535 x 65535 notes_100 =Maximum Memory Pitch: 2147483647B notes_105 =Texture Alignment: 512B notes_110 =Max Clock Rate: 1530 MHz notes_115 =Execution Timeout: No notes_120 =Integrated Device: No notes_125 =Can Map Host Memory: Yes notes_130 =Compute Mode: default notes_135 =Concurrent Kernels: Yes notes_140 =ECC Enabled: Yes notes_145 =Memory Clock Rate: 877 MHz notes_150 =Memory Bus Width: 4096 bits notes_155 =L2 Cache Size: 6291456 bytes notes_160 =Max Threads Per SMP: 2048 notes_165 =Async Engines: 4 notes_170 =Unified Addressing: Yes notes_175 =Managed Memory: Yes notes_180 =Concurrent Managed Memory: Yes notes_185 =Preemption Supported: Yes notes_190 =Cooperative Launch: Yes notes_195 = Multi-Device: Yes notes_200 =Default Target: cc70 notes_205 = ####################################################################### # End of SUT section ####################################################################### ####################################################################### # The header section of the config file. Must appear # before any instances of "section markers" (see below) # # ext = how the binaries you generated will be identified # tune = specify "base" or "peak" or "all" label = %{label}_%{pmodel} tune = base output_format = text use_submit_for_speed = 1 # Compiler Settings default: CC = mpicc CXX = mpic++ FC = mpif90 # Compiler Version Flags CC_VERSION_OPTION = --version CXX_VERSION_OPTION = --version FC_VERSION_OPTION = --version # MPI options and binding environment, dependent upon Model being run # Adjust to match your system # OpenMP (CPU) Settings %if %{pmodel} eq 'omp' preENV_OMP_PLACES=cores #preENV_OMP_PROC_BIND=true #preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 #preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2\ #3,24 %endif #OpenMP Targeting Host Settings %if %{pmodel} eq 'tgt' #preENV_OMP_PROC_BIND=true preENV_MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 preEnv_MPICH_GPU_SUPPORT_ENABLED=1 preEnv_MPICH_SMP_SINGLE_COPY_MODE=CMA preEnv_MPICH_GPU_EAGER_DEVICE_MEM=0 #preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 #preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24 %endif %ifdef %{ucx} # if using Open MPI with UCX support, these settings are needed with use of CUDA Aware MPI # without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs preENV_UCX_MEMTYPE_CACHE=n preENV_UCX_TLS=self,shm,cuda_copy %endif #MPIRUN_OPTS = --bind-to none -q # 1 GPU per rs, 7 cores per RS, 1 MPI task per RS, 6 RS per host submit = srun ${MPIRUN_OPTS} $command ####################################################################### # Optimization # Note that SPEC baseline rules require that all uses of a given compiler # use the same flags in the same order. See the SPEChpc Run Rules # for more details # http://www.spec.org/hpc2021/Docs/runrules.html # # OPTIMIZE = flags applicable to all compilers # FOPTIMIZE = flags appliable to the Fortran compiler # COPTIMIZE = flags appliable to the C compiler # CXXOPTIMIZE = flags appliable to the C++ compiler # # See your compiler manual for information on the flags available # for your compiler # Compiler flags applied to all models default=base=default: OPTIMIZE = -O3 COPTIMIZE = -lm # use -mcpu=native for ARM CXXOPTIMIZE = -std=c++11 #FOPTIMIZE = -ffree-line-length-none -fno-stack-protector FOPTIMIZE = %if %{model} eq 'mpi' pmodel=MPI %endif # OpenACC flags %if %{pmodel} eq 'acc' # Use with PGI compiler only # https://docs.nvidia.com/hpc-sdk/archive/21.5/ pmodel=ACC #OPTIMIZE += -acc=gpu OPTIMIZE += -acc -ta=tesla OPTIMIZE += -acc -ta=tesla -DSPEC_ACCEL_AWARE_MPI #-Minfo=accel %endif # Note that NVHPC is in the process of adding OpenMP array # reduction support so this option may be removed in future # reduction not supported on taurusml due to old driver 513.soma_t: PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE 513.soma_s: PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE # OpenMP (CPU) flags %if %{pmodel} eq 'omp' pmodel=OMP #OPTIMIZE += -qsmp=omp OPTIMIZE += -fopenmp #FOPTIMIZE += %endif # OpenMP Targeting host flags %if %{pmodel} eq 'tgt' pmodel=TGT # PGI OPTIMIZE += -mp -acc=multicore # Intel?? # OPTIMIZE += -qsmp=omp -qoffload # -fopen-simd # GCC (doesn't recognize its own flags) #OPTIMIZE += -fopenmp #OPTIMIZE += -fopenmp -mgomp #OPTIMIZE += -fopenmp -msoft-stack -muniform-simt #FOPTIMIZE += -homp %endif # OpenMP Targeting host flags %if %{pmodel} eq 'tgtnv' pmodel=TGT # PGI OPTIMIZE += -mp=gpu -acc #FOPTIMIZE += -homp %endif # No peak flags set, so make peak use the same flags as base default=peak=default: basepeak=1 ####################################################################### # Portability ####################################################################### # The following section was added automatically, and contains settings that # did not appear in the original configuration file, but were added to the # raw file after the run. default: flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags.xml interconnect_ib_hw_switch_ib_model000 = Mellanox IB EDR Switch IB-2