Peter Heywood, Research Software Engineer
The University of Sheffield
2024-03-27
Profile
gprof
, perf
, Kcachegrind, VTune, …roctracer
rocsys
rocprofv2
The Celeritas project implements HEP detector physics on GPU accelerator hardware with the ultimate goal of supporting the massive computational requirements of the HL-LHC upgrade.
$ ctest --rerun-failed --output-on-failure
# ...
1/2 Test #158: celeritas/mat/Material ...........***Failed
Error regular expression found in output. Regex=[tests FAILED] 0.68 sec
# ...
2/2 Test #160: celeritas/phys/Particle .......... Passed 0.61 sec
50% tests passed, 1 tests failed out of 2
JSON Comparison | mass_radiation_coeff |
---|---|
Expected | 0.03605392839455309 |
Actual | 0.0360539283945531 |
-DCMAKE_BUILD_TYPE=Release
, -O3
-lineinfo
v0.4.2
with VecGeom v1.2.4
cms2018+field+msc
from celeritas-project/regressionceler-sim
16
Events1300
primaries per event1048576
track slots (max threads){
"_exe": "celer-sim",
"_format": "celer-sim",
"_geometry": "vecgeom",
"_instance": 0,
"_name": [
"cms2018+field+msc",
"vecgeom",
"gpu"
],
"_outdir": "cms2018+field+msc-vecgeom-gpu",
"_timeout": 600.0,
"_use_celeritas": true,
"_version": "0.4.2",
"action_diagnostic": false,
"brem_combined": false,
"cuda_heap_size": null,
"cuda_stack_size": 8192,
"default_stream": false,
"environ": {},
"event_file": null,
"field": [
0.0,
0.0,
1.0
],
"field_options": {
"delta_chord": 0.025,
"delta_intersection": 1e-05,
"epsilon_rel_max": 0.001,
"epsilon_step": 1e-05,
"errcon": 0.0001,
"max_nsteps": 100,
"max_stepping_decrease": 0.1,
"max_stepping_increase": 5.0,
"minimum_step": 1.0000000000000002e-06,
"pgrow": -0.2,
"pshrink": -0.25,
"safety": 0.9
},
"geometry_file": "/path/to/cms2018.gdml",
"initializer_capacity": 67108864,
"max_events": 16,
"max_steps": 32768,
"mctruth_file": null,
"mctruth_filter": null,
"merge_events": true,
"num_track_slots": 1048576,
"physics_file": "",
"physics_options": {
"annihilation": true,
"apply_cuts": false,
"brems": "all",
"compton_scattering": true,
"coulomb_scattering": false,
"default_cutoff": 0.1,
"eloss_fluctuation": true,
"em_bins_per_decade": 56,
"gamma_conversion": true,
"gamma_general": false,
"integral_approach": true,
"ionization": true,
"linear_loss_limit": 0.01,
"lowest_electron_energy": [
0.001,
"MeV"
],
"lpm": true,
"max_energy": [
100000000.0,
"MeV"
],
"min_energy": [
0.0001,
"MeV"
],
"msc": "urban",
"msc_lambda_limit": 0.1,
"msc_range_factor": 0.04,
"msc_safety_factor": 0.6,
"photoelectric": true,
"rayleigh_scattering": false,
"relaxation": "none",
"verbose": false
},
"primary_options": {
"direction": {
"distribution": "isotropic",
"params": []
},
"energy": {
"distribution": "delta",
"params": [
10000.0
]
},
"num_events": 16,
"pdg": [
11
],
"position": {
"distribution": "delta",
"params": [
0.0,
0.0,
0.0
]
},
"primaries_per_event": 1300,
"seed": 0
},
"secondary_stack_factor": 3.0,
"seed": 20220904,
"simple_calo": [],
"step_diagnostic": false,
"step_diagnostic_bins": null,
"step_limiter": null,
"sync": false,
"track_order": "unsorted",
"use_device": true,
"warm_up": true,
"write_track_counts": true
}
nvml
to monitor GPU resource consumptionCELER_ENABLE_PROFILING=1
nsys
: Timelinensys
: Host-Device Communicationnsys
: Host-Device Communication242MB, 690μs @ 328GB/s
nsys
: Longest Duration Kernel-lineinfo
for line-level profiling--set=full
for non-interactive profiling-s
, -c
etc.ncu
: Summaryncu
: “Speed of Light”ncu
: Schedulerncu
: Warp statencu
: Occupancyncu
: Performance Monitor Sampling2023.3
(distributed with CUDA 12.3)ncu
: Memory Access PatternUKRI “Shaping the Future of UK large-scale compute” survey
closes 29 March 2024 (this Friday!)
https://engagementhub.ukri.org/ukri-infrastructure/shaping-the-future-of-uk-large-scale-compute/
-Wno-psabi
aarch64
include/VecGeom/base/Transformation3D.h: In member function
‘vecgeom::cxx::Vector3D<double>
vecgeom::cxx::Transformation3D::Translation() const’:
include/VecGeom/base/Transformation3D.h:213:3: note: parameter
passing for argument of type ‘vecgeom::cxx::Vector3D<double>’
when C++17 is enabled changed to match C++14 in GCC 10.1
213 | {
| ^
$ ctest
# ...
99% tests passed, 2 tests failed out of 203
Label Time Summary:
app = 108.80 sec*proc (11 tests)
gpu = 101.33 sec*proc (43 tests)
nomemcheck = 107.88 sec*proc (9 tests)
unit = 33.99 sec*proc (191 tests)
Total Test time (real) = 140.78 sec
The following tests FAILED:
158 - celeritas/mat/Material (Failed)
160 - celeritas/phys/Particle (SEGFAULT)
$ ctest --rerun-failed --output-on-failure
# ...
1/2 Test #158: celeritas/mat/Material ...........***Failed
Error regular expression found in output. Regex=[tests FAILED] 0.68 sec
# ...
2/2 Test #160: celeritas/phys/Particle .......... Passed 0.61 sec
50% tests passed, 1 tests failed out of 2
The following tests FAILED:
158 - celeritas/mat/Material (Failed)
JSON Comparison | mass_radiation_coeff |
---|---|
Expected | 0.03605392839455309 |
Actual | 0.0360539283945531 |
$ time ./bin/celer-sim cms2018+field+msc.json
status: Loading input and initializing problem data
status: Initializing Geant4 run manager
status: Initializing Geant4 geometry
info: Loading Geant4 geometry from GDML at /path/to/cms2018.gdml
status: Building Geant4 physics tables
status: Transferring data from Geant4
status: Loading external elemental data
status: Loading VecGeom geometry from GDML at /path/to/cms2018.gdml
status: Initializing tracking information
celeritas/src/celeritas/geo/GeoMaterialParams.cc:205: warning: Some geometry volumes do not have known material IDs: PixelForwardInnerDiskOuterRing_seg_1@0x7f4a9a837fc0,
# ...
real 1m13.997s
user 1m7.096s
sys 0m0.871s
nsys
: Timelinensys
: Host-Device Communicationncu
: ERR_NVGPUCTRPERM
ncu
: Summary along-stepncu
: Computencu
: Instructionsncu
: Memory Access Patternncu
: Launch Statisticsncu
: SourceProfiiling Celeritas - GridPP51 & SWIFT-HEP07