Skip to content

Commit 751a953

Browse files
authored
Merge pull request #100 from ecmwf-ifs/je-field-api-offload-v2
New Field API Loki variant with state types in driver
2 parents 82fdf4b + b2815a3 commit 751a953

File tree

5 files changed

+336
-1
lines changed

5 files changed

+336
-1
lines changed

bundle.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ projects :
3636
3737
- loki :
3838
git : https://github.com/ecmwf-ifs/loki
39-
version : v0.2.7
39+
version : v0.2.9
4040
require : ecbuild
4141
cmake : >
4242
LOKI_ENABLE_TESTS=OFF

src/cloudsc_loki/CMakeLists.txt

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,76 @@ if( HAVE_CLOUDSC_LOKI )
623623
)
624624

625625

626+
####################################################
627+
## "Single Column Coalesced Field" (SCC-FIELD) ##
628+
## * Removes horizontal vector loops ##
629+
## * Invokes compute kernel as `!$acc vector` ##
630+
## * Uses Field API for device data offloading ##
631+
####################################################
632+
if ( HAVE_FIELD_API AND field_api_HAVE_ACC )
633+
loki_transform(
634+
COMMAND convert
635+
OUTPUT
636+
loki-scc-field/cloudsc.scc_field.F90
637+
loki-scc-field/cloudsc_driver_field_loki_mod.scc_field.F90
638+
BUILDDIR ${CMAKE_CURRENT_BINARY_DIR}/loki-scc-field
639+
DEPENDS
640+
cloudsc.F90
641+
cloudsc_driver_field_loki_mod.F90
642+
${_OMNI_DEPENDENCIES}
643+
MODE scc-field
644+
CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/cloudsc_loki_field_offload.config
645+
CPP
646+
DEFINITIONS
647+
CLOUDSC_GPU_TIMING
648+
${CLOUDSC_DEFINE_STMT_FUNC}
649+
FRONTEND ${LOKI_FRONTEND}
650+
HEADERS
651+
${COMMON_MODULE}/yomcst.F90
652+
${COMMON_MODULE}/yomphyder.F90
653+
${COMMON_MODULE}/yoethf.F90
654+
${COMMON_MODULE}/yoecldp.F90
655+
${COMMON_MODULE}/cloudsc_field_state_mod.F90
656+
${COMMON_MODULE}/cloudsc_flux_type_mod.F90
657+
${COMMON_MODULE}/cloudsc_aux_type_mod.F90
658+
${COMMON_MODULE}/cloudsc_state_type_mod.F90
659+
SOURCES
660+
${CMAKE_CURRENT_SOURCE_DIR}
661+
${COMMON_MODULE}
662+
INCLUDES
663+
${COMMON_INCLUDE}
664+
XMOD
665+
${_TARGET_XMOD_DIR}
666+
${XMOD_DIR}
667+
)
668+
669+
ecbuild_add_executable( TARGET dwarf-cloudsc-loki-scc-field
670+
SOURCES
671+
dwarf_cloudsc.F90
672+
loki-scc-field/cloudsc.scc_field.F90
673+
loki-scc-field/cloudsc_driver_field_loki_mod.scc_field.F90
674+
LIBS
675+
cloudsc-common-lib
676+
DEFINITIONS ${CLOUDSC_DEFINITIONS} CLOUDSC_GPU_SCC_FIELD
677+
)
678+
# Set specific module directory to avoid aliasing of .mod files
679+
set_target_properties( dwarf-cloudsc-loki-scc-field
680+
PROPERTIES Fortran_MODULE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/loki-scc-field
681+
)
682+
683+
684+
ecbuild_add_test(
685+
TARGET dwarf-cloudsc-loki-scc-field
686+
COMMAND bin/dwarf-cloudsc-loki-scc-field
687+
ARGS 1 1280 128
688+
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../..
689+
OMP 1
690+
ENVIRONMENT "NVCOMPILER_ACC_CUDA_HEAPSIZE=128M"
691+
)
692+
693+
endif()
694+
695+
626696
####################################################
627697
## SCC CUF (CUDA Fortran) ##
628698
## * SCC with CUDA Fortran (CUF) ##
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
! (C) Copyright 1988- ECMWF.
2+
!
3+
! This software is licensed under the terms of the Apache Licence Version 2.0
4+
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5+
!
6+
! In applying this licence, ECMWF does not waive the privileges and immunities
7+
! granted to it by virtue of its status as an intergovernmental organisation
8+
! nor does it submit to any jurisdiction.
9+
10+
MODULE CLOUDSC_DRIVER_FIELD_LOKI_MOD
11+
USE PARKIND1, ONLY: JPIM, JPIB, JPRB, JPRD
12+
USE YOMPHYDER, ONLY: STATE_TYPE
13+
USE YOECLDP, ONLY : NCLV, YRECLDP
14+
USE CLOUDSC_MPI_MOD, ONLY: NUMPROC, IRANK
15+
USE TIMER_MOD, ONLY : PERFORMANCE_TIMER, GET_THREAD_NUM
16+
USE EC_PMON_MOD, ONLY: EC_PMON
17+
USE CLOUDSC_FIELD_STATE_MOD, ONLY: CLOUDSC_AUX_TYPE, CLOUDSC_FLUX_TYPE, CLOUDSC_STATE_TYPE
18+
19+
USE CLOUDSC_MOD, ONLY : CLOUDSC
20+
21+
IMPLICIT NONE
22+
23+
CONTAINS
24+
25+
SUBROUTINE CLOUDSC_DRIVER_FIELD( &
26+
& NUMOMP, NPROMA, NLEV, NGPTOT, NGPTOTG, KFLDX, PTSPHY, PAUX, FLUX, &
27+
& TENDENCY_TMP, TENDENCY_LOC)
28+
! Driver routine that invokes the optimized CLAW-based CLOUDSC GPU kernel
29+
30+
USE YOECLDP , ONLY : TECLDP
31+
USE YOMCST , ONLY : TOMCST
32+
USE YOETHF , ONLY : TOETHF
33+
34+
INTEGER(KIND=JPIM) ,INTENT(IN) :: NUMOMP, NPROMA, NLEV, NGPTOT, NGPTOTG
35+
INTEGER(KIND=JPIM) ,INTENT(IN) :: KFLDX
36+
REAL(KIND=JPRB) ,INTENT(IN) :: PTSPHY ! PHYSICS TIMESTEP
37+
TYPE(CLOUDSC_AUX_TYPE) ,INTENT(IN) :: PAUX
38+
TYPE(CLOUDSC_FLUX_TYPE) ,INTENT(IN) :: FLUX
39+
TYPE(CLOUDSC_STATE_TYPE) ,INTENT(IN) :: TENDENCY_TMP
40+
TYPE(CLOUDSC_STATE_TYPE) ,INTENT(INOUT) :: TENDENCY_LOC
41+
42+
INTEGER(KIND=JPIM) :: JKGLO,IBL,ICEND, NGPBLKS
43+
44+
TYPE(TECLDP) :: LOCAL_YRECLDP
45+
46+
TYPE(PERFORMANCE_TIMER) :: TIMER
47+
INTEGER(KIND=JPIM) :: TID ! thread id from 0 .. NUMOMP - 1
48+
49+
NGPBLKS = (NGPTOT / NPROMA) + MIN(MOD(NGPTOT,NPROMA), 1)
50+
1003 format(5x,'NUMPROC=',i0,', NUMOMP=',i0,', NGPTOTG=',i0,', NPROMA=',i0,', NGPBLKS=',i0)
51+
if (irank == 0) then
52+
write(0,1003) NUMPROC,NUMOMP,NGPTOTG,NPROMA,NGPBLKS
53+
end if
54+
55+
! Global timer for the parallel region
56+
CALL TIMER%START(NUMOMP)
57+
58+
! Workaround for PGI / OpenACC oddities:
59+
! Create a local copy of the parameter struct to ensure they get
60+
! moved to the device the in ``acc data`` clause below
61+
LOCAL_YRECLDP = YRECLDP
62+
63+
!$loki data
64+
65+
!$omp parallel default(shared) private(JKGLO,IBL,ICEND,TID) &
66+
!$omp& num_threads(NUMOMP) firstprivate(PAUX, FLUX, TENDENCY_TMP, TENDENCY_LOC)
67+
68+
! Local timer for each thread
69+
TID = GET_THREAD_NUM()
70+
CALL TIMER%THREAD_START(TID)
71+
72+
!$omp do schedule(runtime) reduction(+:power_total,power_count)
73+
DO JKGLO=1,NGPTOT,NPROMA
74+
IBL=(JKGLO-1)/NPROMA+1
75+
ICEND=MIN(NPROMA,NGPTOT-JKGLO+1)
76+
77+
CALL PAUX%UPDATE_VIEW(IBL)
78+
CALL FLUX%UPDATE_VIEW(IBL)
79+
CALL TENDENCY_LOC%UPDATE_VIEW(IBL)
80+
CALL TENDENCY_TMP%UPDATE_VIEW(IBL)
81+
82+
!-- These were uninitialized : meaningful only when we compare error differences
83+
PAUX%PCOVPTOT = 0.0_JPRB
84+
TENDENCY_LOC%CLD(:,:,NCLV) = 0.0_JPRB
85+
86+
87+
88+
CALL CLOUDSC( 1, ICEND, NPROMA, NLEV, & ! These could also be accessed through FIELD_STATE
89+
& PTSPHY,&
90+
& PAUX%PT, PAUX%PQ, &
91+
& TENDENCY_TMP%T, TENDENCY_TMP%Q, TENDENCY_TMP%A, TENDENCY_TMP%CLD, &
92+
& TENDENCY_LOC%T, TENDENCY_LOC%Q, TENDENCY_LOC%A, TENDENCY_LOC%CLD, &
93+
& PAUX%PVFA, PAUX%PVFL, PAUX%PVFI, PAUX%PDYNA, PAUX%PDYNL, PAUX%PDYNI, &
94+
& PAUX%PHRSW, PAUX%PHRLW,&
95+
& PAUX%PVERVEL, PAUX%PAP, PAUX%PAPH,&
96+
& PAUX%PLSM, PAUX%LDCUM, PAUX%KTYPE, &
97+
& PAUX%PLU, PAUX%PLUDE, PAUX%PSNDE, PAUX%PMFU, PAUX%PMFD,&
98+
!---prognostic fields
99+
& PAUX%PA,&
100+
& PAUX%PCLV, &
101+
& PAUX%PSUPSAT,&
102+
! -- arrays for aerosol-cloud interactions
103+
! !! & PQAER, KAER, &
104+
& PAUX%PLCRIT_AER,PAUX%PICRIT_AER,&
105+
& PAUX%PRE_ICE,&
106+
& PAUX%PCCN, PAUX%PNICE,&
107+
!---diagnostic output
108+
& PAUX%PCOVPTOT, PAUX%PRAINFRAC_TOPRFZ,&
109+
!---resulting fluxes
110+
& FLUX%PFSQLF, FLUX%PFSQIF , FLUX%PFCQNNG, FLUX%PFCQLNG,&
111+
& FLUX%PFSQRF, FLUX%PFSQSF , FLUX%PFCQRNG, FLUX%PFCQSNG,&
112+
& FLUX%PFSQLTUR, FLUX%PFSQITUR , &
113+
& FLUX%PFPLSL, FLUX%PFPLSN, FLUX%PFHPSL, FLUX%PFHPSN, &
114+
& LOCAL_YRECLDP)
115+
116+
#ifndef CLOUDSC_GPU_TIMING
117+
! Log number of columns processed by this thread
118+
CALL TIMER%THREAD_LOG(TID, IGPC=ICEND)
119+
#endif
120+
ENDDO
121+
122+
!-- The "nowait" is here to get correct local timings (tloc) per thread
123+
! i.e. we should not wait for slowest thread to finish before measuring tloc
124+
!$omp end do nowait
125+
126+
CALL TIMER%THREAD_END(TID)
127+
128+
!$omp end parallel
129+
130+
!$loki end data
131+
132+
CALL TIMER%END()
133+
134+
#ifdef CLOUDSC_GPU_TIMING
135+
! On GPUs, adding block-level column totals is cumbersome and
136+
! error prone, and of little value due to the large number of
137+
! processing "thread teams". Instead we register the total here.
138+
CALL TIMER % THREAD_LOG(TID=TID, IGPC=NGPTOT)
139+
#endif
140+
CALL TIMER%PRINT_PERFORMANCE(NPROMA, NGPBLKS, NGPTOT)
141+
142+
143+
END SUBROUTINE CLOUDSC_DRIVER_FIELD
144+
145+
END MODULE CLOUDSC_DRIVER_FIELD_LOKI_MOD
146+
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
[default]
2+
# Specifies the behaviour of auto-expanded routines
3+
role = 'kernel'
4+
expand = true # Automatically expand subroutine calls
5+
strict = true # Throw exceptions during dicovery
6+
enable_imports = true # Chase dependencies incurred via imports
7+
8+
# disable - not parsed and not transformed
9+
# block - contained as nodes but never parsed/transformed
10+
# ignore - are parsed but not transformed
11+
12+
disable = ['abor1', 'timer_mod', 'abort', 'file_io_mod', 'foe*', 'fokoop', 'get_environment_variable', '*%update_view', 'cloudsc_mpi_reduce_min', 'cloudsc_mpi_reduce_max','cloudsc_mpi_reduce_sum', 'EC_PMON', 'expand_l1', 'expand_i1', 'expand_r1', 'expand_r2', 'expand_r3', 'load_and_expand_l1', 'load_and_expand_i1', 'load_and_expand_r1', 'load_and_expand_r2', 'load_and_expand_r3', 'VALIDATE_L1', 'VALIDATE_I1', 'VALIDATE_R1', 'VALIDATE_R2', 'VALIDATE_R3', 'get_offsets', 'ERROR_PRINT', '*get_device_data_rdonly', '*get_device_data_rdwr', '*sync_host_rdwr']
13+
14+
ignore = ['parkind1', 'yomphyder', 'yoecldp', 'fc*_mod']
15+
16+
17+
# Define entry point for call-tree transformation
18+
[routines.cloudsc_driver_field]
19+
role = 'driver'
20+
expand = true
21+
22+
23+
# Define indices and bounds for array dimensions
24+
[dimensions.horizontal]
25+
size = 'KLON'
26+
index = 'JL'
27+
bounds = ['KIDIA', 'KFDIA']
28+
aliases = ['NPROMA', 'KDIM%KLON']
29+
30+
[dimensions.vertical]
31+
size = 'KLEV'
32+
index = 'JK'
33+
34+
[dimensions.block_dim]
35+
size = 'NGPBLKS'
36+
index = 'IBL'
37+
38+
39+
# Overwrite frontend for header modules that cannot be parsed via OMNI
40+
[frontend_args]
41+
42+
[frontend_args."yomphyder.F90"]
43+
frontend = 'FP'
44+
45+
[frontend_args."yomcst.F90"]
46+
frontend = 'FP'
47+
48+
[frontend_args."yoethf.F90"]
49+
frontend = 'FP'
50+
51+
[frontend_args."yoecldp.F90"]
52+
frontend = 'FP'
53+
54+
55+
[transformations.Idem]
56+
classname = 'IdemTransformation'
57+
module = 'loki.transformations'
58+
59+
60+
# Loki-SCC
61+
# -----------------------------------------
62+
[transformations.SCCVector]
63+
classname = 'SCCVectorPipeline'
64+
module = 'loki.transformations.single_column'
65+
[transformations.SCCVector.options]
66+
horizontal = '%dimensions.horizontal%'
67+
block_dim = '%dimensions.block_dim%'
68+
directive = 'openacc'
69+
70+
# Housekeeping and other transformations
71+
# -----------------------------------------
72+
[transformations.FieldOffload]
73+
classname = 'FieldOffloadTransformation'
74+
module = "loki.transformations"
75+
options = { field_group_types = ['CLOUDSC_STATE_TYPE', 'CLOUDSC_AUX_TYPE', 'CLOUDSC_FLUX_TYPE'] }
76+
77+
[transformations.DataOffload]
78+
classname = 'DataOffloadTransformation'
79+
module = 'loki.transformations'
80+
options = { remove_openmp = true, claw_data_offload = false, assume_deviceptr = false, present_on_device = true }
81+
82+
[transformations.ModuleWrap]
83+
classname = 'ModuleWrapTransformation'
84+
module = 'loki.transformations.build_system'
85+
options = { module_suffix = '_MOD' }
86+
87+
[transformations.Dependency]
88+
classname = 'DependencyTransformation'
89+
module = 'loki.transformations.build_system'
90+
options = { suffix = '_LOKI', module_suffix = '_MOD' }
91+
92+
93+
# Full transformation pipelines
94+
# -----------------------------------------
95+
[pipelines]
96+
97+
[pipelines.idem]
98+
transformations = ['Idem', 'ModuleWrap', 'Dependency']
99+
100+
[pipelines.scc-field]
101+
transformations = ['FieldOffload', 'DataOffload', 'SCCVector', 'ModuleWrap', 'Dependency']

src/cloudsc_loki/dwarf_cloudsc.F90

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,14 @@ PROGRAM DWARF_CLOUDSC
1414
USE CLOUDSC_GLOBAL_STATE_MOD, ONLY: CLOUDSC_GLOBAL_STATE
1515
#if CLOUDSC_GPU_SCC_CUF
1616
USE CUF_CLOUDSC_DRIVER_LOKI_MOD, ONLY: CUF_CLOUDSC_DRIVER
17+
#elif defined(CLOUDSC_GPU_SCC_FIELD)
18+
USE CLOUDSC_FIELD_STATE_MOD, ONLY: CLOUDSC_FIELD_STATE
19+
USE CLOUDSC_DRIVER_FIELD_LOKI_MOD, ONLY: CLOUDSC_DRIVER_FIELD
1720
#else
1821
USE CLOUDSC_DRIVER_LOKI_MOD, ONLY: CLOUDSC_DRIVER
1922
#endif
23+
24+
2025
USE EC_PMON_MOD, ONLY: EC_PMON
2126

2227
#ifdef _OPENMP
@@ -33,7 +38,11 @@ PROGRAM DWARF_CLOUDSC
3338
INTEGER(KIND=JPIM) :: NPROMA = 16384 ! NPROMA blocking factor (currently active)
3439
INTEGER(KIND=JPIM) :: NGPTOT ! Local number of grid points
3540

41+
#ifdef CLOUDSC_GPU_SCC_FIELD
42+
TYPE(CLOUDSC_FIELD_STATE) :: GLOBAL_STATE
43+
#else
3644
TYPE(CLOUDSC_GLOBAL_STATE) :: GLOBAL_STATE
45+
#endif
3746

3847
INTEGER(KIND=JPIB) :: ENERGY, POWER
3948
CHARACTER(LEN=1) :: CLEC_PMON
@@ -107,6 +116,12 @@ PROGRAM DWARF_CLOUDSC
107116
& GLOBAL_STATE%PFSQLTUR, GLOBAL_STATE%PFSQITUR, &
108117
& GLOBAL_STATE%PFPLSL, GLOBAL_STATE%PFPLSN, GLOBAL_STATE%PFHPSL, GLOBAL_STATE%PFHPSN &
109118
& )
119+
#elif defined(CLOUDSC_GPU_SCC_FIELD)
120+
CALL CLOUDSC_DRIVER_FIELD( &
121+
& NUMOMP, NPROMA, GLOBAL_STATE%KLEV, NGPTOT, NGPTOTG, &
122+
& GLOBAL_STATE%KFLDX, GLOBAL_STATE%PTSPHY, &
123+
& GLOBAL_STATE%AUX, GLOBAL_STATE%FLUX, &
124+
& GLOBAL_STATE%TENDENCY_TMP, GLOBAL_STATE%TENDENCY_LOC)
110125
#else
111126
CALL CLOUDSC_DRIVER(NUMOMP, NPROMA, GLOBAL_STATE%KLEV, NGPTOT, NGPTOTG, GLOBAL_STATE%NBLOCKS, &
112127
& GLOBAL_STATE%KFLDX, GLOBAL_STATE%PTSPHY, &
@@ -133,6 +148,9 @@ PROGRAM DWARF_CLOUDSC
133148

134149
! Validate the output against serialized reference data
135150
CALL GLOBAL_STATE%VALIDATE(NPROMA, NGPTOT, NGPTOTG)
151+
#ifdef CLOUDSC_GPU_SCC_FIELD
152+
CALL GLOBAL_STATE%FINALIZE()
153+
#endif
136154

137155
! Tear down MPI environment
138156
CALL CLOUDSC_MPI_END()

0 commit comments

Comments
 (0)