Skip to content

Commit

Permalink
Merge pull request #95 from ecmwf-ifs/naan-scc-ecwam
Browse files Browse the repository at this point in the history
SCC updates needed for ecWam
  • Loading branch information
mlange05 authored Jun 21, 2023
2 parents 8bf568a + f3b87a8 commit 5fe6af3
Show file tree
Hide file tree
Showing 11 changed files with 519 additions and 48 deletions.
13 changes: 11 additions & 2 deletions cmake/loki_transform.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ endmacro()
# [DEFINITIONS <define1> [<define2> ...]]
# [OMNI_INCLUDE <omni-inc1> [<omni-inc2> ...]]
# [XMOD <xmod-dir1> [<xmod-dir2> ...]]
# [REMOVE_OPENMP] [DATA_OFFLOAD]
# [REMOVE_OPENMP] [DATA_OFFLOAD] [GLOBAL_VAR_OFFLOAD]
# [TRIM_VECTOR_SECTIONS]
# )
#
# Call ``loki-transform.py convert ...`` with the provided arguments.
Expand All @@ -198,7 +199,7 @@ endmacro()

function( loki_transform_convert )

set( options CPP DATA_OFFLOAD REMOVE_OPENMP )
set( options CPP DATA_OFFLOAD REMOVE_OPENMP GLOBAL_VAR_OFFLOAD TRIM_VECTOR_SECTIONS )
set( oneValueArgs MODE DIRECTIVE FRONTEND CONFIG PATH OUTPATH )
set( multiValueArgs OUTPUT DEPENDS INCLUDES INCLUDE HEADERS HEADER DEFINITIONS DEFINE OMNI_INCLUDE XMOD )

Expand Down Expand Up @@ -235,6 +236,14 @@ function( loki_transform_convert )
list( APPEND _ARGS --remove-openmp )
endif()

if( ${_PAR_GLOBAL_VAR_OFFLOAD} )
list( APPEND _ARGS --global-var-offload )
endif()

if( ${_PAR_TRIM_VECTOR_SECTIONS} )
list( APPEND _ARGS --trim-vector-sections )
endif()

_loki_transform_env_setup()

add_custom_command(
Expand Down
7 changes: 0 additions & 7 deletions loki/transform/dependency_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from loki.expression import Variable, FindInlineCalls, SubstituteExpressions
from loki.backend import fgen
from loki.tools import as_tuple
from loki.bulk.item import GlobalVarImportItem


__all__ = ['DependencyTransformation']
Expand Down Expand Up @@ -121,12 +120,6 @@ def transform_module(self, module, **kwargs):
Rename kernel modules and re-point module-level imports.
"""
role = kwargs.get('role')
item = kwargs.get('item', None)

# bail if module contains global variables as these are potentially used
# in non-offloaded CPU code
if isinstance(item, GlobalVarImportItem):
return

if role == 'kernel':
# Change the name of kernel modules
Expand Down
29 changes: 24 additions & 5 deletions scripts/loki_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@

from loki import (
Sourcefile, Transformation, Scheduler, SchedulerConfig, SubroutineItem,
Frontend, as_tuple, set_excepthook, auto_post_mortem_debugger, flatten, info
Frontend, as_tuple, set_excepthook, auto_post_mortem_debugger, flatten, info,
GlobalVarImportItem
)

# Get generalized transformations provided by Loki
Expand All @@ -31,7 +32,7 @@
from transformations.argument_shape import (
ArgumentArrayShapeAnalysis, ExplicitArgumentArrayShapeTransformation
)
from transformations.data_offload import DataOffloadTransformation
from transformations.data_offload import DataOffloadTransformation, GlobalVarOffloadTransformation
from transformations.derived_types import DerivedTypeArgumentsTransformation
from transformations.utility_routines import DrHookTransformation, RemoveCallsTransformation
from transformations.pool_allocator import TemporariesPoolAllocatorTransformation
Expand Down Expand Up @@ -144,8 +145,12 @@ def cli(debug):
help='Frontend parser to use (default FP)')
@click.option('--config', default=None, type=click.Path(),
help='Path to custom scheduler configuration file')
@click.option('--trim-vector-sections', is_flag=True, default=False,
help='Trim vector loops in SCC transform to exclude scalar assignments.')
@click.option('--global-var-offload', is_flag=True, default=False,
help="Generate offload instructions for global vars imported via 'USE' statements.")
def convert(out_path, path, header, cpp, directive, include, define, omni_include, xmod,
data_offload, remove_openmp, mode, frontend, config):
data_offload, remove_openmp, mode, frontend, config, trim_vector_sections, global_var_offload):
"""
Single Column Abstraction (SCA): Convert kernel into single-column
format and adjust driver to apply it over in a horizontal loop.
Expand Down Expand Up @@ -189,6 +194,15 @@ def convert(out_path, path, header, cpp, directive, include, define, omni_includ
# First, remove all derived-type arguments; caller first!
scheduler.process(transformation=DerivedTypeArgumentsTransformation())

# Remove DR_HOOK and other utility calls first, so they don't interfere with SCC loop hoisting
if 'scc' in mode:
scheduler.process(transformation=RemoveCallsTransformation(
routines=config.default.get('utility_routines', None) or ['DR_HOOK', 'ABOR1', 'WRITE(NULOUT'],
include_intrinsics=True
))
else:
scheduler.process(transformation=DrHookTransformation(mode=mode, remove=False))

# Insert data offload regions for GPUs and remove OpenMP threading directives
use_claw_offload = True
if data_offload:
Expand Down Expand Up @@ -216,7 +230,7 @@ def convert(out_path, path, header, cpp, directive, include, define, omni_includ
vertical = scheduler.config.dimensions['vertical']
block_dim = scheduler.config.dimensions['block_dim']
transformation = (SCCBaseTransformation(horizontal=horizontal, directive=directive),)
transformation += (SCCDevectorTransformation(horizontal=horizontal),)
transformation += (SCCDevectorTransformation(horizontal=horizontal, trim_vector_sections=trim_vector_sections),)
transformation += (SCCDemoteTransformation(horizontal=horizontal),)
if not 'hoist' in mode:
transformation += (SCCRevectorTransformation(horizontal=horizontal),)
Expand All @@ -243,6 +257,10 @@ def convert(out_path, path, header, cpp, directive, include, define, omni_includ
else:
raise RuntimeError('[Loki] Convert could not find specified Transformation!')

if global_var_offload:
scheduler.process(transformation=GlobalVarOffloadTransformation(),
item_filter=(SubroutineItem, GlobalVarImportItem), reverse=True)

if mode in ['idem-stack', 'scc-stack']:
if frontend == Frontend.OMNI:
# To make the pool allocator size derivation work correctly, we need
Expand Down Expand Up @@ -280,7 +298,8 @@ def transform_subroutine(self, routine, **kwargs):
scheduler.process(transformation=dependency)

# Write out all modified source files into the build directory
scheduler.process(transformation=FileWriteTransformation(builddir=out_path, mode=mode, cuf='cuf' in mode))
scheduler.process(transformation=FileWriteTransformation(builddir=out_path, mode=mode, cuf='cuf' in mode),
use_file_graph=True)


@cli.command()
Expand Down
11 changes: 6 additions & 5 deletions transformations/tests/test_pool_allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,15 +365,15 @@ def test_pool_allocator_temporaries(frontend, generate_driver_stack, block_dim,
@pytest.mark.parametrize('directive', [None, 'openmp', 'openacc'])
def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directive):
if directive == 'openmp':
driver_loop_pragma1 = '!$omp parallel default(shared) private(b)\n !$omp do'
driver_loop_pragma1 = '!$omp parallel default(shared) private(b) firstprivate(a)\n !$omp do'
driver_end_loop_pragma1 = '!$omp end do\n !$omp end parallel'
driver_loop_pragma2 = '!$omp parallel do'
driver_loop_pragma2 = '!$omp parallel do firstprivate(a)'
driver_end_loop_pragma2 = '!$omp end parallel do'
kernel_pragma = ''
elif directive == 'openacc':
driver_loop_pragma1 = '!$acc parallel loop gang private(b)'
driver_loop_pragma1 = '!$acc parallel loop gang private(b) firstprivate(a)'
driver_end_loop_pragma1 = '!$acc end parallel loop'
driver_loop_pragma2 = '!$acc parallel loop gang'
driver_loop_pragma2 = '!$acc parallel loop gang firstprivate(a)'
driver_end_loop_pragma2 = '!$acc end parallel loop'
kernel_pragma = '!$acc routine vector'
else:
Expand All @@ -400,7 +400,7 @@ def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directi
INTEGER, INTENT(IN) :: NLON, NZ, NB
real(kind=jprb), intent(inout) :: field1(nlon, nb)
real(kind=jprb), intent(inout) :: field2(nlon, nz, nb)
integer :: b
integer :: a,b
{driver_loop_pragma1}
do b=1,nb
Expand Down Expand Up @@ -563,6 +563,7 @@ def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directi
for pragma in pragmas:
parameters = get_pragma_parameters(pragma, starts_with='parallel', only_loki_pragmas=False)
assert 'private' in parameters and 'ylstack' in parameters['private'].lower()
assert not 'ylstack' in parameters['firstprivate'].lower()

# Are there data regions for the stack?
if directive == ['openacc']:
Expand Down
Loading

0 comments on commit 5fe6af3

Please sign in to comment.