From b84310b4a7ae7730df7c4fc5cc44c9fbc0d07eff Mon Sep 17 00:00:00 2001 From: Johan Ericsson Date: Tue, 27 Aug 2024 14:15:14 +0200 Subject: [PATCH] added tests and cleaned loop blocking logic --- loki/transformations/loop_blocking.py | 211 ++++++--- .../tests/test_loop_blocking.py | 419 +++++++++++++++++- 2 files changed, 547 insertions(+), 83 deletions(-) diff --git a/loki/transformations/loop_blocking.py b/loki/transformations/loop_blocking.py index 0909a28ca..a444ee438 100644 --- a/loki/transformations/loop_blocking.py +++ b/loki/transformations/loop_blocking.py @@ -4,76 +4,149 @@ # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. - +from pymbolic.primitives import Expression from loki import Subroutine, ir, FindNodes, Assignment, Sum, Product, Loop, LoopRange, IntLiteral, \ Quotient, InlineCall, DeferredTypeSymbol, Transformer, Array, Variable, RangeIndex, Scalar, \ - SubstituteExpressions, FindVariables + SubstituteExpressions, FindVariables, parse_expr, fgen + + + +def ceil_division(iexpr1: Expression, iexpr2: Expression) -> Expression: + """ + Returns ceiled division expression of two integer expressions iexpr1/iexpr2. + """ + return Sum(children=(Quotient(numerator=Sum(children=(iexpr1, IntLiteral(-1))), + denominator=iexpr2), IntLiteral(1))) + + + +def negate(i: Expression) -> Expression: + return Product((IntLiteral(-1), i)) + + +def num_iterations(loop_range: LoopRange) -> Expression: + """ + Returns total number of iterations of a loop. + + Given a loop, this returns an expression that computes the total number of + iterations of the loop, i.e. + (start,stop,step) -> ceil(stop-start/step) + """ + start = loop_range.start + stop = loop_range.stop + step = loop_range.step + if step is None: + return stop if isinstance(start, IntLiteral) and start.value == 1 else Sum( + (stop, negate(start), IntLiteral(1))) + else: + return Sum((Quotient(Sum((stop, negate(start))), step), IntLiteral(1))) + + +def normalized_loop_range(loop_range: LoopRange) -> LoopRange: + """ + Returns the normalized LoopRange of a given LoopRange. + + Returns the normalized LoopRange which corresponds to a loop with the same + number of iterations but starts at 1 and has stride 1, i.e. + (start,stop,step) -> (1,num_iter,1) + """ + return LoopRange((1, num_iterations(loop_range))) + + +def iteration_number(iter_idx: Variable, loop_range: LoopRange) -> Expression: + """ + Returns the normalized iteration number of the iteration variable, i.e. + (iter_idx - start + step)/step = (iter_idx-start)/step + 1 + """ + if loop_range.step is None: + return Sum((Sum((iter_idx, negate(loop_range.start))), IntLiteral(1))) + else: + return Sum((Quotient(Sum((iter_idx, negate(loop_range.start))), loop_range.step), + IntLiteral(1))) + + +def iteration_index(iter_num: Variable, loop_range: LoopRange) -> Expression: + """ + Returns the iteration index of the loop based on the iteration number, i.e. + iter_idx = (iter_num-1)*step+start + """ + if loop_range.step is None: + return Sum((iter_num, loop_range.start)) + else: + return Sum( + (Product((Sum((iter_num, IntLiteral(-1))), loop_range.step)), loop_range.start)) def split_loop(routine: Subroutine, loop: ir.Loop, block_size: int): """ - Assumes that loop is contiguous in blocking variable - into a blocked loop in those variables. Does not block - the loop data + Blocks a loop by splitting it into an outer loop and inner loop of size `block_size`. + + Parameters + ---------- + routine: :any:`Subroutine` + Subroutine object containing the loop. New variables introduced in the + loop splitting will be declared in the body of routine. + loop: :any:`Loop` + Loop to be split. + block_size: int + inner loop size (size of blocking blocks) """ # blocking variable declarations - decls = FindNodes(ir.VariableDeclaration).visit(routine.spec) - block_size_var = loop.variable.clone(name=loop.variable.name + '_loop_block_size') - num_blocks_var = loop.variable.clone(name=loop.variable.name + '_loop_num_blocks') - outer_loop_var = loop.variable.clone(name=loop.variable.name + '_loop_outer_idx') - inner_loop_var = loop.variable.clone(name=loop.variable.name + '_loop_local') + block_size_var = loop.variable.clone(name=loop.variable.name + "_loop_block_size") + num_blocks_var = loop.variable.clone(name=loop.variable.name + "_loop_num_blocks") + block_idx = loop.variable.clone(name=loop.variable.name + "_loop_block_idx") + inner_loop_var = loop.variable.clone(name=loop.variable.name + "_loop_local") + iter_num_var = loop.variable.clone(name=loop.variable.name + "_loop_iter_num") global_loop_var = loop.variable - block_start = loop.variable.clone(name=loop.variable.name + '_loop_block_start') - block_end = loop.variable.clone(name=loop.variable.name + '_loop_block_end') + block_start = loop.variable.clone(name=loop.variable.name + "_loop_block_start") + block_end = loop.variable.clone(name=loop.variable.name + "_loop_block_end") - routine.variables += (block_size_var, num_blocks_var, outer_loop_var, block_start, block_end) - - # Inner loop - inner_body = Assignment(global_loop_var, Sum(children=( - Product(children=(outer_loop_var, block_size_var)), inner_loop_var))) - inner_loop = Loop(variable=inner_loop_var, body=(inner_body,) + loop.body, - bounds=LoopRange((IntLiteral(1), block_size_var))) - - # blocking variables outside loop initalization - block_loop_inits = [ir.Assignment(block_size_var, IntLiteral(block_size))] - q = Quotient(numerator=Sum(children=(loop.bounds.upper, block_size_var, IntLiteral(-1))), - denominator=block_size_var) - block_loop_inits.append(ir.Assignment(num_blocks_var, q)) - change_map = {loop: block_loop_inits} - # Outer loop bounds + body + routine.variables += ( + block_size_var, num_blocks_var, block_idx, block_start, block_end, iter_num_var) - outer_bounds = LoopRange((IntLiteral(1), num_blocks_var)) - outer_body = ( + # block index calculations + blocking_body = ( Assignment(block_start, - Sum(children=( - Product(children=( - Sum(children=( - outer_loop_var, IntLiteral(-1) - )), - block_size_var - )), - IntLiteral(1) - )) + parse_expr(f"({block_idx} - 1) * {block_size_var} + 1") ), Assignment(block_end, InlineCall(DeferredTypeSymbol('MIN', scope=routine), - parameters=(Product(children=(outer_loop_var, block_size_var)), + parameters=(Product(children=(block_idx, block_size_var)), loop.bounds.upper)) )) - outer_loop = Loop(variable=outer_loop_var, body=outer_body + (inner_loop,), - bounds=outer_bounds) - change_map[loop].append(outer_loop) + # Outer loop blocking variable assignments + loop_range = loop.bounds + block_loop_inits = (ir.Assignment(block_size_var, IntLiteral(block_size)), + ir.Assignment(num_blocks_var, + ceil_division(num_iterations(loop_range), block_size_var))) + + # Inner loop + iteration_nums = ( + Assignment(iter_num_var, parse_expr(f"{block_start}+{inner_loop_var}-1")), + Assignment(global_loop_var, iteration_index(iter_num_var, loop_range)) + ) + inner_loop = Loop(variable=inner_loop_var, body=iteration_nums + loop.body, + bounds=LoopRange((IntLiteral(1), block_size_var))) + + # Outer loop bounds + body + outer_loop = Loop(variable=block_idx, body=blocking_body + (inner_loop,), + bounds=LoopRange((IntLiteral(1), num_blocks_var))) + change_map = {loop: block_loop_inits + (outer_loop,)} Transformer(change_map, inplace=True).visit(routine.ir) return loop.variable, inner_loop, outer_loop -def blocked_shape(a: Array, blocking_var: Variable, loop: ir.Loop): - dims = tuple(RangeIndex( - (1, loop.bounds.upper)) if isinstance(dim, Scalar) and blocking_var in dim else dim for dim - in a.dimensions) +def blocked_shape(a: Array, blocking_vars: list[Variable], loop: ir.Loop): + """ + calculates the dimensions for a blocked version of the array. + """ + dims = tuple( + loop.bounds.upper if isinstance(dim, Scalar) and any( + blocking_var in dim for blocking_var in blocking_vars) else dim for dim + in a.dimensions) return dims @@ -81,32 +154,53 @@ def blocked_type(a: Array): return a.type.clone(intent=None) -def block_loop_dims(a: Array, blocking_var: Variable, loop: ir.Loop): +def block_loop_dims(a: Array, blocking_vars: list[Variable], loop: ir.Loop): + """ + calculates the dimension index of the blocked array inside the loop. + """ dims = tuple( - loop.variable if isinstance(dim, Scalar) and blocking_var in dim else dim for dim + loop.variable if isinstance(dim, Scalar) and any( + blocking_var in dim for blocking_var in blocking_vars) else dim for dim in a.dimensions) return dims -def insert_block_loop_arrays(routine: Subroutine, loop_var: Variable, inner_loop: ir.Loop, +def insert_block_loop_arrays(routine: Subroutine, blocking_indices: list[Variable], inner_loop: ir.Loop, outer_loop: ir.Loop, block_size: int): """ + Replaces arrays inside the inner loop with blocked counterparts. + This routine declares array variables to hold the blocks of the arrays used inside - the loop and replaces array variables inside the loop with their blocked counterparts + the loop and replaces array variables inside the loop with their blocked counterparts. + An array is blocked with the leading dimensions + + Parameters + ---------- + routine : Subroutine + + blocking_indices: list of :any:`Variable` + list of the index variables that arrays inside the loop should be blocked by. + inner_loop: :any:`Loop` + inner loop after loop splitting + outer_loop: :any:`Loop` + outer loop after loop splitting + block_size: int + size of blocked arrays """ # Declare Blocked arrays - arrays = [var for var in FindVariables().visit(inner_loop.body) if - isinstance(var, Array) and loop_var in var] + arrays = tuple(var for var in FindVariables().visit(inner_loop.body) if + isinstance(var, Array) and any( + bv in var for bv in blocking_indices)) name_map = {a.name: a.name + '_block' for a in arrays} block_arrays = tuple( - a.clone(name=name_map[a.name], dimensions=blocked_shape(a, loop_var, inner_loop), + a.clone(name=name_map[a.name], dimensions=blocked_shape(a, blocking_indices, inner_loop), type=blocked_type(a)) for a in arrays) routine.variables += block_arrays - + # Replace arrays in loop with blocked arrays and update idx block_array_expr = [ - a.clone(name=name_map[a.name], dimensions=block_loop_dims(a, loop_var, inner_loop)) for a in + a.clone(name=name_map[a.name], dimensions=block_loop_dims(a, blocking_indices, inner_loop)) for a + in arrays] - # Replace arrays in loop with blocked arrays and update idx body = SubstituteExpressions(dict(zip(arrays, block_array_expr))).visit(inner_loop.body) inner_loop._update(body=body) @@ -114,8 +208,3 @@ def insert_block_loop_arrays(routine: Subroutine, loop_var: Variable, inner_loop def handle_device_transfers(routine: Subroutine, loop_var: Variable, inner_loop: ir.Loop, outer_loop: ir.Loop, block_size: int): pass - - -def block_loop(routine: Subroutine, loop: ir.Loop, block_size: int): - loop_var, inner_loop, outer_loop = split_loop(routine, loop, block_size) - insert_block_loop_arrays(routine, loop_var, inner_loop, outer_loop, block_size) diff --git a/loki/transformations/tests/test_loop_blocking.py b/loki/transformations/tests/test_loop_blocking.py index a453d3522..fb82bda2d 100644 --- a/loki/transformations/tests/test_loop_blocking.py +++ b/loki/transformations/tests/test_loop_blocking.py @@ -7,17 +7,178 @@ import itertools +from math import floor + import pytest import numpy as np from fparser.two.pattern_tools import subroutine +from sympy import fcode from loki import available_frontends, Subroutine, pragmas_attached, find_driver_loops, Loop, fgen, \ - pprint -from loki.transformations.loop_blocking import block_loop + pprint, Module, ir, FindNodes, LoopRange, IntLiteral +from loki.expression.parser import LokiEvaluationMapper +from loki.transformations.loop_blocking import split_loop, insert_block_loop_arrays, \ + normalized_loop_range, iteration_number, iteration_index + +from pymbolic.compiler import CompiledExpression +from pymbolic.mapper.evaluator import EvaluationMapper as EM + + + +def get_pyrange(loop_range: LoopRange): + """ + Converts a LoopRange of IntLiterals to a python range. + """ + LEM = LokiEvaluationMapper() + if loop_range.step is None: + return range(LEM(loop_range.start), floor(LEM(loop_range.stop))+1) + else: + return range(LEM(loop_range.start), floor(LEM(loop_range.stop))+1, LEM(loop_range.step)) + + +@pytest.mark.parametrize('frontend', available_frontends()) +def test_normalized_loop_range(tmp_path, frontend): + """ + Tests the num_iterations and normalized_loop_range functions. + """ + for start in range(-10, 11): + for stop in range(start + 1, 50 + 1, 4): + for step in range(1, stop - start): + loop_range = LoopRange((start, stop, step)) + pyrange = range(start, stop + 1, step) + + normalized_range = normalized_loop_range(loop_range) + assert normalized_range.step is None, "LoopRange.step should be None in a normalized range" + + normalized_start = LokiEvaluationMapper()(normalized_range.start) + assert normalized_start == 1, "LoopRange.start should be equal to 1 in a normalized range" + + normalized_stop = floor(LokiEvaluationMapper()(normalized_range.stop)) + assert normalized_stop == len( + pyrange), "LoopRange.stop should be equal to the total number of iterations of the original LoopRange" + + +@pytest.mark.parametrize('frontend', available_frontends()) +def test_iteration_number(tmp_path, frontend): + for start in range(-10, 11): + for stop in range(start + 1, 50 + 1, 4): + for step in range(1, stop - start): + loop_range = LoopRange((start, stop, step)) + pyrange = range(start, stop + 1, step) + normalized_range = get_pyrange(normalized_loop_range(loop_range)) + assert len(normalized_range) == len( + pyrange), "Length of normalized loop range should equal length of python loop range" + + LEM = LokiEvaluationMapper() + assert all(n == LEM(iteration_number(IntLiteral(i), loop_range)) for i, n in + zip(pyrange, normalized_range)) + + +@pytest.mark.parametrize('frontend', available_frontends()) +def test_iteration_index(tmp_path, frontend): + for start in range(-10, 11): + for stop in range(start + 1, 50 + 1, 4): + for step in range(1, stop - start): + loop_range = LoopRange((start, stop, step)) + pyrange = range(start, stop + 1, step) + normalized_range = get_pyrange(normalized_loop_range(loop_range)) + assert len(normalized_range) == len( + pyrange), "Length of normalized loop range should equal length of python loop range" + + LEM = LokiEvaluationMapper() + assert all(i == LEM(iteration_index(IntLiteral(n), loop_range)) for i, n in + zip(pyrange, normalized_range)) + + + +""" +the total number of variables added by the loop splitting routine should be 6 +the loop blocking routine should create a new variable for each variable blocked +""" + + +@pytest.mark.parametrize('frontend', available_frontends()) +def test_1d_blocking(tmp_path, frontend): + """ + Apply loop blocking of simple loops into two loops + """ + fcode = """ +subroutine simple_loop(a, b, n) + integer, intent(in) :: n + real, intent(out) :: a(n) + real, intent(in) :: b(n) + real :: c(n) + integer :: i + !$loki driver-loop + do i=1,n + a(i) = real(i) + end do +end subroutine simple_loop + """ + routine = Subroutine.from_source(fcode, frontend=frontend) + loops = FindNodes(ir.Loop).visit(routine.ir) + num_loops = len(loops) + num_vars = len(routine.variable_map) + with pragmas_attached(routine, Loop): + loops = find_driver_loops(routine, + targets=None) + loop_var, inner_loop, outer_loop = split_loop(routine, loops[0], 10) + + loops = FindNodes(ir.Loop).visit(routine.ir) + assert len( + loops) == num_loops + 1, f"Total number of loops transformation is: {len(loops)} but expected {num_loops + 1}" + assert len( + routine.variable_map) == num_vars + 6, f"Total number of variables after loop splitting is: {len(routine.variable_map)} but expected {num_vars + 5}" + + loop_vars = [loop_var] + insert_block_loop_arrays(routine, loop_vars, inner_loop, outer_loop, 10) + + assert len( + routine.variable_map) == num_vars + 7, f"Total number of variables after loop blocking is: {len(routine.variable_map)} but expected {num_vars + 5}" + + +@pytest.mark.parametrize('frontend', available_frontends()) +def test_1d_blocking_multi_var(tmp_path, frontend): + """ + Apply loop blocking of simple loops into two loops + """ + fcode = """ +subroutine simple_loop(a, b, n) + integer, intent(in) :: n + real, intent(out) :: a(n) + real, intent(in) :: b(n) + real :: c(n) + integer :: i + !$loki driver-loop + do i=1,n + c(1) = c(1) + i + a(i) = real(i) + end do +end subroutine simple_loop + """ + routine = Subroutine.from_source(fcode, frontend=frontend) + loops = FindNodes(ir.Loop).visit(routine.ir) + num_loops = len(loops) + num_vars = len(routine.variable_map) + with pragmas_attached(routine, Loop): + loops = find_driver_loops(routine, + targets=None) + loop_var, inner_loop, outer_loop = split_loop(routine, loops[0], 10) + + loops = FindNodes(ir.Loop).visit(routine.ir) + assert len( + loops) == num_loops + 1, f"Total number of loops transformation is: {len(loops)} but expected {num_loops + 1}" + assert len( + routine.variable_map) == num_vars + 6, f"Total number of variables after loop splitting is: {len(routine.variable_map)} but expected {num_vars + 5}" + + loop_vars = [loop_var] + insert_block_loop_arrays(routine, loop_vars, inner_loop, outer_loop, 10) + assert len( + routine.variable_map) == num_vars + 7, f"Total number of variables after loop blocking is: {len(routine.variable_map)} but expected {num_vars + 5}" @pytest.mark.parametrize('frontend', available_frontends()) -def test_simple_blocking(tmp_path, frontend): +def test_2d_blocking(tmp_path, frontend): """ Apply loop blocking of simple loops into two loops """ @@ -38,21 +199,25 @@ def test_simple_blocking(tmp_path, frontend): end subroutine simple_loop """ routine = Subroutine.from_source(fcode, frontend=frontend) - - # loops = FindNodes(ir.Loop).visit(routine.ir) + loops = FindNodes(ir.Loop).visit(routine.ir) + num_loops = len(loops) + num_vars = len(routine.variable_map) with pragmas_attached(routine, Loop): loops = find_driver_loops(routine, targets=None) - for loop in loops: - block_loop(routine, loop, 10) + loop_var, inner_loop, outer_loop = split_loop(routine, loops[0], 10) + loops = FindNodes(ir.Loop).visit(routine.ir) + assert len( + loops) == num_loops + 1, f"Total number of loops transformation is: {len(loops)} but expected {num_loops + 1}" + assert len( + routine.variable_map) == num_vars + 6, f"Total number of variables after loop splitting is: {len(routine.variable_map)} but expected {num_vars + 5}" + + loop_vars = [loop_var] + insert_block_loop_arrays(routine, loop_vars, inner_loop, outer_loop, 10) - print() - print(30 * "-" + "IR after manipulations" + 30 * "-") - print(f"{pprint(routine.ir)}") # scheduler['transformation_module_hoist#driver'].ir.ir)}") - print() - print(30 * "-" + "Generated FORTRAN" + 30 * "-") - print(f"{fgen(routine)}") # scheduler['transformation_module_hoist#driver'].ir.ir)}") + assert len( + routine.variable_map) == num_vars + 8, f"Total number of variables after loop blocking is: {len(routine.variable_map)} but expected {num_vars + 5}" @pytest.mark.parametrize('frontend', available_frontends()) @@ -75,21 +240,231 @@ def test_4d_blocking(tmp_path, frontend): """ routine = Subroutine.from_source(fcode, frontend=frontend) + loops = FindNodes(ir.Loop).visit(routine.ir) + num_loops = len(loops) + num_vars = len(routine.variable_map) + with pragmas_attached(routine, Loop): + loops = find_driver_loops(routine, + targets=None) - # loops = FindNodes(ir.Loop).visit(routine.ir) + loop_var, inner_loop, outer_loop = split_loop(routine, loops[0], 10) + loops = FindNodes(ir.Loop).visit(routine.ir) + assert len( + loops) == num_loops + 1, f"Total number of loops transformation is: {len(loops)} but expected {num_loops + 1}" + assert len( + routine.variable_map) == num_vars + 6, f"Total number of variables after loop splitting is: {len(routine.variable_map)} but expected {num_vars + 5}" + + loop_vars = [loop_var] + insert_block_loop_arrays(routine, loop_vars, inner_loop, outer_loop, 10) + + assert len( + routine.variable_map) == num_vars + 8, f"Total number of variables after loop blocking is: {len(routine.variable_map)} but expected {num_vars + 5}" + routine = Subroutine.from_source(fcode, frontend=frontend) + + +@pytest.mark.parametrize('frontend', available_frontends()) +def test_4d_strided_blocking(tmp_path, frontend): + """ + Apply loop interchange for two loops without further arguments. + """ + fcode = """ +subroutine simple_loop(a, b, n) + integer, intent(in) :: n + real, intent(out) :: a(n,n) + real, intent(in) :: b(n,n) + real :: c(n) + integer :: i,k,strided + !$loki driver-loop + do i=1,k,strided + call some_kernel(a(:,i), b(:,i)) + c(1) = c(1) + a(i,i) + end do +end subroutine simple_loop + """ + + routine = Subroutine.from_source(fcode, frontend=frontend) + loops = FindNodes(ir.Loop).visit(routine.ir) + num_loops = len(loops) + num_vars = len(routine.variable_map) with pragmas_attached(routine, Loop): loops = find_driver_loops(routine, targets=None) - for loop in loops: - block_loop(routine, loop, 10) + loop_var, inner_loop, outer_loop = split_loop(routine, loops[0], 10) + loops = FindNodes(ir.Loop).visit(routine.ir) + assert len( + loops) == num_loops + 1, f"Total number of loops transformation is: {len(loops)} but expected {num_loops + 1}" + assert len( + routine.variable_map) == num_vars + 6, f"Total number of variables after loop splitting is: {len(routine.variable_map)} but expected {num_vars + 5}" - print() - print(30 * "-" + "IR after manipulations" + 30 * "-") - print(f"{pprint(routine.ir)}") # scheduler['transformation_module_hoist#driver'].ir.ir)}") - print() - print(30 * "-" + "Generated FORTRAN" + 30 * "-") - print(f"{fgen(routine)}") # scheduler['transformation_module_hoist#driver'].ir.ir)}") + loop_vars = [loop_var] + insert_block_loop_arrays(routine, loop_vars, inner_loop, outer_loop, 10) + assert len( + routine.variable_map) == num_vars + 8, f"Total number of variables after loop blocking is: {len(routine.variable_map)} but expected {num_vars + 5}" + routine = Subroutine.from_source(fcode, frontend=frontend) + +@pytest.mark.parametrize('frontend', available_frontends()) +def test_cloudsc_driver(tmp_path, frontend): + """ + Apply loop interchange for two loops without further arguments. + """ + fcode=""" + ! (C) Copyright 1988- ECMWF. + ! + ! This software is licensed under the terms of the Apache Licence Version 2.0 + ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. + ! + ! In applying this licence, ECMWF does not waive the privileges and immunities + ! granted to it by virtue of its status as an intergovernmental organisation + ! nor does it submit to any jurisdiction. + + MODULE CLOUDSC_DRIVER_GPU_SCC_MOD + + USE PARKIND1, ONLY: JPIM, JPRB + USE YOMPHYDER, ONLY: STATE_TYPE + USE YOECLDP, ONLY : NCLV, YRECLDP, TECLDP + USE CLOUDSC_MPI_MOD, ONLY: NUMPROC, IRANK + + USE CLOUDSC_GPU_SCC_MOD, ONLY: CLOUDSC_SCC + + IMPLICIT NONE + + CONTAINS + + SUBROUTINE CLOUDSC_DRIVER_GPU_SCC( & + & NUMOMP, NPROMA, NLEV, NGPTOT, NGPBLKS, NGPTOTG, KFLDX, PTSPHY, & + & PT, PQ, & + & BUFFER_CML, BUFFER_TMP, BUFFER_LOC, & + & PVFA, PVFL, PVFI, PDYNA, PDYNL, PDYNI, & + & PHRSW, PHRLW, & + & PVERVEL, PAP, PAPH, & + & PLSM, LDCUM, KTYPE, & + & PLU, PLUDE, PSNDE, PMFU, PMFD, & + & PA, & + & PCLV, PSUPSAT,& + & PLCRIT_AER,PICRIT_AER, PRE_ICE, & + & PCCN, PNICE,& + & PCOVPTOT, PRAINFRAC_TOPRFZ, & + & PFSQLF, PFSQIF , PFCQNNG, PFCQLNG, & + & PFSQRF, PFSQSF , PFCQRNG, PFCQSNG, & + & PFSQLTUR, PFSQITUR, & + & PFPLSL, PFPLSN, PFHPSL, PFHPSN & + & ) + ! Driver routine that invokes the optimized CLAW-based CLOUDSC GPU kernel + + INTEGER(KIND=JPIM) :: NUMOMP, NPROMA, NLEV, NGPTOT, NGPBLKS, NGPTOTG + INTEGER(KIND=JPIM) :: KFLDX + REAL(KIND=JPRB) :: PTSPHY ! Physics timestep + REAL(KIND=JPRB), INTENT(IN) :: PT(NPROMA, NLEV, NGPBLKS) ! T at start of callpar + REAL(KIND=JPRB), INTENT(IN) :: PQ(NPROMA, NLEV, NGPBLKS) ! Q at start of callpar + REAL(KIND=JPRB), INTENT(INOUT) :: BUFFER_CML(NPROMA,NLEV,3+NCLV,NGPBLKS) ! Storage buffer for TENDENCY_CML + REAL(KIND=JPRB), INTENT(INOUT) :: BUFFER_TMP(NPROMA,NLEV,3+NCLV,NGPBLKS) ! Storage buffer for TENDENCY_TMP + REAL(KIND=JPRB), INTENT(INOUT) :: BUFFER_LOC(NPROMA,NLEV,3+NCLV,NGPBLKS) ! Storage buffer for TENDENCY_LOC + REAL(KIND=JPRB), INTENT(IN) :: PVFA(NPROMA, NLEV, NGPBLKS) ! CC from VDF scheme + REAL(KIND=JPRB), INTENT(IN) :: PVFL(NPROMA, NLEV, NGPBLKS) ! Liq from VDF scheme + REAL(KIND=JPRB), INTENT(IN) :: PVFI(NPROMA, NLEV, NGPBLKS) ! Ice from VDF scheme + REAL(KIND=JPRB), INTENT(IN) :: PDYNA(NPROMA, NLEV, NGPBLKS) ! CC from Dynamics + REAL(KIND=JPRB), INTENT(IN) :: PDYNL(NPROMA, NLEV, NGPBLKS) ! Liq from Dynamics + REAL(KIND=JPRB), INTENT(IN) :: PDYNI(NPROMA, NLEV, NGPBLKS) ! Liq from Dynamics + REAL(KIND=JPRB), INTENT(IN) :: PHRSW(NPROMA, NLEV, NGPBLKS) ! Short-wave heating rate + REAL(KIND=JPRB), INTENT(IN) :: PHRLW(NPROMA, NLEV, NGPBLKS) ! Long-wave heating rate + REAL(KIND=JPRB), INTENT(IN) :: PVERVEL(NPROMA, NLEV, NGPBLKS) !Vertical velocity + REAL(KIND=JPRB), INTENT(IN) :: PAP(NPROMA, NLEV, NGPBLKS) ! Pressure on full levels + REAL(KIND=JPRB), INTENT(IN) :: PAPH(NPROMA, NLEV+1, NGPBLKS) ! Pressure on half levels + REAL(KIND=JPRB), INTENT(IN) :: PLSM(NPROMA, NGPBLKS) ! Land fraction (0-1) + LOGICAL, INTENT(IN) :: LDCUM(NPROMA, NGPBLKS) ! Convection active + INTEGER(KIND=JPIM), INTENT(IN) :: KTYPE(NPROMA, NGPBLKS) ! Convection type 0,1,2 + REAL(KIND=JPRB), INTENT(IN) :: PLU(NPROMA, NLEV, NGPBLKS) ! Conv. condensate + REAL(KIND=JPRB), INTENT(INOUT) :: PLUDE(NPROMA, NLEV, NGPBLKS) ! Conv. detrained water + REAL(KIND=JPRB), INTENT(IN) :: PSNDE(NPROMA, NLEV, NGPBLKS) ! Conv. detrained snow + REAL(KIND=JPRB), INTENT(IN) :: PMFU(NPROMA, NLEV, NGPBLKS) ! Conv. mass flux up + REAL(KIND=JPRB), INTENT(IN) :: PMFD(NPROMA, NLEV, NGPBLKS) ! Conv. mass flux down + REAL(KIND=JPRB), INTENT(IN) :: PA(NPROMA, NLEV, NGPBLKS) ! Original Cloud fraction (t) + REAL(KIND=JPRB), INTENT(IN) :: PCLV(NPROMA, NLEV, NCLV, NGPBLKS) + REAL(KIND=JPRB), INTENT(IN) :: PSUPSAT(NPROMA, NLEV, NGPBLKS) + REAL(KIND=JPRB), INTENT(IN) :: PLCRIT_AER(NPROMA, NLEV, NGPBLKS) + REAL(KIND=JPRB), INTENT(IN) :: PICRIT_AER(NPROMA, NLEV, NGPBLKS) + REAL(KIND=JPRB), INTENT(IN) :: PRE_ICE(NPROMA, NLEV, NGPBLKS) + REAL(KIND=JPRB), INTENT(IN) :: PCCN(NPROMA, NLEV, NGPBLKS) ! liquid cloud condensation nuclei + REAL(KIND=JPRB), INTENT(IN) :: PNICE(NPROMA, NLEV, NGPBLKS) ! ice number concentration (cf. CCN) + + REAL(KIND=JPRB), INTENT(INOUT) :: PCOVPTOT(NPROMA, NLEV, NGPBLKS) ! Precip fraction + REAL(KIND=JPRB), INTENT(OUT) :: PRAINFRAC_TOPRFZ(NPROMA, NGPBLKS) + ! Flux diagnostics for DDH budget + REAL(KIND=JPRB), INTENT(OUT) :: PFSQLF(NPROMA, NLEV+1, NGPBLKS) ! Flux of liquid + REAL(KIND=JPRB), INTENT(OUT) :: PFSQIF(NPROMA, NLEV+1, NGPBLKS) ! Flux of ice + REAL(KIND=JPRB), INTENT(OUT) :: PFCQLNG(NPROMA, NLEV+1, NGPBLKS) ! -ve corr for liq + REAL(KIND=JPRB), INTENT(OUT) :: PFCQNNG(NPROMA, NLEV+1, NGPBLKS) ! -ve corr for ice + REAL(KIND=JPRB), INTENT(OUT) :: PFSQRF(NPROMA, NLEV+1, NGPBLKS) ! Flux diagnostics + REAL(KIND=JPRB), INTENT(OUT) :: PFSQSF(NPROMA, NLEV+1, NGPBLKS) ! for DDH, generic + REAL(KIND=JPRB), INTENT(OUT) :: PFCQRNG(NPROMA, NLEV+1, NGPBLKS) ! rain + REAL(KIND=JPRB), INTENT(OUT) :: PFCQSNG(NPROMA, NLEV+1, NGPBLKS) ! snow + REAL(KIND=JPRB), INTENT(OUT) :: PFSQLTUR(NPROMA, NLEV+1, NGPBLKS) ! liquid flux due to VDF + REAL(KIND=JPRB), INTENT(OUT) :: PFSQITUR(NPROMA, NLEV+1, NGPBLKS) ! ice flux due to VDF + REAL(KIND=JPRB), INTENT(OUT) :: PFPLSL(NPROMA, NLEV+1, NGPBLKS) ! liq+rain sedim flux + REAL(KIND=JPRB), INTENT(OUT) :: PFPLSN(NPROMA, NLEV+1, NGPBLKS) ! ice+snow sedim flux + REAL(KIND=JPRB), INTENT(OUT) :: PFHPSL(NPROMA, NLEV+1, NGPBLKS) ! Enthalpy flux for liq + REAL(KIND=JPRB), INTENT(OUT) :: PFHPSN(NPROMA, NLEV+1, NGPBLKS) ! ice number concentration (cf. CCN) + + INTEGER(KIND=JPIM) :: JKGLO,IBL,ICEND + INTEGER(KIND=JPIM) :: TID ! thread id from 0 .. NUMOMP - 1 + + ! Local copy of cloud parameters for offload + TYPE(TECLDP) :: LOCAL_YRECLDP + + NGPBLKS = (NGPTOT / NPROMA) + MIN(MOD(NGPTOT,NPROMA), 1) + + DO JKGLO=1,NGPTOT,NPROMA + IBL=(JKGLO-1)/NPROMA+1 + ICEND=MIN(NPROMA,NGPTOT-JKGLO+1) + + CALL CLOUDSC_SCC & + & (1, ICEND, NPROMA, NLEV, PTSPHY,& + & PT(:,:,IBL), PQ(:,:,IBL), & + & BUFFER_TMP(:,:,1,IBL), BUFFER_TMP(:,:,3,IBL), BUFFER_TMP(:,:,2,IBL), BUFFER_TMP(:,:,4:8,IBL), & + & BUFFER_LOC(:,:,1,IBL), BUFFER_LOC(:,:,3,IBL), BUFFER_LOC(:,:,2,IBL), BUFFER_LOC(:,:,4:8,IBL), & + & PVFA(:,:,IBL), PVFL(:,:,IBL), PVFI(:,:,IBL), PDYNA(:,:,IBL), PDYNL(:,:,IBL), PDYNI(:,:,IBL), & + & PHRSW(:,:,IBL), PHRLW(:,:,IBL),& + & PVERVEL(:,:,IBL), PAP(:,:,IBL), PAPH(:,:,IBL),& + & PLSM(:,IBL), LDCUM(:,IBL), KTYPE(:,IBL), & + & PLU(:,:,IBL), PLUDE(:,:,IBL), PSNDE(:,:,IBL), PMFU(:,:,IBL), PMFD(:,:,IBL),& + !---prognostic fields + & PA(:,:,IBL), PCLV(:,:,:,IBL), PSUPSAT(:,:,IBL),& + !-- arrays for aerosol-cloud interactions + & PLCRIT_AER(:,:,IBL),PICRIT_AER(:,:,IBL),& + & PRE_ICE(:,:,IBL),& + & PCCN(:,:,IBL), PNICE(:,:,IBL),& + !---diagnostic output + & PCOVPTOT(:,:,IBL), PRAINFRAC_TOPRFZ(:,IBL),& + !---resulting fluxes + & PFSQLF(:,:,IBL), PFSQIF (:,:,IBL), PFCQNNG(:,:,IBL), PFCQLNG(:,:,IBL),& + & PFSQRF(:,:,IBL), PFSQSF (:,:,IBL), PFCQRNG(:,:,IBL), PFCQSNG(:,:,IBL),& + & PFSQLTUR(:,:,IBL), PFSQITUR (:,:,IBL), & + & PFPLSL(:,:,IBL), PFPLSN(:,:,IBL), PFHPSL(:,:,IBL), PFHPSN(:,:,IBL),& + & YRECLDP=LOCAL_YRECLDP) + + ENDDO + + END SUBROUTINE CLOUDSC_DRIVER_GPU_SCC + + END MODULE CLOUDSC_DRIVER_GPU_SCC_MOD + + """ + module = Module.from_source(fcode, frontend=frontend) + routine = module.routines[0] + block_size = 33 + + loops = FindNodes(ir.Loop).visit(routine.ir) + loop_var, inner_loop, outer_loop = split_loop(routine, loops[0], block_size) + blocking_indices = [loop_var, routine.variable_map['IBL']] + insert_block_loop_arrays(routine, blocking_indices, inner_loop, outer_loop, block_size) + + # TODO: fortran test? + # print(30 * "-" + "Loops after manipulations" + 30 * "-") + # loops = FindNodes(ir.Loop).visit(routine.ir) + # print(f"{pprint(loops[0])}") + # print(30 * "-" + "Generated FORTRAN" + 30 * "-") + # print(f"{fgen(module.ir)}")