Skip to content

Commit

Permalink
Merge pull request #98 from ecmwf-ifs/nabr-scc-vector-length
Browse files Browse the repository at this point in the history
SCC: Insert vector_length directive into the gang loop
  • Loading branch information
mlange05 authored Jun 21, 2023
2 parents 5fe6af3 + 80f81ac commit cffca77
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 19 deletions.
2 changes: 1 addition & 1 deletion transformations/tests/test_cloudsc.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_cloudsc(here, frontend):
# Raise stack limit
resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))
env = os.environ.copy()
env.update({'OMP_STACKSIZE': '2G'})
env.update({'OMP_STACKSIZE': '2G', 'NVCOMPILER_ACC_CUDA_HEAPSIZE': '2G'})

# For some reason, the 'data' dir symlink is not created???
os.symlink(here/'data', here/'build/data')
Expand Down
27 changes: 14 additions & 13 deletions transformations/tests/test_single_column_coalesced.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

@pytest.fixture(scope='module', name='horizontal')
def fixture_horizontal():
return Dimension(name='horizontal', size='nlon', index='jl', bounds=('start', 'end'))
return Dimension(name='horizontal', size='nlon', index='jl', bounds=('start', 'end'), aliases=('nproma',))


@pytest.fixture(scope='module', name='vertical')
Expand Down Expand Up @@ -638,7 +638,7 @@ def test_scc_annotate_openacc(frontend, horizontal, vertical, blocking):
driver_loops = FindNodes(Loop).visit(driver.body)
assert len(driver_loops) == 1
assert driver_loops[0].pragma[0].keyword == 'acc'
assert driver_loops[0].pragma[0].content == 'parallel loop gang'
assert driver_loops[0].pragma[0].content == 'parallel loop gang vector_length(nlon)'


@pytest.mark.parametrize('frontend', available_frontends())
Expand Down Expand Up @@ -731,7 +731,7 @@ def test_single_column_coalesced_hoist_openacc(frontend, horizontal, vertical, b
driver_loops = FindNodes(Loop).visit(driver.body)
assert len(driver_loops) == 2
assert driver_loops[0].pragma[0].keyword == 'acc'
assert driver_loops[0].pragma[0].content == 'parallel loop gang'
assert driver_loops[0].pragma[0].content == 'parallel loop gang vector_length(nlon)'
assert driver_loops[1].pragma[0].keyword == 'acc'
assert driver_loops[1].pragma[0].content == 'loop vector'

Expand All @@ -744,23 +744,24 @@ def test_single_column_coalesced_hoist_openacc(frontend, horizontal, vertical, b
assert driver_pragmas[1].content == 'exit data delete(t)'


@pytest.mark.parametrize('block_size', ['nlon','nproma'])
@pytest.mark.parametrize('frontend', available_frontends())
def test_scc_wrapper_hoist_openacc(frontend, horizontal, vertical, blocking):
def test_scc_wrapper_hoist_openacc(frontend, horizontal, vertical, blocking, block_size):
"""
Test the correct addition of OpenACC pragmas to SCC format code
when hoisting column array temporaries to driver.
"""

fcode_driver = """
SUBROUTINE column_driver(nlon, nz, q, nb)
INTEGER, INTENT(IN) :: nlon, nz, nb ! Size of the horizontal and vertical
REAL, INTENT(INOUT) :: q(nlon,nz,nb)
fcode_driver = f"""
SUBROUTINE column_driver({block_size}, nz, q, nb)
INTEGER, INTENT(IN) :: {block_size}, nz, nb ! Size of the horizontal and vertical
REAL, INTENT(INOUT) :: q({block_size},nz,nb)
INTEGER :: b, start, end
start = 1
end = nlon
end = {block_size}
do b=1, nb
call compute_column(start, end, nlon, nz, q(:,:,b))
call compute_column(start, end, {block_size}, nz, q(:,:,b))
end do
END SUBROUTINE column_driver
"""
Expand Down Expand Up @@ -832,7 +833,7 @@ def test_scc_wrapper_hoist_openacc(frontend, horizontal, vertical, blocking):
driver_loops = FindNodes(Loop).visit(driver.body)
assert len(driver_loops) == 2
assert driver_loops[0].pragma[0].keyword == 'acc'
assert driver_loops[0].pragma[0].content == 'parallel loop gang'
assert driver_loops[0].pragma[0].content == f'parallel loop gang vector_length({block_size})'
assert driver_loops[1].pragma[0].keyword == 'acc'
assert driver_loops[1].pragma[0].content == 'loop vector'

Expand Down Expand Up @@ -1016,7 +1017,7 @@ def test_single_column_coalesced_nested(frontend, horizontal, vertical, blocking
assert driver_loops[0].variable == 'b'
assert driver_loops[0].bounds == '1:nb'
assert driver_loops[0].pragma[0].keyword == 'acc'
assert driver_loops[0].pragma[0].content == 'parallel loop gang'
assert driver_loops[0].pragma[0].content == 'parallel loop gang vector_length(nlon)'

# Ensure we have a kernel call in the driver loop
kernel_calls = FindNodes(CallStatement).visit(driver_loops[0])
Expand Down Expand Up @@ -1488,7 +1489,7 @@ def test_single_column_coalesced_multiple_acc_pragmas(frontend, horizontal, vert
assert 'data' in pragmas[0].content
assert 'copy' in pragmas[0].content
assert '(work)' in pragmas[0].content
assert pragmas[1].content == 'parallel loop gang'
assert pragmas[1].content == 'parallel loop gang vector_length(nlon)'
assert pragmas[2].content == 'end parallel loop'
assert pragmas[3].content == 'end data'

Expand Down
23 changes: 19 additions & 4 deletions transformations/transformations/single_column_coalesced.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,15 @@ def process_driver(self, routine, targets=None, item=None):
if self.directive == 'openacc':
self.device_alloc_column_locals(routine, column_locals)

# For the thread block size, find the horizontal size variable that is available in
# the driver
num_threads = None
symbol_map = routine.symbol_map
for size_expr in self.horizontal.size_expressions:
if size_expr in symbol_map:
num_threads = size_expr
break

with pragmas_attached(routine, ir.Loop, attach_pragma_post=True):
for call in FindNodes(ir.CallStatement).visit(routine.body):
if not call.name in targets:
Expand All @@ -488,7 +497,10 @@ def process_driver(self, routine, targets=None, item=None):
assert not driver_loop == kernel_loop

# Mark driver loop as "gang parallel".
self.annotate_driver(self.directive, driver_loop, kernel_loop, self.block_dim, column_locals)
self.annotate_driver(
self.directive, driver_loop, kernel_loop,
self.block_dim, column_locals, num_threads
)

@classmethod
def device_alloc_column_locals(cls, routine, column_locals):
Expand All @@ -512,7 +524,7 @@ def device_alloc_column_locals(cls, routine, column_locals):
routine.body.append((ir.Comment(''), pragma_post, ir.Comment('')))

@classmethod
def annotate_driver(cls, directive, driver_loop, kernel_loop, block_dim, column_locals):
def annotate_driver(cls, directive, driver_loop, kernel_loop, block_dim, column_locals, num_threads):
"""
Annotate driver block loop with ``'openacc'`` pragmas, and add offload directives
for hoisted column locals.
Expand All @@ -531,6 +543,8 @@ def annotate_driver(cls, directive, driver_loop, kernel_loop, block_dim, column_
to use for hoisted column arrays if hoisting is enabled.
column_locals : list
List of column locals to be hoisted to driver layer
num_threads : str
The size expression that determines the number of threads per thread block
"""

# Mark driver loop as "gang parallel".
Expand All @@ -546,21 +560,22 @@ def annotate_driver(cls, directive, driver_loop, kernel_loop, block_dim, column_
arrays = [v for v in arrays if not any(d in sizes for d in as_tuple(v.shape))]
private_arrays = ', '.join(set(v.name for v in arrays))
private_clause = '' if not private_arrays else f' private({private_arrays})'
vector_length_clause = '' if not num_threads else f' vector_length({num_threads})'

# Annotate vector loops with OpenACC pragmas
if kernel_loop:
kernel_loop._update(pragma=ir.Pragma(keyword='acc', content='loop vector'))

if driver_loop.pragma is None:
p_content = f'parallel loop gang{private_clause}'
p_content = f'parallel loop gang{private_clause}{vector_length_clause}'
driver_loop._update(pragma=(ir.Pragma(keyword='acc', content=p_content),))
driver_loop._update(pragma_post=(ir.Pragma(keyword='acc', content='end parallel loop'),))

# add acc parallel loop gang if the only existing pragma is acc data
elif len(driver_loop.pragma) == 1:
if (driver_loop.pragma[0].keyword == 'acc' and
driver_loop.pragma[0].content.lower().lstrip().startswith('data ')):
p_content = f'parallel loop gang{private_clause}'
p_content = f'parallel loop gang{private_clause}{vector_length_clause}'
driver_loop._update(pragma=(driver_loop.pragma[0], ir.Pragma(keyword='acc', content=p_content)))
driver_loop._update(pragma_post=(ir.Pragma(keyword='acc', content='end parallel loop'),
driver_loop.pragma_post[0]))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,15 @@ def process_driver(self, routine, targets=None, item=None):
if self.directive == 'openacc':
SCCAnnotateTransformation.device_alloc_column_locals(routine, column_locals)

# For the thread block size, find the horizontal size variable that is available in
# the driver
num_threads = None
symbol_map = routine.symbol_map
for size_expr in self.horizontal.size_expressions:
if size_expr in symbol_map:
num_threads = size_expr
break

with pragmas_attached(routine, ir.Loop, attach_pragma_post=True):

for call in FindNodes(ir.CallStatement).visit(routine.body):
Expand All @@ -254,4 +263,4 @@ def process_driver(self, routine, targets=None, item=None):

# Mark driver loop as "gang parallel".
SCCAnnotateTransformation.annotate_driver(self.directive, driver_loop, kernel_loop,
self.block_dim, column_locals)
self.block_dim, column_locals, num_threads)

0 comments on commit cffca77

Please sign in to comment.