Skip to content

Commit f1091a6

Browse files
authored
Merge the ND-Range Kernel lesson into Data Parallelism (#388)
The Data Parallelism and ND-Range Kernel lessons duplicated a lot of material, which became confusing when we tried presenting both in a workshop. The duplication is unnecessary and also adds to the maintenance cost. Merge the "ND-Range Kernel" lesson into "Data Parallelism" by moving all the unique slides from the latter into the former and reordering so it has a reasonable flow. The exercises were also similar, where "Data Parallelism" tested sycl::range + sycl::id version and ND-Range tested sycl::range + sycl::item as well as sycl::nd_range + sycl::nd_item. Merge the two by using most of the code from the latter, but using sycl::id instead of sycl::item. Merge the README instructions from both to retain full detail. The top-level README is adjusted to renumber all lessons after the removed one. Constant memory is deprecated in SYCL 2020, so removed it from the slides about the SYCL memory model.
1 parent bc79770 commit f1091a6

File tree

11 files changed

+435
-797
lines changed

11 files changed

+435
-797
lines changed

Code_Exercises/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ add_subdirectory(Data_and_Dependencies)
5858
add_subdirectory(In_Order_Queue)
5959
add_subdirectory(Advanced_Data_Flow)
6060
add_subdirectory(Multiple_Devices)
61-
add_subdirectory(ND_Range_Kernel)
6261
add_subdirectory(Image_Convolution)
6362
add_subdirectory(Coalesced_Global_Memory)
6463
add_subdirectory(Vectors)

Code_Exercises/Data_Parallelism/README.md

+10-3
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,20 @@ Create `accessor`s to each of the `buffer`s within the command group function.
2727

2828
Now enqueue parallel kernel function by calling `parallel_for` on the `handler`.
2929

30-
This function takes a `range` specifying the number of iterations of the kernel
31-
function to invoke and the kernel function itself must take an `id` which
32-
represents the current iteration.
30+
#### 4.1 ) Use the `range` and `id` variant
31+
This version of `parallel_for` takes a `range` specifying the number of
32+
iterations of the kernel function to invoke and the kernel function itself must
33+
take an `id` which represents the current iteration.
3334

3435
The `id` can be used in the `accessor` subscript operator to access or assign to
3536
the corresponding element of data that the accessor represents.
3637

38+
#### 4.2 ) Use the `nd_range` and `nd_item` variant
39+
This version of `parallel_for` takes an `nd_range` which is made up of two
40+
`range`s describing the global range and the local range (work-group size). The
41+
kernel function must take an `nd_item`, which cannot be passed directly to the
42+
subscript operator of an `accessor`. Instead, retrieve the `id` using the
43+
`get_global_id` member function.
3744

3845
#### Build And Execution Hints
3946

Code_Exercises/Data_Parallelism/solution.cpp

+62-17
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,54 @@
1212

1313
#include "../helpers.hpp"
1414

15-
class vector_add;
15+
class vector_add_1;
16+
class vector_add_2;
1617

17-
int main() {
18+
void test_range() {
19+
constexpr size_t dataSize = 1024;
20+
21+
int a[dataSize], b[dataSize], r[dataSize];
22+
for (int i = 0; i < dataSize; ++i) {
23+
a[i] = i;
24+
b[i] = i;
25+
r[i] = 0;
26+
}
27+
28+
try {
29+
auto defaultQueue = sycl::queue{};
30+
31+
auto bufA = sycl::buffer{a, sycl::range{dataSize}};
32+
auto bufB = sycl::buffer{b, sycl::range{dataSize}};
33+
auto bufR = sycl::buffer{r, sycl::range{dataSize}};
34+
35+
defaultQueue.submit([&](sycl::handler& cgh) {
36+
sycl::accessor accA{bufA, cgh, sycl::read_only};
37+
sycl::accessor accB{bufB, cgh, sycl::read_only};
38+
sycl::accessor accR{bufR, cgh, sycl::write_only};
39+
40+
cgh.parallel_for<vector_add_1>(
41+
sycl::range{dataSize}, [=](sycl::id<1> globalId) {
42+
accR[globalId] = accA[globalId] + accB[globalId];
43+
});
44+
});
45+
46+
defaultQueue.throw_asynchronous();
47+
} catch (const sycl::exception& e) {
48+
std::cout << "Exception caught: " << e.what() << std::endl;
49+
}
50+
51+
SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
52+
}
53+
54+
void test_nd_range() {
1855
constexpr size_t dataSize = 1024;
56+
constexpr size_t workGroupSize = 128;
1957

20-
float a[dataSize], b[dataSize], r[dataSize];
58+
int a[dataSize], b[dataSize], r[dataSize];
2159
for (int i = 0; i < dataSize; ++i) {
22-
a[i] = static_cast<float>(i);
23-
b[i] = static_cast<float>(i);
24-
r[i] = 0.0f;
60+
a[i] = i;
61+
b[i] = i;
62+
r[i] = 0;
2563
}
2664

2765
try {
@@ -31,22 +69,29 @@ int main() {
3169
auto bufB = sycl::buffer{b, sycl::range{dataSize}};
3270
auto bufR = sycl::buffer{r, sycl::range{dataSize}};
3371

34-
defaultQueue
35-
.submit([&](sycl::handler& cgh) {
36-
sycl::accessor accA{bufA, cgh, sycl::read_only};
37-
sycl::accessor accB{bufB, cgh, sycl::read_only};
38-
sycl::accessor accR{bufR, cgh, sycl::write_only};
72+
defaultQueue.submit([&](sycl::handler& cgh) {
73+
sycl::accessor accA{bufA, cgh, sycl::read_only};
74+
sycl::accessor accB{bufB, cgh, sycl::read_only};
75+
sycl::accessor accR{bufR, cgh, sycl::write_only};
76+
77+
auto ndRange =
78+
sycl::nd_range{sycl::range{dataSize}, sycl::range{workGroupSize}};
3979

40-
cgh.parallel_for<vector_add>(
41-
sycl::range{dataSize},
42-
[=](sycl::id<1> idx) { accR[idx] = accA[idx] + accB[idx]; });
43-
})
44-
.wait();
80+
cgh.parallel_for<vector_add_2>(ndRange, [=](sycl::nd_item<1> itm) {
81+
sycl::id globalId = itm.get_global_id();
82+
accR[globalId] = accA[globalId] + accB[globalId];
83+
});
84+
});
4585

4686
defaultQueue.throw_asynchronous();
4787
} catch (const sycl::exception& e) {
4888
std::cout << "Exception caught: " << e.what() << std::endl;
4989
}
5090

51-
SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2.0f; });
91+
SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
92+
}
93+
94+
int main() {
95+
test_range();
96+
test_nd_range();
5297
}

Code_Exercises/Data_Parallelism/source.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -44,17 +44,17 @@
4444
int main() {
4545
constexpr size_t dataSize = 1024;
4646

47-
float a[dataSize], b[dataSize], r[dataSize];
47+
int a[dataSize], b[dataSize], r[dataSize];
4848
for (int i = 0; i < dataSize; ++i) {
49-
a[i] = static_cast<float>(i);
50-
b[i] = static_cast<float>(i);
51-
r[i] = 0.0f;
49+
a[i] = i;
50+
b[i] = i;
51+
r[i] = 0;
5252
}
5353

5454
// Task: Compute r[i] = a[i] + b[i] in parallel on the SYCL device
5555
for (int i = 0; i < dataSize; ++i) {
5656
r[i] = a[i] + b[i];
5757
}
5858

59-
SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2.0f; });
59+
SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
6060
}

Code_Exercises/ND_Range_Kernel/CMakeLists.txt

-14
This file was deleted.

Code_Exercises/ND_Range_Kernel/README.md

-48
This file was deleted.

Code_Exercises/ND_Range_Kernel/solution.cpp

-98
This file was deleted.

Code_Exercises/ND_Range_Kernel/source.cpp

-75
This file was deleted.

0 commit comments

Comments
 (0)