|
| 1 | +#include <stdio.h> |
| 2 | +#include <stdlib.h> |
| 3 | +#include <assert.h> |
| 4 | +#include <stdint.h> |
| 5 | +#include <sys/time.h> |
| 6 | + |
| 7 | +#define ITERATIONS 2000 |
| 8 | +#define WIDTH 3840 |
| 9 | +#define HEIGHT 2160 |
| 10 | +#define CENTER_X -0.75 |
| 11 | +#define CENTER_Y 0.0 |
| 12 | +#define ZOOM (float(HEIGHT) / 2.5) |
| 13 | + |
| 14 | +__global__ void mandelbrot(unsigned* dim, float* output) { |
| 15 | + unsigned width = dim[0]; |
| 16 | + unsigned height = dim[1]; |
| 17 | + unsigned tid = blockDim.x * blockIdx.x + threadIdx.x; |
| 18 | + for(; tid < width * height; tid += blockDim.x * gridDim.x) { |
| 19 | + float x = tid % width; |
| 20 | + float y = tid / width; |
| 21 | + x -= width / 2.0; |
| 22 | + y -= height / 2.0; |
| 23 | + x /= ZOOM; |
| 24 | + y /= ZOOM; |
| 25 | + x += CENTER_X; |
| 26 | + y += CENTER_Y; |
| 27 | + float a = 0.0, b = 0.0; |
| 28 | + for(unsigned i = 0; i < ITERATIONS; i++) { |
| 29 | + float tmp_a = a * a - b * b + x; |
| 30 | + b = 2.0 * a * b + y; |
| 31 | + a = tmp_a; |
| 32 | + } |
| 33 | + output[tid] = a * a + b * b; |
| 34 | + } |
| 35 | +} |
| 36 | + |
| 37 | +int main() { |
| 38 | + unsigned* host_dim; |
| 39 | + float* host_output; |
| 40 | + unsigned* device_dim; |
| 41 | + float* device_output; |
| 42 | + struct timespec start, end; |
| 43 | + host_dim = (unsigned*)malloc(2 * sizeof(unsigned)); |
| 44 | + assert(host_dim); |
| 45 | + host_output = (float*)malloc(WIDTH * HEIGHT * sizeof(float)); |
| 46 | + assert(host_output); |
| 47 | + cudaMalloc(&device_dim, 2 * sizeof(unsigned)); |
| 48 | + cudaMalloc(&device_output, WIDTH * HEIGHT * sizeof(float)); |
| 49 | + host_dim[0] = WIDTH; |
| 50 | + host_dim[1] = HEIGHT; |
| 51 | + cudaMemcpy(device_dim, host_dim, 2 * sizeof(unsigned), cudaMemcpyHostToDevice); |
| 52 | + clock_gettime(CLOCK_MONOTONIC_RAW, &start); |
| 53 | + mandelbrot<<<(WIDTH * HEIGHT + 256) / 256, 256>>>(device_dim, device_output); |
| 54 | + cudaDeviceSynchronize(); |
| 55 | + clock_gettime(CLOCK_MONOTONIC_RAW, &end); |
| 56 | + cudaMemcpy(host_output, device_output, WIDTH * HEIGHT * sizeof(float), cudaMemcpyDeviceToHost); |
| 57 | + cudaFree(device_output); |
| 58 | + cudaFree(device_dim); |
| 59 | + FILE* output = fopen("out.ppm", "w"); |
| 60 | + fprintf(output, "P2\n%u %u\n255\n", host_dim[0], host_dim[1]); |
| 61 | + for(unsigned i = 0; i < WIDTH * HEIGHT; i++) { |
| 62 | + fprintf(output, "%d\n", (host_output[i] <= 2.0) ? (0) : (255)); |
| 63 | + } |
| 64 | + fclose(output); |
| 65 | + free(host_dim); |
| 66 | + free(host_output); |
| 67 | + uint64_t delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000; |
| 68 | + double delta = double(delta_us) / 1e6; |
| 69 | + printf("Time: %.3lf\n", delta); |
| 70 | + return 0; |
| 71 | +} |
0 commit comments