Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 36 additions & 17 deletions glfft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,18 +522,23 @@ static inline unsigned type_to_input_components(Type type)

FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
Type type, Direction direction, Target input_target, Target output_target,
std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom)
std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom,
std::string input_load_texture_code, std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer0, std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer1)
: context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny)
{
set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);

size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2);
temp_buffer_size >>= options.type.output_fp16;
temp_buffer_size >>= options.type.fp16;

temp_buffer = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
if (output_target != SSBO)
temp_buffer = reuse_preallocated_temporary_buffer0 ?
std::move(reuse_preallocated_temporary_buffer0) :
context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
if (output_target != SSBO || (options.type.output_fp16 && !options.type.fp16)) // @HigherIntermediatePrecision We may need higher intermediate precision.
{
temp_buffer_image = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
temp_buffer_image = reuse_preallocated_temporary_buffer1 ?
std::move(reuse_preallocated_temporary_buffer1) :
context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
}

bool expand = false;
Expand Down Expand Up @@ -624,7 +629,8 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
// If this is the last pass and we're writing to an image, use a special shader variant.
bool last_pass = index == last_index && i == radix_direction.size() - 1;

bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.fp16;
bool output_fp16 = last_pass ? options.type.output_fp16 : options.type.fp16;
Target out_target = last_pass ? output_target : SSBO;
Target in_target = passes.empty() ? input_target : SSBO;
Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
Expand All @@ -642,8 +648,9 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
out_target,
p == 1,
radix.shared_banked,
options.type.fp16, input_fp16, options.type.output_fp16,
options.type.fp16, input_fp16, output_fp16,
options.type.normalize,
input_load_texture_code
};

const Pass pass = {
Expand All @@ -664,7 +671,6 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
// This way, we avoid having special purpose transforms for all FFT variants.
if (index == 0 && (type == ComplexToReal || type == RealToComplex))
{
bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
bool last_pass = radices[1].empty();
Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
Target in_target = passes.empty() ? input_target : SSBO;
Expand All @@ -673,7 +679,8 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
unsigned uv_scale_x = 1;

auto base_opts = options;
base_opts.type.input_fp16 = input_fp16;
base_opts.type.input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.fp16;
base_opts.type.output_fp16 = last_pass ? options.type.output_fp16 : options.type.fp16;

auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, 2, mode, in_target, out_target, base_opts);
auto res = build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 });
Expand Down Expand Up @@ -731,7 +738,7 @@ void FFT::store_shader_string(const char *path, const string &source)
unique_ptr<Program> FFT::build_program(const Parameters &params)
{
string str;
str.reserve(16 * 1024);
str.reserve(64 * 1024);

#if 0
context->log("Building program:\n");
Expand Down Expand Up @@ -794,6 +801,10 @@ unique_ptr<Program> FFT::build_program(const Parameters &params)
str += "#define FFT_CONVOLVE\n";
}

str += "#define FFT_LOAD_TEXTURE_CODE ";
str += params.input_load_texture_code.empty() ? input_load_texture_code_default : params.input_load_texture_code;
str += "\n";

str += params.shared_banked ? "#define FFT_SHARED_BANKED 1\n" : "#define FFT_SHARED_BANKED 0\n";

str += params.direction == Forward ? "#define FFT_FORWARD\n" : "#define FFT_INVERSE\n";
Expand Down Expand Up @@ -1001,9 +1012,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc

Resource *buffers[2] = {
input,
passes.size() & 1 ?
(passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output) :
temp_buffer.get(),
(!temp_buffer_image && passes.size() & 1) ? output : temp_buffer.get() // If no 'temp_buffer_image' is available, we must be use the output buffer directly.
};

if (input_aux != 0)
Expand Down Expand Up @@ -1058,6 +1067,11 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
constant_data.stride = pass.stride;
p *= pass.parameters.radix;

if (pass_index + 1 >= passes.size()) // In the last pass we need to inject our output buffer.
{
buffers[1] = output;
}

if (pass.parameters.input_target != SSBO)
{
cmd->bind_texture(BindingTexture0, static_cast<Texture*>(buffers[0]));
Expand Down Expand Up @@ -1117,7 +1131,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
break;
}
}
cmd->bind_storage_texture(BindingImage, static_cast<Texture*>(output), format);
cmd->bind_storage_texture(BindingImage, static_cast<Texture*>(buffers[1]), format);
}
else
{
Expand All @@ -1144,9 +1158,14 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc

if (pass_index == 0)
{
buffers[0] = passes.size() & 1 ?
temp_buffer.get() :
(passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output);
if (!temp_buffer_image) // If no 'temp_buffer_image' is available, we must be use the output buffer directly.
{
buffers[0] = passes.size() & 1 ? temp_buffer.get() : output;
}
else
{
buffers[0] = temp_buffer_image.get();
}
}

swap(buffers[0], buffers[1]);
Expand Down
23 changes: 19 additions & 4 deletions glfft.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,25 @@ class FFT
/// @param options FFT options such as performance related parameters and types.
/// @param wisdom GLFFT wisdom which can override performance related options
/// (options.performance is used as a fallback).
/// @param input_load_texture_code
/// Custom code for sampling the input texture can be inserted here.
/// This must only use a single line and must define a function with signature
/// "cfloat load_texture(uvec2 coord)" and can call "cfloat load_texture_inner(uvec2 coord)".
/// @param reuse_preallocated_temporary_buffer0
/// For large FFTs also a large internal temporary buffer is required. To reduce memory consumption
/// you can provide a preallocated buffer here that can be shared with other parts of the program.
/// The buffer must have size at least Nx * Ny * (type == ComplexToComplexDual ? 4 : 2) * (options.type.fp16 ? 2 : 4).
/// The provided buffer must not be used while the FFT is in progress and will contain unpredictable garbage data afterwards.
/// @param reuse_preallocated_temporary_buffer1
/// Same as reuse_preallocated_temporary_buffer0 and used only if the output is a texture.
/// May be aliased with the input if the input if the input is not needed again after processing.
FFT(Context *context, unsigned Nx, unsigned Ny,
Type type, Direction direction, Target input_target, Target output_target,
std::shared_ptr<ProgramCache> cache, const FFTOptions &options,
const FFTWisdom &wisdom = FFTWisdom());
const FFTWisdom &wisdom = FFTWisdom(),
std::string input_load_texture_code = input_load_texture_code_default,
std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer0 = nullptr,
std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer1 = nullptr);

/// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs.
///
Expand Down Expand Up @@ -115,12 +130,12 @@ class FFT
double get_cost() const { return cost; }

/// @brief Returns number of passes (glDispatchCompute) in a process() call.
unsigned get_num_passes() const { return passes.size(); }
size_t get_num_passes() const { return passes.size(); }

/// @brief Returns Nx.
unsigned get_dimension_x() const { return size_x; }
size_t get_dimension_x() const { return size_x; }
/// @brief Returns Ny.
unsigned get_dimension_y() const { return size_y; }
size_t get_dimension_y() const { return size_y; }

/// @brief Sets offset and scale parameters for normalized texel coordinates when sampling textures.
///
Expand Down
20 changes: 13 additions & 7 deletions glfft_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
namespace GLFFT
{

enum Direction
enum Direction : char
{
/// Forward FFT transform.
Forward = -1,
Expand All @@ -44,7 +44,7 @@ enum Direction
Inverse = 1
};

enum Mode
enum Mode : char
{
Horizontal,
HorizontalDual,
Expand All @@ -55,7 +55,7 @@ enum Mode
ResolveComplexToReal,
};

enum Type
enum Type : char
{
/// Regular complex-to-complex transform.
ComplexToComplex,
Expand All @@ -68,7 +68,7 @@ enum Type
RealToComplex
};

enum Target
enum Target : char
{
/// GL_SHADER_STORAGE_BUFFER
SSBO,
Expand All @@ -81,6 +81,11 @@ enum Target
ImageReal
};

static constexpr char const input_load_texture_code_default[] =
"cfloat load_texture(uvec2 coord) {"
" return load_texture_inner(coord);"
"}";

struct Parameters
{
unsigned workgroup_size_x;
Expand All @@ -96,10 +101,11 @@ struct Parameters
bool shared_banked;
bool fft_fp16, input_fp16, output_fp16;
bool fft_normalize;

std::string input_load_texture_code; // If empty defaults to input_load_texture_code_default. Unfortunately we can't put it here because that breaks the initializer lists in C++11.
bool operator==(const Parameters &other) const
{
return std::memcmp(this, &other, sizeof(Parameters)) == 0;
return std::memcmp(this, &other, offsetof(Parameters, input_load_texture_code)) == 0
&& input_load_texture_code == other.input_load_texture_code;
}
};

Expand All @@ -126,7 +132,7 @@ struct FFTOptions

struct Type
{
/// Whether internal shader should be mediump float.
/// Whether internal shader and intermediate results should be mediump float.
bool fp16 = false;
/// Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
bool input_fp16 = false;
Expand Down
2 changes: 1 addition & 1 deletion glfft_wisdom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ std::pair<double, FFTOptions::Performance> FFTWisdom::study(Context *context, co
}

FFTOptions::Performance perf;
perf.shared_banked = shared_banked;
perf.shared_banked = !!shared_banked;
perf.vector_size = vector_size;
perf.workgroup_size_x = workgroup_size_x;
perf.workgroup_size_y = workgroup_size_y;
Expand Down
5 changes: 4 additions & 1 deletion glsl/fft_common.comp
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ cfloat load_texture(sampler2D sampler, uvec2 coord)
#endif
}

cfloat load_texture(uvec2 coord)
cfloat load_texture_inner(uvec2 coord)
{
#ifdef FFT_CONVOLVE
// Convolution in frequency domain is multiplication.
Expand All @@ -326,6 +326,9 @@ cfloat load_texture(uvec2 coord)
#endif
}

// This must define a function with signature cfloat load_texture(uvec2 coord)
FFT_LOAD_TEXTURE_CODE

// Implement a dummy load_global, or we have to #ifdef out lots of dead code elsewhere.
#ifdef FFT_VEC8
cfloat load_global(uint offset)
Expand Down
5 changes: 3 additions & 2 deletions test/glfft_cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ struct BenchArguments
unsigned warmup = 2;
unsigned iterations = 20;
unsigned dispatches = 50;
unsigned timeout = 1.0;
unsigned timeout = 1;
Type type = ComplexToComplex;
unsigned size_for_type = 2;
const char *string_for_type = "C2C";
Expand Down Expand Up @@ -335,6 +335,7 @@ static int cli_test(Context *context, int argc, char *argv[])
cbs.add("--minimum-snr-fp32", [&args](CLIParser &parser) { args.min_snr_fp32 = parser.next_double(); });
cbs.add("--epsilon-fp16", [&args](CLIParser &parser) { args.epsilon_fp16 = parser.next_double(); });
cbs.add("--epsilon-fp32", [&args](CLIParser &parser) { args.epsilon_fp32 = parser.next_double(); });
cbs.add("--single-base-size", [&args](CLIParser &parser) { args.single_base_size = true; });

cbs.error_handler = [context]{ cli_test_help(context); };
CLIParser parser(move(cbs), argc, argv);
Expand Down Expand Up @@ -406,7 +407,7 @@ static int cli_bench(Context *context, int argc, char *argv[])
cbs.add("--warmup", [&args](CLIParser &parser) { args.warmup = parser.next_uint(); });
cbs.add("--iterations", [&args](CLIParser &parser) { args.iterations = parser.next_uint(); });
cbs.add("--dispatches", [&args](CLIParser &parser) { args.dispatches = parser.next_uint(); });
cbs.add("--timeout", [&args](CLIParser &parser) { args.timeout = parser.next_double(); });
cbs.add("--timeout", [&args](CLIParser &parser) { args.timeout = (unsigned int)parser.next_double(); });
cbs.add("--fp16", [&args](CLIParser&) { args.fp16 = true; });
cbs.add("--type", [&args](CLIParser &parser) { args.type = parse_type(parser.next_string(), args); });
cbs.add("--input-texture", [&args](CLIParser&) { args.input_texture = true; });
Expand Down
1 change: 1 addition & 0 deletions test/glfft_cli.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace GLFFT
unsigned test_id_min = 0;
unsigned test_id_max = 0;
bool exhaustive = true;
bool single_base_size = true;
bool throw_on_fail = false;
double min_snr_fp16 = 50.0;
double min_snr_fp32 = 100.0;
Expand Down
Loading