Themaister · ActuallyaDeviloper · Mar 2, 2020 · Mar 2, 2020 · Mar 3, 2020 · Mar 6, 2020
diff --git a/glfft.cpp b/glfft.cpp
@@ -522,18 +522,23 @@ static inline unsigned type_to_input_components(Type type)
 
 FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
         Type type, Direction direction, Target input_target, Target output_target,
-        std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom)
+        std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom,
+        std::string input_load_texture_code, std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer0, std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer1)
     : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny)
 {
     set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);
 
     size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2);
-    temp_buffer_size >>= options.type.output_fp16;
+    temp_buffer_size >>= options.type.fp16;
 
-    temp_buffer = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
-    if (output_target != SSBO)
+    temp_buffer = reuse_preallocated_temporary_buffer0 ? 
+        std::move(reuse_preallocated_temporary_buffer0) :
+        context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
+    if (output_target != SSBO || (options.type.output_fp16 && !options.type.fp16)) // @HigherIntermediatePrecision We may need higher intermediate precision.
     {
-        temp_buffer_image = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
+        temp_buffer_image = reuse_preallocated_temporary_buffer1 ?
+            std::move(reuse_preallocated_temporary_buffer1) :
+            context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
     }
 
     bool expand = false;
@@ -624,7 +629,8 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
             // If this is the last pass and we're writing to an image, use a special shader variant.
             bool last_pass = index == last_index && i == radix_direction.size() - 1;
 
-            bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
+            bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.fp16;
+            bool output_fp16 = last_pass ? options.type.output_fp16 : options.type.fp16;
             Target out_target = last_pass ? output_target : SSBO;
             Target in_target = passes.empty() ? input_target : SSBO;
             Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
@@ -642,8 +648,9 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
                 out_target,
                 p == 1,
                 radix.shared_banked,
-                options.type.fp16, input_fp16, options.type.output_fp16,
+                options.type.fp16, input_fp16, output_fp16,
                 options.type.normalize,
+                input_load_texture_code
             };
 
             const Pass pass = {
@@ -664,7 +671,6 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
         // This way, we avoid having special purpose transforms for all FFT variants.
         if (index == 0 && (type == ComplexToReal || type == RealToComplex))
         {
-            bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
             bool last_pass = radices[1].empty();
             Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
             Target in_target = passes.empty() ? input_target : SSBO;
@@ -673,7 +679,8 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
             unsigned uv_scale_x = 1;
 
             auto base_opts = options;
-            base_opts.type.input_fp16 = input_fp16;
+            base_opts.type.input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.fp16;
+            base_opts.type.output_fp16 = last_pass ? options.type.output_fp16 : options.type.fp16;
 
             auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, 2, mode, in_target, out_target, base_opts);
             auto res = build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 });
@@ -731,7 +738,7 @@ void FFT::store_shader_string(const char *path, const string &source)
 unique_ptr<Program> FFT::build_program(const Parameters &params)
 {
     string str;
-    str.reserve(16 * 1024);
+    str.reserve(64 * 1024);
 
 #if 0
     context->log("Building program:\n");
@@ -794,6 +801,10 @@ unique_ptr<Program> FFT::build_program(const Parameters &params)
         str += "#define FFT_CONVOLVE\n";
     }
 
+    str += "#define FFT_LOAD_TEXTURE_CODE ";
+    str += params.input_load_texture_code.empty() ? input_load_texture_code_default : params.input_load_texture_code;
+    str += "\n";
+
     str += params.shared_banked ? "#define FFT_SHARED_BANKED 1\n" : "#define FFT_SHARED_BANKED 0\n";
 
     str += params.direction == Forward ? "#define FFT_FORWARD\n" : "#define FFT_INVERSE\n";
@@ -1001,9 +1012,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
 
     Resource *buffers[2] = {
         input,
-        passes.size() & 1 ?
-            (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output) :
-            temp_buffer.get(),
+        (!temp_buffer_image && passes.size() & 1) ? output : temp_buffer.get() // If no 'temp_buffer_image' is available, we must be use the output buffer directly. 
     };
 
     if (input_aux != 0)
@@ -1058,6 +1067,11 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
         constant_data.stride = pass.stride;
         p *= pass.parameters.radix;
 
+        if (pass_index + 1 >= passes.size()) // In the last pass we need to inject our output buffer.
+        {
+            buffers[1] = output; 
+        }
+
         if (pass.parameters.input_target != SSBO)
         {
             cmd->bind_texture(BindingTexture0, static_cast<Texture*>(buffers[0]));
@@ -1117,7 +1131,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
                         break;
                 }
             }
-            cmd->bind_storage_texture(BindingImage, static_cast<Texture*>(output), format);
+            cmd->bind_storage_texture(BindingImage, static_cast<Texture*>(buffers[1]), format);
         }
         else
         {
@@ -1144,9 +1158,14 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
 
         if (pass_index == 0)
         {
-            buffers[0] = passes.size() & 1 ?
-                temp_buffer.get() :
-                (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output);
+            if (!temp_buffer_image) // If no 'temp_buffer_image' is available, we must be use the output buffer directly. 
+            {
+                buffers[0] = passes.size() & 1 ? temp_buffer.get() : output;
+            }
+            else
+            {
+                buffers[0] = temp_buffer_image.get();
+            }
         }
 
         swap(buffers[0], buffers[1]);

diff --git a/glfft.hpp b/glfft.hpp
@@ -55,10 +55,25 @@ class FFT
         /// @param options       FFT options such as performance related parameters and types.
         /// @param wisdom        GLFFT wisdom which can override performance related options
         ///                      (options.performance is used as a fallback).
+        /// @param input_load_texture_code
+        ///                      Custom code for sampling the input texture can be inserted here.
+        ///                      This must only use a single line and must define a function with signature 
+        ///                      "cfloat load_texture(uvec2 coord)" and can call "cfloat load_texture_inner(uvec2 coord)".
+        /// @param reuse_preallocated_temporary_buffer0
+        ///                      For large FFTs also a large internal temporary buffer is required. To reduce memory consumption
+        ///                      you can provide a preallocated buffer here that can be shared with other parts of the program.
+        ///                      The buffer must have size at least Nx * Ny * (type == ComplexToComplexDual ? 4 : 2) * (options.type.fp16 ? 2 : 4).
+        ///                      The provided buffer must not be used while the FFT is in progress and will contain unpredictable garbage data afterwards.
+        /// @param reuse_preallocated_temporary_buffer1
+        ///                      Same as reuse_preallocated_temporary_buffer0 and used only if the output is a texture.
+        ///                      May be aliased with the input if the input if the input is not needed again after processing.
         FFT(Context *context, unsigned Nx, unsigned Ny,
                 Type type, Direction direction, Target input_target, Target output_target,
                 std::shared_ptr<ProgramCache> cache, const FFTOptions &options,
-                const FFTWisdom &wisdom = FFTWisdom());
+                const FFTWisdom &wisdom = FFTWisdom(), 
+                std::string input_load_texture_code = input_load_texture_code_default,
+                std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer0 = nullptr,
+                std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer1 = nullptr);
 
         /// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs.
         ///
@@ -115,12 +130,12 @@ class FFT
         double get_cost() const { return cost; }
 
         /// @brief Returns number of passes (glDispatchCompute) in a process() call.
-        unsigned get_num_passes() const { return passes.size(); }
+        size_t get_num_passes() const { return passes.size(); }
 
         /// @brief Returns Nx.
-        unsigned get_dimension_x() const { return size_x; }
+        size_t get_dimension_x() const { return size_x; }
         /// @brief Returns Ny.
-        unsigned get_dimension_y() const { return size_y; }
+        size_t get_dimension_y() const { return size_y; }
 
         /// @brief Sets offset and scale parameters for normalized texel coordinates when sampling textures.
         ///

diff --git a/glfft_common.hpp b/glfft_common.hpp
@@ -33,7 +33,7 @@
 namespace GLFFT
 {
 
-enum Direction
+enum Direction : char
 {
     /// Forward FFT transform.
     Forward = -1,
@@ -44,7 +44,7 @@ enum Direction
     Inverse = 1
 };
 
-enum Mode
+enum Mode : char
 {
     Horizontal,
     HorizontalDual,
@@ -55,7 +55,7 @@ enum Mode
     ResolveComplexToReal,
 };
 
-enum Type
+enum Type : char
 {
     /// Regular complex-to-complex transform.
     ComplexToComplex,
@@ -68,7 +68,7 @@ enum Type
     RealToComplex
 };
 
-enum Target
+enum Target : char
 {
     /// GL_SHADER_STORAGE_BUFFER
     SSBO,
@@ -81,6 +81,11 @@ enum Target
     ImageReal
 };
 
+static constexpr char const input_load_texture_code_default[] = 
+    "cfloat load_texture(uvec2 coord) {"
+    "    return load_texture_inner(coord);"
+    "}";
+
 struct Parameters
 {
     unsigned workgroup_size_x;
@@ -96,10 +101,11 @@ struct Parameters
     bool shared_banked;
     bool fft_fp16, input_fp16, output_fp16;
     bool fft_normalize;
-
+    std::string input_load_texture_code; // If empty defaults to input_load_texture_code_default. Unfortunately we can't put it here because that breaks the initializer lists in C++11.
     bool operator==(const Parameters &other) const
     {
-        return std::memcmp(this, &other, sizeof(Parameters)) == 0;
+        return std::memcmp(this, &other, offsetof(Parameters, input_load_texture_code)) == 0
+            && input_load_texture_code == other.input_load_texture_code;
     }
 };
 
@@ -126,7 +132,7 @@ struct FFTOptions
 
     struct Type
     {
-        /// Whether internal shader should be mediump float.
+        /// Whether internal shader and intermediate results should be mediump float.
         bool fp16 = false;
         /// Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
         bool input_fp16 = false;

diff --git a/glfft_wisdom.cpp b/glfft_wisdom.cpp
@@ -394,7 +394,7 @@ std::pair<double, FFTOptions::Performance> FFTWisdom::study(Context *context, co
                     }
 
                     FFTOptions::Performance perf;
-                    perf.shared_banked = shared_banked;
+                    perf.shared_banked = !!shared_banked;
                     perf.vector_size = vector_size;
                     perf.workgroup_size_x = workgroup_size_x;
                     perf.workgroup_size_y = workgroup_size_y;

diff --git a/glsl/fft_common.comp b/glsl/fft_common.comp
@@ -314,7 +314,7 @@ cfloat load_texture(sampler2D sampler, uvec2 coord)
 #endif
 }
 
-cfloat load_texture(uvec2 coord)
+cfloat load_texture_inner(uvec2 coord)
 {
 #ifdef FFT_CONVOLVE
     // Convolution in frequency domain is multiplication.
@@ -326,6 +326,9 @@ cfloat load_texture(uvec2 coord)
 #endif
 }
 
+// This must define a function with signature cfloat load_texture(uvec2 coord)
+FFT_LOAD_TEXTURE_CODE
+
 // Implement a dummy load_global, or we have to #ifdef out lots of dead code elsewhere.
 #ifdef FFT_VEC8
 cfloat load_global(uint offset)

diff --git a/test/glfft_cli.cpp b/test/glfft_cli.cpp
@@ -143,7 +143,7 @@ struct BenchArguments
     unsigned warmup = 2;
     unsigned iterations = 20;
     unsigned dispatches = 50;
-    unsigned timeout = 1.0;
+    unsigned timeout = 1;
     Type type = ComplexToComplex;
     unsigned size_for_type = 2;
     const char *string_for_type = "C2C";
@@ -335,6 +335,7 @@ static int cli_test(Context *context, int argc, char *argv[])
     cbs.add("--minimum-snr-fp32", [&args](CLIParser &parser) { args.min_snr_fp32 = parser.next_double(); });
     cbs.add("--epsilon-fp16",     [&args](CLIParser &parser) { args.epsilon_fp16 = parser.next_double(); });
     cbs.add("--epsilon-fp32",     [&args](CLIParser &parser) { args.epsilon_fp32 = parser.next_double(); });
+    cbs.add("--single-base-size",      [&args](CLIParser &parser) { args.single_base_size = true; });
 
     cbs.error_handler = [context]{ cli_test_help(context); };
     CLIParser parser(move(cbs), argc, argv);
@@ -406,7 +407,7 @@ static int cli_bench(Context *context, int argc, char *argv[])
     cbs.add("--warmup",         [&args](CLIParser &parser) { args.warmup = parser.next_uint(); });
     cbs.add("--iterations",     [&args](CLIParser &parser) { args.iterations = parser.next_uint(); });
     cbs.add("--dispatches",     [&args](CLIParser &parser) { args.dispatches = parser.next_uint(); });
-    cbs.add("--timeout",        [&args](CLIParser &parser) { args.timeout = parser.next_double(); });
+    cbs.add("--timeout",        [&args](CLIParser &parser) { args.timeout = (unsigned int)parser.next_double(); });
     cbs.add("--fp16",           [&args](CLIParser&)        { args.fp16 = true; });
     cbs.add("--type",           [&args](CLIParser &parser) { args.type = parse_type(parser.next_string(), args); });
     cbs.add("--input-texture",  [&args](CLIParser&)        { args.input_texture = true; });

diff --git a/test/glfft_cli.hpp b/test/glfft_cli.hpp
@@ -40,6 +40,7 @@ namespace GLFFT
             unsigned test_id_min = 0;
             unsigned test_id_max = 0;
             bool exhaustive = true;
+            bool single_base_size = true;
             bool throw_on_fail = false;
             double min_snr_fp16 = 50.0;
             double min_snr_fp32 = 100.0;