From e3fb37b8af44ba7d224833d84888a5cd99e72fcf Mon Sep 17 00:00:00 2001
From: Elliot Saba <staticfloat@gmail.com>
Date: Thu, 30 Jan 2020 17:28:53 -0800
Subject: [PATCH] Big interface overhaul

Eliminate as much runtime checking as possible; create metaprogramming
forwards from frontend to backend in a better (yet still imperfect) manner.

We are reserving the right for the frontend names such as `conv()` to
decide which backend to use at runtime, but for now, all `conv()` calls
(but not all pooling calls) are dispatched at compile-time, which is nice.
---
 Manifest.toml                                 |  53 +++
 Project.toml                                  |   1 +
 deps/build.jl                                 |  50 ---
 src/NNlib.jl                                  |  54 +--
 src/{ => activation}/activation.jl            |   2 +
 src/{ => activation}/softmax.jl               |   0
 src/conv.jl                                   | 181 ---------
 .../PaddingEdges.jl}                          |   0
 src/{ => dim_helpers}/dim_helpers.jl          |  21 +-
 src/{impl => direct}/conv_direct.jl           |  20 +-
 src/{impl => direct}/depthwiseconv_direct.jl  |   3 +-
 src/direct/direct.jl                          |  33 ++
 src/{impl => direct}/pooling_direct.jl        |   3 +
 src/{impl => im2col}/conv_im2col.jl           |   4 +-
 src/{impl => im2col}/depthwiseconv_im2col.jl  |   4 +-
 src/{ => im2col}/gemm.jl                      |   0
 src/im2col/im2col.jl                          |  27 ++
 src/interface.jl                              |  66 ++++
 src/interface_impl.jl                         | 361 ++++++++++++++++++
 src/nnpack/NNPACK.jl                          |  52 ++-
 src/nnpack/impl.jl                            |  50 ---
 src/nnpack/interface.jl                       | 110 +++---
 src/nnpack/libnnpack.jl                       |   2 -
 .../{performance.jl => multithreading.jl}     |  10 +-
 src/pooling.jl                                | 155 --------
 test/conv.jl                                  |   7 +-
 test/inference.jl                             |  10 +-
 27 files changed, 695 insertions(+), 584 deletions(-)
 delete mode 100644 deps/build.jl
 rename src/{ => activation}/activation.jl (99%)
 rename src/{ => activation}/softmax.jl (100%)
 delete mode 100644 src/conv.jl
 rename src/{impl/padding_edges.jl => dim_helpers/PaddingEdges.jl} (100%)
 rename src/{ => dim_helpers}/dim_helpers.jl (93%)
 rename src/{impl => direct}/conv_direct.jl (92%)
 rename src/{impl => direct}/depthwiseconv_direct.jl (98%)
 create mode 100644 src/direct/direct.jl
 rename src/{impl => direct}/pooling_direct.jl (98%)
 rename src/{impl => im2col}/conv_im2col.jl (99%)
 rename src/{impl => im2col}/depthwiseconv_im2col.jl (98%)
 rename src/{ => im2col}/gemm.jl (100%)
 create mode 100644 src/im2col/im2col.jl
 create mode 100644 src/interface.jl
 create mode 100644 src/interface_impl.jl
 delete mode 100644 src/nnpack/impl.jl
 rename src/nnpack/{performance.jl => multithreading.jl} (87%)
 delete mode 100644 src/pooling.jl

diff --git a/Manifest.toml b/Manifest.toml
index 260e456b6..f70d8a9ea 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,11 +1,29 @@
 # This file is machine-generated - editing it directly is not advised
 
+[[Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
 [[BinaryProvider]]
 deps = ["Libdl", "SHA"]
 git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
 uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
 version = "0.5.8"
 
+[[Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[Distributed]]
+deps = ["Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[LibGit2]]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
 [[Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 
@@ -13,6 +31,31 @@ uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 deps = ["Libdl"]
 uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
+[[Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[NNPACK_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "c3d1a616362645754b18e12dbba96ec311b0867f"
+uuid = "a6bfbf70-4841-5cb9-aa18-3a8ad3c413ee"
+version = "2018.6.22+0"
+
+[[Pkg]]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+
+[[Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
 [[Random]]
 deps = ["Serialization"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -29,6 +72,9 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
+[[Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
 [[SparseArrays]]
 deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -37,6 +83,13 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
+[[Test]]
+deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
 [[UUIDs]]
 deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
diff --git a/Project.toml b/Project.toml
index 83b5223b8..41b3c0193 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,7 @@ version = "0.6.4"
 BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+NNPACK_jll = "a6bfbf70-4841-5cb9-aa18-3a8ad3c413ee"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
diff --git a/deps/build.jl b/deps/build.jl
deleted file mode 100644
index 3da8d53c2..000000000
--- a/deps/build.jl
+++ /dev/null
@@ -1,50 +0,0 @@
-using BinaryProvider
-
-# Parse some basic command-line arguments
-const verbose = "--verbose" in ARGS
-const prefix = Prefix(get([a for a in ARGS if a != "--verbose"], 1, joinpath(@__DIR__, "usr")))
-products = [
-    LibraryProduct(prefix, ["libnnpack"], :libnnpack),
-]
-
-# Download binaries from hosted location
-bin_prefix = "https://github.com/JuliaPackaging/Yggdrasil/releases/download/NNPACK-v2018.06.22-0"
-
-# Listing of files generated by BinaryBuilder:
-download_info = Dict(
-    Linux(:aarch64, libc=:glibc) => ("$bin_prefix/NNPACK.v2018.6.22.aarch64-linux-gnu.tar.gz", "e0c6e21ba4c47acfd5a3d3e3510e8786474080f654338f4583b88860296c1437"),
-    Linux(:i686, libc=:glibc) => ("$bin_prefix/NNPACK.v2018.6.22.i686-linux-gnu.tar.gz", "e9b6685001bc5a5d17acef15f3f6ffeb7beb6081926300f23ed4a442beac71ca"),
-    Linux(:i686, libc=:musl) => ("$bin_prefix/NNPACK.v2018.6.22.i686-linux-musl.tar.gz", "36c1d3c30b3bc3e0b34f215945bb46319f88e28f011fc758f21ba888b1fd9e25"),
-    MacOS(:x86_64) => ("$bin_prefix/NNPACK.v2018.6.22.x86_64-apple-darwin14.tar.gz", "b30046223a11470b15a2ceb0d0df6f7d8a43260fe52f4a2f8ebe5f0b2df822ca"),
-    Linux(:x86_64, libc=:glibc) => ("$bin_prefix/NNPACK.v2018.6.22.x86_64-linux-gnu.tar.gz", "150d5b6ca81fa72bfdc8bbda2428f0d3483fd11a5813724646c6d6c6a7ef969f"),
-    Linux(:x86_64, libc=:musl) => ("$bin_prefix/NNPACK.v2018.6.22.x86_64-linux-musl.tar.gz", "d961a104f814ec5b356519a82746a70a1df193ae37fc8130f38ffb61336def16"),
-)
-
-# Install unsatisfied or updated dependencies:
-unsatisfied = any(!satisfied(p; verbose=verbose) for p in products)
-dl_info = choose_download(download_info, platform_key_abi())
-if dl_info === nothing && unsatisfied
-    # If we don't have a compatible .tar.gz to download, complain.
-    # Alternatively, you could attempt to install from a separate provider,
-    # build from source or something even more ambitious here.
-    @warn "Your platform (\"$(Sys.MACHINE)\", parsed as \"$(triplet(platform_key_abi()))\") is not supported by NNPACK!
-          You will only be able to use only the default NNlib backend."
-end
-
-# If we have a download, and we are unsatisfied (or the version we're
-# trying to install is not itself installed) then load it up!
-# Download and install binaries
-use_nnpack = get(ENV, "NNLIB_USE_NNPACK", "false") == "true"
-os_support = Sys.islinux() || Sys.isapple()
-if use_nnpack && os_support
-  if unsatisfied || !isinstalled(dl_info...; prefix=prefix)
-    install(dl_info...; prefix=prefix, force=true, verbose=verbose)
-  end
-  # Write out a deps.jl file that will contain mappings for our products
-  write_deps_file(joinpath(@__DIR__, "deps.jl"), products, verbose=verbose)
-else
-  open(joinpath(@__DIR__, "deps.jl"), "w") do io
-    write(io, "check_deps() = false")
-  end
-end
-
diff --git a/src/NNlib.jl b/src/NNlib.jl
index aa08bc299..ed9fee187 100644
--- a/src/NNlib.jl
+++ b/src/NNlib.jl
@@ -1,34 +1,38 @@
 module NNlib
-using Requires
 
-# Include APIs
-include("dim_helpers.jl")
+# Start with the simplest stuff in here; activation functions
+include("activation/activation.jl")
+include("activation/softmax.jl")
 
-# NNPACK support
-include(joinpath(@__DIR__, "..", "deps", "deps.jl"))
-if check_deps() == nothing
-  include("nnpack/NNPACK.jl")
-else
-  is_nnpack_available() = false
-end
+# Load dimensionality helpers for convolution dispatching
+include("dim_helpers/dim_helpers.jl")
+
+# Define our convolution/pooling interface backend holders
+include("interface.jl")
 
-include("activation.jl")
-include("softmax.jl")
-include("gemm.jl")
-include("conv.jl")
-include("pooling.jl")
+# Begin with straightforward direct implementations
+include("direct/direct.jl")
+# Next, im2col implementations
+include("im2col/im2col.jl")
 
-## Include implementations
-include("impl/padding_edges.jl")
+# Next, NNPACK implementations
+using NNPACK_jll
 
-# Direct implementations of convolutional and depthwise-convolutional algorithms
-include("impl/conv_direct.jl")
-include("impl/depthwiseconv_direct.jl")
-# im2col implementations of convolutional and depthwise-convolutional algorithms
-include("impl/conv_im2col.jl")
-include("impl/depthwiseconv_im2col.jl")
+# Check to see if NNPACK_jll is loadable
+if isdefined(NNPACK_jll, :libnnpack)
+    include("nnpack/NNPACK.jl")
+else
+    # Otherwise, signal to the rest of the world that this is unavailable
+    """
+        is_nnpack_available()
+
+    Checks if the current platform/hardware is supported by NNPACK.
+    Your platform sadly, is not supported by NNPACK.
+    """
+    is_nnpack_available() = false
+end
 
-# Direct implementations of pooling
-include("impl/pooling_direct.jl")
+# Finally, generate all the goodies for conv() and maxpool() and friends!
+include("interface_impl.jl")
 
 end # module NNlib
diff --git a/src/activation.jl b/src/activation/activation.jl
similarity index 99%
rename from src/activation.jl
rename to src/activation/activation.jl
index 9dffa5b18..08f2b7ed0 100644
--- a/src/activation.jl
+++ b/src/activation/activation.jl
@@ -1,3 +1,5 @@
+using Requires
+
 export σ, sigmoid, relu, leakyrelu, elu, gelu, swish, selu, softplus, softsign, logσ,
        logsigmoid, logcosh, mish
 
diff --git a/src/softmax.jl b/src/activation/softmax.jl
similarity index 100%
rename from src/softmax.jl
rename to src/activation/softmax.jl
diff --git a/src/conv.jl b/src/conv.jl
deleted file mode 100644
index b2997d8f2..000000000
--- a/src/conv.jl
+++ /dev/null
@@ -1,181 +0,0 @@
-export conv, conv!, ∇conv_data, ∇conv_data!, ∇conv_filter, ∇conv_filter!, depthwiseconv,
-        depthwiseconv!, ∇depthwiseconv_data, ∇depthwiseconv_data!, ∇depthwiseconv_filter,
-        ∇depthwiseconv_filter!
-
-## Convolution API
-#
-#  We provide the following generic methods, for 3d, 4d, and 5d tensors, calculating 1d,
-#  2d and 3d convolutions, based on the rank of the input tensors, in both mutating and
-#  non-mutating auto-allocating variants:
-#   - Convolution:
-#     - conv(x, w, cdims)
-#     - conv!(y, x, w, cdims)
-#   - Convolution data backpropagation
-#     - ∇conv_data(dy, w, cdims)
-#     - ∇conv_data!(dx, dy, w, cdims)
-#   - Convolution filter backpropagation
-#     - ∇conv_filter(x, dy, cdims)
-#     - ∇conv_filter!(dw, x, dy, cdims)
-#
-#   All methods require a `ConvDims` object to define the dimensions and optional
-#   elements of the convolution (padding, stride, dilation, kernel-flipping, etc...),
-#   which is easily constructable through something like `DenseConvDims(x, w)`.  All
-#   methods take in the `ConvDims` of the associated normal, forward-pass convolution,
-#   that is, the following is legal:
-#
-#       cdims = ConvDims(x, w; stride=2, dilation=(3,2))
-#       dx = ∇conv_data(conv(x, w, cdims), w, cdims)
-
-
-
-# First, we will define mappings from the generic API names to our accelerated backend
-# implementations. For homogeneous-datatype 1, 2 and 3d convolutions, we default to using
-# im2col + GEMM.  Do so in a loop, here:
-for (front_name, backend) in (
-        # This maps from public, front-facing name, to internal backend name
-        :conv                   => :im2col,
-        :∇conv_data             => :im2col,
-        :∇conv_filter           => :im2col,
-        :depthwiseconv          => :im2col,
-        :∇depthwiseconv_data    => :im2col,
-        :∇depthwiseconv_filter  => :im2col,
-    )
-
-    # These are the GEMM types we will accelerate with `im2col`
-    G = Union{[x[2] for x in gemm_datatype_mappings]...}
-
-    # We only define 3d conv primitives, we reshape lower down to get 1d and 2d convolution
-    @eval begin
-        # im2col-accelerated function forwarding definition
-        function $(Symbol("$(front_name)!"))(
-                        out::AbstractArray{T,5}, in1::AbstractArray{T,5},
-                        in2::AbstractArray{T,5}, cdims::ConvDims; kwargs...) where {T <: $G}
-            $(Symbol("$(front_name)_$(backend)!"))(out, in1, in2, cdims; kwargs...)
-        end
-    end
-end
-
-# Our strategy for 1d and 2d convolution is to reshape to 3d convolutions, which
-# makes things MUCH EASIER for us on the backend side, and is in general pretty fast,
-# since we can specialize on sizes.
-for front_name in (:conv, :∇conv_data, :∇conv_filter,
-                   :depthwiseconv, :∇depthwiseconv_data, :∇depthwiseconv_filter)
-    for backend in (Symbol(), :_direct, :_im2col)
-        for N in (3, 4)
-            @eval begin
-                function $(Symbol("$(front_name)$(backend)!"))(
-                                y::AbstractArray{yT,$N}, x::AbstractArray{xT,$N},
-                                w::AbstractArray{wT,$N}, cdims::ConvDims;
-                                kwargs...) where {yT, xT, wT}
-                    $(Symbol("$(front_name)$(backend)!"))(
-                        insert_singleton_spatial_dimension(y, $(5 - N)),
-                        insert_singleton_spatial_dimension(x, $(5 - N)),
-                        insert_singleton_spatial_dimension(w, $(5 - N)),
-                        insert_singleton_spatial_dimension(cdims, $(5 - N));
-                        kwargs...
-                    )
-
-                    # We explicitly return `y` here, because the backend call
-                    # itself may return a reshaped view, which we don't want.
-                    return y
-                end
-            end
-        end
-    end
-end
-
-# We always support a fallback, non-accelerated path, where we use the direct, but
-# slow, implementations.  These should not typically be used, hence the `@debug`,
-# but let's ggo ahead and define them first:
-for front_name in (:conv, :∇conv_data, :∇conv_filter,
-                   :depthwiseconv, :∇depthwiseconv_data, :∇depthwiseconv_filter)
-    @eval begin
-        function $(Symbol("$(front_name)!"))(
-                        y::AbstractArray{yT,N}, in1::AbstractArray{T1,N},
-                        in2::AbstractArray{T2,N}, cdims::ConvDims;
-                        kwargs...) where {yT, T1, T2, N}
-            @debug string("Slow fallback implementation invoked for ", $front_name, "!  ",
-                          "You probably don't want this; check your datatypes.")
-            $(Symbol("$(front_name)_direct!"))(y, in1, in2, cdims; kwargs...)
-        end
-    end
-end
-
-# Finally, let's generate auto-allocating versions of all our functions, for all backends.
-# We `@timeit` these methods separately, as we want to know how much time is spent in
-# allocation.  :P
-for backend in (Symbol(), :_direct, :_im2col)
-    # First make auto-allocating versions of the conv()-like calls:
-    for name in (:conv, :depthwiseconv)
-        @eval begin
-            function $(Symbol("$(name)$(backend)"))(
-                            x::AbstractArray{xT,N}, w::AbstractArray{wT,N},
-                            cdims::ConvDims; kwargs...) where {xT, wT, N}
-                y = similar(x, promote_type(xT, wT), output_size(cdims)...,
-                               channels_out(cdims), size(x,N))
-                return $(Symbol("$(name)$(backend)!"))(y, x, w, cdims; kwargs...)
-            end
-        end
-    end
-
-    for name in (:∇conv_data, :∇depthwiseconv_data)
-        @eval begin
-            function $(Symbol("$(name)$(backend)"))(
-                            dy::AbstractArray{yT,N}, w::AbstractArray{wT,N},
-                            cdims::ConvDims; kwargs...) where {yT, wT, N}
-                dx = similar(dy, input_size(cdims)..., channels_in(cdims),
-                                                        size(dy, N))
-                return $(Symbol("$(name)$(backend)!"))(dx, dy, w, cdims; kwargs...)
-            end
-        end
-    end
-
-    # We do the conv/depthwiseconv filter backprops separately, as the shape calculation
-    # for `w` is slightly different for depthwise than for normal dense convolution.
-    @eval begin
-        function $(Symbol("∇conv_filter$(backend)"))(
-                        x::AbstractArray{xT,N}, dy::AbstractArray{yT,N},
-                        cdims::ConvDims; kwargs...) where {xT, yT, N}
-            dw = similar(dy, kernel_size(cdims)..., channels_in(cdims),
-                                                    channels_out(cdims))
-            return $(Symbol("∇conv_filter$(backend)!"))(dw, x, dy, cdims; kwargs...)
-        end
-    end
-
-    @eval begin
-        function $(Symbol("∇depthwiseconv_filter$(backend)"))(
-                        x::AbstractArray{xT,N}, dy::AbstractArray{yT,N},
-                        cdims::ConvDims; kwargs...) where {xT, yT, N}
-            dw = similar(dy, kernel_size(cdims)..., channel_multiplier(cdims),
-                                                    channels_in(cdims))
-            return $(Symbol("∇depthwiseconv_filter$(backend)!"))(dw, x, dy, cdims;
-                                                                 kwargs...)
-        end
-    end
-end
-
-
-# Use NNPACK if it is available and the operation is supported
-if is_nnpack_available()
-    function conv(x::Array{xT, 4}, w::Array{wT, 4},
-                  cdims::DenseConvDims{2, K, C_in, C_out, (1, 1), P, (1, 1), F};
-                  kwargs...) where {xT, wT, K, C_in, C_out, P, F}
-        return conv_nnpack(x, w, cdims; kwargs...)
-    end
-end
-
-function conv(x, w::AbstractArray{T, N}; stride = 1, pad = 0, dilation = 1, flipped = false) where {T, N}
-    stride = expand(Val(N-2), stride)
-    pad = expand(Val(N-2), pad)
-    dilation = expand(Val(N-2), dilation)
-    cdims = DenseConvDims(x, w; stride = stride, padding = pad, dilation = dilation, flipkernel = flipped)
-    return conv(x, w, cdims)
-end
-
-function depthwiseconv(x, w::AbstractArray{T, N}; stride = 1, pad = 0, dilation = 1, flipped = false) where {T, N}
-    stride = expand(Val(N-2), stride)
-    pad = expand(Val(N-2), pad)
-    dilation = expand(Val(N-2), dilation)
-    cdims = DepthwiseConvDims(x, w; stride = stride, padding = pad, dilation = dilation, flipkernel = flipped)
-    return depthwiseconv(x, w, cdims)
-end
diff --git a/src/impl/padding_edges.jl b/src/dim_helpers/PaddingEdges.jl
similarity index 100%
rename from src/impl/padding_edges.jl
rename to src/dim_helpers/PaddingEdges.jl
diff --git a/src/dim_helpers.jl b/src/dim_helpers/dim_helpers.jl
similarity index 93%
rename from src/dim_helpers.jl
rename to src/dim_helpers/dim_helpers.jl
index 22d5636a7..ade8b9d8d 100644
--- a/src/dim_helpers.jl
+++ b/src/dim_helpers/dim_helpers.jl
@@ -1,9 +1,9 @@
 # Various helper functions to calculate dimensions for operations
-include("dim_helpers/ConvDims.jl")
-include("dim_helpers/DenseConvDims.jl")
-include("dim_helpers/DepthwiseConvDims.jl")
-include("dim_helpers/PoolDims.jl")
-
+include("ConvDims.jl")
+include("DenseConvDims.jl")
+include("DepthwiseConvDims.jl")
+include("PoolDims.jl")
+include("PaddingEdges.jl")
 
 """
     transpose_swapbatch(x::AbstractArray)
@@ -130,11 +130,8 @@ end
 
 Reorders the weight tensor for supporting both convolution and cross-correlation operations.
 """
-
-# For any array with ndims <= 3 it makes no sense to flip the weights so simply return the
-# original array
-@inline flipweight(w::AbstractArray) = w
-
-@inline flipweight(w::AbstractArray{T, 4}) where {T} = w[end:-1:1, end:-1:1, :, :]
-
 @inline flipweight(w::AbstractArray{T, 5}) where {T} = w[end:-1:1, end:-1:1, end:-1:1, :, :]
+@inline flipweight(w::AbstractArray{T, 4}) where {T} = w[end:-1:1, end:-1:1, :, :]
+@inline flipweight(w::AbstractArray{T, 3}) where {T} = w[end:-1:1, :, :]
+# For ndims < 3 it makes no sense to flip the weights so simply return the original array
+@inline flipweight(w::AbstractArray) = w
diff --git a/src/impl/conv_direct.jl b/src/direct/conv_direct.jl
similarity index 92%
rename from src/impl/conv_direct.jl
rename to src/direct/conv_direct.jl
index 5f5b7c4c3..bf86eadec 100644
--- a/src/impl/conv_direct.jl
+++ b/src/direct/conv_direct.jl
@@ -1,5 +1,5 @@
 ## This file contains direct Julia implementations of 2d and 3d convolutions
-using Base.Threads
+export conv_direct!, ∇conv_data_direct!, ∇conv_filter_direct!
 
 # Helper functions for restricting x/w overreach
 function clamp_lo(x, w)
@@ -22,12 +22,12 @@ end
 
 Direct convolution implementation; used for debugging, tests, and mixing/matching of
 strange datatypes within a single convolution.  Uses naive nested for loop implementation
-and does not attempt to optimize performance.  Rather, this implementation is intended to
-be maximally understandable and debuggable, to aid in testing other, more performant
-implementations.  We also explicitly support mixing and matching of strange datatypes,
-so that if the user really wants to convolve an image of `UInt8`'s with a `Float16`
-kernel, storing the result in a `Float32` output, there is at least a function call
-for that madness.
+and does not attempt to optimize performance at the cost of readability.  Rather, this
+implementation is intended to be maximally understandable and debuggable, to aid in
+testing other, more performant implementations.  We also explicitly support mixing and
+matching of strange datatypes, so that if the user really wants to convolve an image of
+`UInt8`'s with a `Float16` kernel, storing the result in a `Float32` output, there is at
+least one callable function for that madness.
 
 The keyword arguments `alpha` and `beta` control accumulation behavior; this function
 calculates `y = alpha * x * w + beta * y`, therefore by setting `beta` to a nonzero
@@ -43,8 +43,6 @@ The basic implementation performs 3-dimensional convolution; 1-dimensional and 2
 dimensional casesa are supported by simply reshaping `y`, `x` and `w`, for which
 wrapper methods are available.
 """
-conv_direct!
-
 function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
                       w::AbstractArray{wT,5}, cdims::DenseConvDims;
                       alpha::yT = yT(1), beta = false) where {yT, xT, wT}
@@ -150,8 +148,6 @@ end
 
 Calculate the gradient imposed upon `x` in the convolution `y = x * w`.
 """
-∇conv_data_direct!
-
 function ∇conv_data_direct!(dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5},
                             w::AbstractArray{wT,5}, cdims::DenseConvDims;
                             alpha::xT=xT(1), beta=false) where {xT, yT, wT}
@@ -169,8 +165,6 @@ end
 
 Calculate the gradient imposed upon `w` in the convolution `y = x * w`.
 """
-∇conv_filter_direct!
-
 function ∇conv_filter_direct!(dw::AbstractArray{wT,5}, x::AbstractArray{xT,5},
                               dy::AbstractArray{yT,5}, cdims::DenseConvDims;
                               alpha::wT=wT(1), beta=false) where {xT, yT, wT}
diff --git a/src/impl/depthwiseconv_direct.jl b/src/direct/depthwiseconv_direct.jl
similarity index 98%
rename from src/impl/depthwiseconv_direct.jl
rename to src/direct/depthwiseconv_direct.jl
index b6822a488..d7ba96689 100644
--- a/src/impl/depthwiseconv_direct.jl
+++ b/src/direct/depthwiseconv_direct.jl
@@ -1,4 +1,5 @@
 ## This file contains direct Julia implementations of depwthwise convolutions
+export depthwiseconv_direct!, ∇depthwiseconv_data_direct!, ∇depthwiseconv_filter_direct!
 
 """
     depthwiseconv_direct!(y, x, w, cdims; alpha=1, beta=0)
@@ -130,8 +131,6 @@ get applied to it.  The output of such a convolution is the gradient imposed upo
 particular channel of `x`, and so we simply walk through `x`, calculating the gradient
 for each batch and channel independently.
 """
-∇depthwiseconv_data_direct!
-
 function ∇depthwiseconv_data_direct!(
                 dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5},
                 w::AbstractArray{wT,5}, cdims::DepthwiseConvDims;
diff --git a/src/direct/direct.jl b/src/direct/direct.jl
new file mode 100644
index 000000000..10a135f85
--- /dev/null
+++ b/src/direct/direct.jl
@@ -0,0 +1,33 @@
+"""
+Direct implementations of convolution, pooling, etc...
+"""
+module Direct
+using ..NNlib
+using ..NNlib: output_size, input_size, kernel_size, channels_in, channels_out, check_dims,
+               spatial_dims, stride, padding, dilation, flipkernel, calc_padding_regions,
+               transpose_swapbatch, predilate, transpose_pad, channel_multiplier
+
+include("conv_direct.jl")
+include("depthwiseconv_direct.jl")
+include("pooling_direct.jl")
+
+# Here we register our convolution and pooling methods with the parent NNlib module.
+# We have direct implementations of just about everything, so push them all!
+import ..conv_backends, ..pooling_backends
+push!(conv_backends[:conv], :direct)
+push!(conv_backends[:∇conv_data], :direct)
+push!(conv_backends[:∇conv_filter], :direct)
+push!(conv_backends[:depthwiseconv], :direct)
+push!(conv_backends[:∇depthwiseconv_data], :direct)
+push!(conv_backends[:∇depthwiseconv_filter], :direct)
+
+push!(pooling_backends[:maxpool], :direct)
+push!(pooling_backends[:meanpool], :direct)
+
+end # module Direct
+
+# Self-using?  Yes.
+using .Direct
+import .Direct: conv_direct!, ∇conv_data_direct!, ∇conv_filter_direct!,
+                depthwiseconv_direct!, ∇depthwiseconv_data_direct!, ∇depthwiseconv_filter_direct!,
+                meanpool_direct!, maxpool_direct!, ∇meanpool_direct!, ∇maxpool_direct!
\ No newline at end of file
diff --git a/src/impl/pooling_direct.jl b/src/direct/pooling_direct.jl
similarity index 98%
rename from src/impl/pooling_direct.jl
rename to src/direct/pooling_direct.jl
index f95ab32f5..3a03be7c3 100644
--- a/src/impl/pooling_direct.jl
+++ b/src/direct/pooling_direct.jl
@@ -1,3 +1,6 @@
+## This file contains direct Julia implementations of pooling operations
+export meanpool_direct!, maxpool_direct!
+
 using Statistics
 
 # Pooling is so similar, we abstract over meanpooling and maxpooling, simply replacing
diff --git a/src/impl/conv_im2col.jl b/src/im2col/conv_im2col.jl
similarity index 99%
rename from src/impl/conv_im2col.jl
rename to src/im2col/conv_im2col.jl
index e06231325..e80576f05 100644
--- a/src/impl/conv_im2col.jl
+++ b/src/im2col/conv_im2col.jl
@@ -1,6 +1,8 @@
 ## This file contains im2col-backed implementations of convolution for 2d and 3d
 ## convolutions.  Expect to see a lot of indexing.
 
+export conv_im2col!, ∇conv_data_im2col!, ∇conv_filter_im2col!
+
 # Helper functions for flipkernel-induced dyslexia
 @inline function kernel_index(w, h, d, cdims::ConvDims{N, S, P, D, false}) where {N, S, P, D}
     kernel_w, kernel_h, kernel_d = kernel_size(cdims)
@@ -293,8 +295,6 @@ Note that this method has not been optimized in the same way as `im2col()` has,
 it is slightly more complicated due to the more chaotic data access patterns, and I'm not
 desperate enough yet.
 """
-col2im!
-
 function col2im!(x::AbstractArray{T,4}, col::AbstractArray{T,2},
                                   cdims::ConvDims) where T
     if spatial_dims(cdims) != 3
diff --git a/src/impl/depthwiseconv_im2col.jl b/src/im2col/depthwiseconv_im2col.jl
similarity index 98%
rename from src/impl/depthwiseconv_im2col.jl
rename to src/im2col/depthwiseconv_im2col.jl
index 145dc9961..c1a3de8a4 100644
--- a/src/impl/depthwiseconv_im2col.jl
+++ b/src/im2col/depthwiseconv_im2col.jl
@@ -1,5 +1,5 @@
 ## This file contains adapter code for doing depthwise convolutions with im2col.
-
+export depthwiseconv_im2col!, ∇depthwiseconv_data_im2col!, ∇depthwiseconv_filter_im2col!
 
 """
     depthwiseconv_im2col!(y, x, w, cdims, col=similar(x); alpha=1, beta=0)
@@ -8,8 +8,6 @@ Perform a depthwise convolution using im2col and GEMM, store the result in `y`.
 
 See `conv_im2col!()` for an explanation of optional parameters.
 """
-depthwiseconv_im2col!
-
 function depthwiseconv_im2col!(
                 y::AbstractArray{T,5}, x::AbstractArray{T,5},
                 w::AbstractArray{T,5}, cdims::DepthwiseConvDims;
diff --git a/src/gemm.jl b/src/im2col/gemm.jl
similarity index 100%
rename from src/gemm.jl
rename to src/im2col/gemm.jl
diff --git a/src/im2col/im2col.jl b/src/im2col/im2col.jl
new file mode 100644
index 000000000..283b59d5c
--- /dev/null
+++ b/src/im2col/im2col.jl
@@ -0,0 +1,27 @@
+module Im2col
+using ..NNlib
+using ..NNlib: im2col_dims, output_size, input_size, kernel_size, channels_in, channels_out, check_dims,
+               spatial_dims, stride, padding, dilation, flipkernel, calc_padding_regions, channel_multiplier
+using Base.Threads
+
+include("gemm.jl")
+include("conv_im2col.jl")
+include("depthwiseconv_im2col.jl")
+
+
+# Here we register our convolution methods with the parent NNlib module.
+# We only do convolution, no pooling.
+import ..conv_backends
+push!(conv_backends[:conv], :im2col)
+push!(conv_backends[:∇conv_data], :im2col)
+push!(conv_backends[:∇conv_filter], :im2col)
+push!(conv_backends[:depthwiseconv], :im2col)
+push!(conv_backends[:∇depthwiseconv_data], :im2col)
+push!(conv_backends[:∇depthwiseconv_filter], :im2col)
+
+end # module Im2col
+
+using .Im2col
+import .Im2col: conv_im2col!, ∇conv_data_im2col!, ∇conv_filter_im2col!,
+                depthwiseconv_im2col!, ∇depthwiseconv_data_im2col!,
+                ∇depthwiseconv_filter_im2col!
\ No newline at end of file
diff --git a/src/interface.jl b/src/interface.jl
new file mode 100644
index 000000000..8e537aeb9
--- /dev/null
+++ b/src/interface.jl
@@ -0,0 +1,66 @@
+## Convolution and Pooling API
+#
+#  We provide the following generic methods, for 3d, 4d, and 5d tensors, calculating 1d,
+#  2d and 3d convolutions and pooling based on the rank of the input tensors, in both
+#  mutating and non-mutating/auto-allocating variants:
+#   - Convolution:
+#     - conv(x, w, cdims)
+#     - conv!(y, x, w, cdims)
+#   - Convolution data backpropagation
+#     - ∇conv_data(dy, w, cdims)
+#     - ∇conv_data!(dx, dy, w, cdims)
+#   - Convolution filter backpropagation
+#     - ∇conv_filter(x, dy, cdims)
+#     - ∇conv_filter!(dw, x, dy, cdims)
+#   - Pooling:
+#     - maxpool(x, pdims)
+#     - maxpool!(y, x, pdims)
+#     - meanpool(x, pdims)
+#     - meanpool!(y, x, pdims)
+#   - Pooling data backprop
+#     - ∇maxpool(dy, y, x, pdims)
+#     - ∇maxpool!(dx, dy, y, x, pdims)
+#     - ∇meanpool(dy, y, x, pdims)
+#     - ∇meanpool!(dx, dy, y, x pdims)
+#
+#   All methods require a `ConvDims` or `PoolDims` object to define the dimensions and
+#   meta elements of the convolution (padding, stride, dilation, kernel-flipping, etc...)
+#   which is easily constructable through something like `DenseConvDims(x, w)`.  All
+#   methods take in the `ConvDims` of the associated normal, forward-pass convolution,
+#   that is, the following is legal:
+#
+#       cdims = DenseConvDims(x, w; stride=2, dilation=(3,2))
+#       dx = ∇conv_data(conv(x, w, cdims), w, cdims)
+#
+#   Note that we do provide a helper API in the case that you don't want to bother with
+#   DenseConvDims and friends: you can simply do the following, however it will be less
+#   performant if you run the same operation multiple times:
+#
+#       y = conv(x, w; stride=2, dilation=(3,2))
+
+
+# We support a pluggable backend system, currently consisting of three possible backends:
+#  * `nnpack`: which uses the third-party NNPACK libraries for convolution and pooling
+#  * `im2col`: A Julia BLAS-based implementation of convolution
+#  * `direct`: A Julia-native direct implementation of convolution and pooling
+#
+# We store each within a module (in the case of NNPACK, it is included only if the
+# NNPACK binaries are available for the host system) and each module pushes a value
+# onto these `conv_backends` lists.  Those lists are then read from in the file
+# `interface_impl.jl` which generates the nice interface described above, using a mixture
+# of dispatch and runtime checks to provide the convenient `conv()` -> `conv!()` ->
+# `conv_nnpack!()` interface that we all know and love.
+conv_backends = Dict(
+    :conv                   => [],
+    :∇conv_data             => [],
+    :∇conv_filter           => [],
+    :depthwiseconv          => [],
+    :∇depthwiseconv_data    => [],
+    :∇depthwiseconv_filter  => [],
+)
+
+# Same thing for pooling
+pooling_backends = Dict(
+    :maxpool  => [],
+    :meanpool => [],
+)
\ No newline at end of file
diff --git a/src/interface_impl.jl b/src/interface_impl.jl
new file mode 100644
index 000000000..169ce075c
--- /dev/null
+++ b/src/interface_impl.jl
@@ -0,0 +1,361 @@
+## This file creates the mappings from `conv!()` -> `conv_nnpack!()`, as well as
+#  convenience functions such as `conv_nnpack()` -> `conv_nnpack!()`, the auto-allocating
+#  variants, the reshaping variants, etc...
+
+       # convolution
+export conv, conv!, ∇conv_data, ∇conv_data!, ∇conv_filter, ∇conv_filter!,
+       # depthwise convolution
+       depthwiseconv, depthwiseconv!, ∇depthwiseconv_data, ∇depthwiseconv_data!,
+       ∇depthwiseconv_filter, ∇depthwiseconv_filter!,
+       # pooling
+       maxpool, maxpool!, meanpool, meanpool!, ∇maxpool, ∇maxpool!, ∇meanpool, ∇meanpool!
+
+
+# We're going to use some typenames an awful lot
+const AA = AbstractArray
+
+# Luckily, because the dispatch signatures for each of these backends are distinct,
+# we can just layer the applicable backends and multiple dispatch will take care of
+# calling the correct version for us.
+for (front_name, backends) in conv_backends
+    # _nnpack() methods are generally preferrable, but they're pretty strict
+    # in what they can do; they can't deal with anything other than Float32,
+    # they only do conv2d, and they can't deal with stride or dilation.
+    if :nnpack in backends
+        # Only conv2d, no conv1d or conv3d.  Also no stride or dilation
+        nnpack_cdims = DenseConvDims{
+            # only conv2d, no conv1d or conv3d
+            2,
+            # Any kernel size, channels in or out, whatever, it's good.
+            K, C_in, C_out,
+            # No stride
+            (1, 1),
+            # Padding can be whatevs
+            P,
+            # No dilation
+            (1, 1),
+            # Flipping is fine
+            F,
+        } where {K, C_in, C_out, P, F}
+
+        # Map from the front name to the back name
+        fname = Symbol("$(front_name)!")
+        bname = Symbol("$(front_name)_nnpack!")
+        @eval begin
+            function $(fname)(out::AA{Float32, 4}, in1::AA{Float32, 4},
+                              in2::AA{Float32, 4}, cdims::$(nnpack_cdims);
+                              kwargs...)
+                $(bname)(out, in1, in2, cdims; kwargs...)
+                return out
+            end
+        end
+    end
+
+    # _im2col() methods are a little less strict.  They can deal with any kind
+    # of DenseConvDims, but they are still limited to the basic types that BLAS
+    # can deal with, which is {Complex,}Float{32,64}.  For BLAS to work, all
+    # types must also be the same:
+    if :im2col in backends
+        # These are the types that our BLAS can handle
+        BLAS_TYPES = Union{[x[2] for x in Im2col.gemm_datatype_mappings]...}
+
+        # Map from the front name to the back name
+        fname = Symbol("$(front_name)!")
+        bname = Symbol("$(front_name)_im2col!")
+        @eval begin
+            function $(fname)(out::AA{T}, in1::AA{T},
+                              in2::AA{T}, cdims::ConvDims;
+                              kwargs...) where {T <: $(BLAS_TYPES)}
+                $(bname)(out, in1, in2, cdims; kwargs...)
+                return out
+            end
+
+            # We add here some "expanders" that convert conv1d/2d inputs up to
+            # the conv3d input shape that our backend is expecting:
+            function $(bname)(out::AA{T,3}, in1::AA{T,3}, in2::AA{T,3},
+                              cdims::ConvDims; kwargs...) where {T <: $(BLAS_TYPES)}
+                out, in1, in2, cdims = expand_dimensions(Val(5), out, in1, in2, cdims)
+                $(bname)(out, in1, in2, cdims; kwargs...)
+                return out
+            end
+            function $(bname)(out::AA{T,4}, in1::AA{T,4}, in2::AA{T,4},
+                              cdims::ConvDims; kwargs...) where {T <: $(BLAS_TYPES)}
+                out, in1, in2, cdims = expand_dimensions(Val(5), out, in1, in2, cdims)
+                $(bname)(out, in1, in2, cdims; kwargs...)
+                return out
+            end
+        end
+    end
+
+    # _direct() can take in anything, but it can be much slower, so it's best
+    # to take advantage of the accelerated definitions above.  We still do the
+    # expansion of dimensions here, to make sure this works for conv3d cases.
+    if :direct in backends
+        fname = Symbol("$(front_name)!")
+        bname = Symbol("$(front_name)_direct!")
+        @eval begin
+            function $(fname)(out::AA, in1::AA,
+                              in2::AA, cdims::ConvDims;
+                              kwargs...)
+                @debug string("Slow fallback implementation invoked for ", $front_name, "!  ",
+                              "You probably don't want this; check your datatypes.")
+                $(bname)(out, in1, in2, cdims; kwargs...)
+                return out
+            end
+
+            # We add here some "expanders" that convert conv1d/2d inputs up to
+            # the conv3d input shape that our backend is expecting:
+            function $(bname)(out::AA{<:Any,3}, in1::AA{<:Any,3}, in2::AA{<:Any,3},
+                              cdims::ConvDims; kwargs...)
+                out, in1, in2, cdims = expand_dimensions(Val(5), out, in1, in2, cdims)
+                $(bname)(out, in1, in2, cdims; kwargs...)
+                return out
+            end
+            function $(bname)(out::AA{<:Any,4}, in1::AA{<:Any,4}, in2::AA{<:Any,4},
+                              cdims::ConvDims; kwargs...)
+                out, in1, in2, cdims = expand_dimensions(Val(5), out, in1, in2, cdims)
+                $(bname)(out, in1, in2, cdims; kwargs...)
+                return out
+            end
+        end
+    end
+end
+
+"""
+    expand_dimensions(M, x, args...)
+
+Inserts singleton dimensions into `x`, and any further arguments until they
+are `M`-dimensional.  It is an error for the input tensors to be of greater
+dimensionality than `M`.
+"""
+function expand_dimensions(::Val{M}, x::AbstractArray{<:Any, N}, args...) where {N, M}
+    if M == N
+        return (x, args...)
+    end
+    if N > M
+        error("Cannot expand_dimensions() to a smaller dimensionality!")
+    end
+    return (
+        insert_singleton_spatial_dimension(x, M - N),
+        insert_singleton_spatial_dimension.(args, M - N)...,
+    )
+end
+
+# Finally, let's generate auto-allocating versions of all our functions.
+# These are the ones that don't have the `!` at the end.  Note that we do not
+# type-specialize these like the non-allocating versions.
+for backend in (Symbol(), :_direct, :_im2col, :_nnpack)
+    # First, conv() forward passes
+    for name in (:conv, :depthwiseconv)
+        fname = Symbol("$(name)$(backend)")
+        bname = Symbol("$(name)$(backend)!")
+        @eval begin
+            function $(fname)(x::AA, w::AA, cdims::ConvDims; kwargs...)
+                y = similar(x,
+                    promote_type(eltype(x), eltype(w)),
+                    output_size(cdims)...,
+                    channels_out(cdims),
+                    size(x, ndims(x)),
+                )
+                $(bname)(y, x, w, cdims; kwargs...)
+                return y
+            end
+        end
+    end
+
+    # Next, backward passes for `_data()`
+    for name in (:∇conv_data, :∇depthwiseconv_data)
+        fname = Symbol("$(name)$(backend)")
+        bname = Symbol("$(name)$(backend)!")
+        @eval begin
+            function $(fname)(dy::AA, w::AA, cdims::ConvDims; kwargs...)
+                dx = similar(dy,
+                    input_size(cdims)...,
+                    channels_in(cdims),
+                    size(dy, ndims(dy)),
+                )
+                $(bname)(dx, dy, w, cdims; kwargs...)
+                return dx
+            end
+        end
+    end
+
+    # We do the filter backprops separately, as the shape calculation for `w`
+    # is slightly different for depthwise than for normal dense convolution.
+    fname = Symbol("∇conv_filter$(backend)")
+    bname = Symbol("∇conv_filter$(backend)!")
+    @eval begin
+        function $(fname)(x::AA, dy::AA, cdims::ConvDims; kwargs...)
+            dw = similar(dy,
+                kernel_size(cdims)...,
+                channels_in(cdims),
+                channels_out(cdims),
+            )
+            $(bname)(dw, x, dy, cdims; kwargs...)
+            return dw
+        end
+    end
+
+    fname = Symbol("∇depthwiseconv_filter$(backend)")
+    bname = Symbol("∇depthwiseconv_filter$(backend)!")
+    @eval begin
+        function $(fname)(x::AA, dy::AA, cdims::ConvDims; kwargs...)
+            dw = similar(dy,
+                kernel_size(cdims)...,
+                channel_multiplier(cdims),
+                channels_in(cdims),
+            )
+            $(bname)(dw, x, dy, cdims; kwargs...)
+            return dw
+        end
+    end
+end
+
+## Pooling
+for (front_name, backends) in pooling_backends
+    if :nnpack in backends
+        nnpack_pdims = PoolDims{
+            # only conv2d, no conv1d or conv3d
+            2,
+            # Any kernel size, channels in or out, whatever, it's good.
+            K,
+            # No stride
+            S,
+            # Padding can be whatevs
+            P,
+            # No dilation
+            (1, 1),
+        } where {K, S, P}
+        # Map from the front name to the back name
+        fname = Symbol("$(front_name)!")
+        @eval begin
+            function $(fname)(out::AA{Float32, 4}, in::AA{Float32, 4},
+                              pdims::$(nnpack_pdims); kwargs...)
+                # Check to see if this is a supported operation
+                if nnpack_supported_pooling(pdims)
+                    $(Symbol("$(front_name)_nnpack!"))(out, in, pdims; kwargs...)
+                else
+                    # If it's not suported, then bail out to _direct()
+                    $(Symbol("$(front_name)_direct!"))(out, in, pdims; kwargs...)
+                end
+                return out
+            end
+        end
+    end
+
+    if :direct in backends
+        fname = Symbol("$(front_name)!")
+        bname = Symbol("$(front_name)_direct!")
+        @eval begin
+            function $(fname)(out::AA, in::AA, pdims::PoolDims; kwargs...)
+                $(bname)(out, in, pdims; kwargs...)
+                return out
+            end
+
+            # Add reshapers
+            function $(bname)(out::AA{T,3}, in::AA{T,3},
+                              pdims::PoolDims; kwargs...) where {T}
+                outx, inx, pdimsx = expand_dimensions(Val(5), out, in, pdims)
+                $(bname)(outx, inx, pdimsx; kwargs...)
+                return out
+            end
+            function $(bname)(out::AA{T,4}, in::AA{T,4},
+                              pdims::PoolDims; kwargs...) where {T}
+                outx, inx, pdimsx = expand_dimensions(Val(5), out, in, pdims)
+                $(bname)(outx, inx, pdimsx; kwargs...)
+                return out
+            end
+        end
+    end
+end
+
+# We only have direct backprop for pooling
+for (front_name, backend) in (
+        :∇maxpool  => :direct,
+        :∇meanpool => :direct,
+    )
+    fname = Symbol("$(front_name)!")
+    bname = Symbol("$(front_name)_direct!")
+    @eval begin
+        function $(fname)(dx::AA{T}, dy::AA{T},
+                          y::AA{T}, x::AA{T},
+                          pdims::PoolDims; kwargs...) where {T}
+            $(bname)(dx, dy, y, x, pdims; kwargs...)
+            return dx
+        end
+        function $(bname)(dx::AA{T,3}, dy::AA{T,3},
+                          y::AA{T,3}, x::AA{T,3},
+                          pdims::PoolDims; kwargs...) where {T}
+            dxx, dyx, yx, xx, pdimsx = expand_dimensions(Val(5), dx, dy, y, x, pdims)
+            $(bname)(dxx, dyx, yx, xx, pdimsx; kwargs...)
+            return dx
+        end
+        function $(bname)(dx::AA{T,4}, dy::AA{T,4},
+                          y::AA{T,4}, x::AA{T,4},
+                          pdims::PoolDims; kwargs...) where {T}
+            dxx, dyx, yx, xx, pdimsx = expand_dimensions(Val(5), dx, dy, y, x, pdims)
+            $(bname)(dxx, dyx, yx, xx, pdimsx; kwargs...)
+            return dx
+        end
+    end
+end
+
+# Finally, let's generate auto-allocating versions of all our functions, for all backends:
+for backend in (Symbol(), :_direct),
+    name in (:maxpool, :meanpool)
+
+    fname = Symbol("$(name)$(backend)")
+    bname = Symbol("$(name)$(backend)!")
+    f_backname = Symbol("∇$(name)$(backend)")
+    b_backname = Symbol("∇$(name)$(backend)!")
+    @eval begin
+        function $(fname)(x::AA, pdims::PoolDims; kwargs...)
+            y = similar(x, output_size(pdims)..., channels_out(pdims), size(x, ndims(x)))
+            fill!(y, zero(eltype(x)))
+            $(bname)(y, x, pdims; kwargs...)
+            return y
+        end
+
+        # Backprops too
+        function $(f_backname)(dy::AA, y::AA, x::AA, pdims::PoolDims; kwargs...)
+            dx = similar(x, input_size(pdims)..., channels_in(pdims), size(dy, ndims(dy)))
+            fill!(dx, zero(eltype(x)))
+            $(b_backname)(dx, dy, y, x, pdims; kwargs...)
+            return dx
+        end
+    end
+end
+
+expand(N, i::Tuple) = i
+expand(N, i::Integer) = ntuple(_ -> i, N)
+
+# Simplified conv() adapters that construct the `DenseConvDims` for you.
+function conv(x, w::AbstractArray{T, N}; stride = 1, pad = 0, dilation = 1, flipped = false) where {T, N}
+    stride = expand(Val(N-2), stride)
+    pad = expand(Val(N-2), pad)
+    dilation = expand(Val(N-2), dilation)
+    cdims = DenseConvDims(x, w; stride = stride, padding = pad, dilation = dilation, flipkernel = flipped)
+    return conv(x, w, cdims)
+end
+
+function depthwiseconv(x, w::AbstractArray{T, N}; stride = 1, pad = 0, dilation = 1, flipped = false) where {T, N}
+    stride = expand(Val(N-2), stride)
+    pad = expand(Val(N-2), pad)
+    dilation = expand(Val(N-2), dilation)
+    cdims = DepthwiseConvDims(x, w; stride = stride, padding = pad, dilation = dilation, flipkernel = flipped)
+    return depthwiseconv(x, w, cdims)
+end
+
+function maxpool(x, k::NTuple{N, Integer}; pad = 0, stride = k) where N
+    pad = expand(Val(N), pad)
+    stride = expand(Val(N), stride)
+    pdims = PoolDims(x, k; padding = pad, stride = stride)
+    return maxpool(x, pdims)
+end
+
+function meanpool(x, k::NTuple{N, Integer}; pad = 0, stride = k) where N
+    pad = expand(Val(N), pad)
+    stride = expand(Val(N), stride)
+    pdims = PoolDims(x, k; padding = pad, stride = stride)
+    return meanpool(x, pdims)
+end
diff --git a/src/nnpack/NNPACK.jl b/src/nnpack/NNPACK.jl
index 1e420a9cc..46c10c228 100644
--- a/src/nnpack/NNPACK.jl
+++ b/src/nnpack/NNPACK.jl
@@ -1,31 +1,14 @@
+module NNPACK
+using ..NNlib
+using ..NNlib: check_dims, input_size, output_size, kernel_size, padding, stride, flipkernel, flipweight
+using NNPACK_jll
+
 include("libnnpack_types.jl")
 include("error.jl")
 include("libnnpack.jl")
-include("performance.jl")
+include("multithreading.jl")
 include("interface.jl")
 
-const depsjl_path = joinpath(dirname(@__FILE__), "..", "..", "deps", "deps.jl")
-if !isfile(depsjl_path)
-    error("NNPACK not installed properly, run Pkg.build(\"NNlib\"), restart Julia and try again")
-end
-
-const shared_threadpool_dict = Dict{UInt64, Base.RefValue}()
-
-"""
-    is_nnpack_available()
-
-Checks if the current hardware is supported by NNPACK.
-"""
-function is_nnpack_available()
-    check_deps() isa Nothing || return false
-    status = nnp_initialize()
-    if status == nnp_status_unsupported_hardware
-        return false
-    else
-        return true
-    end
-end
-
 """
     allocate_threadpool()
 
@@ -41,12 +24,11 @@ function allocate_threadpool()
     end
 end
 
-@init begin
-    check_deps()
-    status = nnp_initialize()
-    if status == nnp_status_unsupported_hardware
-        @warn "Hardware is unsupported by NNPACK so falling back to default NNlib"
+function __init__()
+    if !is_nnpack_available()
+        @warn "Hardware unsupported by NNPACK, falling back to other NNlib backends"
     end
+
     try
         global NNPACK_CPU_THREADS = parse(UInt64, ENV["NNPACK_CPU_THREADS"])
     catch
@@ -57,3 +39,17 @@ end
     end
     allocate_threadpool()
 end
+
+# Here we register our convolution and pooling methods with the parent NNlib module.
+# We have implementations only for normal convolution and maxpooling:
+import ..conv_backends, ..pooling_backends
+push!(conv_backends[:conv], :nnpack)
+push!(conv_backends[:∇conv_data], :nnpack)
+push!(conv_backends[:∇conv_filter], :nnpack)
+
+push!(pooling_backends[:maxpool], :nnpack)
+end # module NNPACK
+
+using .NNPACK
+import .NNPACK: maxpool_nnpack!, nnpack_supported_operation,
+               conv_nnpack!, ∇conv_data_nnpack!, ∇conv_filter_nnpack!
\ No newline at end of file
diff --git a/src/nnpack/impl.jl b/src/nnpack/impl.jl
deleted file mode 100644
index 5d3086583..000000000
--- a/src/nnpack/impl.jl
+++ /dev/null
@@ -1,50 +0,0 @@
-function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4}}
-    check_dims(size(x), size(y), pdims)
-    threadpool = select_threadpool(pdims, size(y, 4))
-    nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims),
-                           stride = stride(pdims), threadpool = threadpool)
-end
-
-function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims;
-                                       b::A2 = zeros(Float32, size(x, 3)),
-                                       algo = UInt32(0)) where {A1<:Array{Float32, 4},
-                                                                A2<:Array{Float32, 1}}
-    check_dims(size(x), size(w), size(y), cdims)
-    threadpool = select_threadpool(cdims, size(y, 4))
-
-    if flipkernel(cdims) == 0
-        w = flipweight(w)
-    end
-
-    nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims),
-                           stride = stride(cdims), threadpool = threadpool)
-end
-
-function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims;
-                                             algo = UInt32(0)) where{A<:Array{Float32, 4}}
-    check_dims(size(dx), size(w), size(dy), cdims)
-    threadpool = select_threadpool(cdims, size(y, 4))
-    
-    if flipkernel(cdims) == 0
-        w = flipweight(w)
-    end
-
-    nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims),
-                                   stride = stride(cdims), threadpool = threadpool)
-end
-
-function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims;
-                                               algo = UInt32(0)) where{A<:Array{Float32, 4}}
-    check_dims(size(x), size(dw), size(dy), cdims)
-    threadpool = select_threadpool(cdims, size(y, 4))
-    
-    nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims),
-                                    stride = stride(cdims), threadpool = threadpool)
-
-    if flipkernel(cdims) == 0
-        dw .= flipweight(dw)
-    end
-
-    dw
-end
-
diff --git a/src/nnpack/interface.jl b/src/nnpack/interface.jl
index 25ab93632..74f38c2fb 100644
--- a/src/nnpack/interface.jl
+++ b/src/nnpack/interface.jl
@@ -1,70 +1,90 @@
-include("impl.jl")
-
-
-for (front_name, backend) in (
-        :conv          => :_nnpack,
-        :∇conv_data    => :_nnpack,
-        :∇conv_filter  => :_nnpack,
-    )
-    @eval begin
-        function $(Symbol("$(front_name)$(backend)!"))(
-                        out::Array{T1,4}, in1::Array{T2,4}, in2::Array{T3,4},
-                        cdims::ConvDims; kwargs...) where {T1, T2, T3}
-            @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1
-            # Output must of the same type as in the function signature
-            T1.($(Symbol("$(front_name)$(backend)!"))(Float32.(out), Float32.(in1),
-                                                      Float32.(in2), cdims; kwargs...))
-        end
-    end
-end
+export is_nnpack_available,
+       # Pooling
+       maxpool_nnpack!, nnpack_supported_operation,
+       # Convolution
+       conv_nnpack!, ∇conv_data_nnpack!, ∇conv_filter_nnpack!
+
+"""
+    is_nnpack_available()
 
+Checks if the current hardware is supported by NNPACK.
 
-function conv_nnpack(x::Array{T1, 4}, w::Array{T2, 4}, cdims::ConvDims; kwargs...) where {T1, T2}
-    y = similar(x, output_size(cdims)..., channels_out(cdims), size(x, 4))
-    return conv_nnpack!(y, x, w, cdims; kwargs...)
+While the platform itself may be supported by NNPACK, certain hardware
+configurations (such as processors lacking SSE) are not.
+"""
+function is_nnpack_available()
+    return nnp_initialize() != nnp_status_unsupported_hardware
 end
 
+# Conv
+function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims;
+                                       b::A2 = zeros(Float32, size(x, 3)),
+                                       algo = UInt32(0)) where {A1<:Array{Float32, 4},
+                                                                A2<:Array{Float32, 1}}
+    check_dims(size(x), size(w), size(y), cdims)
+    threadpool = select_threadpool(cdims, size(y, 4))
+
+    if flipkernel(cdims) == 0
+        w = flipweight(w)
+    end
 
-function ∇conv_data(dy::Array{T1, 4}, w::Array{T2, 4}, cdims::ConvDims; kwargs...) where {T1, T2}
-    dx = similar(dy, input_size(cdims)..., channels_in(cdims), size(dy, 4))
-    return ∇conv_data!(dx, dy, w, cdims; kwargs...)
+    nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims),
+                           stride = stride(cdims), threadpool = threadpool)
 end
 
+function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims;
+                                             algo = UInt32(0)) where{A<:Array{Float32, 4}}
+    check_dims(size(dx), size(w), size(dy), cdims)
+    threadpool = select_threadpool(cdims, size(dy, 4))
+    
+    if flipkernel(cdims) == 0
+        w = flipweight(w)
+    end
 
-function ∇conv_filter(x::Array{T1, 4}, dy::Array{T2, 4}, cdims::ConvDims; kwargs...) where {T1, T2}
-    dw = similar(x, kernel_size(cdims)..., channels_in(cdims), channels_out(cdims))
-    return ∇conv_filter!(dw, x, dy, cdims; kwargs...)
+    nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims),
+                                   stride = stride(cdims), threadpool = threadpool)
 end
 
+function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims;
+                                               algo = UInt32(0)) where{A<:Array{Float32, 4}}
+    check_dims(size(x), size(dw), size(dy), cdims)
+    threadpool = select_threadpool(cdims, size(dy, 4))
+    
+    nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims),
+                                    stride = stride(cdims), threadpool = threadpool)
+
+    if flipkernel(cdims) == 0
+        dw .= flipweight(dw)
+    end
 
-function maxpool_nnpack!(y::Array{T1, 4}, x::Array{T2, 4}, pdims::PoolDims;
-                         kwargs...) where {T1, T2}
-    @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1
-    # We want the output to be of the same type as desired
-    T1.(maxpool_nnpack!(Float32.(y), Float32.(x), pdims; kwargs...))
+    dw
 end
 
 
-function maxpool_nnpack(x::Array{T, 4}, pdims::PoolDims; kwargs...) where {T}
-    y = similar(x, output_size(pdims)..., channels_out(pdims), size(x, 4))
-    return maxpool_nnpack!(y, x, pdims; kwargs...)
+# Pooling
+function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4}}
+    check_dims(size(x), size(y), pdims)
+    threadpool = select_threadpool(pdims, size(y, 4))
+    nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims),
+                           stride = stride(pdims), threadpool = threadpool)
 end
 
-
 """
     nnpack_supported_operation(cdims::ConvDims)
-    nnpack_supported_operation(pdims::PoolDims)
 
-Returns `true` if nnpack supports the convolution/pooling operation for the given parameters.
+Returns `true` if nnpack supports the conv/pooling operation for the given
+parameters.  For convolution this can be known at compile-time, however for
+pooling, we cannot describe the stride domain constraint purely with types,
+so we must do it at runtime with this method.
 """
 function nnpack_supported_operation(pdims::PoolDims{2, K, S, P, (1, 1)}) where {K, S, P}
-    val = input_size(pdims)[1:2] .+ (P[1] + P[2], P[3] + P[4]) .- K
-    return val .% S == (0, 0) ? true : false
+    # Ensure that the kernel striding perfectly covers the padded input size.
+    stride_domain = input_size(pdims)[1:2] .+ (P[1] + P[2], P[3] + P[4]) .- K
+    return stride_domain .% S == (0, 0)
 end
 
-function nnpack_supported_operation(cdims::ConvDims{2, K, (1, 1), P, (1, 1)}) where {K, S, P}
-    return true
-end
+NNPACK_CDIMS = DenseConvDims{2,K,C_in,C_out,(1,1),P,(1,1),F} where {K,C_in,C_out,P,F}
+nnpack_supported_operation(cdims::NNPACK_CDIMS) = true
 
-# Return false for everything else
+# Say false by default
 nnpack_supported_operation(dims) = false
diff --git a/src/nnpack/libnnpack.jl b/src/nnpack/libnnpack.jl
index 2f3996c32..4767a9545 100644
--- a/src/nnpack/libnnpack.jl
+++ b/src/nnpack/libnnpack.jl
@@ -90,8 +90,6 @@ function nnp_max_pooling_output(y::Array{Float32,4}, x::Array{Float32,4}, kernel
     y
 end
 
-#TODO: Add wrapper for convolution inference
-
 function nnp_convolution_input_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile)
     @nnpack_check ccall((:nnp_convolution_input_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
 end
diff --git a/src/nnpack/performance.jl b/src/nnpack/multithreading.jl
similarity index 87%
rename from src/nnpack/performance.jl
rename to src/nnpack/multithreading.jl
index 24abdb411..6f37ccfcc 100644
--- a/src/nnpack/performance.jl
+++ b/src/nnpack/multithreading.jl
@@ -1,5 +1,7 @@
+const shared_threadpool_dict = Dict{UInt64, Base.RefValue}()
+
 function select_threadpool(cdims::DenseConvDims, batch_size::Int)
-    inp_size = input_size(cdims)[1] 
+    inp_size = input_size(cdims)[1]
     if batch_size >= 32
         return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
     elseif batch_size >= 16 && inp_size >= 64
@@ -10,12 +12,12 @@ function select_threadpool(cdims::DenseConvDims, batch_size::Int)
         return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
     elseif inp_size * batch_size >= 256
         return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
-    end    
+    end
     return C_NULL
 end
 
 function select_threadpool(pdims::PoolDims, batch_size::Int)
-    inp_size = input_size(pdims)[1] 
+    inp_size = input_size(pdims)[1]
     if batch_size >= 32
         return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
     elseif batch_size >= 16 && inp_size >= 64
@@ -26,6 +28,6 @@ function select_threadpool(pdims::PoolDims, batch_size::Int)
         return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
     elseif inp_size * batch_size >= 256
         return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][]
-    end    
+    end
     return C_NULL
 end
diff --git a/src/pooling.jl b/src/pooling.jl
deleted file mode 100644
index 13c605e97..000000000
--- a/src/pooling.jl
+++ /dev/null
@@ -1,155 +0,0 @@
-export maxpool, maxpool!, meanpool, meanpool!, ∇maxpool, ∇maxpool!, ∇meanpool, ∇meanpool!
-
-## Pooling API
-#
-#  We provide the following generic methods, for 3d, 4d, and 5d tensors, calculating 1d,
-#  2d and 3d pooling, based on the rank of the input tensors, in both mutating and
-#  non-mutating auto-allocating variants:
-#   - Pooling:
-#     - maxpool(x, pdims)
-#     - maxpool!(y, x, pdims)
-#     - meanpool(x, pdims)
-#     - meanpool!(y, x, pdims)
-#   - Pooling input backprop
-#     - ∇maxpool(dy, y, x, pdims)
-#     - ∇maxpool!(dx, dy, y, x, pdims)
-#     - ∇meanpool(dy, y, x, pdims)
-#     - ∇meanpool!(dx, dy, y, x pdims)
-#
-#   All methods require a `PoolDims` object to define the dimensions and optional
-#   elements of the convolution (stride, dilation, etc...), which is easily constructable
-#   through something like `PoolDims(x, w)`.
-
-
-# First, we will define mappings from the generic API names to our accelerated backend
-# implementations.  At the moment this is only the direct implementation, however this
-# exists here so that other packages (NNPACK, MAGMA, etc...) can override this easily.
-for (front_name, backend) in (
-        # This maps from public, front-facing name, to internal backend name
-        :maxpool  => :direct,
-        :meanpool => :direct,
-    )
-
-    # We only define 3d pooling primitives, we reshape lower down to get 1d and 2d pooling
-    @eval begin
-        function $(Symbol("$(front_name)!"))(
-                y::AbstractArray{T,5}, x::AbstractArray{T,5},
-                pdims::PoolDims; kwargs...) where {T}
-            $(Symbol("$(front_name)_$(backend)!"))(y, x, pdims; kwargs...)
-        end
-    end
-end
-
-# Do the same for backprops
-for (front_name, backend) in (
-        :∇maxpool  => :direct,
-        :∇meanpool => :direct,
-    )
-    @eval begin
-        function $(Symbol("$(front_name)!"))(
-                        dx::AbstractArray{T,5}, dy::AbstractArray{T,5},
-                        y::AbstractArray{T,5}, x::AbstractArray{T,5},
-                        pdims::PoolDims; kwargs...) where {T}
-            $(Symbol("$(front_name)_$(backend)!"))(dx, dy, y, x, pdims; kwargs...)
-        end
-    end
-end
-
-
-# Our strategy for pooling is to reshape to an array with three spatial dimensions, which
-# makes things MUCH EASIER for us on the backend side, and is in general pretty fast,
-# since we can specialize on sizes.
-for front_name in (:maxpool, :meanpool)
-    for backend in (Symbol(), :_direct)
-        for N in (3, 4)
-            @eval begin
-                function $(Symbol("$(front_name)$(backend)!"))(
-                                y::AbstractArray{T,$N}, x::AbstractArray{T,$N},
-                                pdims::PoolDims; kwargs...) where {T}
-                    $(Symbol("$(front_name)$(backend)!"))(
-                        insert_singleton_spatial_dimension(y, $(5 - N)),
-                        insert_singleton_spatial_dimension(x, $(5 - N)),
-                        insert_singleton_spatial_dimension(pdims, $(5 - N));
-                        kwargs...
-                    )
-
-                    # We explicitly return `y` here, because the backend call
-                    # itself may return a reshaped view, which we don't want.
-                    return y
-                end
-
-                # backprops too
-                function $(Symbol("∇$(front_name)$(backend)!"))(
-                                dx::AbstractArray{T,$N}, dy::AbstractArray{T,$N},
-                                y::AbstractArray{T,$N}, x::AbstractArray{T,$N},
-                                pdims::PoolDims; kwargs...) where {T}
-                    $(Symbol("∇$(front_name)$(backend)!"))(
-                        insert_singleton_spatial_dimension(dx, $(5 - N)),
-                        insert_singleton_spatial_dimension(dy, $(5 - N)),
-                        insert_singleton_spatial_dimension(y, $(5 - N)),
-                        insert_singleton_spatial_dimension(x, $(5 - N)),
-                        insert_singleton_spatial_dimension(pdims, $(5 - N));
-                        kwargs...
-                    )
-
-                    # We explicitly return `dx` here, because the backend call
-                    # itself may return a reshaped view, which we don't want.
-                    return dx
-                end
-            end
-        end
-    end
-end
-
-
-# Finally, let's generate auto-allocating versions of all our functions, for all backends:
-for backend in (Symbol(), :_direct, :_im2col)
-    # First make auto-allocating versions of the basic pooling calls:
-    for name in (:maxpool, :meanpool)
-        @eval begin
-            function $(Symbol("$(name)$(backend)"))(
-                            x::AbstractArray{xT,N},
-                            pdims::PoolDims; kwargs...) where {xT, N}
-                y = similar(x, output_size(pdims)..., channels_out(pdims), size(x, N))
-                fill!(y, xT(0))
-                return $(Symbol("$(name)$(backend)!"))(y, x, pdims; kwargs...)
-            end
-
-            # Backprops too
-            function $(Symbol("∇$(name)$(backend)"))(
-                            dy::AbstractArray{T,N}, y::AbstractArray{T,N},
-                            x::AbstractArray{T,N}, pdims::PoolDims;
-                            kwargs...) where {T, N}
-                dx = similar(x, input_size(pdims)..., channels_in(pdims), size(dy, N))
-                fill!(dx, T(0))
-                return $(Symbol("∇$(name)$(backend)!"))(dx, dy, y, x, pdims; kwargs...)
-            end
-        end
-    end
-end
-
-
-# Use NNPACK if it is available and operation is supported
-if is_nnpack_available()
-    function maxpool(x::Array{T, 4}, pdims::PoolDims{2, K, S, P, (1, 1)}; kwargs...) where {T, K, S, P}
-        func = nnpack_supported_operation(pdims) ? maxpool_nnpack : maxpool_direct
-        return func(x, pdims; kwargs...)
-    end
-end
-
-expand(N, i::Tuple) = i
-expand(N, i::Integer) = ntuple(_ -> i, N)
-
-function maxpool(x, k::NTuple{N, Integer}; pad = 0, stride = k) where N
-    pad = expand(Val(N), pad)
-    stride = expand(Val(N), stride)
-    pdims = PoolDims(x, k; padding = pad, stride = stride)
-    return maxpool(x, pdims)
-end
-
-function meanpool(x, k::NTuple{N, Integer}; pad = 0, stride = k) where N
-    pad = expand(Val(N), pad)
-    stride = expand(Val(N), stride)
-    pdims = PoolDims(x, k; padding = pad, stride = stride)
-    return meanpool(x, pdims)
-end
diff --git a/test/conv.jl b/test/conv.jl
index cf91a5361..03676377c 100644
--- a/test/conv.jl
+++ b/test/conv.jl
@@ -274,14 +274,9 @@ conv_answer_dict = Dict(
             # A "drop channels and batch dimension" helper
             ddims(x) = dropdims(x, dims=(rank+1, rank+2))
 
+            # We don't directly test conv_nnpack() because it has so many holes in its support
             convs = [NNlib.conv, NNlib.conv_im2col, NNlib.conv_direct,]
-            NNlib.is_nnpack_available() && push!(convs, NNlib.conv_nnpack)
             for conv in convs
-                if NNlib.is_nnpack_available()
-                    if conv == NNlib.conv_nnpack && !NNlib.nnpack_supported_operation(DenseConvDims(x, w))
-                        continue
-                    end
-                end
                 @testset "$(conv)" begin
                     cdims = DenseConvDims(x, w)
                     # First, your basic convolution with no parameters
diff --git a/test/inference.jl b/test/inference.jl
index 39b5108ce..778aeec49 100644
--- a/test/inference.jl
+++ b/test/inference.jl
@@ -1,13 +1,11 @@
-import NNlib: conv_direct, conv_im2col
-
 @testset "Conv Inference" begin
-    x = rand(10, 10, 3, 2)
-    w = rand(3, 3, 3, 1)
+    x = rand(Float32, 10, 10, 3, 2)
+    w = rand(Float32, 3, 3, 3, 1)
 
-    impl = [conv, conv_direct, conv_im2col]
+    impl = [conv, NNlib.conv_direct, NNlib.conv_im2col]
     NNlib.is_nnpack_available() && push!(impl, NNlib.conv_nnpack)
 
     for T in impl
-        @test T(x, w, DenseConvDims(x, w)) isa AbstractArray{K,4} where K
+        @test T(x, w, DenseConvDims(x, w)) isa AbstractArray{eltype(x),4}
     end
 end