From e3fb37b8af44ba7d224833d84888a5cd99e72fcf Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Thu, 30 Jan 2020 17:28:53 -0800 Subject: [PATCH] Big interface overhaul Eliminate as much runtime checking as possible; create metaprogramming forwards from frontend to backend in a better (yet still imperfect) manner. We are reserving the right for the frontend names such as `conv()` to decide which backend to use at runtime, but for now, all `conv()` calls (but not all pooling calls) are dispatched at compile-time, which is nice. --- Manifest.toml | 53 +++ Project.toml | 1 + deps/build.jl | 50 --- src/NNlib.jl | 54 +-- src/{ => activation}/activation.jl | 2 + src/{ => activation}/softmax.jl | 0 src/conv.jl | 181 --------- .../PaddingEdges.jl} | 0 src/{ => dim_helpers}/dim_helpers.jl | 21 +- src/{impl => direct}/conv_direct.jl | 20 +- src/{impl => direct}/depthwiseconv_direct.jl | 3 +- src/direct/direct.jl | 33 ++ src/{impl => direct}/pooling_direct.jl | 3 + src/{impl => im2col}/conv_im2col.jl | 4 +- src/{impl => im2col}/depthwiseconv_im2col.jl | 4 +- src/{ => im2col}/gemm.jl | 0 src/im2col/im2col.jl | 27 ++ src/interface.jl | 66 ++++ src/interface_impl.jl | 361 ++++++++++++++++++ src/nnpack/NNPACK.jl | 52 ++- src/nnpack/impl.jl | 50 --- src/nnpack/interface.jl | 110 +++--- src/nnpack/libnnpack.jl | 2 - .../{performance.jl => multithreading.jl} | 10 +- src/pooling.jl | 155 -------- test/conv.jl | 7 +- test/inference.jl | 10 +- 27 files changed, 695 insertions(+), 584 deletions(-) delete mode 100644 deps/build.jl rename src/{ => activation}/activation.jl (99%) rename src/{ => activation}/softmax.jl (100%) delete mode 100644 src/conv.jl rename src/{impl/padding_edges.jl => dim_helpers/PaddingEdges.jl} (100%) rename src/{ => dim_helpers}/dim_helpers.jl (93%) rename src/{impl => direct}/conv_direct.jl (92%) rename src/{impl => direct}/depthwiseconv_direct.jl (98%) create mode 100644 src/direct/direct.jl rename src/{impl => direct}/pooling_direct.jl (98%) rename src/{impl => im2col}/conv_im2col.jl (99%) rename src/{impl => im2col}/depthwiseconv_im2col.jl (98%) rename src/{ => im2col}/gemm.jl (100%) create mode 100644 src/im2col/im2col.jl create mode 100644 src/interface.jl create mode 100644 src/interface_impl.jl delete mode 100644 src/nnpack/impl.jl rename src/nnpack/{performance.jl => multithreading.jl} (87%) delete mode 100644 src/pooling.jl diff --git a/Manifest.toml b/Manifest.toml index 260e456b6..f70d8a9ea 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,11 +1,29 @@ # This file is machine-generated - editing it directly is not advised +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + [[BinaryProvider]] deps = ["Libdl", "SHA"] git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c" uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" version = "0.5.8" +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[LibGit2]] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + [[Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" @@ -13,6 +31,31 @@ uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" deps = ["Libdl"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[NNPACK_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "c3d1a616362645754b18e12dbba96ec311b0867f" +uuid = "a6bfbf70-4841-5cb9-aa18-3a8ad3c413ee" +version = "2018.6.22+0" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + [[Random]] deps = ["Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -29,6 +72,9 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + [[SparseArrays]] deps = ["LinearAlgebra", "Random"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" @@ -37,6 +83,13 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + [[UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/Project.toml b/Project.toml index 83b5223b8..41b3c0193 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.6.4" BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +NNPACK_jll = "a6bfbf70-4841-5cb9-aa18-3a8ad3c413ee" Requires = "ae029012-a4dd-5104-9daa-d747884805df" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/deps/build.jl b/deps/build.jl deleted file mode 100644 index 3da8d53c2..000000000 --- a/deps/build.jl +++ /dev/null @@ -1,50 +0,0 @@ -using BinaryProvider - -# Parse some basic command-line arguments -const verbose = "--verbose" in ARGS -const prefix = Prefix(get([a for a in ARGS if a != "--verbose"], 1, joinpath(@__DIR__, "usr"))) -products = [ - LibraryProduct(prefix, ["libnnpack"], :libnnpack), -] - -# Download binaries from hosted location -bin_prefix = "https://github.com/JuliaPackaging/Yggdrasil/releases/download/NNPACK-v2018.06.22-0" - -# Listing of files generated by BinaryBuilder: -download_info = Dict( - Linux(:aarch64, libc=:glibc) => ("$bin_prefix/NNPACK.v2018.6.22.aarch64-linux-gnu.tar.gz", "e0c6e21ba4c47acfd5a3d3e3510e8786474080f654338f4583b88860296c1437"), - Linux(:i686, libc=:glibc) => ("$bin_prefix/NNPACK.v2018.6.22.i686-linux-gnu.tar.gz", "e9b6685001bc5a5d17acef15f3f6ffeb7beb6081926300f23ed4a442beac71ca"), - Linux(:i686, libc=:musl) => ("$bin_prefix/NNPACK.v2018.6.22.i686-linux-musl.tar.gz", "36c1d3c30b3bc3e0b34f215945bb46319f88e28f011fc758f21ba888b1fd9e25"), - MacOS(:x86_64) => ("$bin_prefix/NNPACK.v2018.6.22.x86_64-apple-darwin14.tar.gz", "b30046223a11470b15a2ceb0d0df6f7d8a43260fe52f4a2f8ebe5f0b2df822ca"), - Linux(:x86_64, libc=:glibc) => ("$bin_prefix/NNPACK.v2018.6.22.x86_64-linux-gnu.tar.gz", "150d5b6ca81fa72bfdc8bbda2428f0d3483fd11a5813724646c6d6c6a7ef969f"), - Linux(:x86_64, libc=:musl) => ("$bin_prefix/NNPACK.v2018.6.22.x86_64-linux-musl.tar.gz", "d961a104f814ec5b356519a82746a70a1df193ae37fc8130f38ffb61336def16"), -) - -# Install unsatisfied or updated dependencies: -unsatisfied = any(!satisfied(p; verbose=verbose) for p in products) -dl_info = choose_download(download_info, platform_key_abi()) -if dl_info === nothing && unsatisfied - # If we don't have a compatible .tar.gz to download, complain. - # Alternatively, you could attempt to install from a separate provider, - # build from source or something even more ambitious here. - @warn "Your platform (\"$(Sys.MACHINE)\", parsed as \"$(triplet(platform_key_abi()))\") is not supported by NNPACK! - You will only be able to use only the default NNlib backend." -end - -# If we have a download, and we are unsatisfied (or the version we're -# trying to install is not itself installed) then load it up! -# Download and install binaries -use_nnpack = get(ENV, "NNLIB_USE_NNPACK", "false") == "true" -os_support = Sys.islinux() || Sys.isapple() -if use_nnpack && os_support - if unsatisfied || !isinstalled(dl_info...; prefix=prefix) - install(dl_info...; prefix=prefix, force=true, verbose=verbose) - end - # Write out a deps.jl file that will contain mappings for our products - write_deps_file(joinpath(@__DIR__, "deps.jl"), products, verbose=verbose) -else - open(joinpath(@__DIR__, "deps.jl"), "w") do io - write(io, "check_deps() = false") - end -end - diff --git a/src/NNlib.jl b/src/NNlib.jl index aa08bc299..ed9fee187 100644 --- a/src/NNlib.jl +++ b/src/NNlib.jl @@ -1,34 +1,38 @@ module NNlib -using Requires -# Include APIs -include("dim_helpers.jl") +# Start with the simplest stuff in here; activation functions +include("activation/activation.jl") +include("activation/softmax.jl") -# NNPACK support -include(joinpath(@__DIR__, "..", "deps", "deps.jl")) -if check_deps() == nothing - include("nnpack/NNPACK.jl") -else - is_nnpack_available() = false -end +# Load dimensionality helpers for convolution dispatching +include("dim_helpers/dim_helpers.jl") + +# Define our convolution/pooling interface backend holders +include("interface.jl") -include("activation.jl") -include("softmax.jl") -include("gemm.jl") -include("conv.jl") -include("pooling.jl") +# Begin with straightforward direct implementations +include("direct/direct.jl") +# Next, im2col implementations +include("im2col/im2col.jl") -## Include implementations -include("impl/padding_edges.jl") +# Next, NNPACK implementations +using NNPACK_jll -# Direct implementations of convolutional and depthwise-convolutional algorithms -include("impl/conv_direct.jl") -include("impl/depthwiseconv_direct.jl") -# im2col implementations of convolutional and depthwise-convolutional algorithms -include("impl/conv_im2col.jl") -include("impl/depthwiseconv_im2col.jl") +# Check to see if NNPACK_jll is loadable +if isdefined(NNPACK_jll, :libnnpack) + include("nnpack/NNPACK.jl") +else + # Otherwise, signal to the rest of the world that this is unavailable + """ + is_nnpack_available() + + Checks if the current platform/hardware is supported by NNPACK. + Your platform sadly, is not supported by NNPACK. + """ + is_nnpack_available() = false +end -# Direct implementations of pooling -include("impl/pooling_direct.jl") +# Finally, generate all the goodies for conv() and maxpool() and friends! +include("interface_impl.jl") end # module NNlib diff --git a/src/activation.jl b/src/activation/activation.jl similarity index 99% rename from src/activation.jl rename to src/activation/activation.jl index 9dffa5b18..08f2b7ed0 100644 --- a/src/activation.jl +++ b/src/activation/activation.jl @@ -1,3 +1,5 @@ +using Requires + export σ, sigmoid, relu, leakyrelu, elu, gelu, swish, selu, softplus, softsign, logσ, logsigmoid, logcosh, mish diff --git a/src/softmax.jl b/src/activation/softmax.jl similarity index 100% rename from src/softmax.jl rename to src/activation/softmax.jl diff --git a/src/conv.jl b/src/conv.jl deleted file mode 100644 index b2997d8f2..000000000 --- a/src/conv.jl +++ /dev/null @@ -1,181 +0,0 @@ -export conv, conv!, ∇conv_data, ∇conv_data!, ∇conv_filter, ∇conv_filter!, depthwiseconv, - depthwiseconv!, ∇depthwiseconv_data, ∇depthwiseconv_data!, ∇depthwiseconv_filter, - ∇depthwiseconv_filter! - -## Convolution API -# -# We provide the following generic methods, for 3d, 4d, and 5d tensors, calculating 1d, -# 2d and 3d convolutions, based on the rank of the input tensors, in both mutating and -# non-mutating auto-allocating variants: -# - Convolution: -# - conv(x, w, cdims) -# - conv!(y, x, w, cdims) -# - Convolution data backpropagation -# - ∇conv_data(dy, w, cdims) -# - ∇conv_data!(dx, dy, w, cdims) -# - Convolution filter backpropagation -# - ∇conv_filter(x, dy, cdims) -# - ∇conv_filter!(dw, x, dy, cdims) -# -# All methods require a `ConvDims` object to define the dimensions and optional -# elements of the convolution (padding, stride, dilation, kernel-flipping, etc...), -# which is easily constructable through something like `DenseConvDims(x, w)`. All -# methods take in the `ConvDims` of the associated normal, forward-pass convolution, -# that is, the following is legal: -# -# cdims = ConvDims(x, w; stride=2, dilation=(3,2)) -# dx = ∇conv_data(conv(x, w, cdims), w, cdims) - - - -# First, we will define mappings from the generic API names to our accelerated backend -# implementations. For homogeneous-datatype 1, 2 and 3d convolutions, we default to using -# im2col + GEMM. Do so in a loop, here: -for (front_name, backend) in ( - # This maps from public, front-facing name, to internal backend name - :conv => :im2col, - :∇conv_data => :im2col, - :∇conv_filter => :im2col, - :depthwiseconv => :im2col, - :∇depthwiseconv_data => :im2col, - :∇depthwiseconv_filter => :im2col, - ) - - # These are the GEMM types we will accelerate with `im2col` - G = Union{[x[2] for x in gemm_datatype_mappings]...} - - # We only define 3d conv primitives, we reshape lower down to get 1d and 2d convolution - @eval begin - # im2col-accelerated function forwarding definition - function $(Symbol("$(front_name)!"))( - out::AbstractArray{T,5}, in1::AbstractArray{T,5}, - in2::AbstractArray{T,5}, cdims::ConvDims; kwargs...) where {T <: $G} - $(Symbol("$(front_name)_$(backend)!"))(out, in1, in2, cdims; kwargs...) - end - end -end - -# Our strategy for 1d and 2d convolution is to reshape to 3d convolutions, which -# makes things MUCH EASIER for us on the backend side, and is in general pretty fast, -# since we can specialize on sizes. -for front_name in (:conv, :∇conv_data, :∇conv_filter, - :depthwiseconv, :∇depthwiseconv_data, :∇depthwiseconv_filter) - for backend in (Symbol(), :_direct, :_im2col) - for N in (3, 4) - @eval begin - function $(Symbol("$(front_name)$(backend)!"))( - y::AbstractArray{yT,$N}, x::AbstractArray{xT,$N}, - w::AbstractArray{wT,$N}, cdims::ConvDims; - kwargs...) where {yT, xT, wT} - $(Symbol("$(front_name)$(backend)!"))( - insert_singleton_spatial_dimension(y, $(5 - N)), - insert_singleton_spatial_dimension(x, $(5 - N)), - insert_singleton_spatial_dimension(w, $(5 - N)), - insert_singleton_spatial_dimension(cdims, $(5 - N)); - kwargs... - ) - - # We explicitly return `y` here, because the backend call - # itself may return a reshaped view, which we don't want. - return y - end - end - end - end -end - -# We always support a fallback, non-accelerated path, where we use the direct, but -# slow, implementations. These should not typically be used, hence the `@debug`, -# but let's ggo ahead and define them first: -for front_name in (:conv, :∇conv_data, :∇conv_filter, - :depthwiseconv, :∇depthwiseconv_data, :∇depthwiseconv_filter) - @eval begin - function $(Symbol("$(front_name)!"))( - y::AbstractArray{yT,N}, in1::AbstractArray{T1,N}, - in2::AbstractArray{T2,N}, cdims::ConvDims; - kwargs...) where {yT, T1, T2, N} - @debug string("Slow fallback implementation invoked for ", $front_name, "! ", - "You probably don't want this; check your datatypes.") - $(Symbol("$(front_name)_direct!"))(y, in1, in2, cdims; kwargs...) - end - end -end - -# Finally, let's generate auto-allocating versions of all our functions, for all backends. -# We `@timeit` these methods separately, as we want to know how much time is spent in -# allocation. :P -for backend in (Symbol(), :_direct, :_im2col) - # First make auto-allocating versions of the conv()-like calls: - for name in (:conv, :depthwiseconv) - @eval begin - function $(Symbol("$(name)$(backend)"))( - x::AbstractArray{xT,N}, w::AbstractArray{wT,N}, - cdims::ConvDims; kwargs...) where {xT, wT, N} - y = similar(x, promote_type(xT, wT), output_size(cdims)..., - channels_out(cdims), size(x,N)) - return $(Symbol("$(name)$(backend)!"))(y, x, w, cdims; kwargs...) - end - end - end - - for name in (:∇conv_data, :∇depthwiseconv_data) - @eval begin - function $(Symbol("$(name)$(backend)"))( - dy::AbstractArray{yT,N}, w::AbstractArray{wT,N}, - cdims::ConvDims; kwargs...) where {yT, wT, N} - dx = similar(dy, input_size(cdims)..., channels_in(cdims), - size(dy, N)) - return $(Symbol("$(name)$(backend)!"))(dx, dy, w, cdims; kwargs...) - end - end - end - - # We do the conv/depthwiseconv filter backprops separately, as the shape calculation - # for `w` is slightly different for depthwise than for normal dense convolution. - @eval begin - function $(Symbol("∇conv_filter$(backend)"))( - x::AbstractArray{xT,N}, dy::AbstractArray{yT,N}, - cdims::ConvDims; kwargs...) where {xT, yT, N} - dw = similar(dy, kernel_size(cdims)..., channels_in(cdims), - channels_out(cdims)) - return $(Symbol("∇conv_filter$(backend)!"))(dw, x, dy, cdims; kwargs...) - end - end - - @eval begin - function $(Symbol("∇depthwiseconv_filter$(backend)"))( - x::AbstractArray{xT,N}, dy::AbstractArray{yT,N}, - cdims::ConvDims; kwargs...) where {xT, yT, N} - dw = similar(dy, kernel_size(cdims)..., channel_multiplier(cdims), - channels_in(cdims)) - return $(Symbol("∇depthwiseconv_filter$(backend)!"))(dw, x, dy, cdims; - kwargs...) - end - end -end - - -# Use NNPACK if it is available and the operation is supported -if is_nnpack_available() - function conv(x::Array{xT, 4}, w::Array{wT, 4}, - cdims::DenseConvDims{2, K, C_in, C_out, (1, 1), P, (1, 1), F}; - kwargs...) where {xT, wT, K, C_in, C_out, P, F} - return conv_nnpack(x, w, cdims; kwargs...) - end -end - -function conv(x, w::AbstractArray{T, N}; stride = 1, pad = 0, dilation = 1, flipped = false) where {T, N} - stride = expand(Val(N-2), stride) - pad = expand(Val(N-2), pad) - dilation = expand(Val(N-2), dilation) - cdims = DenseConvDims(x, w; stride = stride, padding = pad, dilation = dilation, flipkernel = flipped) - return conv(x, w, cdims) -end - -function depthwiseconv(x, w::AbstractArray{T, N}; stride = 1, pad = 0, dilation = 1, flipped = false) where {T, N} - stride = expand(Val(N-2), stride) - pad = expand(Val(N-2), pad) - dilation = expand(Val(N-2), dilation) - cdims = DepthwiseConvDims(x, w; stride = stride, padding = pad, dilation = dilation, flipkernel = flipped) - return depthwiseconv(x, w, cdims) -end diff --git a/src/impl/padding_edges.jl b/src/dim_helpers/PaddingEdges.jl similarity index 100% rename from src/impl/padding_edges.jl rename to src/dim_helpers/PaddingEdges.jl diff --git a/src/dim_helpers.jl b/src/dim_helpers/dim_helpers.jl similarity index 93% rename from src/dim_helpers.jl rename to src/dim_helpers/dim_helpers.jl index 22d5636a7..ade8b9d8d 100644 --- a/src/dim_helpers.jl +++ b/src/dim_helpers/dim_helpers.jl @@ -1,9 +1,9 @@ # Various helper functions to calculate dimensions for operations -include("dim_helpers/ConvDims.jl") -include("dim_helpers/DenseConvDims.jl") -include("dim_helpers/DepthwiseConvDims.jl") -include("dim_helpers/PoolDims.jl") - +include("ConvDims.jl") +include("DenseConvDims.jl") +include("DepthwiseConvDims.jl") +include("PoolDims.jl") +include("PaddingEdges.jl") """ transpose_swapbatch(x::AbstractArray) @@ -130,11 +130,8 @@ end Reorders the weight tensor for supporting both convolution and cross-correlation operations. """ - -# For any array with ndims <= 3 it makes no sense to flip the weights so simply return the -# original array -@inline flipweight(w::AbstractArray) = w - -@inline flipweight(w::AbstractArray{T, 4}) where {T} = w[end:-1:1, end:-1:1, :, :] - @inline flipweight(w::AbstractArray{T, 5}) where {T} = w[end:-1:1, end:-1:1, end:-1:1, :, :] +@inline flipweight(w::AbstractArray{T, 4}) where {T} = w[end:-1:1, end:-1:1, :, :] +@inline flipweight(w::AbstractArray{T, 3}) where {T} = w[end:-1:1, :, :] +# For ndims < 3 it makes no sense to flip the weights so simply return the original array +@inline flipweight(w::AbstractArray) = w diff --git a/src/impl/conv_direct.jl b/src/direct/conv_direct.jl similarity index 92% rename from src/impl/conv_direct.jl rename to src/direct/conv_direct.jl index 5f5b7c4c3..bf86eadec 100644 --- a/src/impl/conv_direct.jl +++ b/src/direct/conv_direct.jl @@ -1,5 +1,5 @@ ## This file contains direct Julia implementations of 2d and 3d convolutions -using Base.Threads +export conv_direct!, ∇conv_data_direct!, ∇conv_filter_direct! # Helper functions for restricting x/w overreach function clamp_lo(x, w) @@ -22,12 +22,12 @@ end Direct convolution implementation; used for debugging, tests, and mixing/matching of strange datatypes within a single convolution. Uses naive nested for loop implementation -and does not attempt to optimize performance. Rather, this implementation is intended to -be maximally understandable and debuggable, to aid in testing other, more performant -implementations. We also explicitly support mixing and matching of strange datatypes, -so that if the user really wants to convolve an image of `UInt8`'s with a `Float16` -kernel, storing the result in a `Float32` output, there is at least a function call -for that madness. +and does not attempt to optimize performance at the cost of readability. Rather, this +implementation is intended to be maximally understandable and debuggable, to aid in +testing other, more performant implementations. We also explicitly support mixing and +matching of strange datatypes, so that if the user really wants to convolve an image of +`UInt8`'s with a `Float16` kernel, storing the result in a `Float32` output, there is at +least one callable function for that madness. The keyword arguments `alpha` and `beta` control accumulation behavior; this function calculates `y = alpha * x * w + beta * y`, therefore by setting `beta` to a nonzero @@ -43,8 +43,6 @@ The basic implementation performs 3-dimensional convolution; 1-dimensional and 2 dimensional casesa are supported by simply reshaping `y`, `x` and `w`, for which wrapper methods are available. """ -conv_direct! - function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5}, w::AbstractArray{wT,5}, cdims::DenseConvDims; alpha::yT = yT(1), beta = false) where {yT, xT, wT} @@ -150,8 +148,6 @@ end Calculate the gradient imposed upon `x` in the convolution `y = x * w`. """ -∇conv_data_direct! - function ∇conv_data_direct!(dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5}, w::AbstractArray{wT,5}, cdims::DenseConvDims; alpha::xT=xT(1), beta=false) where {xT, yT, wT} @@ -169,8 +165,6 @@ end Calculate the gradient imposed upon `w` in the convolution `y = x * w`. """ -∇conv_filter_direct! - function ∇conv_filter_direct!(dw::AbstractArray{wT,5}, x::AbstractArray{xT,5}, dy::AbstractArray{yT,5}, cdims::DenseConvDims; alpha::wT=wT(1), beta=false) where {xT, yT, wT} diff --git a/src/impl/depthwiseconv_direct.jl b/src/direct/depthwiseconv_direct.jl similarity index 98% rename from src/impl/depthwiseconv_direct.jl rename to src/direct/depthwiseconv_direct.jl index b6822a488..d7ba96689 100644 --- a/src/impl/depthwiseconv_direct.jl +++ b/src/direct/depthwiseconv_direct.jl @@ -1,4 +1,5 @@ ## This file contains direct Julia implementations of depwthwise convolutions +export depthwiseconv_direct!, ∇depthwiseconv_data_direct!, ∇depthwiseconv_filter_direct! """ depthwiseconv_direct!(y, x, w, cdims; alpha=1, beta=0) @@ -130,8 +131,6 @@ get applied to it. The output of such a convolution is the gradient imposed upo particular channel of `x`, and so we simply walk through `x`, calculating the gradient for each batch and channel independently. """ -∇depthwiseconv_data_direct! - function ∇depthwiseconv_data_direct!( dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5}, w::AbstractArray{wT,5}, cdims::DepthwiseConvDims; diff --git a/src/direct/direct.jl b/src/direct/direct.jl new file mode 100644 index 000000000..10a135f85 --- /dev/null +++ b/src/direct/direct.jl @@ -0,0 +1,33 @@ +""" +Direct implementations of convolution, pooling, etc... +""" +module Direct +using ..NNlib +using ..NNlib: output_size, input_size, kernel_size, channels_in, channels_out, check_dims, + spatial_dims, stride, padding, dilation, flipkernel, calc_padding_regions, + transpose_swapbatch, predilate, transpose_pad, channel_multiplier + +include("conv_direct.jl") +include("depthwiseconv_direct.jl") +include("pooling_direct.jl") + +# Here we register our convolution and pooling methods with the parent NNlib module. +# We have direct implementations of just about everything, so push them all! +import ..conv_backends, ..pooling_backends +push!(conv_backends[:conv], :direct) +push!(conv_backends[:∇conv_data], :direct) +push!(conv_backends[:∇conv_filter], :direct) +push!(conv_backends[:depthwiseconv], :direct) +push!(conv_backends[:∇depthwiseconv_data], :direct) +push!(conv_backends[:∇depthwiseconv_filter], :direct) + +push!(pooling_backends[:maxpool], :direct) +push!(pooling_backends[:meanpool], :direct) + +end # module Direct + +# Self-using? Yes. +using .Direct +import .Direct: conv_direct!, ∇conv_data_direct!, ∇conv_filter_direct!, + depthwiseconv_direct!, ∇depthwiseconv_data_direct!, ∇depthwiseconv_filter_direct!, + meanpool_direct!, maxpool_direct!, ∇meanpool_direct!, ∇maxpool_direct! \ No newline at end of file diff --git a/src/impl/pooling_direct.jl b/src/direct/pooling_direct.jl similarity index 98% rename from src/impl/pooling_direct.jl rename to src/direct/pooling_direct.jl index f95ab32f5..3a03be7c3 100644 --- a/src/impl/pooling_direct.jl +++ b/src/direct/pooling_direct.jl @@ -1,3 +1,6 @@ +## This file contains direct Julia implementations of pooling operations +export meanpool_direct!, maxpool_direct! + using Statistics # Pooling is so similar, we abstract over meanpooling and maxpooling, simply replacing diff --git a/src/impl/conv_im2col.jl b/src/im2col/conv_im2col.jl similarity index 99% rename from src/impl/conv_im2col.jl rename to src/im2col/conv_im2col.jl index e06231325..e80576f05 100644 --- a/src/impl/conv_im2col.jl +++ b/src/im2col/conv_im2col.jl @@ -1,6 +1,8 @@ ## This file contains im2col-backed implementations of convolution for 2d and 3d ## convolutions. Expect to see a lot of indexing. +export conv_im2col!, ∇conv_data_im2col!, ∇conv_filter_im2col! + # Helper functions for flipkernel-induced dyslexia @inline function kernel_index(w, h, d, cdims::ConvDims{N, S, P, D, false}) where {N, S, P, D} kernel_w, kernel_h, kernel_d = kernel_size(cdims) @@ -293,8 +295,6 @@ Note that this method has not been optimized in the same way as `im2col()` has, it is slightly more complicated due to the more chaotic data access patterns, and I'm not desperate enough yet. """ -col2im! - function col2im!(x::AbstractArray{T,4}, col::AbstractArray{T,2}, cdims::ConvDims) where T if spatial_dims(cdims) != 3 diff --git a/src/impl/depthwiseconv_im2col.jl b/src/im2col/depthwiseconv_im2col.jl similarity index 98% rename from src/impl/depthwiseconv_im2col.jl rename to src/im2col/depthwiseconv_im2col.jl index 145dc9961..c1a3de8a4 100644 --- a/src/impl/depthwiseconv_im2col.jl +++ b/src/im2col/depthwiseconv_im2col.jl @@ -1,5 +1,5 @@ ## This file contains adapter code for doing depthwise convolutions with im2col. - +export depthwiseconv_im2col!, ∇depthwiseconv_data_im2col!, ∇depthwiseconv_filter_im2col! """ depthwiseconv_im2col!(y, x, w, cdims, col=similar(x); alpha=1, beta=0) @@ -8,8 +8,6 @@ Perform a depthwise convolution using im2col and GEMM, store the result in `y`. See `conv_im2col!()` for an explanation of optional parameters. """ -depthwiseconv_im2col! - function depthwiseconv_im2col!( y::AbstractArray{T,5}, x::AbstractArray{T,5}, w::AbstractArray{T,5}, cdims::DepthwiseConvDims; diff --git a/src/gemm.jl b/src/im2col/gemm.jl similarity index 100% rename from src/gemm.jl rename to src/im2col/gemm.jl diff --git a/src/im2col/im2col.jl b/src/im2col/im2col.jl new file mode 100644 index 000000000..283b59d5c --- /dev/null +++ b/src/im2col/im2col.jl @@ -0,0 +1,27 @@ +module Im2col +using ..NNlib +using ..NNlib: im2col_dims, output_size, input_size, kernel_size, channels_in, channels_out, check_dims, + spatial_dims, stride, padding, dilation, flipkernel, calc_padding_regions, channel_multiplier +using Base.Threads + +include("gemm.jl") +include("conv_im2col.jl") +include("depthwiseconv_im2col.jl") + + +# Here we register our convolution methods with the parent NNlib module. +# We only do convolution, no pooling. +import ..conv_backends +push!(conv_backends[:conv], :im2col) +push!(conv_backends[:∇conv_data], :im2col) +push!(conv_backends[:∇conv_filter], :im2col) +push!(conv_backends[:depthwiseconv], :im2col) +push!(conv_backends[:∇depthwiseconv_data], :im2col) +push!(conv_backends[:∇depthwiseconv_filter], :im2col) + +end # module Im2col + +using .Im2col +import .Im2col: conv_im2col!, ∇conv_data_im2col!, ∇conv_filter_im2col!, + depthwiseconv_im2col!, ∇depthwiseconv_data_im2col!, + ∇depthwiseconv_filter_im2col! \ No newline at end of file diff --git a/src/interface.jl b/src/interface.jl new file mode 100644 index 000000000..8e537aeb9 --- /dev/null +++ b/src/interface.jl @@ -0,0 +1,66 @@ +## Convolution and Pooling API +# +# We provide the following generic methods, for 3d, 4d, and 5d tensors, calculating 1d, +# 2d and 3d convolutions and pooling based on the rank of the input tensors, in both +# mutating and non-mutating/auto-allocating variants: +# - Convolution: +# - conv(x, w, cdims) +# - conv!(y, x, w, cdims) +# - Convolution data backpropagation +# - ∇conv_data(dy, w, cdims) +# - ∇conv_data!(dx, dy, w, cdims) +# - Convolution filter backpropagation +# - ∇conv_filter(x, dy, cdims) +# - ∇conv_filter!(dw, x, dy, cdims) +# - Pooling: +# - maxpool(x, pdims) +# - maxpool!(y, x, pdims) +# - meanpool(x, pdims) +# - meanpool!(y, x, pdims) +# - Pooling data backprop +# - ∇maxpool(dy, y, x, pdims) +# - ∇maxpool!(dx, dy, y, x, pdims) +# - ∇meanpool(dy, y, x, pdims) +# - ∇meanpool!(dx, dy, y, x pdims) +# +# All methods require a `ConvDims` or `PoolDims` object to define the dimensions and +# meta elements of the convolution (padding, stride, dilation, kernel-flipping, etc...) +# which is easily constructable through something like `DenseConvDims(x, w)`. All +# methods take in the `ConvDims` of the associated normal, forward-pass convolution, +# that is, the following is legal: +# +# cdims = DenseConvDims(x, w; stride=2, dilation=(3,2)) +# dx = ∇conv_data(conv(x, w, cdims), w, cdims) +# +# Note that we do provide a helper API in the case that you don't want to bother with +# DenseConvDims and friends: you can simply do the following, however it will be less +# performant if you run the same operation multiple times: +# +# y = conv(x, w; stride=2, dilation=(3,2)) + + +# We support a pluggable backend system, currently consisting of three possible backends: +# * `nnpack`: which uses the third-party NNPACK libraries for convolution and pooling +# * `im2col`: A Julia BLAS-based implementation of convolution +# * `direct`: A Julia-native direct implementation of convolution and pooling +# +# We store each within a module (in the case of NNPACK, it is included only if the +# NNPACK binaries are available for the host system) and each module pushes a value +# onto these `conv_backends` lists. Those lists are then read from in the file +# `interface_impl.jl` which generates the nice interface described above, using a mixture +# of dispatch and runtime checks to provide the convenient `conv()` -> `conv!()` -> +# `conv_nnpack!()` interface that we all know and love. +conv_backends = Dict( + :conv => [], + :∇conv_data => [], + :∇conv_filter => [], + :depthwiseconv => [], + :∇depthwiseconv_data => [], + :∇depthwiseconv_filter => [], +) + +# Same thing for pooling +pooling_backends = Dict( + :maxpool => [], + :meanpool => [], +) \ No newline at end of file diff --git a/src/interface_impl.jl b/src/interface_impl.jl new file mode 100644 index 000000000..169ce075c --- /dev/null +++ b/src/interface_impl.jl @@ -0,0 +1,361 @@ +## This file creates the mappings from `conv!()` -> `conv_nnpack!()`, as well as +# convenience functions such as `conv_nnpack()` -> `conv_nnpack!()`, the auto-allocating +# variants, the reshaping variants, etc... + + # convolution +export conv, conv!, ∇conv_data, ∇conv_data!, ∇conv_filter, ∇conv_filter!, + # depthwise convolution + depthwiseconv, depthwiseconv!, ∇depthwiseconv_data, ∇depthwiseconv_data!, + ∇depthwiseconv_filter, ∇depthwiseconv_filter!, + # pooling + maxpool, maxpool!, meanpool, meanpool!, ∇maxpool, ∇maxpool!, ∇meanpool, ∇meanpool! + + +# We're going to use some typenames an awful lot +const AA = AbstractArray + +# Luckily, because the dispatch signatures for each of these backends are distinct, +# we can just layer the applicable backends and multiple dispatch will take care of +# calling the correct version for us. +for (front_name, backends) in conv_backends + # _nnpack() methods are generally preferrable, but they're pretty strict + # in what they can do; they can't deal with anything other than Float32, + # they only do conv2d, and they can't deal with stride or dilation. + if :nnpack in backends + # Only conv2d, no conv1d or conv3d. Also no stride or dilation + nnpack_cdims = DenseConvDims{ + # only conv2d, no conv1d or conv3d + 2, + # Any kernel size, channels in or out, whatever, it's good. + K, C_in, C_out, + # No stride + (1, 1), + # Padding can be whatevs + P, + # No dilation + (1, 1), + # Flipping is fine + F, + } where {K, C_in, C_out, P, F} + + # Map from the front name to the back name + fname = Symbol("$(front_name)!") + bname = Symbol("$(front_name)_nnpack!") + @eval begin + function $(fname)(out::AA{Float32, 4}, in1::AA{Float32, 4}, + in2::AA{Float32, 4}, cdims::$(nnpack_cdims); + kwargs...) + $(bname)(out, in1, in2, cdims; kwargs...) + return out + end + end + end + + # _im2col() methods are a little less strict. They can deal with any kind + # of DenseConvDims, but they are still limited to the basic types that BLAS + # can deal with, which is {Complex,}Float{32,64}. For BLAS to work, all + # types must also be the same: + if :im2col in backends + # These are the types that our BLAS can handle + BLAS_TYPES = Union{[x[2] for x in Im2col.gemm_datatype_mappings]...} + + # Map from the front name to the back name + fname = Symbol("$(front_name)!") + bname = Symbol("$(front_name)_im2col!") + @eval begin + function $(fname)(out::AA{T}, in1::AA{T}, + in2::AA{T}, cdims::ConvDims; + kwargs...) where {T <: $(BLAS_TYPES)} + $(bname)(out, in1, in2, cdims; kwargs...) + return out + end + + # We add here some "expanders" that convert conv1d/2d inputs up to + # the conv3d input shape that our backend is expecting: + function $(bname)(out::AA{T,3}, in1::AA{T,3}, in2::AA{T,3}, + cdims::ConvDims; kwargs...) where {T <: $(BLAS_TYPES)} + out, in1, in2, cdims = expand_dimensions(Val(5), out, in1, in2, cdims) + $(bname)(out, in1, in2, cdims; kwargs...) + return out + end + function $(bname)(out::AA{T,4}, in1::AA{T,4}, in2::AA{T,4}, + cdims::ConvDims; kwargs...) where {T <: $(BLAS_TYPES)} + out, in1, in2, cdims = expand_dimensions(Val(5), out, in1, in2, cdims) + $(bname)(out, in1, in2, cdims; kwargs...) + return out + end + end + end + + # _direct() can take in anything, but it can be much slower, so it's best + # to take advantage of the accelerated definitions above. We still do the + # expansion of dimensions here, to make sure this works for conv3d cases. + if :direct in backends + fname = Symbol("$(front_name)!") + bname = Symbol("$(front_name)_direct!") + @eval begin + function $(fname)(out::AA, in1::AA, + in2::AA, cdims::ConvDims; + kwargs...) + @debug string("Slow fallback implementation invoked for ", $front_name, "! ", + "You probably don't want this; check your datatypes.") + $(bname)(out, in1, in2, cdims; kwargs...) + return out + end + + # We add here some "expanders" that convert conv1d/2d inputs up to + # the conv3d input shape that our backend is expecting: + function $(bname)(out::AA{<:Any,3}, in1::AA{<:Any,3}, in2::AA{<:Any,3}, + cdims::ConvDims; kwargs...) + out, in1, in2, cdims = expand_dimensions(Val(5), out, in1, in2, cdims) + $(bname)(out, in1, in2, cdims; kwargs...) + return out + end + function $(bname)(out::AA{<:Any,4}, in1::AA{<:Any,4}, in2::AA{<:Any,4}, + cdims::ConvDims; kwargs...) + out, in1, in2, cdims = expand_dimensions(Val(5), out, in1, in2, cdims) + $(bname)(out, in1, in2, cdims; kwargs...) + return out + end + end + end +end + +""" + expand_dimensions(M, x, args...) + +Inserts singleton dimensions into `x`, and any further arguments until they +are `M`-dimensional. It is an error for the input tensors to be of greater +dimensionality than `M`. +""" +function expand_dimensions(::Val{M}, x::AbstractArray{<:Any, N}, args...) where {N, M} + if M == N + return (x, args...) + end + if N > M + error("Cannot expand_dimensions() to a smaller dimensionality!") + end + return ( + insert_singleton_spatial_dimension(x, M - N), + insert_singleton_spatial_dimension.(args, M - N)..., + ) +end + +# Finally, let's generate auto-allocating versions of all our functions. +# These are the ones that don't have the `!` at the end. Note that we do not +# type-specialize these like the non-allocating versions. +for backend in (Symbol(), :_direct, :_im2col, :_nnpack) + # First, conv() forward passes + for name in (:conv, :depthwiseconv) + fname = Symbol("$(name)$(backend)") + bname = Symbol("$(name)$(backend)!") + @eval begin + function $(fname)(x::AA, w::AA, cdims::ConvDims; kwargs...) + y = similar(x, + promote_type(eltype(x), eltype(w)), + output_size(cdims)..., + channels_out(cdims), + size(x, ndims(x)), + ) + $(bname)(y, x, w, cdims; kwargs...) + return y + end + end + end + + # Next, backward passes for `_data()` + for name in (:∇conv_data, :∇depthwiseconv_data) + fname = Symbol("$(name)$(backend)") + bname = Symbol("$(name)$(backend)!") + @eval begin + function $(fname)(dy::AA, w::AA, cdims::ConvDims; kwargs...) + dx = similar(dy, + input_size(cdims)..., + channels_in(cdims), + size(dy, ndims(dy)), + ) + $(bname)(dx, dy, w, cdims; kwargs...) + return dx + end + end + end + + # We do the filter backprops separately, as the shape calculation for `w` + # is slightly different for depthwise than for normal dense convolution. + fname = Symbol("∇conv_filter$(backend)") + bname = Symbol("∇conv_filter$(backend)!") + @eval begin + function $(fname)(x::AA, dy::AA, cdims::ConvDims; kwargs...) + dw = similar(dy, + kernel_size(cdims)..., + channels_in(cdims), + channels_out(cdims), + ) + $(bname)(dw, x, dy, cdims; kwargs...) + return dw + end + end + + fname = Symbol("∇depthwiseconv_filter$(backend)") + bname = Symbol("∇depthwiseconv_filter$(backend)!") + @eval begin + function $(fname)(x::AA, dy::AA, cdims::ConvDims; kwargs...) + dw = similar(dy, + kernel_size(cdims)..., + channel_multiplier(cdims), + channels_in(cdims), + ) + $(bname)(dw, x, dy, cdims; kwargs...) + return dw + end + end +end + +## Pooling +for (front_name, backends) in pooling_backends + if :nnpack in backends + nnpack_pdims = PoolDims{ + # only conv2d, no conv1d or conv3d + 2, + # Any kernel size, channels in or out, whatever, it's good. + K, + # No stride + S, + # Padding can be whatevs + P, + # No dilation + (1, 1), + } where {K, S, P} + # Map from the front name to the back name + fname = Symbol("$(front_name)!") + @eval begin + function $(fname)(out::AA{Float32, 4}, in::AA{Float32, 4}, + pdims::$(nnpack_pdims); kwargs...) + # Check to see if this is a supported operation + if nnpack_supported_pooling(pdims) + $(Symbol("$(front_name)_nnpack!"))(out, in, pdims; kwargs...) + else + # If it's not suported, then bail out to _direct() + $(Symbol("$(front_name)_direct!"))(out, in, pdims; kwargs...) + end + return out + end + end + end + + if :direct in backends + fname = Symbol("$(front_name)!") + bname = Symbol("$(front_name)_direct!") + @eval begin + function $(fname)(out::AA, in::AA, pdims::PoolDims; kwargs...) + $(bname)(out, in, pdims; kwargs...) + return out + end + + # Add reshapers + function $(bname)(out::AA{T,3}, in::AA{T,3}, + pdims::PoolDims; kwargs...) where {T} + outx, inx, pdimsx = expand_dimensions(Val(5), out, in, pdims) + $(bname)(outx, inx, pdimsx; kwargs...) + return out + end + function $(bname)(out::AA{T,4}, in::AA{T,4}, + pdims::PoolDims; kwargs...) where {T} + outx, inx, pdimsx = expand_dimensions(Val(5), out, in, pdims) + $(bname)(outx, inx, pdimsx; kwargs...) + return out + end + end + end +end + +# We only have direct backprop for pooling +for (front_name, backend) in ( + :∇maxpool => :direct, + :∇meanpool => :direct, + ) + fname = Symbol("$(front_name)!") + bname = Symbol("$(front_name)_direct!") + @eval begin + function $(fname)(dx::AA{T}, dy::AA{T}, + y::AA{T}, x::AA{T}, + pdims::PoolDims; kwargs...) where {T} + $(bname)(dx, dy, y, x, pdims; kwargs...) + return dx + end + function $(bname)(dx::AA{T,3}, dy::AA{T,3}, + y::AA{T,3}, x::AA{T,3}, + pdims::PoolDims; kwargs...) where {T} + dxx, dyx, yx, xx, pdimsx = expand_dimensions(Val(5), dx, dy, y, x, pdims) + $(bname)(dxx, dyx, yx, xx, pdimsx; kwargs...) + return dx + end + function $(bname)(dx::AA{T,4}, dy::AA{T,4}, + y::AA{T,4}, x::AA{T,4}, + pdims::PoolDims; kwargs...) where {T} + dxx, dyx, yx, xx, pdimsx = expand_dimensions(Val(5), dx, dy, y, x, pdims) + $(bname)(dxx, dyx, yx, xx, pdimsx; kwargs...) + return dx + end + end +end + +# Finally, let's generate auto-allocating versions of all our functions, for all backends: +for backend in (Symbol(), :_direct), + name in (:maxpool, :meanpool) + + fname = Symbol("$(name)$(backend)") + bname = Symbol("$(name)$(backend)!") + f_backname = Symbol("∇$(name)$(backend)") + b_backname = Symbol("∇$(name)$(backend)!") + @eval begin + function $(fname)(x::AA, pdims::PoolDims; kwargs...) + y = similar(x, output_size(pdims)..., channels_out(pdims), size(x, ndims(x))) + fill!(y, zero(eltype(x))) + $(bname)(y, x, pdims; kwargs...) + return y + end + + # Backprops too + function $(f_backname)(dy::AA, y::AA, x::AA, pdims::PoolDims; kwargs...) + dx = similar(x, input_size(pdims)..., channels_in(pdims), size(dy, ndims(dy))) + fill!(dx, zero(eltype(x))) + $(b_backname)(dx, dy, y, x, pdims; kwargs...) + return dx + end + end +end + +expand(N, i::Tuple) = i +expand(N, i::Integer) = ntuple(_ -> i, N) + +# Simplified conv() adapters that construct the `DenseConvDims` for you. +function conv(x, w::AbstractArray{T, N}; stride = 1, pad = 0, dilation = 1, flipped = false) where {T, N} + stride = expand(Val(N-2), stride) + pad = expand(Val(N-2), pad) + dilation = expand(Val(N-2), dilation) + cdims = DenseConvDims(x, w; stride = stride, padding = pad, dilation = dilation, flipkernel = flipped) + return conv(x, w, cdims) +end + +function depthwiseconv(x, w::AbstractArray{T, N}; stride = 1, pad = 0, dilation = 1, flipped = false) where {T, N} + stride = expand(Val(N-2), stride) + pad = expand(Val(N-2), pad) + dilation = expand(Val(N-2), dilation) + cdims = DepthwiseConvDims(x, w; stride = stride, padding = pad, dilation = dilation, flipkernel = flipped) + return depthwiseconv(x, w, cdims) +end + +function maxpool(x, k::NTuple{N, Integer}; pad = 0, stride = k) where N + pad = expand(Val(N), pad) + stride = expand(Val(N), stride) + pdims = PoolDims(x, k; padding = pad, stride = stride) + return maxpool(x, pdims) +end + +function meanpool(x, k::NTuple{N, Integer}; pad = 0, stride = k) where N + pad = expand(Val(N), pad) + stride = expand(Val(N), stride) + pdims = PoolDims(x, k; padding = pad, stride = stride) + return meanpool(x, pdims) +end diff --git a/src/nnpack/NNPACK.jl b/src/nnpack/NNPACK.jl index 1e420a9cc..46c10c228 100644 --- a/src/nnpack/NNPACK.jl +++ b/src/nnpack/NNPACK.jl @@ -1,31 +1,14 @@ +module NNPACK +using ..NNlib +using ..NNlib: check_dims, input_size, output_size, kernel_size, padding, stride, flipkernel, flipweight +using NNPACK_jll + include("libnnpack_types.jl") include("error.jl") include("libnnpack.jl") -include("performance.jl") +include("multithreading.jl") include("interface.jl") -const depsjl_path = joinpath(dirname(@__FILE__), "..", "..", "deps", "deps.jl") -if !isfile(depsjl_path) - error("NNPACK not installed properly, run Pkg.build(\"NNlib\"), restart Julia and try again") -end - -const shared_threadpool_dict = Dict{UInt64, Base.RefValue}() - -""" - is_nnpack_available() - -Checks if the current hardware is supported by NNPACK. -""" -function is_nnpack_available() - check_deps() isa Nothing || return false - status = nnp_initialize() - if status == nnp_status_unsupported_hardware - return false - else - return true - end -end - """ allocate_threadpool() @@ -41,12 +24,11 @@ function allocate_threadpool() end end -@init begin - check_deps() - status = nnp_initialize() - if status == nnp_status_unsupported_hardware - @warn "Hardware is unsupported by NNPACK so falling back to default NNlib" +function __init__() + if !is_nnpack_available() + @warn "Hardware unsupported by NNPACK, falling back to other NNlib backends" end + try global NNPACK_CPU_THREADS = parse(UInt64, ENV["NNPACK_CPU_THREADS"]) catch @@ -57,3 +39,17 @@ end end allocate_threadpool() end + +# Here we register our convolution and pooling methods with the parent NNlib module. +# We have implementations only for normal convolution and maxpooling: +import ..conv_backends, ..pooling_backends +push!(conv_backends[:conv], :nnpack) +push!(conv_backends[:∇conv_data], :nnpack) +push!(conv_backends[:∇conv_filter], :nnpack) + +push!(pooling_backends[:maxpool], :nnpack) +end # module NNPACK + +using .NNPACK +import .NNPACK: maxpool_nnpack!, nnpack_supported_operation, + conv_nnpack!, ∇conv_data_nnpack!, ∇conv_filter_nnpack! \ No newline at end of file diff --git a/src/nnpack/impl.jl b/src/nnpack/impl.jl deleted file mode 100644 index 5d3086583..000000000 --- a/src/nnpack/impl.jl +++ /dev/null @@ -1,50 +0,0 @@ -function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4}} - check_dims(size(x), size(y), pdims) - threadpool = select_threadpool(pdims, size(y, 4)) - nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims), - stride = stride(pdims), threadpool = threadpool) -end - -function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims; - b::A2 = zeros(Float32, size(x, 3)), - algo = UInt32(0)) where {A1<:Array{Float32, 4}, - A2<:Array{Float32, 1}} - check_dims(size(x), size(w), size(y), cdims) - threadpool = select_threadpool(cdims, size(y, 4)) - - if flipkernel(cdims) == 0 - w = flipweight(w) - end - - nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims), - stride = stride(cdims), threadpool = threadpool) -end - -function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims; - algo = UInt32(0)) where{A<:Array{Float32, 4}} - check_dims(size(dx), size(w), size(dy), cdims) - threadpool = select_threadpool(cdims, size(y, 4)) - - if flipkernel(cdims) == 0 - w = flipweight(w) - end - - nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims), - stride = stride(cdims), threadpool = threadpool) -end - -function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims; - algo = UInt32(0)) where{A<:Array{Float32, 4}} - check_dims(size(x), size(dw), size(dy), cdims) - threadpool = select_threadpool(cdims, size(y, 4)) - - nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims), - stride = stride(cdims), threadpool = threadpool) - - if flipkernel(cdims) == 0 - dw .= flipweight(dw) - end - - dw -end - diff --git a/src/nnpack/interface.jl b/src/nnpack/interface.jl index 25ab93632..74f38c2fb 100644 --- a/src/nnpack/interface.jl +++ b/src/nnpack/interface.jl @@ -1,70 +1,90 @@ -include("impl.jl") - - -for (front_name, backend) in ( - :conv => :_nnpack, - :∇conv_data => :_nnpack, - :∇conv_filter => :_nnpack, - ) - @eval begin - function $(Symbol("$(front_name)$(backend)!"))( - out::Array{T1,4}, in1::Array{T2,4}, in2::Array{T3,4}, - cdims::ConvDims; kwargs...) where {T1, T2, T3} - @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1 - # Output must of the same type as in the function signature - T1.($(Symbol("$(front_name)$(backend)!"))(Float32.(out), Float32.(in1), - Float32.(in2), cdims; kwargs...)) - end - end -end +export is_nnpack_available, + # Pooling + maxpool_nnpack!, nnpack_supported_operation, + # Convolution + conv_nnpack!, ∇conv_data_nnpack!, ∇conv_filter_nnpack! + +""" + is_nnpack_available() +Checks if the current hardware is supported by NNPACK. -function conv_nnpack(x::Array{T1, 4}, w::Array{T2, 4}, cdims::ConvDims; kwargs...) where {T1, T2} - y = similar(x, output_size(cdims)..., channels_out(cdims), size(x, 4)) - return conv_nnpack!(y, x, w, cdims; kwargs...) +While the platform itself may be supported by NNPACK, certain hardware +configurations (such as processors lacking SSE) are not. +""" +function is_nnpack_available() + return nnp_initialize() != nnp_status_unsupported_hardware end +# Conv +function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims; + b::A2 = zeros(Float32, size(x, 3)), + algo = UInt32(0)) where {A1<:Array{Float32, 4}, + A2<:Array{Float32, 1}} + check_dims(size(x), size(w), size(y), cdims) + threadpool = select_threadpool(cdims, size(y, 4)) + + if flipkernel(cdims) == 0 + w = flipweight(w) + end -function ∇conv_data(dy::Array{T1, 4}, w::Array{T2, 4}, cdims::ConvDims; kwargs...) where {T1, T2} - dx = similar(dy, input_size(cdims)..., channels_in(cdims), size(dy, 4)) - return ∇conv_data!(dx, dy, w, cdims; kwargs...) + nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims), + stride = stride(cdims), threadpool = threadpool) end +function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims; + algo = UInt32(0)) where{A<:Array{Float32, 4}} + check_dims(size(dx), size(w), size(dy), cdims) + threadpool = select_threadpool(cdims, size(dy, 4)) + + if flipkernel(cdims) == 0 + w = flipweight(w) + end -function ∇conv_filter(x::Array{T1, 4}, dy::Array{T2, 4}, cdims::ConvDims; kwargs...) where {T1, T2} - dw = similar(x, kernel_size(cdims)..., channels_in(cdims), channels_out(cdims)) - return ∇conv_filter!(dw, x, dy, cdims; kwargs...) + nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims), + stride = stride(cdims), threadpool = threadpool) end +function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims; + algo = UInt32(0)) where{A<:Array{Float32, 4}} + check_dims(size(x), size(dw), size(dy), cdims) + threadpool = select_threadpool(cdims, size(dy, 4)) + + nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims), + stride = stride(cdims), threadpool = threadpool) + + if flipkernel(cdims) == 0 + dw .= flipweight(dw) + end -function maxpool_nnpack!(y::Array{T1, 4}, x::Array{T2, 4}, pdims::PoolDims; - kwargs...) where {T1, T2} - @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1 - # We want the output to be of the same type as desired - T1.(maxpool_nnpack!(Float32.(y), Float32.(x), pdims; kwargs...)) + dw end -function maxpool_nnpack(x::Array{T, 4}, pdims::PoolDims; kwargs...) where {T} - y = similar(x, output_size(pdims)..., channels_out(pdims), size(x, 4)) - return maxpool_nnpack!(y, x, pdims; kwargs...) +# Pooling +function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4}} + check_dims(size(x), size(y), pdims) + threadpool = select_threadpool(pdims, size(y, 4)) + nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims), + stride = stride(pdims), threadpool = threadpool) end - """ nnpack_supported_operation(cdims::ConvDims) - nnpack_supported_operation(pdims::PoolDims) -Returns `true` if nnpack supports the convolution/pooling operation for the given parameters. +Returns `true` if nnpack supports the conv/pooling operation for the given +parameters. For convolution this can be known at compile-time, however for +pooling, we cannot describe the stride domain constraint purely with types, +so we must do it at runtime with this method. """ function nnpack_supported_operation(pdims::PoolDims{2, K, S, P, (1, 1)}) where {K, S, P} - val = input_size(pdims)[1:2] .+ (P[1] + P[2], P[3] + P[4]) .- K - return val .% S == (0, 0) ? true : false + # Ensure that the kernel striding perfectly covers the padded input size. + stride_domain = input_size(pdims)[1:2] .+ (P[1] + P[2], P[3] + P[4]) .- K + return stride_domain .% S == (0, 0) end -function nnpack_supported_operation(cdims::ConvDims{2, K, (1, 1), P, (1, 1)}) where {K, S, P} - return true -end +NNPACK_CDIMS = DenseConvDims{2,K,C_in,C_out,(1,1),P,(1,1),F} where {K,C_in,C_out,P,F} +nnpack_supported_operation(cdims::NNPACK_CDIMS) = true -# Return false for everything else +# Say false by default nnpack_supported_operation(dims) = false diff --git a/src/nnpack/libnnpack.jl b/src/nnpack/libnnpack.jl index 2f3996c32..4767a9545 100644 --- a/src/nnpack/libnnpack.jl +++ b/src/nnpack/libnnpack.jl @@ -90,8 +90,6 @@ function nnp_max_pooling_output(y::Array{Float32,4}, x::Array{Float32,4}, kernel y end -#TODO: Add wrapper for convolution inference - function nnp_convolution_input_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile) @nnpack_check ccall((:nnp_convolution_input_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL) end diff --git a/src/nnpack/performance.jl b/src/nnpack/multithreading.jl similarity index 87% rename from src/nnpack/performance.jl rename to src/nnpack/multithreading.jl index 24abdb411..6f37ccfcc 100644 --- a/src/nnpack/performance.jl +++ b/src/nnpack/multithreading.jl @@ -1,5 +1,7 @@ +const shared_threadpool_dict = Dict{UInt64, Base.RefValue}() + function select_threadpool(cdims::DenseConvDims, batch_size::Int) - inp_size = input_size(cdims)[1] + inp_size = input_size(cdims)[1] if batch_size >= 32 return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] elseif batch_size >= 16 && inp_size >= 64 @@ -10,12 +12,12 @@ function select_threadpool(cdims::DenseConvDims, batch_size::Int) return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] elseif inp_size * batch_size >= 256 return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - end + end return C_NULL end function select_threadpool(pdims::PoolDims, batch_size::Int) - inp_size = input_size(pdims)[1] + inp_size = input_size(pdims)[1] if batch_size >= 32 return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] elseif batch_size >= 16 && inp_size >= 64 @@ -26,6 +28,6 @@ function select_threadpool(pdims::PoolDims, batch_size::Int) return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] elseif inp_size * batch_size >= 256 return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - end + end return C_NULL end diff --git a/src/pooling.jl b/src/pooling.jl deleted file mode 100644 index 13c605e97..000000000 --- a/src/pooling.jl +++ /dev/null @@ -1,155 +0,0 @@ -export maxpool, maxpool!, meanpool, meanpool!, ∇maxpool, ∇maxpool!, ∇meanpool, ∇meanpool! - -## Pooling API -# -# We provide the following generic methods, for 3d, 4d, and 5d tensors, calculating 1d, -# 2d and 3d pooling, based on the rank of the input tensors, in both mutating and -# non-mutating auto-allocating variants: -# - Pooling: -# - maxpool(x, pdims) -# - maxpool!(y, x, pdims) -# - meanpool(x, pdims) -# - meanpool!(y, x, pdims) -# - Pooling input backprop -# - ∇maxpool(dy, y, x, pdims) -# - ∇maxpool!(dx, dy, y, x, pdims) -# - ∇meanpool(dy, y, x, pdims) -# - ∇meanpool!(dx, dy, y, x pdims) -# -# All methods require a `PoolDims` object to define the dimensions and optional -# elements of the convolution (stride, dilation, etc...), which is easily constructable -# through something like `PoolDims(x, w)`. - - -# First, we will define mappings from the generic API names to our accelerated backend -# implementations. At the moment this is only the direct implementation, however this -# exists here so that other packages (NNPACK, MAGMA, etc...) can override this easily. -for (front_name, backend) in ( - # This maps from public, front-facing name, to internal backend name - :maxpool => :direct, - :meanpool => :direct, - ) - - # We only define 3d pooling primitives, we reshape lower down to get 1d and 2d pooling - @eval begin - function $(Symbol("$(front_name)!"))( - y::AbstractArray{T,5}, x::AbstractArray{T,5}, - pdims::PoolDims; kwargs...) where {T} - $(Symbol("$(front_name)_$(backend)!"))(y, x, pdims; kwargs...) - end - end -end - -# Do the same for backprops -for (front_name, backend) in ( - :∇maxpool => :direct, - :∇meanpool => :direct, - ) - @eval begin - function $(Symbol("$(front_name)!"))( - dx::AbstractArray{T,5}, dy::AbstractArray{T,5}, - y::AbstractArray{T,5}, x::AbstractArray{T,5}, - pdims::PoolDims; kwargs...) where {T} - $(Symbol("$(front_name)_$(backend)!"))(dx, dy, y, x, pdims; kwargs...) - end - end -end - - -# Our strategy for pooling is to reshape to an array with three spatial dimensions, which -# makes things MUCH EASIER for us on the backend side, and is in general pretty fast, -# since we can specialize on sizes. -for front_name in (:maxpool, :meanpool) - for backend in (Symbol(), :_direct) - for N in (3, 4) - @eval begin - function $(Symbol("$(front_name)$(backend)!"))( - y::AbstractArray{T,$N}, x::AbstractArray{T,$N}, - pdims::PoolDims; kwargs...) where {T} - $(Symbol("$(front_name)$(backend)!"))( - insert_singleton_spatial_dimension(y, $(5 - N)), - insert_singleton_spatial_dimension(x, $(5 - N)), - insert_singleton_spatial_dimension(pdims, $(5 - N)); - kwargs... - ) - - # We explicitly return `y` here, because the backend call - # itself may return a reshaped view, which we don't want. - return y - end - - # backprops too - function $(Symbol("∇$(front_name)$(backend)!"))( - dx::AbstractArray{T,$N}, dy::AbstractArray{T,$N}, - y::AbstractArray{T,$N}, x::AbstractArray{T,$N}, - pdims::PoolDims; kwargs...) where {T} - $(Symbol("∇$(front_name)$(backend)!"))( - insert_singleton_spatial_dimension(dx, $(5 - N)), - insert_singleton_spatial_dimension(dy, $(5 - N)), - insert_singleton_spatial_dimension(y, $(5 - N)), - insert_singleton_spatial_dimension(x, $(5 - N)), - insert_singleton_spatial_dimension(pdims, $(5 - N)); - kwargs... - ) - - # We explicitly return `dx` here, because the backend call - # itself may return a reshaped view, which we don't want. - return dx - end - end - end - end -end - - -# Finally, let's generate auto-allocating versions of all our functions, for all backends: -for backend in (Symbol(), :_direct, :_im2col) - # First make auto-allocating versions of the basic pooling calls: - for name in (:maxpool, :meanpool) - @eval begin - function $(Symbol("$(name)$(backend)"))( - x::AbstractArray{xT,N}, - pdims::PoolDims; kwargs...) where {xT, N} - y = similar(x, output_size(pdims)..., channels_out(pdims), size(x, N)) - fill!(y, xT(0)) - return $(Symbol("$(name)$(backend)!"))(y, x, pdims; kwargs...) - end - - # Backprops too - function $(Symbol("∇$(name)$(backend)"))( - dy::AbstractArray{T,N}, y::AbstractArray{T,N}, - x::AbstractArray{T,N}, pdims::PoolDims; - kwargs...) where {T, N} - dx = similar(x, input_size(pdims)..., channels_in(pdims), size(dy, N)) - fill!(dx, T(0)) - return $(Symbol("∇$(name)$(backend)!"))(dx, dy, y, x, pdims; kwargs...) - end - end - end -end - - -# Use NNPACK if it is available and operation is supported -if is_nnpack_available() - function maxpool(x::Array{T, 4}, pdims::PoolDims{2, K, S, P, (1, 1)}; kwargs...) where {T, K, S, P} - func = nnpack_supported_operation(pdims) ? maxpool_nnpack : maxpool_direct - return func(x, pdims; kwargs...) - end -end - -expand(N, i::Tuple) = i -expand(N, i::Integer) = ntuple(_ -> i, N) - -function maxpool(x, k::NTuple{N, Integer}; pad = 0, stride = k) where N - pad = expand(Val(N), pad) - stride = expand(Val(N), stride) - pdims = PoolDims(x, k; padding = pad, stride = stride) - return maxpool(x, pdims) -end - -function meanpool(x, k::NTuple{N, Integer}; pad = 0, stride = k) where N - pad = expand(Val(N), pad) - stride = expand(Val(N), stride) - pdims = PoolDims(x, k; padding = pad, stride = stride) - return meanpool(x, pdims) -end diff --git a/test/conv.jl b/test/conv.jl index cf91a5361..03676377c 100644 --- a/test/conv.jl +++ b/test/conv.jl @@ -274,14 +274,9 @@ conv_answer_dict = Dict( # A "drop channels and batch dimension" helper ddims(x) = dropdims(x, dims=(rank+1, rank+2)) + # We don't directly test conv_nnpack() because it has so many holes in its support convs = [NNlib.conv, NNlib.conv_im2col, NNlib.conv_direct,] - NNlib.is_nnpack_available() && push!(convs, NNlib.conv_nnpack) for conv in convs - if NNlib.is_nnpack_available() - if conv == NNlib.conv_nnpack && !NNlib.nnpack_supported_operation(DenseConvDims(x, w)) - continue - end - end @testset "$(conv)" begin cdims = DenseConvDims(x, w) # First, your basic convolution with no parameters diff --git a/test/inference.jl b/test/inference.jl index 39b5108ce..778aeec49 100644 --- a/test/inference.jl +++ b/test/inference.jl @@ -1,13 +1,11 @@ -import NNlib: conv_direct, conv_im2col - @testset "Conv Inference" begin - x = rand(10, 10, 3, 2) - w = rand(3, 3, 3, 1) + x = rand(Float32, 10, 10, 3, 2) + w = rand(Float32, 3, 3, 3, 1) - impl = [conv, conv_direct, conv_im2col] + impl = [conv, NNlib.conv_direct, NNlib.conv_im2col] NNlib.is_nnpack_available() && push!(impl, NNlib.conv_nnpack) for T in impl - @test T(x, w, DenseConvDims(x, w)) isa AbstractArray{K,4} where K + @test T(x, w, DenseConvDims(x, w)) isa AbstractArray{eltype(x),4} end end