From b3be8569676f6c87a86139cb4fdc44c7436726fc Mon Sep 17 00:00:00 2001 From: Pietro Monticone <38562595+pitmonticone@users.noreply.github.com> Date: Fri, 1 Jul 2022 07:45:30 +0200 Subject: [PATCH] Clean docs and docstrings Fixed a few typos. --- docs/src/man/examples/airpassengers.md | 20 +- docs/src/man/examples/london.md | 8 +- docs/src/man/examples/quakes.md | 6 +- docs/src/man/examples/whatif.md | 2 +- docs/src/man/quickstart.md | 6 +- src/AR.jl | 404 +++++++++++++++++++------ src/CCF.jl | 124 ++++++-- src/FORECAST.jl | 151 ++++----- src/STL.jl | 223 +++++++++++++- src/SUMMARIZE.jl | 121 +++++++- src/ar.jl | 2 +- src/ccf.jl | 2 +- src/datasets.jl | 2 +- src/p.jl | 2 +- src/stl.jl | 8 +- 15 files changed, 835 insertions(+), 246 deletions(-) diff --git a/docs/src/man/examples/airpassengers.md b/docs/src/man/examples/airpassengers.md index 285fccf..40630ef 100644 --- a/docs/src/man/examples/airpassengers.md +++ b/docs/src/man/examples/airpassengers.md @@ -8,7 +8,7 @@ default(size=(800,500)) # Air Passengers (Seasonality) ## Introduction -In this example we will predict the monthly totals in thounsads of international airline passengers with data from 1949 to 1960. +In this example we will predict the monthly totals in thousands of international airline passengers with data from 1949 to 1960. ```@example examples plot(air()) #hide @@ -16,7 +16,7 @@ plot(air()) #hide ## Descriptive Analysis & Transformations ### Heteroskedasticity -A cople of obvious feature in the plot of the data is that it shows an non-stationary heteroskedastic time series. However, to facilitate the fitting of an AR model we want a Stationary Homoskedastic time series. With that goal in mind we could consider first a boxcox transformation to deal with the heteroskedasticity but in this case a simple log transformation works just fine. +A couple of obvious feature in the plot of the data is that it shows an non-stationary heteroskedastic time series. However, to facilitate the fitting of an AR model we want a Stationary Homoskedastic time series. With that goal in mind we could consider first a boxcox transformation to deal with the heteroskedasticity but in this case a simple log transformation works just fine. ```@example examples log_ap = air() @@ -26,31 +26,31 @@ plot(log_ap) #hide ``` ### Stationary Behavior -We still have a clear non-stationary time series displaying a marked trend and montnly seasonality. +We still have a clear non-stationary time series displaying a marked trend and monthly seasonality. ```@example examples stl_ap = stl(log_ap.LogPassengers,12,robust=true) plot(stl_ap) ``` -To faciliate the fitting of an AR model we want an stationary time series, and in order to have one we will be using a differentation of order 12 for the trend and seasonality. +To facilitate the fitting of an AR model we want an stationary time series, and in order to have one we will be using a differentiation of order 12 for the trend and seasonality. ```@example examples d12_log_ap = d(log_ap,1,12) plot(d12_log_ap) # hide ``` -There is a case for second differentation of oder one since it furthers reduces the variance of the resulting time series indicating a potential remaining trend, however this potential improvement might not be enough to justify this new transformation since the existing one offers now a good enough stationary time series to consider an AR model with a constant coefficient. +There is a case for second differentiation of oder one since it furthers reduces the variance of the resulting time series indicating a potential remaining trend, however this potential improvement might not be enough to justify this new transformation since the existing one offers now a good enough stationary time series to consider an AR model with a constant coefficient. -### Evaluating Seasonal Differentation +### Evaluating Seasonal Differentiation ```@example examples splot(d12_log_ap) ``` -After a differentation of order 12 we can also see that the seasonality has mostly disappeared and we can continue our analysis with a reasonalble stationary dataset. +After a differentiation of order 12 we can also see that the seasonality has mostly disappeared and we can continue our analysis with a reasonable stationary dataset. ### Autoregressive Behavior ```@example examples plot(pacf(d12_log_ap[:,2])) ``` -We can see clear auto-correlations with values of the previous two months and with values of the same months in the previous year. Beyond this there seems to be not enough correlation to justify a more complex model, but we can always check this hypothesis lookig at the Information Criteria. +We can see clear auto-correlations with values of the previous two months and with values of the same months in the previous year. Beyond this there seems to be not enough correlation to justify a more complex model, but we can always check this hypothesis looking at the Information Criteria. ## Fitting an AR model @@ -61,9 +61,9 @@ ar_tap = ar(d12_log_ap,13) show(ar_tap) # hide ``` ### Fixing Coefficients -As expected we see that ``\Phi1,\Phi2,\Phi12,`` and ``\Phi13`` have highly significant coefficients, also we can see some significance in ``\Phi0``. This confirms the case for a further differentation of order one however, since doing so decreases the normality profile of the residuals we will keep it as it is. +As expected we see that ``\Phi1,\Phi2,\Phi12,`` and ``\Phi13`` have highly significant coefficients, also we can see some significance in ``\Phi0``. This confirms the case for a further differentiation of order one however, since doing so decreases the normality profile of the residuals we will keep it as it is. -Since we want to know the falues of these four parameters without the influcence of the rest we will now fit again the model fixing all coefficients except those five. +Since we want to know the values of these four parameters without the influence of the rest we will now fit again the model fixing all coefficients except those five. ```@example examples Φ = ar_tap.Φ diff --git a/docs/src/man/examples/london.md b/docs/src/man/examples/london.md index 1ebf253..ab66381 100644 --- a/docs/src/man/examples/london.md +++ b/docs/src/man/examples/london.md @@ -33,7 +33,7 @@ splot(x[:,["Date","Violence"]], title="Seasonal Violent Crimes") ```@example examples splot(x[:,["Date","Sexual"]], title="Seasonal Sexual Crimes") ``` -Sesonality patterns for `MaxTemp` are obvious and noticeable for `Violence`, however there's barely any to be seen in `Sexual`. +Seasonality patterns for `MaxTemp` are obvious and noticeable for `Violence`, however there's barely any to be seen in `Sexual`. ### Normalization To compare time series with different positive magnitudes and to facilitate an stationary forecasting we will normalize data by dividing each variable by its maximum value. @@ -50,7 +50,7 @@ dtx = d(tx,1,12) plot(dtx) ``` -An extra differentation might be required for `Violence` and `Sexual` but this seems to be one to many for `MaxTemp` indicated by an increase in its variance when doing so. Also, the apparent need for a differentation in `Violence` and `Sexual` seems to be a temporary effect rather than a permanent trend or seasonality therefore we will continue with just one differentation of order 12. +An extra differentiation might be required for `Violence` and `Sexual` but this seems to be one to many for `MaxTemp` indicated by an increase in its variance when doing so. Also, the apparent need for a differentiation in `Violence` and `Sexual` seems to be a temporary effect rather than a permanent trend or seasonality therefore we will continue with just one differentiation of order 12. ## Fitting an AR(1) model Given the scarcity of data for a multivariate model we cannot fit too many parameters while keeping its significance. We will therefore fit an AR of order one. @@ -61,11 +61,11 @@ show(xar) # hide ``` The first thing we can do is to use Φ1 to infer directional causality, in this case we see how `MaxTemp` is not influenced at all (with statistical significance) by `Violence` and `Sexual`, which could not possibly be otherwise in this case. On the other hand we have `Violence` and `Sexual` being affected significantly by `MaxTemp`. -The relathionship between `Violence` and `Sexual` is both ways, however, the effect `Sexual` has on `Violence` is small and barely significant which might prompt us to consider to remove it altogether, the opposite though is not true, `Violence` has a strong and significant influence in `Sexual`. +The relationship between `Violence` and `Sexual` is both ways, however, the effect `Sexual` has on `Violence` is small and barely significant which might prompt us to consider to remove it altogether, the opposite though is not true, `Violence` has a strong and significant influence in `Sexual`. ## Fixing Coefficients -Let's now fix to 0 the influcence of `Violence` and `Sexual` on `MaxTemp`, remove the influence from `MaxTemp` on `Sexual` since it is accounted for via `Violence` and the non-significant influence of `Sexual` on `Violence`. If now we fit again the model we have +Let's now fix to 0 the influence of `Violence` and `Sexual` on `MaxTemp`, remove the influence from `MaxTemp` on `Sexual` since it is accounted for via `Violence` and the non-significant influence of `Sexual` on `Violence`. If now we fit again the model we have ```@example examples dϕ1 = xar.Φ diff --git a/docs/src/man/examples/quakes.md b/docs/src/man/examples/quakes.md index 9496a3e..793884b 100644 --- a/docs/src/man/examples/quakes.md +++ b/docs/src/man/examples/quakes.md @@ -27,7 +27,7 @@ Data's behavior seems to follow a Normal distribution with no strong indications ```@example examples plot(pacf(qk.quakes)) ``` -The partial autoregression function shows us that there seems to be a significant correlation to the number of earthquakes taking place last year. If we were looking for seasonality we could check on periods of 11 or 15 years since they show a nearly significant correlations but since they're most likely spureous (...or are they?) we will ignore them in this analysis. +The partial autoregression function shows us that there seems to be a significant correlation to the number of earthquakes taking place last year. If we were looking for seasonality we could check on periods of 11 or 15 years since they show a nearly significant correlations but since they're most likely spurious (...or are they?) we will ignore them in this analysis. ## Fitting an AR Model ```@example examples @@ -35,7 +35,7 @@ ar_qk = ar(qk) show(ar_qk) # hide ``` In the AR model of order one we have highly significant coefficients and increasing its order does not provide important changes in the Information Criteria, however, the residuals show a barely significant normality behavior and we may consider to transform our data to improve on that. -Given tha large noise in the model, tranformations to improve results will not be dramatic and therefore we will continue with a simple AR model of order one for our forecasting. +Given the large noise in the model, transformations to improve results will not be dramatic and therefore we will continue with a simple AR model of order one for our forecasting. ## Forecasting Earthquakes @@ -43,7 +43,7 @@ Given tha large noise in the model, tranformations to improve results will not b fc_qk = forecast(ar_qk,10); plot(fc_qk) ``` -The plot shows us the forecast for the next ten years and, as we see, the large noise in the model does not allow us to be very accurate in our forecasting, but at least we can confidently say that there is a resonable chance to have a larger number of big earthquakes in 2021 and 2022 than the number we had in 2020. +The plot shows us the forecast for the next ten years and, as we see, the large noise in the model does not allow us to be very accurate in our forecasting, but at least we can confidently say that there is a reasonable chance to have a larger number of big earthquakes in 2021 and 2022 than the number we had in 2020. ```@example examples show(fc_qk) # hide diff --git a/docs/src/man/examples/whatif.md b/docs/src/man/examples/whatif.md index bf86051..fe2fdf6 100644 --- a/docs/src/man/examples/whatif.md +++ b/docs/src/man/examples/whatif.md @@ -18,7 +18,7 @@ To answer this questions and similar ones we need a way to instruct our forecast ## London Crime/Weather Scenarios -Let's show how to use the Great London Crime/Weather example. In this example we see how weather temperature affects/explains criminality in Greater London. However, the prediction of temperature is not as accurate as it would be if we use forecast coming from (metoffice)[metoffice.gov.uk] instead those predicted by an AR(1) model. Therefore in order to have more accurate predictions about crime we want to overrule the temperature forecast of the AR(1) model with the forecast comming from `metoffice`. +Let's show how to use the Great London Crime/Weather example. In this example we see how weather temperature affects/explains criminality in Greater London. However, the prediction of temperature is not as accurate as it would be if we use forecast coming from (metoffice)[metoffice.gov.uk] instead those predicted by an AR(1) model. Therefore in order to have more accurate predictions about crime we want to overrule the temperature forecast of the AR(1) model with the forecast coming from `metoffice`. These metoffice predictions would not be much different than those from the AR(1) model therefore, in order to better visualize impacts in the forecast, we will play with an hypothetical scenario in which temperatures increase one degree per month for two years straight. diff --git a/docs/src/man/quickstart.md b/docs/src/man/quickstart.md index 41be493..092d5ec 100644 --- a/docs/src/man/quickstart.md +++ b/docs/src/man/quickstart.md @@ -44,7 +44,7 @@ plot!(x,sma(y,100,true), linewidth = 2, label= "Moving Avg 100", color = :orange ## STL on CO2 dataset For this example we will be using the co2 data used by the creators of STL to -demostrate its funcitonality, below we can see such time series. +demonstrate its functionality, below we can see such time series. ```@example quickstart using Plots @@ -53,7 +53,7 @@ using Forecast plot(co2(), legend=:bottomright) ``` -The parameters used for STL they're also from the orginal paper, a period of +The parameters used for STL they're also from the original paper, a period of 365 days is used (removing leap years extra day), a robust fit is required and seasonality post-smoothing is applied. @@ -70,7 +70,7 @@ The image below comes from the original paper for comparison purposes. ``` -## Cross-Correlation on shifted dasaset +## Cross-Correlation on shifted dataset Here we cross-correlate two identical series shifted by six positions, the plot shows how the peak correlation is at position six. diff --git a/src/AR.jl b/src/AR.jl index 376d2ce..4de1447 100644 --- a/src/AR.jl +++ b/src/AR.jl @@ -1,112 +1,338 @@ """ Package: Forecast -Store results from the function `ar` + ar(x::DataFrame, or, constant = true; + alpha = 1.0, dΦ0 = nothing, dΦ = nothing) + + ar(x::AbstractArray, or, constant = true; + alpha = false, dΦ0 = nothing, dΦ = nothing, varnames = nothing) + +Fit a multivariate autoregressive series model. + +The fitting is done via Ordinary Least Squared and implements the following model: + +```math +Xt = \\Phi_0 + \\sum_{i=1}^p \\Phi_i \\cdot X_{t-i} + \\mathcal{N}(\\vec{0},\\Sigma) +``` # Arguments -`varnames` List of variable names -`order` Order of Autoregressive Model -`ndims` Number of dimensions -`Φ` Collection of d by d matrices of coefficients -`coefficients` Alias for Φ -`Φ0` Constant -`constant` Alias for Φ0 -`Σ2` ML variance/covariance Matrix -`variance` Alias for Σ2 -`Σ` Variables Standard deviation -`stdev` Alias for Σ -`x` Original dataset -`fitted` Fitted values -`residuals` Prediction Error -`ic::Dict` Collection of Information Criteria -`stats::Dict` Collection of Statistics -`Φse` Parameters Standard Error -`pse` Alias for Φse -`Φ0se` Constant Standard Error -`p0se` Alias for Φ0se -`Φpv` p-value for Parameters -`ppv` Alias for Φpv -`Φ0pv` p-value for Constant -`p0pv` Alias for Φ0pv -`call::String` Method called to generate AR +- `x`: Multivariate series each column containing a dimension and ordered by time ascending rows. +- `or`: Number of parameters Φ to be estimated. +- `constant`: If `true` `ar` estimates Φ0 otherwise it is assume to be zero. +- `alpha`: fixes to zero all coefficients which significance is below its value. It defaults to one. +- `dΦ0`: Tuple containing two Φ0 objects, the first one will act as an original reference to the second one and different values will be fixed in the fitting process to the values in the second Φ0. +- `dΦ`: Equivalent to dΦ0 but for Φ. +- `varnames`: Names of the dimensions (by default xi where `i` is an integer) + +# Returns +An AR object containing the model coefficients, the error sigma matrix, residuals and a collection of information criteria + +# Examples +```julia-repl +julia> ar(rand(100,2),2) +AR([...]) """ -mutable struct AR{T} - varnames::Vector{String} - order::Integer - ndims::Integer - Φ::Union{T,Array{T}} - Φ0::Union{T,Vector{T}} - Σ2::Union{T,Matrix{T}} - Σ::Union{T,Vector{T}} - x::Union{Array{T},DataFrame} - fitted::Union{T,Array{T}} - residuals::Union{T,Array{T}} - ic::Dict - stats::Dict - Φse::Union{T,Array{T}} - Φ0se::Union{T,Vector{T}} - Φpv::Union{T,Array{T}} - Φ0pv::Union{T,Vector{T}} - call::String +function ar(df::DataFrame, + order::Integer = 1, + constant::Bool = true; + alpha::Real = 1.0, + dΦ0::Tuple = d_ar_dΦ0(size(df,2)-1,constant,Float64), + dΦ::Tuple = d_ar_dΦ(size(df,2)-1,order,Float64)) + + #TODO promote dΦ0 and dΦ types based on df and control for Integer values on x + ttype = [Date,DateTime,Day,Month,Week] + ar_df = eltype(df[:,1]) in ttype ? Array{Float64}(df[:,2:end]) : Array{Float64}(df) + names_df = eltype(df[:,1]) in ttype ? names(df[:,2:end]) : names(df) + xar = ar(ar_df, order, constant; alpha, dΦ0, dΦ, varnames = names_df) + xar.x = df + + return xar + end -function Base.show(io::IO, xar::AR) - m,p = xar.ndims, xar.order +# Default ar values functions +d_ar_dΦ0(m::Integer,constant::Bool,T::Type) = (repeat([T(1)],m), repeat([T(constant ? 1 : 0)],m)) +d_ar_dΦ(m::Integer,order::Integer,T::Type) = + (r11=reshape(repeat([T(1)],m*m*order),m,m,order); (r11,r11)) +d_ar_varnames(m::Integer)::Vector{String} = ["x"*string(i) for i in 1:m] + +function ar(x::AbstractArray{T}, + order::Integer = 1, + constant::Bool = true; + alpha::Real = 1.0, + dΦ::Tuple = d_ar_dΦ(size(x,2),order,T), + dΦ0::Tuple = d_ar_dΦ0(size(x,2),constant,T), + varnames::AbstractVector{String} = d_ar_varnames(size(x,2))) where T<:Real - Φ = expand(xar.Φ,(m,m,p)) + @assert 0.0 < alpha <= 1.0 + + m = size(x,2) + np = order + + dΦ = ( expand(dΦ[1],(m,m,np)), expand(dΦ[2],(m,m,np)) ) + dΦ0 = ( expand(dΦ0[1],(m,)), expand(dΦ0[2],(m,)) ) + + xar = ar_ols(x, order, constant; dΦ0, dΦ, varnames) + + alpha == 1.0 && return xar + + m, np = xar.ndims, xar.order + + Φ = expand(xar.Φ,(m,m,np)) Φ0 = expand(xar.Φ0,(m,)) - Φpv = expand(xar.Φpv,(m,m,p)) + Φpv = expand(xar.Φpv,(m,m,np)) Φ0pv = expand(xar.Φ0pv,(m,)) - Σ2 = expand(xar.Σ2,(m,m)) - - printstyled("Multivariate Autoregressive Model\n",bold=true,color=:underline) - print("\n ",xar.call,"\n") - - printstyled("\nResiduals Summary\n",bold=true,color=:underline) - xar_sum = summarize(xar.residuals; varnames = xar.varnames) - pretty_table(xar_sum.quantiles, nosubheader = true, show_row_number=false) - pretty_table(xar_sum.moments, nosubheader = true, show_row_number=false) - - printstyled("\nCoefficients\n\n",bold=true,color=:underline) - printstyled("Φ0\n",bold=true,color=:underline) - pretty_table(string.(round.(Φ0,digits=3)) .* sigf.(Φ0pv), - tf = tf_matrix, noheader=true) - - for i in 1:p - printstyled("Φ",i,"\n",bold=true,color=:underline) - Φi = Φ[:,:,i] - Φpvi = Φpv[:,:,i] - pretty_table(string.(round.(Φi,digits = 3)) .* sigf.(Φpvi), tf = tf_matrix, noheader=true) + + dΦs = (Φpv .<= alpha) + dΦ0s = (Φ0pv .<= alpha) + + + all(dΦs) && all(dΦ0s) && return xar + + if isnothing(dΦ) + dΦ = (Φ, Φ .* dΦs) + dΦ0 = (Φ0, Φ0 .* dΦ0s) + else + dΦ = (dΦ[1], dΦ[2] .* dΦs) + dΦ0 = (dΦ0[1], dΦ0[2] .* dΦ0s) + end + + return ar_ols(x, order, constant; dΦ0, dΦ, varnames) + +end + + +function ar_ols(x::AbstractArray{T}, + or::Integer, + constant::Bool; + dΦ::Tuple, + dΦ0::Tuple, + varnames::AbstractVector{String}) where T<:Real + + @assert 1 <= ndims(x) <= 2 + @assert 1 <= or < size(x,1)-1 + + n = size(x,1) + m = size(x,2) + np = m*m*or + m + + x = x[end:-1:1,:] + nx = x + for i in 1:or + nx = hcat(x,nx[vcat(2:end,1),:]) end - print("Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘^’ 0.1 ‘ ’ 1 and ‘+’ if fixed\n") + nx = nx[1:end-or,:] - # printstyled("\nCoefficients' Std. Error\n",bold=true,color=:underline) - # printstyled("\nΦ0se\n",bold=true,color=:underline) - # pretty_table(xar.Φ0se, tf = tf_matrix, noheader=true) - # for i in 1:p - # printstyled("Φ",i,"se\n",bold=true,color=:underline) - # pretty_table(xar.Φse[:,:,i], tf = tf_matrix, noheader=true) - # end + Y = nx[:,1:m] + X = nx[:,m+1:end] + X = hcat(repeat([1.0],size(X,1)),X) - printstyled("\nΣ2 Variance/Covariance Matrix\n",bold=true,color=:underline) - pretty_table(Σ2, tf = tf_matrix, noheader=true) + # Fixing parameters + W = Array{T,2}(undef,(or*m+1,m)) + for i in 1:m + Xi,Yi = fixΦ(X,Y,i,dΦ0,dΦ) + dW = Xi\Yi + W[:,i] = fixW(dW,i,dΦ0,dΦ) + end + + # Fitted values + fitted = X*W - printstyled("\nInformation Criteria\n",bold=true,color=:underline) - pretty_table(DataFrame(xar.ic), nosubheader = true, show_row_number=false) + # Prediction error + e = Y - fitted + + Φ0 = W[1,:] + Φ = W[2:end,:] + Φ = Array(reshape(Φ',m,m,or)) - printstyled("\nStatistics\n",bold=true,color=:underline) - pretty_table(DataFrame(xar.stats), nosubheader = true, show_row_number=false) + # Maximum Likelihood noise covariance + k = or*m*m + @assert n-k > 0 "Insufficient data for the model" + Σ2 = 1/(n-k)*e'*e + + # ML parameters std. error. + Φse = sqrt.(abs.(diag(kron(Σ2, (X'*X)^-1)))) + Φse = fixΦse(Φse,dΦ0,dΦ) + + rΦse = reshape(Φse,:,m)' + Φ0se = rΦse[:,1] + Φse = reshape(rΦse[:,2:end],m,m,:) + + #fix p + np = fixnp(dΦ0,dΦ) + + # TODO check wether to use 'or' or 'np' and if 'or' then a fixor function is needed + # Information Criteria + lΣ2 = log(norm(Σ2)) + ic = Dict([("AIC", lΣ2 + 2*np*m^2/n), + ("AICC", lΣ2 + 2*(np*m^2+1)/(n-(np*m^2+2))), + ("BIC", n*log(norm(Σ2)+m)+(m^2*np+m*(m+1)/2)*log(n)), + ("H&Q", lΣ2 + 2*log(log(n))*np*m^2/n)]) + + # Statistics + SStot = sum((Y .- mean(Y,dims=1)).^2,dims=1) + SSres = sum(e .^ 2,dims=1) + R2 = vec(1 .- (SSres ./ SStot)) + + pvf(mu,se) = se == 0 ? 1.0 : cdf(Normal(abs(mu),se),0) #1 to make log(1) = 0 + Φpv = pvf.(Φ,Φse) + Φ0pv = pvf.(Φ0,Φ0se) + slpv = vec(reshape(sum(log.(vcat(reshape(Φ0pv,:,m),reshape(Φpv,:,m))),dims=1),:,1)) + stats = Dict([(" Variable", varnames), + ("R2", R2), + ("R2adj", 1 .- (1 .- R2) * (n-1)/(n-(np-1)/m-1)), + ("Fisher's p-test", 1 .- cdf(Chisq(2*np/m),-2*slpv))]) + + Σ = sqrt.(diag(Σ2)) + + call = "ar(X, order="*string(or)* + ", constant="*string(constant)*")" + + AR(varnames, + or, m, + compact.([Φ,Φ0,Σ2,Σ,x[end:-1:1,:],fitted,e])..., + ic,stats, + compact.([Φse,Φ0se,Φpv,Φ0pv])..., + call) end -function sigf(pv::Real)::String +""" +Package: Forecast + + fixΦ(X,Y,dΦ0,dΦ) + +For a given X and Y OLS matrices returns the X and Y resulting from fixing parameters given dΦ0 and dΦ +""" +function fixΦ(X::AbstractMatrix{T}, + Y::AbstractMatrix{T}, + i::Integer, + dΦ0::Tuple{AbstractArray{T},AbstractArray{T}}, + dΦ::Tuple{AbstractArray{T},AbstractArray{T}}) where T<:Real + + Φ0, fΦ0 = dΦ0 + Φ, fΦ = dΦ + (Φ0 == fΦ0) & (Φ == fΦ) && return((X,Y[:,i])) - if pv < 0.001 return " ***" end - if pv < 0.01 return " ** " end - if pv < 0.05 return " * " end - if pv < 0.1 return " ^ " end - if pv < 1 return " " end - if pv == 1 return " + " end + rΦ0 = reshape(Φ0,:,1) + rΦ = reshape(Φ,size(Φ,1),:) + rΦ = hcat(rΦ0,rΦ) + + rfΦ0 = reshape(fΦ0,:,1) + rfΦ = reshape(fΦ,size(fΦ,1),:) + rfΦ = hcat(rfΦ0,rfΦ) + + dc = findall(rΦ[i,:] .!== rfΦ[i,:]) + + @assert length(rΦ) > length(dc) "No degrees of freedom" + + # Create new Yi + Yi = Y[:,i] + for c in dc + Yi = Yi .- rfΦ[i,c]*X[:,c] + end + + # Create new X + Xi = drop(X,c=dc) + + (Xi,Yi) + +end + +""" +Package: Forecast + + fixW(W,dΦ0,dΦ) + +For a given Weight matrix returns a version with fixed values based on dΦ0 and dΦ +""" +function fixW(dWi::AbstractVector{T}, + i::Integer, + dΦ0::Tuple{AbstractArray{T},AbstractArray{T}}, + dΦ::Tuple{AbstractArray{T},AbstractArray{T}}) where T<:Real + + Φ0, fΦ0 = dΦ0 + Φ, fΦ = dΦ + (Φ0 == fΦ0) & (Φ == fΦ) && return(dWi) + + rΦ0 = reshape(Φ0,:,1) + rΦ = reshape(Φ,size(Φ,1),:) + rΦ = hcat(rΦ0,rΦ) + + rfΦ0 = reshape(fΦ0,:,1) + rfΦ = reshape(fΦ,size(fΦ,1),:) + rfΦ = hcat(rfΦ0,rfΦ) + + dc = findall(rΦ[i,:] .!== rfΦ[i,:]) + + for c in dc + insert!(dWi,c,rfΦ[i,c]) + end + + dWi + +end + + +""" +Package: Forecast + + fixnp(dΦ0,dΦ) + +return the number of free parameters +""" +function fixnp(dΦ0::Tuple{AbstractArray{T},AbstractArray{T}}, + dΦ::Tuple{AbstractArray{T},AbstractArray{T}}) where T<:Real + + Φ0, fΦ0 = dΦ0 + Φ, fΦ = dΦ + (Φ0 == fΦ0) & (Φ == fΦ) && return(length(Φ) + length(Φ0)) + + rΦ0 = reshape(Φ0,:,1) + rΦ = reshape(Φ,size(Φ,1),:) + rΦ = hcat(rΦ0,rΦ) + + rfΦ0 = reshape(fΦ0,:,1) + rfΦ = reshape(fΦ,size(fΦ,1),:) + rfΦ = hcat(rfΦ0,rfΦ) + + dc = findall(rΦ .!== rfΦ) + + length(Φ) + length(Φ0) - length(dc) +end + + +""" +Package: Forecast + + fixΦse(M,dΦ0,dΦ) + +For a given `se` matrix returns an version with zeroes based on dΦ0 and dΦ +""" +function fixΦse(Φse::AbstractVector{T}, + dΦ0::Tuple{AbstractArray{T},AbstractArray{T}}, + dΦ::Tuple{AbstractArray{T},AbstractArray{T}}) where T<:Real + + Φ0, fΦ0 = dΦ0 + Φ, fΦ = dΦ + (Φ0 == fΦ0) & (Φ == fΦ) && return(Φse) + + _,m,np = size(Φ) + + rΦ = reshape(hcat(Φ0 ,reshape(Φ ,m,m*np))',m*(m*np+1),1) + rfΦ = reshape(hcat(fΦ0,reshape(fΦ,m,m*np))',m*(m*np+1),1) + + dc = findall(rΦ .!== rfΦ) + + fΦse = Vector{T}(undef, length(Φse)) + fΦse[:] = Φse + for ci in dc + c = Tuple(ci) + fΦse[c[1]] = 0.0 + end + + fΦse + end diff --git a/src/CCF.jl b/src/CCF.jl index 0a5a890..f08a5d7 100644 --- a/src/CCF.jl +++ b/src/CCF.jl @@ -1,25 +1,113 @@ """ Package: Forecast -Store results from the functions `acf`, `ccf` and `pacf` + ccf(x1::{AbstractVector,DataFrame}, + x2::{AbstractVector,DataFrame}; + type = "cor", + lag = Integer(ceil(10*log10(length(x1)))), + alpha = (0.95,0.99)) + +Compute the cross-correlation or cros-covariance of two univariate series. + +The results are normalized to preserve homoscedasticity. The distribution used to normalize the data is an approximation of a Fisher Transformation via a Normal Distribution. There is a plot recipe for the returned object, if the type is `cor` the plot will also show confidence intervals for the given alpha values. + +If, for a given integer `k`, `x2` repeats `x1` values such that x1[t] = x2[t+k] for all `i` then high correlation value will be placed *at the right from the center* in the results. That is, this convention will be represented in the plots as `x1_t = x2_{t+k} -> _____0__k__` meaning x2 behavior can be predicted by x1 in k units. # Arguments -`ccf` An array with results from ccf, acf and pacf -`N` Length of ccf -`type Type of CCF -`lag` Maximum number of lags -`alph` CI thresholds -`ci` CI for the alpha -`auto` Auto-correlation -`call` Method called to generate ccf +- `x1`: Vector or uni-dimensional DataFrame of data. +- `x2`: Vector or uni-dimensional DataFrame of data. +- `type`: Valid values are "cor" for correlation (default) and "cov" for convariance. +- `lag`: Maximum number of lags. +- `alpha`: A tuple with two thresholds (t1,t2) with t1 <= t2 to plot confidence intervals. The default values are 0.95 and 0.99. + +# Returns +A CCF object. + +# Examples +```julia-repl +julia> x1 = rand(100); +x2 = circshift(x1,6); +res = ccf(x1, x2; type="cor"); +plot(res) +``` """ -mutable struct CCF - ccf::AbstractArray{<:Real} - N::Integer - type::String - lag::Integer - alpha::Tuple{Real,Real} - ci::Tuple{Real,Real} - auto::Bool - call::String +function ccf(df1::DataFrame, + df2::DataFrame; + type::String = "cor", + lag::Integer = Integer(ceil(10*log10(length(x1)))), + alpha::Tuple{Real,Real} = (0.95,0.99)) + + x1 = Array(df1[:,eltype.(eachcol(df1)) .<: Real]) + x2 = Array(df1[:,eltype.(eachcol(df1)) .<: Real]) + + ccf(x1, x2; type = type, lag = lag, alpha = alpha) + end + +function ccf(x1::AbstractVector{<:Real}, + x2::AbstractVector{<:Real}; + type::String = "cor", + lag::Integer = Integer(ceil(10*log10(length(x1)))), + alpha::Tuple{Real,Real} = (0.95,0.99)) + + N = length(x1) + @assert N == length(x2) "Vectors should be of equal size" + @assert N >= 4 "Vectors should have at least four values" + @assert isnothing(findfirst(isnan,x1)) "No missing values allowed" + @assert isnothing(findfirst(isnan,x2)) "No missing values allowed" + @assert isnothing(findfirst(ismissing,x1)) "No missing values allowed" + @assert isnothing(findfirst(ismissing,x2)) "No missing values allowed" + @assert type in ["cor","cov"] "The options for `type` are `cor` and `cov`" + @assert 1 <= lag <= N-4 + @assert length(alpha) == 2 + @assert 0.0 < alpha[1] < alpha[2] < 1.0 + + # auto-ccf + auto = x1 == x2 + + ft = (type == "cor" ? Statistics.cor : Statistics.cov) + + x = [] + + for i in 1:lag + push!(x,ft(x1[i+1:N],x2[1:(N-i)])) + end + + if auto + kx = collect(N-1:-1:N-lag) + else + x = vcat(ft(x1,x2),x) # adding central value (lag=0) + x = reverse(x) + for i in 1:lag + push!(x,ft(x2[i+1:N],x1[1:(N-i)])) + end + kx = vcat(collect(N-lag:N-1), N, collect(N-1:-1:N-lag)) + end + + # ccf normalized with the standard error for the + # Normal distribution of an approximation for + # the Fisher transformation + function fse(v::Real)::AbstractFloat + 1/sqrt(v-3) + end + k = fse.(kx)/fse(N) + ccf_res = x ./ k + + call = "ccf("* (auto ? "x" : "x1, x2") * + "; type=\""*type* + "\", lag="*string(lag)* + ", alpha="*string(alpha)*")" + + # Confidence Intervals + a1 = alpha[1] + a2 = alpha[2] + z1 = Distributions.quantile(Normal(), a1 + (1-a1)/2) + ci1 = z1*fse(N) + z2 = Distributions.quantile(Normal(), a2 + (1-a2)/2) + ci2 = z2*fse(N) + ci = (ci1,ci2) + + CCF(ccf_res, N, type, lag, alpha, ci, auto, call) + +end + diff --git a/src/FORECAST.jl b/src/FORECAST.jl index 4432e34..679ef2a 100644 --- a/src/FORECAST.jl +++ b/src/FORECAST.jl @@ -1,100 +1,77 @@ -""" -Package: Forecast +module Forecast -Store results from the function `forecast` +using CSV, Distributions, ColorSchemes, DataFrames, DataFramesMeta, Dates, CodecZlib, + HypothesisTests, LinearAlgebra, Optim, Plots, PrettyTables, RecipesBase, Statistics, StatsBase -# Arguments - model: Model object containing information about the fitted model. - x: Original time series. - alpha: The confidence levels associated with the prediction intervals. - mean: Point forecasts. - lower: Lower limits for prediction intervals. - upper: Upper limits for prediction intervals. - se: Standard Error. -""" -mutable struct FORECAST - model - alpha::Tuple - mean::DataFrame - upper::DataFrame - lower::DataFrame - se::DataFrame - call::String -end +# types +export AR, CCF, STL -function Base.show(io::IO, fc::FORECAST) - ts = eltype(fc.mean[:,1]) in [Date, DateTime, Time, - Year, Month, Quarter, Week, Day, - Hour, Second, Millisecond, - Microsecond, Nanosecond] - printstyled("Forecast Information\n",bold=true,color=:underline) - print("\n",fc.call, "\n") - printstyled("\nMean Forecasting\n",bold=true,color=:underline) - pretty_table(fc.mean, noheader = false, nosubheader = true, show_row_number=false) - printstyled("\nPrediction Intervals alpha at: "*string(fc.alpha)*"\n",bold=true,color=:underline) - printstyled("\nUpper:\n",color=:underline) - pretty_table(fc.upper, noheader = false, nosubheader = true, show_row_number=false, - vlines = [0,1,div(size(fc.upper,2),2)+1,size(fc.upper,2)]) - printstyled("\nLower:\n",color=:underline) - pretty_table(fc.lower, noheader = false, nosubheader = true, show_row_number=false, - vlines = [0,1,div(size(fc.lower,2),2)+1,size(fc.lower,2)]) -end +# methods +export acf, ar, arsim, boxcox, ccf, d, hma, iboxcox, loess, p, pacf, forecast, + setnames!, sma, stl, transform, summarize -""" -Rename data in FORECAST object -""" -function setnames!(fc::FORECAST,new_names::AbstractVector{String}) - n_mean = names(fc.mean) - n_mean[2:end] = new_names - rename!(fc.mean,n_mean) +# datasets +export air, co2, london, quakes, seaborne - nn_upper = [] - for n in new_names - push!(nn_upper,"upper1_"*n) - push!(nn_upper,"upper2_"*n) - end - n_upper = names(fc.upper) - n_upper[2:end] = nn_upper - rename!(fc.upper,n_upper) - - nn_lower = [] - for n in new_names - push!(nn_lower,"lower1_"*n) - push!(nn_lower,"lower2_"*n) - end - n_lower = names(fc.lower) - n_lower[2:end] = nn_lower - rename!(fc.lower,n_lower) +# types +include("AR.jl") +include("CCF.jl") +include("FORECAST.jl") +include("STL.jl") +include("SUMMARIZE.jl") - if fc.model.x isa DataFrame - n_model = names(fc.model.x) - n_model[2:end] = new_names - rename!(fc.model.x,n_model) - fc.model.varnames=new_names - end -end +# source files +include("acf.jl") +include("ar.jl") +include("arsim.jl") +include("boxcox.jl") +include("ccf.jl") +include("d.jl") +include("datasets.jl") +include("hma.jl") +include("loess.jl") +include("p.jl") +include("pacf.jl") +include("forecast_ar.jl") +include("sma.jl") +include("stl.jl") +include("summarize.jl") +include("utils.jl") +include("utils_datetime.jl") -""" -Transform a FORECAST object value with given function -""" -function transform(fc::FORECAST, f::Function, vari::Union{Integer,UnitRange} = 1:size(fc.mean,2)-1) +# plot recipes +## type +include("plot_CCF.jl") +include("plot_DataFrame.jl") +include("plot_FORECAST.jl") +include("plot_STL.jl") - vari = collect(vari) - # Renaming - tfc = deepcopy(fc) - fnames = tfc.model.varnames - fnames[vari] .= "$(string(f))_" .* fnames[vari] - setnames!(tfc, fnames) +## user +include("splot.jl") - # Transform - vari2 = vcat(vari, vari .+ size(fc.mean,2) .- 1) - tfc.model.x[:,vari .+ 1] .= f.(Array(tfc.model.x[:,vari .+ 1])) - tfc.mean[:,vari .+ 1] .= f.(Array(tfc.mean[:,vari .+ 1])) - tfc.lower[:,vari2 .+ 1] .= f.(Array(tfc.lower[:,vari2 .+ 1])) - tfc.upper[:,vari2 .+ 1] .= f.(Array(tfc.upper[:,vari2 .+ 1])) +""" +Collection of methods for Time Series analysis + +Featured Methods: - tfc.call = fc.call * "\nData transformed with function: $(f)" + acf: Auto-correlation or auto-covariance of univariate series. + ar: Multivariate Autoregressive Model. + arsim: Simulated Multivariate Autoregressive Model. + boxcox: Box-Cox Transformations. + ccf: Cross-correlation or cros-covariance of two univariate series. + d: Lagged differences of a given order for Vector and Array. + forecast: Forecast values of fitted time series models. + hma: Henderson moving average filter. + iboxcox: Inverse Box-Cox Transformations. + loess: Locally estimated scatterplot smoothing. + p: Reverse lagged differences of a given order for types Vector and Array. + pacf: Partial Auto-correlation function. + sma: Simple moving average. + splot: Plot a seasonal plot for types Vector and TimeArray. + stl: Seasonal and Trend decomposition using loess. + summarize: Statistical summary. +""" +Forecast - return(tfc) end diff --git a/src/STL.jl b/src/STL.jl index 6e498de..9e5d508 100644 --- a/src/STL.jl +++ b/src/STL.jl @@ -1,20 +1,221 @@ """ Package: Forecast -Store results from the function stl + stl(Yv, np; robust=false, + nl=nextodd(np), + ns=10*length(Yv)+1, + nt=nextodd(1.5*np/(1-1.5/ns)), + ni=robust ? 1 : 2, + no=0, + spm=false, + qsmp=max(div(np,7),2), + cth = 0.01, + timestamp = nothing, + verbose=false) + +Decompose a time series into trend, seasonal, and remainder components. + +"STL has a simple design that consists of a sequence of applications of the loess smoother; the simplicity allows analysis of the properties of the procedure and allows fast computation, even for very long time series and large amounts of trend and seasonal smoothing. Other features of STL are specification of amounts of seasonal and trend smoothing that range, in a nearly continous way, from very small amount of smoothing to a very large amount; robust estimates of the trend and seasonal components that are not distorted by aberrant behavior in the data; specification of the period of the seasonal component to any intenger multiple of the time sampling interval greater than one; and the ability to decompose time series with missing values."* + +All default values are chosen following the recommendations of the original paper when those were recommended. `ns` is recommended to be chosen of the basis of knowledge of the time series and on the basis of diagnostic methods; it must nonethelessbe always odd and at least 7. A default value is not advised on the original paper, instead the same default value used in the stl implementation in R in usere here. + +for `no` the authors advise 5 ("safe value") or 10 ("near certainty of convergence") cycles or a convergence criterion when robustness is required, in this case when `robust` is true computations stop when convergence is achieved in trend and seasonality. + +for `qsmp` the authors do not adivise a default but they use a value close to div(`np`,7). # Arguments - `decomposition::DataFrame` A time array with three time series from a fitted STL model - `call::String` method called to generate ta -""" -mutable struct STL{T<:DataFrame} - decomposition::T # A DataFrame with three time series from a fitted STL model - call::String # method called -end +- `np`: Seasonality. +- `robust`: Robust stl. +- `nl`: Smoothing parameter of the low-pass filter. +- `ns`: Smoothing parameter for the seasonal component. +- `nt`: Smoothing parameter for the trend decomposition. +- `ni`: Number of inner loop cycles. +- `no`: Number of outer loop cycles. +- `spm`: Seasonal post-smoothing. +- `qsmp`: Loess q window for Seasonal post-smoothing. +- `cth`: Corvengence threshold for Seasonal and Trend. +- `timestamp`: Timestamp to be used other than the default. +- `verbose`: If true shows updates for the Seasonal and Trend convergence. + +# Returns +An `STL` object with the seasonal, trend and remainder components. + +* STL: A Seasonal, Trend Decomposition Procedure Based on Loess" Robert B. Cleveland, William S. Cleveland, Jean E. McRae, and Irma Terpenning. Journal of Official Statistics Vol. 6. No. 1, 1990, pp. 3-73 (c) Statistics Sweden. + + +# Examples +```julia-repl +julia> stl_co2 = stl(co2(),365; robust=true, spm=true) +[ Info: Dataset used in Cleveland et al. paper +[ Info: Corvengence achieved (< 0.01); Stopping computation... +STL Object: stl(Yn, np=365; nl=365, ns=46091, nt=549, ni=1, no=0, spm=true, qsmp=52) + +julia> plot(stl_co2) +``` +""" +function stl(Yv::DataFrame,np::Integer; + robust=false, + nl=nextodd(np), + ns=10*nrow(Yv)+1, + nt=nextodd(1.5*np/(1-1.5/ns)), + ni=robust ? 1 : 2, + no=0, + spm=false, + qsmp=max(div(np,7),2), + cth = 0.01, + timestamp = nothing, + verbose=false) + + x = Vector(Yv[:,eltype.(eachcol(Yv)) .<: Union{Missing,Real}][:,1]) + timestamp = eltype(Yv[:,1]) == Date ? Yv[:,1] : timestamp + + stl(x,np; + robust=robust, nl=nl, ns=ns, nt=nt, + ni=ni, no=no, spm=spm, qsmp=qsmp, verbose=verbose, cth, timestamp) -function Base.show(io::IO, stlx::STL) - printstyled("STL Object:",bold=true,color=:underline) - print(" ", stlx.call) end +function stl(Yv, + np::Integer; + robust=false, + nl=nextodd(np), + ns=10*length(Yv)+1, + nt=nextodd(1.5*np/(1-1.5/ns)), + ni=robust ? 1 : 2, + no=0, + spm=false, + qsmp=max(div(np,7),2), + cth = 0.01, + timestamp = nothing, + verbose=false) #where T<:Union{Missing, Number} + + @assert mod(ns,2)==1 & (ns>=7) "`ns` is chosen of the basis of knowledge of the time series and on the basis of diagnostic methods; must always be odd and at least 7" + + function B(u) + (u < 1) & !ismissing(u) ? (1.0-u^2)^2 : 0.0 + end + + N = length(Yv) + # initial robustness weights + rhov = ones(N) + # initial trend + Tv = Tv0 = zeros(N) + Sv = Sv0 = zeros(N) + Rv = Array{Float64,1}(undef,N) + Cv = Array{Float64,1}(undef,N+2*np) + scnv = false # seasonal convergence flag + tcnv = false # trend convergence flag + #for o in 0:no + o = 0 + while robust | (o <= no) + for k in 1:ni + # Updating seasonal and trend components + ## 1. Detrending (Yv = Tv + Sv) + Sv = Yv - Tv + ### Seasonal convergence criterion + Md = maximum(abs.(skipmissing(Sv-Sv0))) + M0 = maximum(skipmissing(Sv0)) + m0 = minimum(skipmissing(Sv0)) + scnv = (Md/(M0-m0) < cth) + if verbose + println("Outer loop: " * string(o) * " - " * "Inner loop: " * string(k)) + println("Seasonal Convergence: " * string(Md/(M0-m0))) + end + Sv0 = Sv + + # Seasonal Smoothing 2,3,4 + ## 2. Cycle-subseries Smoothing + for csi in 1:np + Cv[csi:np:N+2*np] = loess(csi:np:N, + Sv[csi:np:N]; + q=ns,d=1,rho=rhov[csi:np:N], + predict=(1.0*csi-np):np:(N+np)) + end + ## 3. Low-Pass Filtering of Smoothed Cycle-Subseries + ### centered support instead 1:N to balance out machine error + Lv = loess(1-ceil(N/2):N-ceil(N/2), + sma(sma(sma(Cv,np),np),3), + d=1,q=nl,rho=rhov) + ## 4. Detreending of Smoothed Cycle-Subseries + ### Lv is substracted to prevent low-frenquency power + ### from entering the seasonal component. + Sv = Cv[np+1:end-np] - Lv + + ## 5. Deseasonalizing + Dv = Yv - Sv + + # Trend Smoothing + ## 6. Trend Smoothing + ### centered support instead 1:N to balance out machine error + ### (floor instead ceil like in Lv to balance out even lengths) + Tv = loess(1-floor(N/2):N-floor(N/2),Dv,q=nt,d=1,rho=rhov) + + ### Trend convergence criterion + Md = maximum(abs.(skipmissing(Tv-Tv0))) + M0 = maximum(skipmissing(Tv0)) + m0 = minimum(skipmissing(Tv0)) + tcnv = (Md/(M0-m0) < cth) + if verbose + println("Trend Convergence: " * string(Md/(M0-m0)) * "\n") + end + Tv0 = Tv + + scnv & tcnv ? break : nothing + end + # Computation of robustness weights + ## These new weights will reduce the influence of transient behaviour + ## on the trend and seasonal components. + + # Tv and Sv defined everywhere but Rv is not defined + # where Yv has missing values + + Rv = Yv - Tv - Sv + + if scnv & tcnv + @info "Convergence achieved (< " * string(cth) * "); Stopping computation..." + break + end + + if 0 < o <= no + smRv = skipmissing(Rv) + h = 6*median(abs.(smRv)) + rhov = B.(abs.(Rv)/h) + end + o += 1 + end + + if spm + # Using div(np,7) as default to approximate + # the q=51 for a np=365 chosen in the original paper + Sv = loess(1-ceil(N/2):N-ceil(N/2),Sv,q=qsmp) + Rv = Yv - Tv - Sv + end + + if !scnv + @warn "Seasonal convergence not achieved (>= " * string(cth) * ") + Consider a robust estimation" + end + + if !tcnv + @warn "Trend convergence not achieved (>= " * string(cth) * ") + Consider a robust estimation" + end + + call = "stl(Yn, np="*string(np)* + "; nl="*string(nl)* + ", ns="*string(ns)* + ", nt="*string(nt)* + ", ni="*string(ni)* + ", no="*string(no)* + ", spm="*string(spm)* + ", qsmp="*string(qsmp)*")" + + # Dates.datetime2unix(DateTime(1970,1,1,0,0,1)) = 1.0 + if isnothing(timestamp) + timestamp = DateTime(1970,1,1,0,0,1):Second(1):DateTime(1970,1,1,0,0,1)+Second(N-1) + end + + STL(DataFrame([timestamp, Sv, Tv, Rv], [:Timestamp, :Seasonal, :Trend, :Remainder]), call) + +end diff --git a/src/SUMMARIZE.jl b/src/SUMMARIZE.jl index b704338..4de0a64 100644 --- a/src/SUMMARIZE.jl +++ b/src/SUMMARIZE.jl @@ -1,22 +1,119 @@ """ Package: Forecast -Store results from the function `summarize` + summarize(x; varnames = nothing) + +Return statistical summary for x + +The values returned are dividen in three sections, the first one shows Minimum, 1st Quantile, Median, Mean, 3rd Quantile, Maxixum and the p-value for the Jarque-Bera Normality Test. The second one show the first four moment; Mean, Variance, Skewness and Kurtosis, an finally a summary with the different types contained in the Array. # Arguments -`quantiles::DataFrame` DataFrame with the data quantiles for each column -`moment::DataFrame` DataFrame with the first four moments for each column -`format::DataFrame` DataFrame with number of types per column +- `x`: Array or DataFrame of data. +- `varnames`: Names for the columns to be summarized, it defaults to automatic naming + or the existing names in when a DataFrame. + +# Returns +A SUMMARIZE struct + +# Examples +```julia-repl +julia> summarize(rand(100,3); varnames = ["a","b","c"]) +┌──────────┬────────────┬──────────┬──────────┬──────────┬──────────┬──────────┬───────────┐ +│ Variable │ Min │ 1Q │ Median │ Mean │ 3Q │ Max │ H0 Normal │ +├──────────┼────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┤ +│ a │ 0.00520465 │ 0.205712 │ 0.462199 │ 0.465784 │ 0.6913 │ 0.97946 │ 0.0593599 │ +│ b │ 0.00218787 │ 0.247344 │ 0.485465 │ 0.498587 │ 0.723371 │ 0.985226 │ 0.0562301 │ +│ c │ 0.0244256 │ 0.247598 │ 0.530821 │ 0.498689 │ 0.722731 │ 0.967952 │ 0.0356495 │ +└──────────┴────────────┴──────────┴──────────┴──────────┴──────────┴──────────┴───────────┘ +┌──────────┬──────────┬───────────┬───────────┬───────────┐ +│ Variable │ Mean │ Variance │ Skewness │ Kurtosis │ +├──────────┼──────────┼───────────┼───────────┼───────────┤ +│ a │ 0.465784 │ 0.0823949 │ 0.0823949 │ 0.0823949 │ +│ b │ 0.498587 │ 0.0854883 │ 0.0854883 │ 0.0854883 │ +│ c │ 0.498689 │ 0.0790597 │ 0.0790597 │ 0.0790597 │ +└──────────┴──────────┴───────────┴───────────┴───────────┘ +┌──────────┬─────────┐ +│ Variable │ Float64 │ +├──────────┼─────────┤ +│ a │ 100 │ +│ b │ 100 │ +│ c │ 100 │ +└──────────┴─────────┘ +``` """ -mutable struct SUMMARIZE - quantiles::DataFrame - moments::DataFrame - format::DataFrame +function summarize(x::DataFrame) + summarize(Array(tots(x)[:,2:end]), varnames = names(x)[2:end]) +end + +function summarize(x::AbstractVector{<:Real}; varnames = nothing) + summarize(reshape(x,length(x),1), varnames = varnames) end -function Base.show(io::IO, s::SUMMARIZE) - pretty_table(s.quantiles, nosubheader = true, show_row_number=false, crop = :none) - pretty_table(s.moments, nosubheader = true, show_row_number=false, crop = :none) - pretty_table(s.format, nosubheader = true, show_row_number=false, crop = :none) +function summarize(x::Real; varnames = nothing) + summarize([x,x], varnames = varnames) +end + +function summarize(x::AbstractArray{T}; varnames = nothing) where T<:Real + + x = compact(x) + @assert (nd = ndims(x)) <= 2 "Data should have two dimensions at most." + n,m = nd == 1 ? (length(x),1) : size(x) + + varnames = isnothing(varnames) ? ["x"*string(i) for i in 1:m] : varnames + + # Quantiles and Mean + sk = ["Variable", "Min","1Q","Median","Mean","3Q","Max","H0 Normal"] + sv = zeros(m,7) + s = Array{Any,2}(undef,m,7) + + minimum_Number(x) = minimum(filter(x -> x isa Number, x)) + maximum_Number(x) = maximum(filter(x -> x isa Number, x)) + mean_Number(x) = mean(filter(x -> x isa Number, x)) + quantile_Number(x,q) = quantile(filter(x -> x isa Number, x),q) + normality(x) = pvalue(JarqueBeraTest(x)) + + sv[nd <= 1 ? 1 : 1:end,7] = nd <= 1 ? normality(x) : mapslices(normality,x,dims=[1]) + sv[nd <= 1 ? 1 : 1:end,1] = nd <= 1 ? minimum(x) : mapslices(minimum_Number,x,dims=[1]) + sv[nd <= 1 ? 1 : 1:end,6] = nd <= 1 ? maximum(x) : mapslices(maximum_Number,x,dims=[1]) + sv[nd <= 1 ? 1 : 1:end,4] = nd <= 1 ? mean(x) : mapslices(mean_Number,x,dims=[1]) + sv[nd <= 1 ? 1 : 1:end,[2,3,5]] = nd <= 1 ? quantile_Number(x,[.25,.5,.75]) : + mapslices(x -> quantile_Number(x,[.25,.5,.75]),x,dims=[1])' + qua = DataFrame(hcat(varnames,sv), :auto) + DataFrames.rename!(qua, sk) + + # Moments + sk = ["Variable", "Mean", "Variance", "Skewness", "Kurtosis"] + sv = zeros(m,4) + s = Array{Any,2}(undef,m,4) + + var_Number(x) = var(filter(x -> x isa Number, x)) + skewness_Number(x) = var(filter(x -> x isa Number, x)) + kurtosis_Number(x) = var(filter(x -> x isa Number, x)) + + sv[nd <= 1 ? 1 : 1:end,1] = nd <= 1 ? mean(x) : mapslices(mean_Number,x,dims=[1]) + sv[nd <= 1 ? 1 : 1:end,2] = nd <= 1 ? var(x) : mapslices(var_Number,x,dims=[1]) + sv[nd <= 1 ? 1 : 1:end,3] = nd <= 1 ? skewness(x) : mapslices(skewness_Number,x,dims=[1]) + sv[nd <= 1 ? 1 : 1:end,4] = nd <= 1 ? kurtosis(x) : mapslices(kurtosis_Number,x,dims=[1]) + + mo = DataFrame(hcat(varnames,sv), :auto) + DataFrames.rename!(mo, sk) + + # Format Check + tox = string.(typeof.(x)) + tx = sort(unique(tox)) + nt = length(tx) + mv = length(varnames) + vt = Array{Any}(undef,mv,nt) + vt[:] .= 0 + + for (i,ty) in enumerate(tx) + vt[:,i] = count(==(ty), tox, dims=1) + end + + tdf = DataFrame(hcat(varnames,vt),:auto) + fc = DataFrames.rename!(tdf,vcat("Variable", tx)) + + SUMMARIZE(qua,mo,fc) + end diff --git a/src/ar.jl b/src/ar.jl index 83d7226..4de1447 100644 --- a/src/ar.jl +++ b/src/ar.jl @@ -162,7 +162,7 @@ function ar_ols(x::AbstractArray{T}, #fix p np = fixnp(dΦ0,dΦ) - # TODO check weher to use 'or' or 'np' and if 'or' then a fixor function is needed + # TODO check wether to use 'or' or 'np' and if 'or' then a fixor function is needed # Information Criteria lΣ2 = log(norm(Σ2)) ic = Dict([("AIC", lΣ2 + 2*np*m^2/n), diff --git a/src/ccf.jl b/src/ccf.jl index e4a5166..f08a5d7 100644 --- a/src/ccf.jl +++ b/src/ccf.jl @@ -85,7 +85,7 @@ function ccf(x1::AbstractVector{<:Real}, end # ccf normalized with the standard error for the - # Normal distsribution of an approximation for + # Normal distribution of an approximation for # the Fisher transformation function fse(v::Real)::AbstractFloat 1/sqrt(v-3) diff --git a/src/datasets.jl b/src/datasets.jl index 372eb37..36010a8 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -54,7 +54,7 @@ function co2(full::Bool = false) # since there seems to be no data for april in 1974 dates_co2_stl = Dates.Date(1974,5,17):Dates.Day(1):Dates.Date(1986,12,31) - # revome leap year day + # remove leap year day dates_co2_stl = filter(dates_co2_stl) do x (Dates.isleapyear(x) & (Dates.month(x) == 2)) ? Dates.day(x) != 29 : true end diff --git a/src/p.jl b/src/p.jl index e6724ed..25f0e62 100644 --- a/src/p.jl +++ b/src/p.jl @@ -99,7 +99,7 @@ function p(fc::FORECAST, fnames[vari] = "p$(size(x0))_" .* fnames[vari] pfc = deepcopy(fc) - # Inverse Differentation + # Inverse Differentiation xts = fc.model.x[:,1] fx = Array(fc.model.x[:,2:end]) fmean = Array(fc.mean[:,2:end]) diff --git a/src/stl.jl b/src/stl.jl index e4e46f7..9e5d508 100644 --- a/src/stl.jl +++ b/src/stl.jl @@ -98,7 +98,7 @@ function stl(Yv, N = length(Yv) # initial robustness weights rhov = ones(N) - # intial trend + # initial trend Tv = Tv0 = zeros(N) Sv = Sv0 = zeros(N) Rv = Array{Float64,1}(undef,N) @@ -109,7 +109,7 @@ function stl(Yv, o = 0 while robust | (o <= no) for k in 1:ni - # Updating sesonal and trend components + # Updating seasonal and trend components ## 1. Detrending (Yv = Tv + Sv) Sv = Yv - Tv @@ -148,7 +148,7 @@ function stl(Yv, # Trend Smoothing ## 6. Trend Smoothing ### centered support instead 1:N to balance out machine error - ### (floor isntead ceil like in Lv to balance out even lengths) + ### (floor instead ceil like in Lv to balance out even lengths) Tv = loess(1-floor(N/2):N-floor(N/2),Dv,q=nt,d=1,rho=rhov) ### Trend convergence criterion @@ -173,7 +173,7 @@ function stl(Yv, Rv = Yv - Tv - Sv if scnv & tcnv - @info "Corvengence achieved (< " * string(cth) * "); Stopping computation..." + @info "Convergence achieved (< " * string(cth) * "); Stopping computation..." break end