From a31e4ee90f307bc4175ffeceb53738b3697acf73 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 00:25:43 +0200 Subject: [PATCH 1/7] missing values for continuous don't lead to categorical --- src/schema.jl | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/schema.jl b/src/schema.jl index 314d9c95..bdbd8f3b 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -60,7 +60,7 @@ Base.haskey(schema::Schema, key) = haskey(schema.schema, key) Compute all the invariants necessary to fit a model with `terms`. A schema is a dict that maps `Term`s to their concrete instantiations (either `CategoricalTerm`s or `ContinuousTerm`s. "Hints" may optionally be supplied in the form of a `Dict` mapping term -names (as `Symbol`s) to term or contrast types. If a hint is not provided for a variable, +names (as `Symbol`s) to term or contrast types. If a hint is not provided for a variable, the appropriate term type will be guessed based on the data type from the data column: any numeric data is assumed to be continuous, and any non-numeric data is assumed to be categorical. @@ -91,7 +91,7 @@ StatsModels.Schema with 1 entry: y => y ``` -Note that concrete `ContinuousTerm` and `CategoricalTerm` and un-typed `Term`s print the +Note that concrete `ContinuousTerm` and `CategoricalTerm` and un-typed `Term`s print the same in a container, but when printed alone are different: ```jldoctest 1 @@ -176,6 +176,8 @@ concrete_term(t::Term, d) = concrete_term(t, d, nothing) concrete_term(t, d, hint) = t concrete_term(t::Term, xs::AbstractVector{<:Number}, ::Nothing) = concrete_term(t, xs, ContinuousTerm) +# and for missing values +concrete_term(t::Term, xs::AbstractVector{Union{Missing,T}} where T<:Number, ::Nothing) = concrete_term(t, xs, ContinuousTerm) function concrete_term(t::Term, xs::AbstractVector, ::Type{ContinuousTerm}) μ, σ2 = StatsBase.mean_and_var(xs) min, max = extrema(xs) @@ -196,9 +198,9 @@ end Return a new term that is the result of applying `schema` to term `t` with destination model (type) `Mod`. If `Mod` is omitted, `Nothing` will be used. -When `t` is a `ContinuousTerm` or `CategoricalTerm` already, the term will be returned -unchanged _unless_ a matching term is found in the schema. This allows -selective re-setting of a schema to change the contrast coding or levels of a +When `t` is a `ContinuousTerm` or `CategoricalTerm` already, the term will be returned +unchanged _unless_ a matching term is found in the schema. This allows +selective re-setting of a schema to change the contrast coding or levels of a categorical term, or to change a continuous term to categorical or vice versa. When defining behavior for custom term types, it's best to dispatch on @@ -277,7 +279,7 @@ function apply_schema(t::FormulaTerm, schema::Schema, Mod::Type{<:StatisticalMod end # strategy is: apply schema, then "repair" if necessary (promote to full rank -# contrasts). +# contrasts). # # to know whether to repair, need to know context a term appears in. main # effects occur in "own" context. From 1f64feea84d0506a05b41d9494f7a58e0288ff4c Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 10:33:30 +0200 Subject: [PATCH 2/7] local copy() that handles Missing but otherwise defaults to Base.copy() --- src/terms.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/terms.jl b/src/terms.jl index acee022b..c882d9a4 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -483,6 +483,13 @@ modelcols(t::Term, d::NamedTuple) = modelcols(ft::FunctionTerm{Fo,Fa,Names}, d::NamedTuple) where {Fo,Fa,Names} = ft.fanon.(getfield.(Ref(d), Names)...) +# this is weird, but using import Base: copy leads to exporting type piracy +# for non missing values, the compiler should hopefully optimize down the extra +# layer of indirection +function copy end +copy(x::Any) = Base.copy(x) +copy(m::Missing) = deepcopy(m) + modelcols(t::ContinuousTerm, d::NamedTuple) = copy.(d[t.sym]) modelcols(t::CategoricalTerm, d::NamedTuple) = t.contrasts[d[t.sym], :] From 2c22fd81a631685a140a9099bb2219f92755cffd Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 10:36:29 +0200 Subject: [PATCH 3/7] don't bother with deepcopy for missing, just return the singleton --- src/terms.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/terms.jl b/src/terms.jl index c882d9a4..0a8af997 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -488,7 +488,7 @@ modelcols(ft::FunctionTerm{Fo,Fa,Names}, d::NamedTuple) where {Fo,Fa,Names} = # layer of indirection function copy end copy(x::Any) = Base.copy(x) -copy(m::Missing) = deepcopy(m) +copy(m::Missing) = m modelcols(t::ContinuousTerm, d::NamedTuple) = copy.(d[t.sym]) From 3a2e2557058db1fb14d2b2c1ce06d5dca05293a9 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 10:49:53 +0200 Subject: [PATCH 4/7] tests for continuous terms with missing values --- test/terms.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/terms.jl b/test/terms.jl index f25fe76f..ae2fb015 100644 --- a/test/terms.jl +++ b/test/terms.jl @@ -22,6 +22,17 @@ mimestring(x) = mimestring(MIME"text/plain", x) @test t0.min == 1.0 @test t0.max == 3.0 + vals0m = [3, missing, 1] + t0m = concrete_term(t, vals0m) + @test string(t0m) == "aaa" + @test mimestring(t0m) == "aaa(continuous)" + # compute all these values to make sure the behavior of terms matches + # the behavior of other relevant packages + @test isequal(t0m.mean, mean(vals0m)) + @test isequal(t0m.var, var(vals0m)) + @test isequal(t0m.min, min(vals0m...)) + @test isqual(t0m.max, max(vals0m...)) + t1 = concrete_term(t, [:a, :b, :c]) @test t1.contrasts isa StatsModels.ContrastsMatrix{DummyCoding} @test string(t1) == "aaa" From aae789973ab9c953d97303860602121d74cfb03d Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 10:57:13 +0200 Subject: [PATCH 5/7] typo --- test/terms.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/terms.jl b/test/terms.jl index ae2fb015..e1ec6158 100644 --- a/test/terms.jl +++ b/test/terms.jl @@ -31,7 +31,7 @@ mimestring(x) = mimestring(MIME"text/plain", x) @test isequal(t0m.mean, mean(vals0m)) @test isequal(t0m.var, var(vals0m)) @test isequal(t0m.min, min(vals0m...)) - @test isqual(t0m.max, max(vals0m...)) + @test isequal(t0m.max, max(vals0m...)) t1 = concrete_term(t, [:a, :b, :c]) @test t1.contrasts isa StatsModels.ContrastsMatrix{DummyCoding} From 12f387469f65c5f9a16c94ca102483ef45dd5b54 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 15:47:30 +0200 Subject: [PATCH 6/7] rework getindex for contrast matrices to handle missing values --- src/contrasts.jl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/contrasts.jl b/src/contrasts.jl index d6a96627..61a81a90 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -207,8 +207,13 @@ function termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Intege levels[not_base] end -Base.getindex(contrasts::ContrastsMatrix{C,T}, rowinds, colinds) where {C,T} = - getindex(contrasts.matrix, getindex.(Ref(contrasts.invindex), rowinds), colinds) +function Base.getindex(contrasts::ContrastsMatrix{C,T}, rowinds, colinds) where {C,T} + # allow rows to be missing + rows = get.(Ref(contrasts.invindex), rowinds, missing) + # create a row of nothing but missings for missing values + mrow = reduce(vcat, [missing for c in getindex(contrasts.matrix, 1, colinds)]) + vcat([r === missing ? mrow : getindex(contrasts.matrix, r, colinds) for r in rows]) +end # Making a contrast type T only requires that there be a method for # contrasts_matrix(T, baseind, n) and optionally termnames(T, levels, baseind) From a799919ce790b356047dad0522c3826ffdaf8ac5 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 15:48:23 +0200 Subject: [PATCH 7/7] omit missing data after schema application --- src/modelframe.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/modelframe.jl b/src/modelframe.jl index 8500265f..64d7a194 100644 --- a/src/modelframe.jl +++ b/src/modelframe.jl @@ -79,11 +79,11 @@ missing_omit(data::T, formula::AbstractTerm) where T<:ColumnTable = function ModelFrame(f::FormulaTerm, data::ColumnTable; model::Type{M}=StatisticalModel, contrasts=Dict{Symbol,Any}()) where M - data, _ = missing_omit(data, f) - sch = schema(f, data, contrasts) f = apply_schema(f, sch, M) - + + data, _ = missing_omit(data, f) + ModelFrame(f, sch, data, model) end