Skip to content

Commit

Permalink
Use Tables.allocatecolumn to simplify and improve code (#1535)
Browse files Browse the repository at this point in the history
This new interface allows removing special cases for CategoricalArray and
making the code more general. It also makes it easier to stop calling
promote_col_type(data...) which triggers a stack overflow when
the number of input arguments is high.

This relies on promotion rules which have been fixed in CategoricalArrays 0.3.14,
and which give a slightly different behavior: now combining a CategoricalArray with
an Array gives an Array, since the number of unique levels could be very high.
  • Loading branch information
nalimilan authored Sep 25, 2018
1 parent 0aeb283 commit b02ad3c
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 26 deletions.
2 changes: 1 addition & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
julia 0.7.0
Missings 0.2.3
CategoricalArrays 0.3.11
CategoricalArrays 0.3.14
StatsBase 0.11.0
SortingAlgorithms
Reexport
Expand Down
3 changes: 2 additions & 1 deletion src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,8 @@ function _vcat(dfs::AbstractVector{<:AbstractDataFrame})
for (i, name) in enumerate(header)
data = [df[name] for df in dfs]
lens = map(length, data)
cols[i] = promote_col_type(data...)(undef, sum(lens))
T = mapreduce(eltype, promote_type, data)
cols[i] = Tables.allocatecolumn(T, sum(lens))
offset = 1
for j in 1:length(data)
copyto!(cols[i], offset, data[j])
Expand Down
29 changes: 10 additions & 19 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ mutable struct DataFrame <: AbstractDataFrame
# recycle scalars
for i in 1:length(columns)
isa(columns[i], AbstractArray) && continue
columns[i] = fill(columns[i], maxlen)
columns[i] = fill!(Tables.allocatecolumn(typeof(columns[i]), maxlen),
columns[i])
lengths[i] = maxlen
end
uls = unique(lengths)
Expand Down Expand Up @@ -179,22 +180,10 @@ DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol} = gennames(siz
# Initialize an empty DataFrame with specific eltypes and names
function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol},
nrows::Integer; makeunique::Bool=false)::DataFrame where T<:Type
columns = Vector{AbstractVector}(undef, length(column_eltypes))
for (j, elty) in enumerate(column_eltypes)
if elty >: Missing
if Missings.T(elty) <: CategoricalArrays.CatValue
columns[j] = CategoricalArray{elty}(undef, nrows)
else
columns[j] = missings(elty, nrows)
end
else
if elty <: CategoricalArrays.CatValue
columns[j] = CategoricalVector{elty}(undef, nrows)
else
columns[j] = Vector{elty}(undef, nrows)
end
end
end
columns = AbstractVector[elty >: Missing ?
fill!(Tables.allocatecolumn(elty, nrows), missing) :
Tables.allocatecolumn(elty, nrows)
for elty in column_eltypes]
return DataFrame(columns, Index(convert(Vector{Symbol}, cnames), makeunique=makeunique))
end

Expand All @@ -211,10 +200,12 @@ function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Sym
end
for i in eachindex(categorical)
categorical[i] || continue
elty = CategoricalArrays.catvaluetype(Missings.T(updated_types[i]),
CategoricalArrays.DefaultRefType)
if updated_types[i] >: Missing
updated_types[i] = Union{CategoricalValue{Missings.T(updated_types[i])}, Missing}
updated_types[i] = Union{elty, Missing}
else
updated_types[i] = CategoricalValue{updated_types[i]}
updated_types[i] = elty
end
end
return DataFrame(updated_types, cnames, nrows, makeunique=makeunique)
Expand Down
8 changes: 4 additions & 4 deletions test/cat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -163,23 +163,23 @@ module TestCat
@test typeof.(columns(df)) == [Vector{Union{Missing, Int}}]
df = vcat(DataFrame([CategoricalArray([1])], [:x]), DataFrame([[1]], [:x]))
@test df == DataFrame([[1, 1]], [:x])
@test typeof(df[:x]) <: CategoricalVector{Int}
@test df[:x] isa Vector{Int}
df = vcat(DataFrame([CategoricalArray([1])], [:x]),
DataFrame([Union{Missing, Int}[1]], [:x]))
@test df == DataFrame([[1, 1]], [:x])
@test typeof(df[:x]) <: CategoricalVector{Union{Int, Missing}}
@test df[:x] isa Vector{Union{Int, Missing}}
df = vcat(DataFrame([CategoricalArray([1])], [:x]),
DataFrame([CategoricalArray{Union{Int, Missing}}([1])], [:x]))
@test df == DataFrame([[1, 1]], [:x])
@test typeof(df[:x]) <: CategoricalVector{Union{Int, Missing}}
@test df[:x] isa CategoricalVector{Union{Int, Missing}}
df = vcat(DataFrame([Union{Int, Missing}[1]], [:x]),
DataFrame([["1"]], [:x]))
@test df == DataFrame([[1, "1"]], [:x])
@test typeof.(columns(df)) == [Vector{Any}]
df = vcat(DataFrame([CategoricalArray([1])], [:x]),
DataFrame([CategoricalArray(["1"])], [:x]))
@test df == DataFrame([[1, "1"]], [:x])
@test typeof(df[:x]) <: CategoricalVector{Any}
@test df[:x] isa CategoricalVector{Any}
df = vcat(DataFrame([trues(1)], [:x]), DataFrame([[false]], [:x]))
@test df == DataFrame([[true, false]], [:x])
@test typeof.(columns(df)) == [Vector{Bool}]
Expand Down
6 changes: 5 additions & 1 deletion test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ module TestDataFrame
@test df == DataFrame(a=[1, 2], b=["a", "b"], c=[:c, :d])
end

@testset "Empty DataFrame constructors" begin
@testset "DataFrame constructors" begin
df = DataFrame(Union{Int, Missing}, 10, 3)
@test size(df, 1) == 10
@test size(df, 2) == 3
Expand Down Expand Up @@ -199,6 +199,10 @@ module TestDataFrame
sdf = view(df, df[:x] .== 4)
@test size(sdf, 1) == 0

# Test that vector type is correctly determined from scalar type
df = DataFrame(x=categorical(["a"])[1])
@test df.x isa CategoricalVector{String}

@test hash(convert(DataFrame, [1 2; 3 4])) == hash(convert(DataFrame, [1 2; 3 4]))
@test hash(convert(DataFrame, [1 2; 3 4])) != hash(convert(DataFrame, [1 3; 2 4]))
@test hash(convert(DataFrame, [1 2; 3 4])) == hash(convert(DataFrame, [1 2; 3 4]), zero(UInt))
Expand Down

0 comments on commit b02ad3c

Please sign in to comment.