diff --git a/NEWS.md b/NEWS.md index 5e73643ba..4497e5bcd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +MixedModels v4.20.0 Release Notes +============================== +* The `.tbl` property of a `MixedModelBootstrap` now includes the correlation parameters for lower triangular elements of the `λ` field. [#702] + MixedModels v4.19.0 Release Notes ============================== * New method `StatsAPI.coefnames(::ReMat)` returns the coefficient names associated with each grouping factor. [#709] @@ -463,6 +467,7 @@ Package dependencies [#682]: https://github.com/JuliaStats/MixedModels.jl/issues/682 [#694]: https://github.com/JuliaStats/MixedModels.jl/issues/694 [#698]: https://github.com/JuliaStats/MixedModels.jl/issues/698 +[#702]: https://github.com/JuliaStats/MixedModels.jl/issues/702 [#703]: https://github.com/JuliaStats/MixedModels.jl/issues/703 [#707]: https://github.com/JuliaStats/MixedModels.jl/issues/707 [#709]: https://github.com/JuliaStats/MixedModels.jl/issues/709 diff --git a/Project.toml b/Project.toml index 03e2a20a7..199bec9df 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MixedModels" uuid = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316" author = ["Phillip Alday ", "Douglas Bates ", "Jose Bayoan Santiago Calderon "] -version = "4.19.0" +version = "4.20.0" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" diff --git a/docs/src/bootstrap.md b/docs/src/bootstrap.md index 018a024d8..dfb80b2b8 100644 --- a/docs/src/bootstrap.md +++ b/docs/src/bootstrap.md @@ -24,10 +24,6 @@ parameter, `θ`, that defines the variance-covariance matrices of the random eff For example, a simple linear mixed-effects model for the `Dyestuff` data in the [`lme4`](http://github.com/lme4/lme4) package for [`R`](https://www.r-project.org) is fit by -```@setup Main -using DisplayAs -``` - ```@example Main using DataFrames using Gadfly # plotting package @@ -38,41 +34,29 @@ using Random ```@example Main dyestuff = MixedModels.dataset(:dyestuff) m1 = fit(MixedModel, @formula(yield ~ 1 + (1 | batch)), dyestuff) -DisplayAs.Text(ans) # hide ``` -To bootstrap the model parameters, first initialize a random number generator then create a bootstrap sample +To bootstrap the model parameters, first initialize a random number generator then create a bootstrap sample and extract the `tbl` property, which is a `Table` - a lightweight dataframe-like object. ```@example Main const rng = MersenneTwister(1234321); samp = parametricbootstrap(rng, 10_000, m1); -df = DataFrame(samp.allpars); -first(df, 10) +tbl = samp.tbl ``` -Especially for those with a background in [`R`](https://www.R-project.org/) or [`pandas`](https://pandas.pydata.org), -the simplest way of accessing the parameter estimates in the parametric bootstrap object is to create a `DataFrame` from the `allpars` property as shown above. - -We can use `filter` to filter out relevant rows of a dataframe. A density plot of the estimates of `σ`, the residual standard deviation, can be created as ```@example Main -σres = filter(df) do row # create a thunk that operates on rows - row.type == "σ" && row.group == "residual" # our filtering rule -end - -plot(x = σres.value, Geom.density, Guide.xlabel("Parametric bootstrap estimates of σ")) +plot(x = tbl.σ, Geom.density, Guide.xlabel("Parametric bootstrap estimates of σ")) ``` -For the estimates of the intercept parameter, the `getproperty` extractor must be used + +or, for the intercept parameter ```@example Main -plot(filter(:type => ==("β"), df), x = :value, Geom.density, Guide.xlabel("Parametric bootstrap estimates of β₁")) +plot(x = tbl.β1, Geom.density, Guide.xlabel("Parametric bootstrap estimates of β₁")) ``` A density plot of the estimates of the standard deviation of the random effects is obtained as ```@example Main -σbatch = filter(df) do row # create a thunk that operates on rows - row.type == "σ" && row.group == "batch" # our filtering rule -end -plot(x = σbatch.value, Geom.density, +plot(x = tbl.σ1, Geom.density, Guide.xlabel("Parametric bootstrap estimates of σ₁")) ``` @@ -81,7 +65,7 @@ Although this mode appears to be diffuse, this is an artifact of the way that de In fact, it is a pulse, as can be seen from a histogram. ```@example Main -plot(x = σbatch.value, Geom.histogram, +plot(x = tbl.σ1, Geom.histogram, Guide.xlabel("Parametric bootstrap estimates of σ₁")) ``` @@ -93,14 +77,9 @@ The shortest such intervals, obtained with the `shortestcovint` extractor, corre shortestcovint ``` -We generate these for all random and fixed effects: +We generate these directly from the original bootstrap object: ```@example Main -combine(groupby(df, [:type, :group, :names]), :value => shortestcovint => :interval) -``` - -We can also generate this directly from the original bootstrap object: -```@example Main -DataFrame(shortestcovint(samp)) +Table(shortestcovint(samp)) ``` A value of zero for the standard deviation of the random effects is an example of a *singular* covariance. @@ -110,48 +89,39 @@ However, it is not as straightforward to detect singularity in vector-valued ran For example, if we bootstrap a model fit to the `sleepstudy` data ```@example Main sleepstudy = MixedModels.dataset(:sleepstudy) -m2 = fit( - MixedModel, - @formula(reaction ~ 1+days+(1+days|subj)), - sleepstudy, -) -DisplayAs.Text(ans) # hide +contrasts = Dict(:subj => Grouping()) +m2 = let f = @formula reaction ~ 1+days+(1+days|subj) + fit(MixedModel, f, sleepstudy; contrasts) +end ``` ```@example Main samp2 = parametricbootstrap(rng, 10_000, m2); -df2 = DataFrame(samp2.allpars); -first(df2, 10) +tbl2 = samp2.tbl ``` the singularity can be exhibited as a standard deviation of zero or as a correlation of $\pm1$. ```@example Main -DataFrame(shortestcovint(samp2)) +shortestcovint(samp2) ``` A histogram of the estimated correlations from the bootstrap sample has a spike at `+1`. ```@example Main -ρs = filter(df2) do row - row.type == "ρ" && row.group == "subj" -end -plot(x = ρs.value, Geom.histogram, +plot(x = tbl2.ρ1, Geom.histogram, Guide.xlabel("Parametric bootstrap samples of correlation of random effects")) ``` or, as a count, ```@example Main -count(ρs.value .≈ 1) +count(tbl2.ρ1 .≈ 1) ``` Close examination of the histogram shows a few values of `-1`. ```@example Main -count(ρs.value .≈ -1) +count(tbl2.ρ1 .≈ -1) ``` Furthermore there are even a few cases where the estimate of the standard deviation of the random effect for the intercept is zero. ```@example Main -σs = filter(df2) do row - row.type == "σ" && row.group == "subj" && row.names == "(Intercept)" -end -count(σs.value .≈ 0) +count(tbl2.σ1 .≈ 0) ``` There is a general condition to check for singularity of an estimated covariance matrix or matrices in a bootstrap sample. diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 5161a6468..786036ae8 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -253,7 +253,7 @@ of `iter`, `type`, `group`, `name` and `value`. a breaking change. """ function allpars(bsamp::MixedModelFitCollection{T}) where {T} - fits, λ, fcnames = bsamp.fits, bsamp.λ, bsamp.fcnames + (; fits, λ, fcnames) = bsamp npars = 2 + length(first(fits).β) + sum(map(k -> (k * (k + 1)) >> 1, size.(bsamp.λ, 2))) nresrow = length(fits) * npars cols = ( @@ -392,12 +392,12 @@ function Base.propertynames(bsamp::MixedModelFitCollection) end """ + setθ!(bsamp::MixedModelFitCollection, θ::AbstractVector) setθ!(bsamp::MixedModelFitCollection, i::Integer) Install the values of the i'th θ value of `bsamp.fits` in `bsamp.λ` """ -function setθ!(bsamp::MixedModelFitCollection, i::Integer) - θ = bsamp.fits[i].θ +function setθ!(bsamp::MixedModelFitCollection{T}, θ::AbstractVector{T}) where {T} offset = 0 for (λ, inds) in zip(bsamp.λ, bsamp.inds) λdat = _getdata(λ) @@ -410,6 +410,10 @@ function setθ!(bsamp::MixedModelFitCollection, i::Integer) return bsamp end +function setθ!(bsamp::MixedModelFitCollection, i::Integer) + return setθ!(bsamp, bsamp.θ[i]) +end + _getdata(x::Diagonal) = x _getdata(x::LowerTriangular) = x.data @@ -444,7 +448,7 @@ Return the shortest interval containing `level` proportion for each parameter fr a breaking change. """ function shortestcovint(bsamp::MixedModelFitCollection{T}, level=0.95) where {T} - allpars = bsamp.allpars + allpars = bsamp.allpars # TODO probably simpler to use .tbl instead of .allpars pars = unique(zip(allpars.type, allpars.group, allpars.names)) colnms = (:type, :group, :names, :lower, :upper) @@ -521,6 +525,7 @@ end """ tidyσs(bsamp::MixedModelFitCollection) + Return a tidy (row)table with the estimates of the variance components (on the standard deviation scale) spread into columns of `iter`, `group`, `column` and `σ`. """ @@ -544,40 +549,90 @@ function tidyσs(bsamp::MixedModelFitCollection{T}) where {T} return result end +_nρ(d::Diagonal) = 0 +_nρ(t::LowerTriangular) = kchoose2(size(t.data, 1)) + +function σρnms(λ) + σsyms = _generatesyms('σ', sum(first ∘ size, λ)) + ρsyms = _generatesyms('ρ', sum(_nρ, λ)) + val = sizehint!(Symbol[], length(σsyms) + length(ρsyms)) + for l in λ + for _ in axes(l, 1) + push!(val, popfirst!(σsyms)) + end + for _ in 1:_nρ(l) + push!(val, popfirst!(ρsyms)) + end + end + return val +end + +function _syms(bsamp::MixedModelBootstrap) + (; fits, λ) = bsamp + (; β, θ) = first(fits) + syms = [:obj] + append!(syms, _generatesyms('β', length(β))) + push!(syms, :σ) + append!(syms, σρnms(λ)) + return append!(syms, _generatesyms('θ', length(θ))) +end + +function σρ!(v::AbstractVector, d::Diagonal, σ) + return append!(v, σ .* d.diag) +end + +function σρ!(v::AbstractVector{T}, t::LowerTriangular{T}, σ::T) where {T} + """ + σρ!(v, t, σ) + push! `σ` times the row lengths (σs) and the inner products of normalized rows (ρs) of `t` onto `v` + """ + dat = t.data + for i in axes(dat, 1) + ssqr = zero(T) + for j in 1:i + ssqr += abs2(dat[i, j]) + end + len = sqrt(ssqr) + push!(v, σ * len) + if len > 0 + for j in 1:i + dat[i, j] /= len + end + end + end + for i in axes(dat, 1) + for j in 1:(i - 1) + s = zero(T) + for k in 1:i + s += dat[i, k] * dat[j, k] + end + push!(v, s) + end + end + return v +end + function pbstrtbl(bsamp::MixedModelFitCollection{T}) where {T} - (; fits, λ, inds) = bsamp - row1 = first(fits) - cnms = [:obj, :σ] - pos = Dict{Symbol,UnitRange{Int}}(:obj => 1:1, :σ => 2:2) - βsz = length(row1.β) - append!(cnms, _generatesyms('β', βsz)) - lastpos = 2 + βsz - pos[:β] = 3:lastpos - σsz = sum(m -> size(m, 1), bsamp.λ) - append!(cnms, _generatesyms('σ', σsz)) - pos[:σs] = (lastpos + 1):(lastpos + σsz) - lastpos += σsz - θsz = length(row1.θ) - append!(cnms, _generatesyms('θ', θsz)) - pos[:θ] = (lastpos + 1):(lastpos + θsz) - tblrowtyp = NamedTuple{(cnms...,),NTuple{length(cnms),T}} - val = sizehint!(tblrowtyp[], length(bsamp.fits)) - v = Vector{T}(undef, length(cnms)) - for (i, r) in enumerate(bsamp.fits) - v[1] = r.objective - v[2] = coalesce(r.σ, one(T)) - copyto!(view(v, pos[:β]), r.β) - fill!(view(v, pos[:σs]), zero(T)) - copyto!(view(v, pos[:θ]), r.θ) - setθ!(bsamp, i) - vpos = first(pos[:σs]) + (; fits, λ) = bsamp + λcp = copy.(λ) + syms = _syms(bsamp) + m = length(syms) + n = length(fits) + v = sizehint!(T[], m * n) + for f in fits + (; β, θ, σ) = f + push!(v, f.objective) + append!(v, β) + push!(v, σ) + setθ!(bsamp, θ) for l in λ - for λr in eachrow(l) - v[vpos] = r.σ * norm(λr) - vpos += 1 - end + σρ!(v, l, σ) end - push!(val, tblrowtyp(v)) + append!(v, θ) end - return val + m = permutedims(reshape(v, (m, n)), (2, 1)) # equivalent to collect(transpose(...)) + for k in eachindex(λ, λcp) # restore original contents of λ + copyto!(λ[k], λcp[k]) + end + return Table(Tables.table(m; header=syms)) end diff --git a/src/profile/utilities.jl b/src/profile/utilities.jl index 27f63c4a0..f5a6963ac 100644 --- a/src/profile/utilities.jl +++ b/src/profile/utilities.jl @@ -24,10 +24,12 @@ Utility to generate a vector of Symbols of the form : from a tag and The indices are left-padded with zeros to allow lexicographic sorting. """ -function _generatesyms(tag::Char, len::Integer) +function _generatesyms(tag::AbstractString, len::Integer) return Symbol.(string.(tag, lpad.(Base.OneTo(len), ndigits(len), '0'))) end +_generatesyms(tag::Char, len::Integer) = _generatesyms(string(tag), len) + function TableColumns(m::LinearMixedModel{T}) where {T} nmvec = [:ζ] positions = Dict(:ζ => 1:1) diff --git a/test/bootstrap.jl b/test/bootstrap.jl index f787a5d0c..9797a89ee 100644 --- a/test/bootstrap.jl +++ b/test/bootstrap.jl @@ -156,7 +156,11 @@ end pb0 = quickboot(m0) pb1 = quickboot(m1) savereplicates(io, pb1) - # wrong model`` + @test isa(pb0.tbl, Table) + @test isa(pb1.tbl, Table) # create tbl here to check it doesn't modify pb1 + @test ncol(DataFrame(pb1.β)) == 3 + + # wrong model @test_throws ArgumentError restorereplicates(seekstart(io), m0) # need to specify an eltype! @test_throws MethodError restorereplicates(seekstart(io), m1, MixedModelBootstrap) @@ -184,7 +188,7 @@ end @test pb1 == restorereplicates(seekstart(io), MixedModels.unfit!(deepcopy(m1))) @test pb1 ≈ restorereplicates(seekstart(io), m1, Float16) rtol=1 - end +end @testset "Bernoulli simulate! and GLMM bootstrap" begin contra = dataset(:contra)