Finite Mixture Distribution, an S4 framework

Author

Tingting Zhan

Introduction

This vignette of package fmx (CRAN, Github) documents ..

Prerequisite

New features are first implemented on Github.

remotes::install_github('tingtingzhan/fmx')

And eventually make their way to CRAN.

utils::install.packages('fmx')

Note to Users

Examples in this vignette require that the search path has

library(fmx)
library(TukeyGH77)
library(sn)
#> Loading required package: stats4
#> 
#> Attaching package: 'sn'
#> The following object is masked from 'package:stats':
#> 
#>     sd
library(ggplot2)
#library(geomtextpath)

S4 Class 'fmx'

Definition

The S4 class stores the parameters of a mixture of distribution. Currently the normal distribution and Tukey’s GH distribution is supported.

Example below shows a mixture of two normal distributions N(0,1) and N(3,1.3) with (50\%, 50\%) mixing probability (i.e., the mixing probability w is normalize to so that the summation of w is 1).

(e1 = fmx('norm', mean = c(0,3), sd = c(1, .7), w = c(1, 1)))
#>          Parameters
#> norm      mean sd   w    
#>   1-comp. 0.00 1.00 50.0%
#>   2-comp. 3.00 0.70 50.0%
class(e1)
#> [1] "fmx"
#> attr(,"package")
#> [1] "fmx"
isS4(e1)
#> [1] TRUE

Example below shows a mixture of two Tukey’s g-and-h distributions (A,B,g,h)=(0,1,.2,.2) and (3,1,.3,.1) (i.e., by default the location parameter A=0 and the scale parameter B=1) with (20\%,30\%) mixing probability.

(e2 = fmx('GH', A = c(0,3), g = c(.2, .3), h = c(.2, .1), w = c(2, 3)))
#>          Parameters
#> GH        A    B    g    h    w    
#>   1-comp. 0.00 1.00 0.20 0.20 40.0%
#>   2-comp. 3.00 1.00 0.30 0.10 60.0%

The fmx framework also accepts a single-component distribution.

(e3 = fmx('GH', g = .2, h = .2))
#>          Parameters
#> GH        A    B    g    h   
#>   1-comp. 0.00 1.00 0.20 0.20

Density

Function dfmx()

par(mar = c(4, 5, 1, 0))
curve(dfmx(x, dist = e1), xlim = c(-3,7), ylab = 'Density')

ggplot() + 
  geom_function(fun = dfmx, args = list(dist = e1), xlim = c(-3,7))

Probability

par(mar = c(4, 5, 1, 0))
curve(pfmx(x, dist = e1), xlim = c(-3,7), ylab = 'Probability')

ggplot() + 
  geom_function(fun = pfmx, args = list(dist = e1), xlim = c(-3,7)) + 
  scale_y_continuous(labels = scales::label_percent())

Quantile

par(mar = c(4, 5, 1, 0))
curve(qfmx(x, dist = e1), xlim = c(.001,.999), ylab = 'Quantile')

ggplot() + 
  geom_function(fun = qfmx, args = list(dist = e1), n = 501L, xlim = c(.001, .999)) +
  scale_x_continuous(labels = scales::label_percent())

Simulation

par(mar = c(4, 5, 2, 0))
r1 = e1 |> rfmx(n = 1e3L) 
r1 |> hist(main = '1000 obs from e1')

new(Class = 'fmx', e1, data = r1)
#>          Parameters
#> norm      mean sd   w    
#>   1-comp. 0.00 1.00 50.0%
#>   2-comp. 3.00 0.70 50.0%
#> 
#> Number of Observations: 1000
#> 
#> 'log Lik.' -1841.197 (df=5)
#> 
#> Kolmogorov-Smirnov Distance: 0.03080
#> Cramer-Von Mises Distance: 0.12372
#> Kullback-Leibler divergence: 0.00640

Extract Subset

(d = fmx('norm', mean = c(1, 4, 7), w = c(1, 1, 1)))
#>          Parameters
#> norm      mean sd   w    
#>   1-comp. 1.00 1.00 33.3%
#>   2-comp. 4.00 1.00 33.3%
#>   3-comp. 7.00 1.00 33.3%
d[1:2]
#>          Parameters
#> norm      mean sd   w    
#>   1-comp. 1.00 1.00 50.0%
#>   2-comp. 4.00 1.00 50.0%

Multiple 'fmx' Objects

ggplot() + 
 geom_function(fun = qfmx, args = list(dist=e1), mapping = aes(color='Normal')) + 
 geom_function(fun = qfmx, args = list(dist=e2), mapping = aes(color='Tukey gh')) + 
 scale_x_continuous(labels = scales::label_percent(), limits = c(.001, .999)) +
 labs(x = NULL, y = 'Quantile', color = 'Mixture\nModels')

Tukey’s g-&-h Mixture

Consider three Tukey’s g-&-h mixture distributions.

(d0 = fmx('GH', A = c(1,4), g = c(.2,.1), h = c(.05,.1), w = c(1,1)))
#>          Parameters
#> GH        A    B    g    h    w    
#>   1-comp. 1.00 1.00 0.20 0.05 50.0%
#>   2-comp. 4.00 1.00 0.10 0.10 50.0%
(d1 = fmx('GH', A = c(1,4), g = c(.2,0), h = c(0,.1), w = c(1,1)))
#>          Parameters
#> GH        A    B    g    h    w    
#>   1-comp. 1.00 1.00 0.20 .    50.0%
#>   2-comp. 4.00 1.00 .    0.10 50.0%
(d2 = fmx('GH', A = c(1,4), g = c(.2,0), h = c(.15,.1), w = c(1,1)))
#>          Parameters
#> GH        A    B    g    h    w    
#>   1-comp. 1.00 1.00 0.20 0.15 50.0%
#>   2-comp. 4.00 1.00 .    0.10 50.0%

gh-Constraints

d0 |> fmx_constraint()
#> integer(0)
d1 |> fmx_constraint()
#> [1] 6 7
#> attr(,"user")
#> [1] "g2" "h1"
#> attr(,"tex")
#> [1] "g_{2}" "h_{1}"
#> attr(,"gid")
#> [1] 2
#> attr(,"hid")
#> [1] 1
d2 |> fmx_constraint()
#> [1] 6
#> attr(,"user")
#> [1] "g2"
#> attr(,"tex")
#> [1] "g_{2}"
#> attr(,"gid")
#> [1] 2
#> attr(,"hid")
#> integer(0)
getTeX(d0)
#> [1] "Full GH2"
getTeX(d1)
#> [1] "$g_{2}=h_{1}=0$"
getTeX(d2)
#> [1] "$g_{2}=0$"

Moment per Component

From 'fmx' to moments

Function moment_fmx()

d2 |> moment_fmx()
#> Mean: 1.129, 4.000
#> Standard Deviation: 1.367, 1.182
#> Skewness: 1.601, 0.000
#> (Excess) Kurtosis: 17.736, 2.508

From moments to 'fmx'

Function moment2fmx()

m = c(-1.5, 1.5)
s = c(.9, 1.1)
sk = c(.2, -.3)
kt = c(.5, .75)
w = c(2, 3)
(m1 = moment2fmx(distname = 'GH', w = w, mean = m, sd = s, skewness = sk, kurtosis = kt))
#>          Parameters
#> GH        A     B    g     h    w    
#>   1-comp. -1.53 0.86 0.06  0.03 40.0%
#>   2-comp. 1.55  1.03 -0.08 0.04 60.0%
moment_fmx(m1)
#> Mean: -1.500, 1.500
#> Standard Deviation: 0.900, 1.100
#> Skewness: 0.200, -0.300
#> (Excess) Kurtosis: 0.500, 0.750
(m2 = moment2fmx(distname = 'st', w = w, mean = m, sd = s, skewness = sk, kurtosis = kt))
#>          Parameters
#> st        xi    omega alpha nu    w    
#>   1-comp. -2.01 0.98  0.81  18.12 40.0%
#>   2-comp. 2.20  1.21  -0.95 14.04 60.0%
moment_fmx(m2)
#> Distribution: st
#> Mean: -1.500, 1.500
#> Standard Deviation: 0.900, 1.100
#> Skewness: 0.200, -0.300
#> (Excess) Kurtosis: 0.500, 0.750
ggplot() + 
 geom_function(aes(color = 'GH'), fun = dfmx, args = list(dist=m1), n = 1001L) + 
 geom_function(aes(color = 'st'), fun = dfmx, args = list(dist=m2), n = 1001L) +
 xlim(-5, 6)

Two curves looks really close, but actually not identical.

x = seq.int(from = -5L, to = 6L, length.out = 1001L)
all.equal.numeric(dfmx(x, dist = m1), dfmx(x, dist = m2))
#> [1] "Attributes: < Component \"posterior\": Mean relative difference: 0.002626751 >"
#> [2] "Mean relative difference: 0.00224298"

Appendix

Terms and Abbreviations

Term / Abbreviation Description
|> Forward pipe operator introduced in R 4.1.0
CRAN, R The Comprehensive R Archive Network
curve Function plots
fitdist Mixture of distribution from package fitdistrplus