Title: https://www.r-bloggers.com/maximize-your-expectations/

Libraries

Libraries = c("doMC")

# Install if not present
for(p in Libraries){
    if(!require(p, character.only = TRUE)) { install.packages(p) }
    library(p, character.only = TRUE)
}

# registerDoMC(cores = 3) # Start multi-processor mode
# registerDOSEQ()

Import Data & Data Handling

nSample <- 2000  # this is equal to the size of each class (0.5*total)

# x1a and x2a are features x1 and x2 for z=0 (x11 just seemed a tad much)
x1a <- rnorm(nSample, 10, 3)
x2a <- rnorm(nSample, 10, 3)
# x1b and x2b are features x1 and x2 for z=1
x1b <- rnorm(nSample, 16, 3)
x2b <- rnorm(nSample, 16, 3)

Machine Model

library(ggplot2)
# put the data into a dataframe
clusterData <- data.frame(x1 = c(x1a, x1b), 
                          x2 = c(x2a, x2b), 
                          class = c(rep("z = 0", nSample), 
                                    rep("z = 1", nSample)))
clusterData.gg <- ggplot(clusterData, 
                         aes(x1 = x1, x2 = x2, type = class))

# plot the combined density in grey and then overlay with colored density
# for each class
clusterData.gg + 
    stat_density(aes(x = x1), alpha = 0.5) + 
    geom_density(aes(x = x1, color = class, fill = class), alpha = 0.5) +
    theme(legend.position = "none") + 
    scale_x_continuous("Value of x1") + 
    scale_y_continuous("Density")

EOF

clusterData.gg + 
    stat_density(aes(x = x2), alpha = 0.5) + 
    geom_density(aes(x = x2, color = class, fill = class), alpha = 0.5) +
    theme(legend.position = "none") + 
    scale_x_continuous("Value of x2") + 
    scale_y_continuous("Density")

library(mclust)
## Package 'mclust' version 5.4.3
## Type 'citation("mclust")' for citing this R package in publications.
## Package 'mclust' version 4.0

# put the data into a matrix just to make using Mclust a bit easier
clusterMatrix <- matrix(NA, nrow = nSample * 2, ncol = 2)
clusterMatrix[, 1] <- c(x1a, x1b)
clusterMatrix[, 2] <- c(x2a, x2b)
# fits the EM model to the data
model <- Mclust(clusterMatrix)
model
## 'Mclust' model object: (EII,2) 
## 
## Available components: 
##  [1] "call"           "data"           "modelName"      "n"             
##  [5] "d"              "G"              "BIC"            "bic"           
##  [9] "loglik"         "df"             "hypvol"         "parameters"    
## [13] "z"              "classification" "uncertainty"
summary(model)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EII (spherical, equal volume) model with 2 components: 
## 
##  log-likelihood    n df       BIC       ICL
##       -22189.33 4000  6 -44428.43 -45151.51
## 
## Clustering table:
##    1    2 
## 2036 1964
clusterPlots <- data.frame(x1 = clusterData$x1, 
                           x2 = clusterData$x2, 
                           cluster = factor(model$classification, 
    levels = c(1, 2), labels = c("z = 0", "z = 1")), 
    uncertainity = model$uncertainty, 
    logUncertainity = log(model$uncertainty))

clusterPlots.gg <- ggplot(clusterPlots)
clusterPlots.gg + geom_point(aes(x = x1, y = x2, color = cluster))

clusterPlots.gg + 
    geom_point(aes(x = x1, y = x2, color = logUncertainity))

clusterPlots$uncertainGroups <- ifelse(clusterPlots$uncertainity <= 0.2, "Highly certain", 
    ifelse(clusterPlots$uncertainity <= 0.4, "Moderately certain", ifelse(clusterPlots$uncertainity <= 
        0.5, "Weakly certain", ifelse(clusterPlots$uncertainity <= 0.6, "Weakly uncertain", 
        ifelse(clusterPlots$uncertainity <= 0.8, "Moderately uncertain", "Extremely uncertain")))))

clusterPlots.gg <- ggplot(clusterPlots)
clusterPlots.gg + geom_point(aes(x = x1, y = x2, color = uncertainGroups))

\[[ y_i = \beta_0 + \beta_{1, z}x_{1, i} + \beta_{2, z}x_{2, i} + \epsilon_i ]\]

# beta estimates for z = 1 are 0.3
b1a <- rnorm(nSample, 1, 0.05)  # specific beta1 for z = 0
b1b <- rnorm(nSample, 1.3, 0.05)  # specific beta1 for z = 1
b2a <- rnorm(nSample, 1, 0.05)  # specific beta2 for z = 0
b2b <- rnorm(nSample, 1.3, 0.05)  # specific beta2 for z = 1
intercept <- rnorm(2 * nSample, 3, 0.075)  # intercept
e <- rnorm(2 * nSample, 0, 0.05)  # error term

x1 <- c(x1a, x1b)
x2 <- c(x1b, x2b)
beta1 <- c(b1a, b1b)
beta2 <- c(b2a, b2b)

y <- intercept + beta1 * x1 + beta2 * x2 + e
naiveModel <- lm(y ~ x1 + x2)
summary(naiveModel)
## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.7254  -2.2026   0.3328   2.4801   9.0478 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.64771    0.33002  -20.14   <2e-16 ***
## x1           1.87196    0.01248  150.01   <2e-16 ***
## x2           1.19503    0.01758   67.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.338 on 3997 degrees of freedom
## Multiple R-squared:  0.8711, Adjusted R-squared:  0.8711 
## F-statistic: 1.351e+04 on 2 and 3997 DF,  p-value: < 2.2e-16
AIC(naiveModel)
## [1] 21000.39
classOne <- model$classification == 1  # actually z = 1

emModel <- lm(y ~ x1 + I(x1 * classOne) + x2 + I(x2 * classOne))

summary(emModel)
## 
## Call:
## lm(formula = y ~ x1 + I(x1 * classOne) + x2 + I(x2 * classOne))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1256  -0.7832   0.1265   1.0706   9.0211 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.70659    0.27259   9.929  < 2e-16 ***
## x1                1.20144    0.01697  70.789  < 2e-16 ***
## I(x1 * classOne)  0.14926    0.01890   7.897 3.66e-15 ***
## x2                0.91679    0.01474  62.187  < 2e-16 ***
## I(x2 * classOne)  0.31091    0.01537  20.231  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.335 on 3995 degrees of freedom
## Multiple R-squared:  0.937,  Adjusted R-squared:  0.9369 
## F-statistic: 1.486e+04 on 4 and 3995 DF,  p-value: < 2.2e-16
AIC(emModel)
## [1] 18141.62
anova(naiveModel, emModel)
## Analysis of Variance Table
## 
## Model 1: y ~ x1 + x2
## Model 2: y ~ x1 + I(x1 * classOne) + x2 + I(x2 * classOne)
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1   3997 44546                                  
## 2   3995 21776  2     22769 2088.6 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
uncertainity <- model$uncertainty

emModelWeighted <- lm(y ~ x1 + I(x1 * classOne) + x2 + I(x2 * classOne), 
                      weights = -log(uncertainity))
summary(emModelWeighted)
## 
## Call:
## lm(formula = y ~ x1 + I(x1 * classOne) + x2 + I(x2 * classOne), 
##     weights = -log(uncertainity))
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.4399  -1.5720  -0.0338   1.5339  11.8261 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.55578    0.20117   12.71   <2e-16 ***
## x1                1.11214    0.01295   85.91   <2e-16 ***
## I(x1 * classOne)  0.20129    0.01443   13.95   <2e-16 ***
## x2                0.97391    0.01112   87.59   <2e-16 ***
## I(x2 * classOne)  0.32472    0.01134   28.64   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.543 on 3995 degrees of freedom
## Multiple R-squared:  0.9757, Adjusted R-squared:  0.9756 
## F-statistic: 4.003e+04 on 4 and 3995 DF,  p-value: < 2.2e-16
AIC(emModelWeighted)
## [1] 16409.87
#trueClassOne <- trueClass == 1  # z = 1
#cheatingModel <- lm(y ~ x1 + I(x1 * trueClassOne) + x2 + I(x2 * trueClassOne))
#summary(cheatingModel)