Title: https://www.r-bloggers.com/maximize-your-expectations/
Libraries
Libraries = c("doMC")
# Install if not present
for(p in Libraries){
if(!require(p, character.only = TRUE)) { install.packages(p) }
library(p, character.only = TRUE)
}
# registerDoMC(cores = 3) # Start multi-processor mode
# registerDOSEQ()
Import Data & Data Handling
nSample <- 2000 # this is equal to the size of each class (0.5*total)
# x1a and x2a are features x1 and x2 for z=0 (x11 just seemed a tad much)
x1a <- rnorm(nSample, 10, 3)
x2a <- rnorm(nSample, 10, 3)
# x1b and x2b are features x1 and x2 for z=1
x1b <- rnorm(nSample, 16, 3)
x2b <- rnorm(nSample, 16, 3)
Machine Model
library(ggplot2)
# put the data into a dataframe
clusterData <- data.frame(x1 = c(x1a, x1b),
x2 = c(x2a, x2b),
class = c(rep("z = 0", nSample),
rep("z = 1", nSample)))
clusterData.gg <- ggplot(clusterData,
aes(x1 = x1, x2 = x2, type = class))
# plot the combined density in grey and then overlay with colored density
# for each class
clusterData.gg +
stat_density(aes(x = x1), alpha = 0.5) +
geom_density(aes(x = x1, color = class, fill = class), alpha = 0.5) +
theme(legend.position = "none") +
scale_x_continuous("Value of x1") +
scale_y_continuous("Density")
EOF
clusterData.gg +
stat_density(aes(x = x2), alpha = 0.5) +
geom_density(aes(x = x2, color = class, fill = class), alpha = 0.5) +
theme(legend.position = "none") +
scale_x_continuous("Value of x2") +
scale_y_continuous("Density")
library(mclust)
## Package 'mclust' version 5.4.3
## Type 'citation("mclust")' for citing this R package in publications.
## Package 'mclust' version 4.0
# put the data into a matrix just to make using Mclust a bit easier
clusterMatrix <- matrix(NA, nrow = nSample * 2, ncol = 2)
clusterMatrix[, 1] <- c(x1a, x1b)
clusterMatrix[, 2] <- c(x2a, x2b)
# fits the EM model to the data
model <- Mclust(clusterMatrix)
model
## 'Mclust' model object: (EII,2)
##
## Available components:
## [1] "call" "data" "modelName" "n"
## [5] "d" "G" "BIC" "bic"
## [9] "loglik" "df" "hypvol" "parameters"
## [13] "z" "classification" "uncertainty"
summary(model)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EII (spherical, equal volume) model with 2 components:
##
## log-likelihood n df BIC ICL
## -22189.33 4000 6 -44428.43 -45151.51
##
## Clustering table:
## 1 2
## 2036 1964
clusterPlots <- data.frame(x1 = clusterData$x1,
x2 = clusterData$x2,
cluster = factor(model$classification,
levels = c(1, 2), labels = c("z = 0", "z = 1")),
uncertainity = model$uncertainty,
logUncertainity = log(model$uncertainty))
clusterPlots.gg <- ggplot(clusterPlots)
clusterPlots.gg + geom_point(aes(x = x1, y = x2, color = cluster))
clusterPlots.gg +
geom_point(aes(x = x1, y = x2, color = logUncertainity))
clusterPlots$uncertainGroups <- ifelse(clusterPlots$uncertainity <= 0.2, "Highly certain",
ifelse(clusterPlots$uncertainity <= 0.4, "Moderately certain", ifelse(clusterPlots$uncertainity <=
0.5, "Weakly certain", ifelse(clusterPlots$uncertainity <= 0.6, "Weakly uncertain",
ifelse(clusterPlots$uncertainity <= 0.8, "Moderately uncertain", "Extremely uncertain")))))
clusterPlots.gg <- ggplot(clusterPlots)
clusterPlots.gg + geom_point(aes(x = x1, y = x2, color = uncertainGroups))
\[[ y_i = \beta_0 + \beta_{1, z}x_{1, i} + \beta_{2, z}x_{2, i} + \epsilon_i ]\]
# beta estimates for z = 1 are 0.3
b1a <- rnorm(nSample, 1, 0.05) # specific beta1 for z = 0
b1b <- rnorm(nSample, 1.3, 0.05) # specific beta1 for z = 1
b2a <- rnorm(nSample, 1, 0.05) # specific beta2 for z = 0
b2b <- rnorm(nSample, 1.3, 0.05) # specific beta2 for z = 1
intercept <- rnorm(2 * nSample, 3, 0.075) # intercept
e <- rnorm(2 * nSample, 0, 0.05) # error term
x1 <- c(x1a, x1b)
x2 <- c(x1b, x2b)
beta1 <- c(b1a, b1b)
beta2 <- c(b2a, b2b)
y <- intercept + beta1 * x1 + beta2 * x2 + e
naiveModel <- lm(y ~ x1 + x2)
summary(naiveModel)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.7254 -2.2026 0.3328 2.4801 9.0478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.64771 0.33002 -20.14 <2e-16 ***
## x1 1.87196 0.01248 150.01 <2e-16 ***
## x2 1.19503 0.01758 67.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.338 on 3997 degrees of freedom
## Multiple R-squared: 0.8711, Adjusted R-squared: 0.8711
## F-statistic: 1.351e+04 on 2 and 3997 DF, p-value: < 2.2e-16
AIC(naiveModel)
## [1] 21000.39
classOne <- model$classification == 1 # actually z = 1
emModel <- lm(y ~ x1 + I(x1 * classOne) + x2 + I(x2 * classOne))
summary(emModel)
##
## Call:
## lm(formula = y ~ x1 + I(x1 * classOne) + x2 + I(x2 * classOne))
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.1256 -0.7832 0.1265 1.0706 9.0211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.70659 0.27259 9.929 < 2e-16 ***
## x1 1.20144 0.01697 70.789 < 2e-16 ***
## I(x1 * classOne) 0.14926 0.01890 7.897 3.66e-15 ***
## x2 0.91679 0.01474 62.187 < 2e-16 ***
## I(x2 * classOne) 0.31091 0.01537 20.231 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.335 on 3995 degrees of freedom
## Multiple R-squared: 0.937, Adjusted R-squared: 0.9369
## F-statistic: 1.486e+04 on 4 and 3995 DF, p-value: < 2.2e-16
AIC(emModel)
## [1] 18141.62
anova(naiveModel, emModel)
## Analysis of Variance Table
##
## Model 1: y ~ x1 + x2
## Model 2: y ~ x1 + I(x1 * classOne) + x2 + I(x2 * classOne)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 3997 44546
## 2 3995 21776 2 22769 2088.6 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
uncertainity <- model$uncertainty
emModelWeighted <- lm(y ~ x1 + I(x1 * classOne) + x2 + I(x2 * classOne),
weights = -log(uncertainity))
summary(emModelWeighted)
##
## Call:
## lm(formula = y ~ x1 + I(x1 * classOne) + x2 + I(x2 * classOne),
## weights = -log(uncertainity))
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -23.4399 -1.5720 -0.0338 1.5339 11.8261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.55578 0.20117 12.71 <2e-16 ***
## x1 1.11214 0.01295 85.91 <2e-16 ***
## I(x1 * classOne) 0.20129 0.01443 13.95 <2e-16 ***
## x2 0.97391 0.01112 87.59 <2e-16 ***
## I(x2 * classOne) 0.32472 0.01134 28.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.543 on 3995 degrees of freedom
## Multiple R-squared: 0.9757, Adjusted R-squared: 0.9756
## F-statistic: 4.003e+04 on 4 and 3995 DF, p-value: < 2.2e-16
AIC(emModelWeighted)
## [1] 16409.87
#trueClassOne <- trueClass == 1 # z = 1
#cheatingModel <- lm(y ~ x1 + I(x1 * trueClassOne) + x2 + I(x2 * trueClassOne))
#summary(cheatingModel)