errGLM <- function(data.df, kfolds = 10){
folds <- sample(1:kfolds, nrow(data.df), rep = T)
err <- rep(0, kfolds)
for(k in 1:kfolds){
train.df <- data.df[folds != k,]
test.df <- data.df[folds == k,]
mod <- glm(unhappy ~ . - unhappy,
data = train.df,
family = binomial)
vals <- predict(mod, newdata = test.df,
type = "response")
thresh.pred <- .5
test.df <- test.df %>%
mutate(prob.logs = vals,
pred.log = ifelse(prob.logs < thresh.pred, 0, 1))
err[k] <- with(test.df, mean(unhappy != pred.log))
}
mean(err)
}
(err.glm.fm <- errGLM(happy.df))## [1] 0.1236428
(null.fm <- nrow(happy.df %>%
filter(unhappy == 1)) / nrow(happy.df))## [1] 0.1253195
lambda.grid <- 10^seq(-3, 1, length = 50)
err.ridge <- function(data.df, M = 10){
err <- rep(0, M)
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~. -1 -unhappy, train.df)
y.train <- train.df$unhappy
x.test <- model.matrix(~. -1 -unhappy, test.df)
y.test <- test.df$unhappy
cv.ridge <- cv.glmnet(x.train, y.train,
alpha = 0, intercept = F,
family = "binomial",
type.measure = "class",
lambda = lambda.grid)
good.lam <- cv.ridge$lambda.min
for (k in 1:M){
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~. -1 -unhappy, train.df)
y.train <- train.df$unhappy
x.test <- model.matrix(~. -1 -unhappy, test.df)
y.test <- test.df$unhappy
mod.ridge <- glmnet(x.train, y.train,
family = "binomial",
alpha = 0, intercept = F,
lambda = good.lam)
pred <- as.numeric(
predict(mod.ridge, newx = x.test, type = "class"))
err[k] <- mean(y.test != pred)
}
mean(err)
}
(err.ridge.fm <- err.ridge(happy.df))## [1] 0.1230974
err.lasso <- function(data.df, M = 10){
err <- rep(0, M)
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~ . -1 -unhappy, train.df)
y.train <- train.df$unhappy
x.test <- model.matrix(~ . -1 -unhappy, test.df)
y.test <- test.df$unhappy
cv.lasso <- cv.glmnet(x.train, y.train,
alpha = 1, intercept = F,
family = "binomial",
type.measure = "class",
lambda = lambda.grid)
good.lam <- cv.lasso$lambda.min
for(k in 1:M){
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~ . -1 -unhappy, train.df)
y.train <- train.df$unhappy
x.test <- model.matrix(~ . -1 -unhappy, test.df)
y.test <- test.df$unhappy
mod.lasso <- glmnet(x.train, y.train,
family = "binomial",
alpha = 1, intercept = F,
lambda = good.lam)
pred <- as.numeric(predict(mod.lasso, newx = x.test, type = "class"))
err[k] <- mean(y.test != pred)
}
mean(err)
}
(err.lasso.fm <- err.lasso(happy.df))## [1] 0.1239529
p <- ncol(happy.df) - 1
errRF <- function(data.df, kfolds = 8){
folds <- sample(1:kfolds, nrow(data.df), rep = T)
err <- rep(0, kfolds)
for (k in 1:kfolds){
train.df <- data.df[folds != k,] %>%
mutate(unhappy = as.factor(unhappy))
test.df <- data.df[folds == k,] %>%
mutate(unhappy = as.factor(unhappy))
mod <- randomForest(unhappy ~.,
data = train.df,
mtry = p,
ntree = 100)
preds <- predict(mod, newdata = test.df, type = "response")
err[k] <- with(test.df, mean((unhappy != preds)))
}
min(err)
}
(err.rf.fm <- errRF(happy.df))## [1] 0.1399093
errNET <- function(data.df, M = 10){
err <- numeric(2)
for (k in 1:M){
samps <- sample(1:nrow(data.df), nrow(data.df), rep = T)
train.df <- data.df[samps,] %>%
mutate(unhappy = as.factor(unhappy))
test.df <- data.df[-samps,] %>%
mutate(unhappy = as.factor(unhappy))
mod <- nnet(unhappy ~ . - unhappy, data = train.df, size = 7, trace = F)
preds <- as.factor(predict(mod, newdata = test.df, type = "class"))
err[k] <- with(test.df %>%
mutate(preds = preds), mean(as.numeric(preds) != as.numeric(unhappy)))
}
err <- err[err < max(err)]
err <- err[err < max(err)]
mean(err)
}
(err.net.fm <- errNET(happy.df))## [1] 0.1241871
errs.df## err type diff.null
## 1 0.1236428 Logistic Reg -0.001676680
## 2 0.1230974 Ridge Reg -0.002222027
## 3 0.1239529 Lasso -0.001366564
## 4 0.1399093 Random Forest 0.014589824
## 5 0.1241871 Neural Net -0.001132392
## 6 0.1253195 Null Rate 0.000000000
Our Random Forest Model does not beat the null rate but the other models do (barely). It seems that this data set does not perform well when attempting to predict whether or not someone reports to be unhappy, but we can still continue with our analysis and see what interesting insight we can gain within this model.
cv.lasso <- cv.glmnet(x.train, y.train,
alpha = 1, intercept = F,
family = "binomial",
type.measure = "class",
lambda = lambda.grid)
coef.lasso <- coef(cv.lasso)[-1,1]
coef.lasso## year age sexmale
## -1.540407e-04 -8.474142e-03 2.399391e-01
## sexfemale maritalnever married maritaldivorced
## 0.000000e+00 5.170270e-01 8.248469e-01
## maritalwidowed maritalseparated degreehigh school
## 8.600762e-01 1.121584e+00 -7.820152e-02
## degreejunior college degreebachelor degreegraduate
## -2.064193e-01 -2.672141e-01 -2.118292e-01
## finrelabelow average finrelaaverage finrelaabove average
## -3.571603e-01 -8.779958e-01 -1.036610e+00
## finrelafar above average healthfair healthgood
## -5.761086e-01 -3.973487e-01 -1.172534e+00
## healthexcellent wtssall
## -1.528277e+00 -7.540278e-05
We see that all variables from the cross validated lasso model have nonzero coefficients besides sexfemale, which would lead us to think we should use all variables in the model except maybe sex. We can also look to see the part that the variables play in the variable importance plot from our random forest model.
mod.rf <- randomForest(unhappy ~.,
data = train.df,
mtry = p,
ntree = 100,
importance = T)grid.arrange(p1, p2, ncol = 2,
top = "Variable Importance Plot")In both metrics of the variable importance plot for our random forest model, we see that sex falls last. In the Mean Decrease Accuracy plot sex is very far behind the next lowest variable.
## [1] 1
## [1] 0.1252583
## [1] 1 5
## [1] 0.1253157
## [1] 1 5 7
## [1] 0.125271
## [1] 1 5 7 6
## [1] 0.1244052
## [1] 1 5 7 6 4
## [1] 0.1233379
## [1] 1 5 7 6 4 8
## [1] 0.1234015
## [1] 1 5 7 6 4 8 3
## [1] 0.1235501
## [1] 1 5 7 6 4 8 3 2
## [1] 0.1238131
This time we see that the second to last variable added was variable 3 (sex), and between step 6 and step 7, when sex was added to the model, our error rate very marginally increased. We are starting to see a pattern across a few models that sex tends to play a small role in the prediction of happiness. We will run one last test to determine if sex really contributes to the accuracty of the prediction across our different models.
sex for each modelhappy.df1 <- happy.df %>%
dplyr::select(-sex)err.df## errs model with.sex perc.improvement
## 1 0.1236428 Logistic Reg Yes 0.168
## 2 0.1230974 Ridge Reg Yes 0.222
## 3 0.1239529 Lasso Yes 0.137
## 4 0.1399093 Random Forest Yes -1.459
## 5 0.1241871 Neural Net Yes 0.113
## 6 0.1253195 Null Rate Null 0.000
## 7 0.1237333 Logistic Reg No 0.159
## 8 0.1230132 Ridge Reg No 0.231
## 9 0.1243874 Lasso No 0.093
## 10 0.1373941 Random Forest No -1.207
## 11 0.1239617 Neural Net No 0.136
## 12 0.1253195 Null Rate Null 0.000
plot.wo.sexIt seems the removal of sex marginally improved our models but the improvement is virtually unnoticable. It did take us down to the lowest error rate we’ve seen in ridge regression but still only a difference of fractions of a percent below the Null Rate.
happy.df.new <- happy %>%
na.omit()lambda.grid <- 10^seq(-3, 1, length = 20)
err.ridge1 <- function(data.df, M = 10){
err <- rep(0, M)
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~. -1 -happy, train.df)
y.train <- train.df$happy
x.test <- model.matrix(~. -1 -happy, test.df)
y.test <- test.df$happy
cv.ridge <- cv.glmnet(x.train, y.train,
alpha = 0, intercept = F,
family = "multinomial",
lambda = lambda.grid)
good.lam <- cv.ridge$lambda.min
for (k in 1:M){
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~. -1 -happy, train.df)
y.train <- train.df$happy
x.test <- model.matrix(~. -1 -happy, test.df)
y.test <- test.df$happy
mod.ridge <- glmnet(x.train, y.train,
family = "multinomial",
alpha = 0, intercept = F,
lambda = good.lam)
pred <- predict(mod.ridge, newx = x.test, type = "class")
err[k] <- mean(y.test != pred)
}
mean(err)
}
(err.ridge.class <- err.ridge1(happy.df.new))## [1] 0.4164482
err.lasso1 <- function(data.df, M = 10){
err <- rep(0, M)
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~ . -1 -happy, train.df)
y.train <- train.df$happy
x.test <- model.matrix(~ . -1 -happy, test.df)
y.test <- test.df$happy
cv.lasso <- cv.glmnet(x.train, y.train,
alpha = 1, intercept = F,
family = "multinomial",
lambda = lambda.grid)
good.lam <- cv.lasso$lambda.min
for(k in 1:M){
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~ . -1 -happy, train.df)
y.train <- train.df$happy
x.test <- model.matrix(~ . -1 -happy, test.df)
y.test <- test.df$happy
mod.lasso <- glmnet(x.train, y.train,
family = "multinomial",
alpha = 1, intercept = F,
lambda = good.lam)
pred <- predict(mod.lasso, newx = x.test, type = "class")
err[k] <- mean(y.test != pred)
}
mean(err)
}
(err.lasso.class <- err.lasso1(happy.df.new))## [1] 0.4165599
p <- ncol(happy.df.new) - 1
errRF1 <- function(data.df, M = 10){
err <- rep(0, M)
for (k in 1:M){
samps <- sample(1:nrow(data.df),
nrow(data.df)/2,
rep = F)
train.df <- data.df[samps,] %>%
mutate(happy = as.factor(happy))
test.df <- data.df[-samps,] %>%
mutate(happy = as.factor(happy))
mod <- randomForest(happy ~.,
data = train.df,
mtry = p,
ntree = 100)
preds <- predict(mod, newdata = test.df, type = "class")
err[k] <- with(test.df, mean((happy != preds)))
}
min(err)
}
(err.rf.happy <- errRF1(happy.df.new))## [1] 0.4499196
(null.class <- nrow(happy.df.new %>%
filter(happy == "pretty happy")
)/ nrow(happy.df.new))## [1] 0.5542027
set.seed(2334)
errNET <- function(data.df, M = 10){
err <- numeric(2)
for (k in 1:M){
samps <- sample(1:nrow(data.df), nrow(data.df), rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
mod <- nnet(happy ~ . - happy, data = train.df, size = 7, trace = F)
preds <- predict(mod, newdata = test.df, type = "class")
err[k] <- with(test.df %>%
mutate(preds = preds), mean(preds != happy))
}
err <- err[err < max(err)]
err <- err[err < max(err)]
mean(err)
}
(err.net.class <- errNET(happy.df.new))## [1] 0.439922
errs.df## err model perc.improve
## 1 0.4164482 Ridge Reg 13.775
## 2 0.4165599 Lasso 13.764
## 3 0.4499196 Random Forest 10.428
## 4 0.4399220 Neural Net 11.428
## 5 0.5542027 Null Rate 0.000
Now that we are predicting using 3 level classification which is the intention of the data, we see an increase in the percent improvement of the model classification error rate from the null rate. If we were doing a project that needed to use these predictions to make some sort of decision in the real world, we would probably want to use a cost function to assess each type of model by the misclassification cost instead of just misclassification rate.
Suppose we were using this data to predict a person’s perceived happiness level in order to offer treatment for increased risk of depression. If this were the case, our biggest cost would come from predicting Very Happy when in fact the subject is Not Too Happy, there would also be a cost of predicting Pretty Happy when the actual is Not Too Happy, and there might be a reward when correctly identifying someone as Not Too Happy.
Here is an arbitrary cost function table.
## pred actual cost
## 1 very happy pretty happy 5
## 2 very happy not too happy 50
## 3 very happy very happy 0
## 4 pretty happy very happy 0
## 5 pretty happy not too happy 20
## 6 pretty happy pretty happy -2
## 7 not too happy very happy 0
## 8 not too happy pretty happy 0
## 9 not too happy not too happy -10
calcCost <- function(df){
with(df %>%
inner_join(cost.df,
by = c("pred" = "pred",
"actual" = "actual")),
sum(cost))
}pred <- rep(as.factor("pretty happy"), nrow(happy.df.new))
actual <- happy.df.new$happy
null.cost <- calcCost(data.frame(pred = pred,
actual = actual))lambda.grid <- 10^seq(-3, 1, length = 20)
cost.ridge <- function(data.df, M = 10){
cost <- rep(0, M)
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~. -1 -happy, train.df)
y.train <- train.df$happy
x.test <- model.matrix(~. -1 -happy, test.df)
y.test <- test.df$happy
cv.ridge <- cv.glmnet(x.train, y.train,
alpha = 0, intercept = F,
family = "multinomial",
lambda = lambda.grid)
good.lam <- cv.ridge$lambda.min
for (k in 1:M){
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~. -1 -happy, train.df)
y.train <- train.df$happy
x.test <- model.matrix(~. -1 -happy, test.df)
y.test <- test.df$happy
mod.ridge <- glmnet(x.train, y.train,
family = "multinomial",
alpha = 0, intercept = F,
lambda = good.lam)
preds <- predict(mod.ridge, newx = x.test, type = "class")
coster.df <- data.frame(pred = preds,
actual = y.test) %>%
rename(pred = s0)
cost[k] <- calcCost(coster.df)
}
mean(cost)
}
(cost.ridge.fm <- cost.ridge(happy.df.new))## [1] 23223.7
cost.lasso <- function(data.df, M = 10){
cost <- rep(0, M)
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~ . -1 -happy, train.df)
y.train <- train.df$happy
x.test <- model.matrix(~ . -1 -happy, test.df)
y.test <- test.df$happy
cv.lasso <- cv.glmnet(x.train, y.train,
alpha = 1, intercept = F,
family = "multinomial",
lambda = lambda.grid)
good.lam <- cv.lasso$lambda.min
for(k in 1:M){
samps <- sample(1:nrow(data.df),
nrow(data.df),
rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
x.train <- model.matrix(~ . -1 -happy, train.df)
y.train <- train.df$happy
x.test <- model.matrix(~ . -1 -happy, test.df)
y.test <- test.df$happy
mod.lasso <- glmnet(x.train, y.train,
family = "multinomial",
alpha = 1, intercept = F,
lambda = good.lam)
preds <- predict(mod.lasso, newx = x.test, type = "class")
coster.df <- data.frame(pred = preds,
actual = y.test) %>%
rename(pred = s0)
cost[k] <- calcCost(coster.df)
}
mean(cost)
}
(cost.lasso.fm <- cost.lasso(happy.df.new))## [1] 25169.1
costRF <- function(data.df, M = 10){
cost <- rep(0, M)
for (k in 1:M){
samps <- sample(1:nrow(data.df),
nrow(data.df)/2,
rep = F)
train.df <- data.df[samps,] %>%
mutate(happy = as.factor(happy))
test.df <- data.df[-samps,] %>%
mutate(happy = as.factor(happy))
mod <- randomForest(happy ~.,
data = train.df,
mtry = p,
ntree = 100)
preds <- predict(mod, newdata = test.df, type = "class")
coster.df <- data.frame(pred = preds,
actual = test.df$happy)
cost[k] <- calcCost(coster.df)
}
min(cost)
}
(cost.rf.fm <- costRF(happy.df.new))## [1] 36834
costNET <- function(data.df, M = 10){
cost <- numeric(2)
for (k in 1:M){
samps <- sample(1:nrow(data.df), nrow(data.df), rep = T)
train.df <- data.df[samps,]
test.df <- data.df[-samps,]
mod <- nnet(happy ~ . - happy, data = train.df, size = 7, trace = F)
preds <- predict(mod, newdata = test.df, type = "class")
coster.df <- data.frame(pred = as.factor(preds),
actual = as.factor(test.df$happy))
cost[k] <- calcCost(coster.df)
}
mean(cost)
}
(cost.net.fm <- costNET(happy.df.new))## [1] 19723.9
costs.df## cost model perc.improve
## 1 23223.7 Ridge Reg 52.29510
## 2 25169.1 Lasso 48.29896
## 3 36834.0 Random Forest 24.33754
## 4 19723.9 Neural Net 59.48420
## 5 48682.0 Null Rate 0.00000
After including a cost factor to the analysis of our models, we start to gain confidence in the effects of our models as compared to the Null Rate. We see here that our models can lower the cost of misclassification by half of the null rate and that could help better target subjects for depression treatment.