ADM Project

Abraham Eyman Casey

09 April, 2019

Happiness Data from the General Social Survey annually from 1972-2006
Question 1: Which model provides the smallest classification error when predicting unhappiness as a binomial variable?
Question 2: How well can we predict the classifation of happpiness at the 3 class level given to us in the original dataset?
Including an arbitrary Cost Function

Happiness Data from the General Social Survey annually from 1972-2006

head(happy)

##   id         happy year age    sex       marital         degree
## 1  1 not too happy 1972  23 female never married       bachelor
## 2  2 not too happy 1972  70   male       married lt high school
## 3  3  pretty happy 1972  48 female       married    high school
## 4  4 not too happy 1972  27 female       married       bachelor
## 5  5  pretty happy 1972  61 female       married    high school
## 6  6  pretty happy 1972  26   male never married    high school
##         finrela    health wtssall
## 1       average      good  0.4446
## 2 above average      fair  0.8893
## 3       average excellent  0.8893
## 4       average      good  0.8893
## 5 above average      good  0.8893
## 6 above average      good  0.4446

Question 1: Which model provides the smallest classification error when predicting unhappiness as a binomial variable?

Logistic Regression (and cross validation)

errGLM <- function(data.df, kfolds = 10){
  folds <- sample(1:kfolds, nrow(data.df), rep = T)
  err <- rep(0, kfolds)
  for(k in 1:kfolds){
    train.df <- data.df[folds != k,]
    test.df <- data.df[folds == k,]
    mod <- glm(unhappy ~ . - unhappy,
               data = train.df, 
               family = binomial)
    vals <- predict(mod, newdata = test.df, 
                        type = "response")
    thresh.pred <- .5
    test.df <- test.df %>%
    mutate(prob.logs = vals,
           pred.log = ifelse(prob.logs < thresh.pred, 0, 1))
    err[k] <- with(test.df, mean(unhappy != pred.log))
  }
  mean(err)
}

(err.glm.fm <- errGLM(happy.df))

## [1] 0.1236428

(null.fm <- nrow(happy.df %>%
                             filter(unhappy == 1)) / nrow(happy.df))

## [1] 0.1253195

Ridge with glmnet (and bootstrapping)

lambda.grid <- 10^seq(-3, 1, length = 50)

err.ridge <- function(data.df, M = 10){
  err <- rep(0, M)
  samps <- sample(1:nrow(data.df),
                       nrow(data.df), 
                    rep = T)
    train.df <- data.df[samps,]
    test.df <- data.df[-samps,]
    x.train <- model.matrix(~. -1 -unhappy, train.df)
    y.train <- train.df$unhappy
    x.test <- model.matrix(~. -1 -unhappy, test.df)
    y.test <- test.df$unhappy
  cv.ridge <- cv.glmnet(x.train, y.train,
                          alpha = 0, intercept = F,
                          family = "binomial",
                          type.measure = "class",
                          lambda = lambda.grid)
    
  good.lam <- cv.ridge$lambda.min
  for (k in 1:M){
    samps <- sample(1:nrow(data.df),
                       nrow(data.df), 
                    rep = T)
    train.df <- data.df[samps,]
    test.df <- data.df[-samps,]
    x.train <- model.matrix(~. -1 -unhappy, train.df)
    y.train <- train.df$unhappy
    x.test <- model.matrix(~. -1 -unhappy, test.df)
    y.test <- test.df$unhappy
    
    mod.ridge <- glmnet(x.train, y.train, 
                        family = "binomial",
                    alpha = 0, intercept = F, 
                    lambda = good.lam)
    pred <- as.numeric(
      predict(mod.ridge, newx = x.test, type = "class"))
    err[k] <- mean(y.test != pred)
  }
  mean(err)
}

(err.ridge.fm <- err.ridge(happy.df))

## [1] 0.1230974

Lasso with glmnet (and bootstrapping)

err.lasso <- function(data.df, M = 10){
  err <- rep(0, M)
  samps <- sample(1:nrow(data.df),
                    nrow(data.df),
                    rep = T)
  train.df <- data.df[samps,]
  test.df <- data.df[-samps,]
  x.train <- model.matrix(~ . -1 -unhappy, train.df)
  y.train <- train.df$unhappy
  x.test <- model.matrix(~ . -1 -unhappy, test.df)
  y.test <- test.df$unhappy
  cv.lasso <- cv.glmnet(x.train, y.train,
                          alpha = 1, intercept = F,
                          family = "binomial",
                          type.measure = "class",
                          lambda = lambda.grid)
  good.lam <- cv.lasso$lambda.min
  
  for(k in 1:M){
    samps <- sample(1:nrow(data.df),
                    nrow(data.df),
                    rep = T)
    train.df <- data.df[samps,]
    test.df <- data.df[-samps,]
    x.train <- model.matrix(~ . -1 -unhappy, train.df)
    y.train <- train.df$unhappy
    x.test <- model.matrix(~ . -1 -unhappy, test.df)
    y.test <- test.df$unhappy
    
    mod.lasso <- glmnet(x.train, y.train,
                        family = "binomial",
                        alpha = 1, intercept = F,
                        lambda = good.lam)
    pred <- as.numeric(predict(mod.lasso, newx = x.test, type = "class"))
    err[k] <- mean(y.test != pred)
  }
  mean(err)
}

(err.lasso.fm <- err.lasso(happy.df))

## [1] 0.1239529

Random Forest

p <- ncol(happy.df) - 1

errRF <- function(data.df, kfolds = 8){
  folds <- sample(1:kfolds, nrow(data.df), rep = T)
  err <- rep(0, kfolds)
  for (k in 1:kfolds){
    train.df <- data.df[folds != k,] %>%
      mutate(unhappy = as.factor(unhappy))
    test.df <- data.df[folds == k,] %>%
      mutate(unhappy = as.factor(unhappy))
    mod <- randomForest(unhappy ~.,
                        data = train.df,
                        mtry = p,
                        ntree = 100)
    preds <- predict(mod, newdata = test.df, type = "response")
    err[k] <- with(test.df, mean((unhappy != preds)))
  }
  min(err)
}

(err.rf.fm <- errRF(happy.df))

## [1] 0.1399093

Feed-Forward Neural Networks

errNET <- function(data.df, M = 10){
  err <- numeric(2)
  for (k in 1:M){
    samps <- sample(1:nrow(data.df), nrow(data.df), rep = T)
    train.df <- data.df[samps,] %>%
      mutate(unhappy = as.factor(unhappy))
    test.df <- data.df[-samps,] %>%
      mutate(unhappy = as.factor(unhappy))
    mod <- nnet(unhappy ~ . - unhappy, data = train.df, size = 7, trace = F)
    preds <- as.factor(predict(mod, newdata = test.df, type = "class"))
    err[k] <- with(test.df %>%
       mutate(preds = preds), mean(as.numeric(preds) != as.numeric(unhappy)))
    }
   err <- err[err < max(err)]
   err <- err[err < max(err)]
   mean(err)
}

(err.net.fm <- errNET(happy.df))

## [1] 0.1241871

Comparing classification error from each model

errs.df

##         err          type    diff.null
## 1 0.1236428  Logistic Reg -0.001676680
## 2 0.1230974     Ridge Reg -0.002222027
## 3 0.1239529         Lasso -0.001366564
## 4 0.1399093 Random Forest  0.014589824
## 5 0.1241871    Neural Net -0.001132392
## 6 0.1253195     Null Rate  0.000000000

Our Random Forest Model does not beat the null rate but the other models do (barely). It seems that this data set does not perform well when attempting to predict whether or not someone reports to be unhappy, but we can still continue with our analysis and see what interesting insight we can gain within this model.

Variable Selection With lasso

cv.lasso <- cv.glmnet(x.train, y.train,
                          alpha = 1, intercept = F,
                          family = "binomial",
                          type.measure = "class",
                          lambda = lambda.grid)

coef.lasso <- coef(cv.lasso)[-1,1]
coef.lasso

##                     year                      age                  sexmale 
##            -1.540407e-04            -8.474142e-03             2.399391e-01 
##                sexfemale     maritalnever married          maritaldivorced 
##             0.000000e+00             5.170270e-01             8.248469e-01 
##           maritalwidowed         maritalseparated        degreehigh school 
##             8.600762e-01             1.121584e+00            -7.820152e-02 
##     degreejunior college           degreebachelor           degreegraduate 
##            -2.064193e-01            -2.672141e-01            -2.118292e-01 
##     finrelabelow average           finrelaaverage     finrelaabove average 
##            -3.571603e-01            -8.779958e-01            -1.036610e+00 
## finrelafar above average               healthfair               healthgood 
##            -5.761086e-01            -3.973487e-01            -1.172534e+00 
##          healthexcellent                  wtssall 
##            -1.528277e+00            -7.540278e-05

We see that all variables from the cross validated lasso model have nonzero coefficients besides sexfemale, which would lead us to think we should use all variables in the model except maybe sex. We can also look to see the part that the variables play in the variable importance plot from our random forest model.

Variable Importance from Random Forest

mod.rf <- randomForest(unhappy ~.,
                        data = train.df,
                        mtry = p,
                        ntree = 100,
                        importance = T)

grid.arrange(p1, p2, ncol = 2, 
             top = "Variable Importance Plot")

In both metrics of the variable importance plot for our random forest model, we see that sex falls last. In the Mean Decrease Accuracy plot sex is very far behind the next lowest variable.

Stepwise variable selection using logistic regression

## [1] 1
## [1] 0.1252583
## [1] 1 5
## [1] 0.1253157
## [1] 1 5 7
## [1] 0.125271
## [1] 1 5 7 6
## [1] 0.1244052
## [1] 1 5 7 6 4
## [1] 0.1233379
## [1] 1 5 7 6 4 8
## [1] 0.1234015
## [1] 1 5 7 6 4 8 3
## [1] 0.1235501
## [1] 1 5 7 6 4 8 3 2
## [1] 0.1238131

This time we see that the second to last variable added was variable 3 (sex), and between step 6 and step 7, when sex was added to the model, our error rate very marginally increased. We are starting to see a pattern across a few models that sex tends to play a small role in the prediction of happiness. We will run one last test to determine if sex really contributes to the accuracty of the prediction across our different models.

Comparison of error with and without `sex` for each model

happy.df1 <- happy.df %>%
  dplyr::select(-sex)

err.df

##         errs         model with.sex perc.improvement
## 1  0.1236428  Logistic Reg      Yes            0.168
## 2  0.1230974     Ridge Reg      Yes            0.222
## 3  0.1239529         Lasso      Yes            0.137
## 4  0.1399093 Random Forest      Yes           -1.459
## 5  0.1241871    Neural Net      Yes            0.113
## 6  0.1253195     Null Rate     Null            0.000
## 7  0.1237333  Logistic Reg       No            0.159
## 8  0.1230132     Ridge Reg       No            0.231
## 9  0.1243874         Lasso       No            0.093
## 10 0.1373941 Random Forest       No           -1.207
## 11 0.1239617    Neural Net       No            0.136
## 12 0.1253195     Null Rate     Null            0.000

plot.wo.sex

It seems the removal of sex marginally improved our models but the improvement is virtually unnoticable. It did take us down to the lowest error rate we’ve seen in ridge regression but still only a difference of fractions of a percent below the Null Rate.

Question 2: How well can we predict the classifation of happpiness at the 3 class level given to us in the original dataset?

happy.df.new <- happy %>%
  na.omit()

Ridge with glmnet (and bootstrapping)

lambda.grid <- 10^seq(-3, 1, length = 20)

err.ridge1 <- function(data.df, M = 10){
  err <- rep(0, M)
  samps <- sample(1:nrow(data.df),
                       nrow(data.df), 
                    rep = T)
  train.df <- data.df[samps,]
  test.df <- data.df[-samps,]
  x.train <- model.matrix(~. -1 -happy, train.df)
  y.train <- train.df$happy
  x.test <- model.matrix(~. -1 -happy, test.df)
  y.test <- test.df$happy
  cv.ridge <- cv.glmnet(x.train, y.train,
                          alpha = 0, intercept = F,
                          family = "multinomial",
                          lambda = lambda.grid)
    
  good.lam <- cv.ridge$lambda.min
  for (k in 1:M){
    samps <- sample(1:nrow(data.df),
                       nrow(data.df), 
                    rep = T)
    train.df <- data.df[samps,]
    test.df <- data.df[-samps,]
    x.train <- model.matrix(~. -1 -happy, train.df)
    y.train <- train.df$happy
    x.test <- model.matrix(~. -1 -happy, test.df)
    y.test <- test.df$happy
    
    mod.ridge <- glmnet(x.train, y.train, 
                        family = "multinomial",
                    alpha = 0, intercept = F, 
                    lambda = good.lam)
    pred <- predict(mod.ridge, newx = x.test, type = "class")
    err[k] <- mean(y.test != pred)
  }
  mean(err)
}

(err.ridge.class <- err.ridge1(happy.df.new))

## [1] 0.4164482

Lasso with glmnet (and bootstrapping)

err.lasso1 <- function(data.df, M = 10){
  err <- rep(0, M)
  samps <- sample(1:nrow(data.df),
                    nrow(data.df),
                    rep = T)
  train.df <- data.df[samps,]
  test.df <- data.df[-samps,]
  x.train <- model.matrix(~ . -1 -happy, train.df)
  y.train <- train.df$happy
  x.test <- model.matrix(~ . -1 -happy, test.df)
  y.test <- test.df$happy
    
  cv.lasso <- cv.glmnet(x.train, y.train,
                        alpha = 1, intercept = F,
                        family = "multinomial",
                        lambda = lambda.grid)
    
  good.lam <- cv.lasso$lambda.min
  
  for(k in 1:M){
    samps <- sample(1:nrow(data.df),
                    nrow(data.df),
                    rep = T)
    train.df <- data.df[samps,]
    test.df <- data.df[-samps,]
    x.train <- model.matrix(~ . -1 -happy, train.df)
    y.train <- train.df$happy
    x.test <- model.matrix(~ . -1 -happy, test.df)
    y.test <- test.df$happy
    
    
    mod.lasso <- glmnet(x.train, y.train,
                        family = "multinomial",
                        alpha = 1, intercept = F,
                        lambda = good.lam)
    pred <- predict(mod.lasso, newx = x.test, type = "class")
    err[k] <- mean(y.test != pred)
  }
  mean(err)
}

(err.lasso.class <- err.lasso1(happy.df.new))

## [1] 0.4165599

Random Forest

p <- ncol(happy.df.new) - 1

errRF1 <- function(data.df, M = 10){
  err <- rep(0, M)
  for (k in 1:M){
    samps <- sample(1:nrow(data.df),
                    nrow(data.df)/2, 
                    rep = F)
    train.df <- data.df[samps,] %>%
      mutate(happy = as.factor(happy))
    test.df <- data.df[-samps,] %>%
      mutate(happy = as.factor(happy))
    mod <- randomForest(happy ~.,
                        data = train.df,
                        mtry = p,
                        ntree = 100)
    preds <- predict(mod, newdata = test.df, type = "class")
    err[k] <- with(test.df, mean((happy != preds)))
  }
  min(err)
}

(err.rf.happy <- errRF1(happy.df.new))

## [1] 0.4499196

(null.class <- nrow(happy.df.new %>%
                     filter(happy == "pretty happy")
                   )/ nrow(happy.df.new))

## [1] 0.5542027

Feed-Forward Neural Networks

set.seed(2334)
errNET <- function(data.df, M = 10){
  err <- numeric(2)
  for (k in 1:M){
    samps <- sample(1:nrow(data.df), nrow(data.df), rep = T)
    train.df <- data.df[samps,]
    test.df <- data.df[-samps,]
    mod <- nnet(happy ~ . - happy, data = train.df, size = 7, trace = F)
    preds <- predict(mod, newdata = test.df, type = "class")
    err[k] <- with(test.df %>%
       mutate(preds = preds), mean(preds != happy))
    }
   err <- err[err < max(err)]
   err <- err[err < max(err)]
   mean(err)
}

(err.net.class <- errNET(happy.df.new))

## [1] 0.439922

Comparing classification error from each model

errs.df

##         err         model perc.improve
## 1 0.4164482     Ridge Reg       13.775
## 2 0.4165599         Lasso       13.764
## 3 0.4499196 Random Forest       10.428
## 4 0.4399220    Neural Net       11.428
## 5 0.5542027     Null Rate        0.000

Now that we are predicting using 3 level classification which is the intention of the data, we see an increase in the percent improvement of the model classification error rate from the null rate. If we were doing a project that needed to use these predictions to make some sort of decision in the real world, we would probably want to use a cost function to assess each type of model by the misclassification cost instead of just misclassification rate.

Including an arbitrary Cost Function

Suppose we were using this data to predict a person’s perceived happiness level in order to offer treatment for increased risk of depression. If this were the case, our biggest cost would come from predicting Very Happy when in fact the subject is Not Too Happy, there would also be a cost of predicting Pretty Happy when the actual is Not Too Happy, and there might be a reward when correctly identifying someone as Not Too Happy.

Here is an arbitrary cost function table.

##            pred        actual cost
## 1    very happy  pretty happy    5
## 2    very happy not too happy   50
## 3    very happy    very happy    0
## 4  pretty happy    very happy    0
## 5  pretty happy not too happy   20
## 6  pretty happy  pretty happy   -2
## 7 not too happy    very happy    0
## 8 not too happy  pretty happy    0
## 9 not too happy not too happy  -10

Cost Function

calcCost <- function(df){
  with(df %>%
    inner_join(cost.df, 
               by = c("pred" = "pred",
                      "actual" = "actual")),
    sum(cost))
}

Null Cost

pred <- rep(as.factor("pretty happy"), nrow(happy.df.new))
actual <- happy.df.new$happy
null.cost <- calcCost(data.frame(pred = pred,
                    actual = actual))

Ridge Cost

lambda.grid <- 10^seq(-3, 1, length = 20)

cost.ridge <- function(data.df, M = 10){
  cost <- rep(0, M)
  samps <- sample(1:nrow(data.df),
                       nrow(data.df), 
                    rep = T)
  train.df <- data.df[samps,]
  test.df <- data.df[-samps,]
  x.train <- model.matrix(~. -1 -happy, train.df)
  y.train <- train.df$happy
  x.test <- model.matrix(~. -1 -happy, test.df)
  y.test <- test.df$happy
  cv.ridge <- cv.glmnet(x.train, y.train,
                          alpha = 0, intercept = F,
                          family = "multinomial",
                          lambda = lambda.grid)
    
  good.lam <- cv.ridge$lambda.min
  for (k in 1:M){
    samps <- sample(1:nrow(data.df),
                       nrow(data.df), 
                    rep = T)
    train.df <- data.df[samps,]
    test.df <- data.df[-samps,]
    x.train <- model.matrix(~. -1 -happy, train.df)
    y.train <- train.df$happy
    x.test <- model.matrix(~. -1 -happy, test.df)
    y.test <- test.df$happy
    
    mod.ridge <- glmnet(x.train, y.train, 
                        family = "multinomial",
                    alpha = 0, intercept = F, 
                    lambda = good.lam)
    preds <- predict(mod.ridge, newx = x.test, type = "class")
    coster.df <- data.frame(pred = preds,
                            actual = y.test) %>%
      rename(pred = s0)
    cost[k] <- calcCost(coster.df)
  }
  mean(cost)
}

(cost.ridge.fm <- cost.ridge(happy.df.new))

## [1] 23223.7

Lasso Cost

cost.lasso <- function(data.df, M = 10){
  cost <- rep(0, M)
  samps <- sample(1:nrow(data.df),
                    nrow(data.df),
                    rep = T)
  train.df <- data.df[samps,]
  test.df <- data.df[-samps,]
  x.train <- model.matrix(~ . -1 -happy, train.df)
  y.train <- train.df$happy
  x.test <- model.matrix(~ . -1 -happy, test.df)
  y.test <- test.df$happy
    
  cv.lasso <- cv.glmnet(x.train, y.train,
                        alpha = 1, intercept = F,
                        family = "multinomial",
                        lambda = lambda.grid)
    
  good.lam <- cv.lasso$lambda.min
  
  for(k in 1:M){
    samps <- sample(1:nrow(data.df),
                    nrow(data.df),
                    rep = T)
    train.df <- data.df[samps,]
    test.df <- data.df[-samps,]
    x.train <- model.matrix(~ . -1 -happy, train.df)
    y.train <- train.df$happy
    x.test <- model.matrix(~ . -1 -happy, test.df)
    y.test <- test.df$happy
    
    
    mod.lasso <- glmnet(x.train, y.train,
                        family = "multinomial",
                        alpha = 1, intercept = F,
                        lambda = good.lam)
    preds <- predict(mod.lasso, newx = x.test, type = "class")
    coster.df <- data.frame(pred = preds,
                            actual = y.test) %>%
      rename(pred = s0)
    cost[k] <- calcCost(coster.df)
  }
  mean(cost)
}

(cost.lasso.fm <- cost.lasso(happy.df.new))

## [1] 25169.1

Random Forest Cost

costRF <- function(data.df, M = 10){
  cost <- rep(0, M)
  for (k in 1:M){
    samps <- sample(1:nrow(data.df),
                    nrow(data.df)/2, 
                    rep = F)
    train.df <- data.df[samps,] %>%
      mutate(happy = as.factor(happy))
    test.df <- data.df[-samps,] %>%
      mutate(happy = as.factor(happy))
    mod <- randomForest(happy ~.,
                        data = train.df,
                        mtry = p,
                        ntree = 100)
    preds <- predict(mod, newdata = test.df, type = "class")
    coster.df <- data.frame(pred = preds,
                            actual = test.df$happy)
    cost[k] <- calcCost(coster.df)
  }
  min(cost)
}

(cost.rf.fm <- costRF(happy.df.new))

## [1] 36834

Neural Net Cost

costNET <- function(data.df, M = 10){
  cost <- numeric(2)
  for (k in 1:M){
    samps <- sample(1:nrow(data.df), nrow(data.df), rep = T)
    train.df <- data.df[samps,]
    test.df <- data.df[-samps,]
    mod <- nnet(happy ~ . - happy, data = train.df, size = 7, trace = F)
    preds <- predict(mod, newdata = test.df, type = "class")
    coster.df <- data.frame(pred = as.factor(preds),
                            actual = as.factor(test.df$happy))
    cost[k] <- calcCost(coster.df)
    }
   mean(cost)
}

(cost.net.fm <- costNET(happy.df.new))

## [1] 19723.9

Comparing Cost from each model

costs.df

##      cost         model perc.improve
## 1 23223.7     Ridge Reg     52.29510
## 2 25169.1         Lasso     48.29896
## 3 36834.0 Random Forest     24.33754
## 4 19723.9    Neural Net     59.48420
## 5 48682.0     Null Rate      0.00000

After including a cost factor to the analysis of our models, we start to gain confidence in the effects of our models as compared to the Null Rate. We see here that our models can lower the cost of misclassification by half of the null rate and that could help better target subjects for depression treatment.

Happiness Data from the General Social Survey annually from 1972-2006

Question 1: Which model provides the smallest classification error when predicting unhappiness as a binomial variable?

Logistic Regression (and cross validation)

Ridge with glmnet (and bootstrapping)

Lasso with glmnet (and bootstrapping)

Random Forest

Feed-Forward Neural Networks

Comparing classification error from each model

Variable Selection With lasso

Variable Importance from Random Forest

Stepwise variable selection using logistic regression

Comparison of error with and without sex for each model

Question 2: How well can we predict the classifation of happpiness at the 3 class level given to us in the original dataset?

Ridge with glmnet (and bootstrapping)

Lasso with glmnet (and bootstrapping)

Random Forest

Feed-Forward Neural Networks

Comparing classification error from each model

Including an arbitrary Cost Function

Cost Function

Null Cost

Ridge Cost

Lasso Cost

Random Forest Cost

Neural Net Cost

Comparing Cost from each model

Comparison of error with and without `sex` for each model