Title

This is an R HTML document. When you click the Knit HTML button a web page will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(ISLR2)
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-4

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tree)
library(boot)

#load data
data(Default)


#response: default -> categorical: yes/no defaulted on credit card debt
#
#predictors: student->categorical: yes/no
#            balance->average balance of credit card in $
#            income->annual income of individual 


set.seed(808)
#function to split data into train and test samples
#parameters: data->data frame of all data in sample
#            ratio->% of data to be randomly selected for training sample
#return: list contains [training_data, test_data]
trainTest <- function(data, ratio){
  smp_size <- floor(ratio * nrow(data))
  train_ind <- sample(seq_len(nrow(data)), size = smp_size)
  train <- data[train_ind, ]
  test <- data[-train_ind, ]
  train <- as.data.frame(train)
  test<-as.data.frame(test)
  return(list(train, test))
}



#treat data for log regression
Default$default <- ifelse(Default$default == "Yes", 1, 0)
data<-trainTest(Default, .80)
train<-data[[1]]
test<-data[[2]]



#fit logistic regression, estimate misclassification rate with 10-fold cv
glm.fit <- glm(default~., family='binomial', data = train)
cv.glm(train, glm.fit, K = 10)$delta[1] #0.02126864 estimated error rate

## [1] 0.02126864

summary(glm.fit)

## 
## Call:
## glm(formula = default ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3234  -0.1457  -0.0588  -0.0227   3.7133  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.043e+01  5.543e-01 -18.821   <2e-16 ***
## studentYes  -5.329e-01  2.691e-01  -1.981   0.0476 *  
## balance      5.479e-03  2.515e-04  21.781   <2e-16 ***
## income      -7.454e-07  9.604e-06  -0.078   0.9381    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2259.2  on 7999  degrees of freedom
## Residual deviance: 1258.9  on 7996  degrees of freedom
## AIC: 1266.9
## 
## Number of Fisher Scoring iterations: 8

#apply model to test sample
probabilities <- predict.glm(glm.fit, newdata = test, type = 'response')
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
table(predicted.classes, test$default)

##                  
## predicted.classes    0    1
##                 0 1917   54
##                 1    5   24

1-mean(predicted.classes == test$default) #0.0295 error rate on test sample

## [1] 0.0295

#The model has a very low test error rate but this is largely due to correctly
#predicting no default. We correctly identified only 24/78 defaults 
#in our test sample


#treat data for lasso regression
x <- model.matrix(default~., train)[, -1]
y <- train$default
x.test <- model.matrix(default~., test)[, -1]
y.test <- test$default


#use training of data to find optimal tuning parameter for lasso regression
cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial")
plot(cv.lasso)

cv.lasso$lambda.min # 0.0002709107

## [1] 0.0002709107

#fit lasso model with optimal lambda based on training sample
lasso.fit <- glmnet(x, y, alpha = 1, family = "binomial",
                lambda = cv.lasso$lambda.min)
lasso.fit$beta #income coef set to zero

## 3 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## studentYes -0.479437127
## balance     0.005399128
## income      .

#apply lasso model to test data set to evaluate error rate
probabilities <- lasso.fit %>% predict(newx = x.test, type = 'response')
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
table(predicted.classes, test$default)

##                  
## predicted.classes    0    1
##                 0 1917   56
##                 1    5   22

1-mean(predicted.classes == test$default) #0.0305 error rate

## [1] 0.0305

#performs worse than log regression, predicts only 22/78 defaults



#use training data to find optimal tuning parameter for ridge regression
cv.ridge <- cv.glmnet(x, y, alpha = 0, family = "binomial")
plot(cv.ridge)

cv.ridge$lambda.min #0.005973936

## [1] 0.005973936

#apply ridge model to test data set to evaluate error rate
ridge.fit <- glmnet(x, y, alpha = 0, family = "binomial",
                lambda = cv.ridge$lambda.min)
ridge.fit$beta

## 3 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## studentYes -1.487443e-01
## balance     3.669385e-03
## income      1.691863e-06

#estimate error rate on test data
probabilities <- ridge.fit %>% predict(newx = x.test, type = 'response')
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
table(predicted.classes, test$default)

##                  
## predicted.classes    0    1
##                 0 1921   69
##                 1    1    9

1-mean(predicted.classes == test$default) #0.035 error rate

## [1] 0.035

#predicts only 9/78 defaults



#classification tree
tree <- tree(as.factor(default)~., train)
plot(tree)
summary(tree) #trainig error: 0.02712

## 
## Classification tree:
## tree(formula = as.factor(default) ~ ., data = train)
## Variables actually used in tree construction:
## [1] "balance"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.1601 = 1280 / 7995 
## Misclassification error rate: 0.02712 = 217 / 8000

text(tree, pretty = 0)

tree.pred <- predict(tree, test, type = "class")
table(tree.pred, test$default)

##          
## tree.pred    0    1
##         0 1908   45
##         1   14   33

1-mean(tree.pred == test$default) #.0295

## [1] 0.0295

#correctly identifies 33/78 defaults using only balance predictor 



#cross validate decision tree
cv.tree <- cv.tree(tree, FUN = prune.misclass)
names(cv.tree)

## [1] "size"   "dev"    "k"      "method"

cv.tree

## $size
## [1] 5 3 1
## 
## $dev
## [1] 223 223 256
## 
## $k
## [1] -Inf    0   19
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

plot(cv.tree$size, cv.tree$dev, type = "b")

prune.tree <- prune.misclass(tree, best = 3)
plot(prune.tree)
text(prune.tree, pretty = 0)

summary(prune.tree) #training error: 0.02712

## 
## Classification tree:
## snip.tree(tree = tree, nodes = 2L)
## Variables actually used in tree construction:
## [1] "balance"
## Number of terminal nodes:  3 
## Residual mean deviance:  0.1757 = 1405 / 7997 
## Misclassification error rate: 0.02712 = 217 / 8000

tree.pred <- predict(prune.tree, test, type = "class")
table(tree.pred, test$default) #.0295

##          
## tree.pred    0    1
##         0 1908   45
##         1   14   33

#pruned tree performs same as unpruned, but uses two less nodes