This is an R HTML document. When you click the Knit HTML button a web page will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(ISLR2) library(glmnet)
library(dplyr)
library(tree) library(boot) #load data data(Default) #response: default -> categorical: yes/no defaulted on credit card debt # #predictors: student->categorical: yes/no # balance->average balance of credit card in $ # income->annual income of individual set.seed(808) #function to split data into train and test samples #parameters: data->data frame of all data in sample # ratio->% of data to be randomly selected for training sample #return: list contains [training_data, test_data] trainTest <- function(data, ratio){ smp_size <- floor(ratio * nrow(data)) train_ind <- sample(seq_len(nrow(data)), size = smp_size) train <- data[train_ind, ] test <- data[-train_ind, ] train <- as.data.frame(train) test<-as.data.frame(test) return(list(train, test)) } #treat data for log regression Default$default <- ifelse(Default$default == "Yes", 1, 0) data<-trainTest(Default, .80) train<-data[[1]] test<-data[[2]] #fit logistic regression, estimate misclassification rate with 10-fold cv glm.fit <- glm(default~., family='binomial', data = train) cv.glm(train, glm.fit, K = 10)$delta[1] #0.02126864 estimated error rate
## [1] 0.02126864
summary(glm.fit)
## ## Call: ## glm(formula = default ~ ., family = "binomial", data = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.3234 -0.1457 -0.0588 -0.0227 3.7133 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -1.043e+01 5.543e-01 -18.821 <2e-16 *** ## studentYes -5.329e-01 2.691e-01 -1.981 0.0476 * ## balance 5.479e-03 2.515e-04 21.781 <2e-16 *** ## income -7.454e-07 9.604e-06 -0.078 0.9381 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 2259.2 on 7999 degrees of freedom ## Residual deviance: 1258.9 on 7996 degrees of freedom ## AIC: 1266.9 ## ## Number of Fisher Scoring iterations: 8
#apply model to test sample probabilities <- predict.glm(glm.fit, newdata = test, type = 'response') predicted.classes <- ifelse(probabilities > 0.5, 1, 0) table(predicted.classes, test$default)
## ## predicted.classes 0 1 ## 0 1917 54 ## 1 5 24
1-mean(predicted.classes == test$default) #0.0295 error rate on test sample
## [1] 0.0295
#The model has a very low test error rate but this is largely due to correctly #predicting no default. We correctly identified only 24/78 defaults #in our test sample #treat data for lasso regression x <- model.matrix(default~., train)[, -1] y <- train$default x.test <- model.matrix(default~., test)[, -1] y.test <- test$default #use training of data to find optimal tuning parameter for lasso regression cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial") plot(cv.lasso)
cv.lasso$lambda.min # 0.0002709107
## [1] 0.0002709107
#fit lasso model with optimal lambda based on training sample lasso.fit <- glmnet(x, y, alpha = 1, family = "binomial", lambda = cv.lasso$lambda.min) lasso.fit$beta #income coef set to zero
## 3 x 1 sparse Matrix of class "dgCMatrix" ## s0 ## studentYes -0.479437127 ## balance 0.005399128 ## income .
#apply lasso model to test data set to evaluate error rate probabilities <- lasso.fit %>% predict(newx = x.test, type = 'response') predicted.classes <- ifelse(probabilities > 0.5, 1, 0) table(predicted.classes, test$default)
## ## predicted.classes 0 1 ## 0 1917 56 ## 1 5 22
1-mean(predicted.classes == test$default) #0.0305 error rate
## [1] 0.0305
#performs worse than log regression, predicts only 22/78 defaults #use training data to find optimal tuning parameter for ridge regression cv.ridge <- cv.glmnet(x, y, alpha = 0, family = "binomial") plot(cv.ridge)
cv.ridge$lambda.min #0.005973936
## [1] 0.005973936
#apply ridge model to test data set to evaluate error rate ridge.fit <- glmnet(x, y, alpha = 0, family = "binomial", lambda = cv.ridge$lambda.min) ridge.fit$beta
## 3 x 1 sparse Matrix of class "dgCMatrix" ## s0 ## studentYes -1.487443e-01 ## balance 3.669385e-03 ## income 1.691863e-06
#estimate error rate on test data probabilities <- ridge.fit %>% predict(newx = x.test, type = 'response') predicted.classes <- ifelse(probabilities > 0.5, 1, 0) table(predicted.classes, test$default)
## ## predicted.classes 0 1 ## 0 1921 69 ## 1 1 9
1-mean(predicted.classes == test$default) #0.035 error rate
## [1] 0.035
#predicts only 9/78 defaults #classification tree tree <- tree(as.factor(default)~., train) plot(tree) summary(tree) #trainig error: 0.02712
## ## Classification tree: ## tree(formula = as.factor(default) ~ ., data = train) ## Variables actually used in tree construction: ## [1] "balance" ## Number of terminal nodes: 5 ## Residual mean deviance: 0.1601 = 1280 / 7995 ## Misclassification error rate: 0.02712 = 217 / 8000
text(tree, pretty = 0)
tree.pred <- predict(tree, test, type = "class") table(tree.pred, test$default)
## ## tree.pred 0 1 ## 0 1908 45 ## 1 14 33
1-mean(tree.pred == test$default) #.0295
## [1] 0.0295
#correctly identifies 33/78 defaults using only balance predictor #cross validate decision tree cv.tree <- cv.tree(tree, FUN = prune.misclass) names(cv.tree)
## [1] "size" "dev" "k" "method"
cv.tree
## $size ## [1] 5 3 1 ## ## $dev ## [1] 223 223 256 ## ## $k ## [1] -Inf 0 19 ## ## $method ## [1] "misclass" ## ## attr(,"class") ## [1] "prune" "tree.sequence"
plot(cv.tree$size, cv.tree$dev, type = "b")
prune.tree <- prune.misclass(tree, best = 3) plot(prune.tree) text(prune.tree, pretty = 0)
summary(prune.tree) #training error: 0.02712
## ## Classification tree: ## snip.tree(tree = tree, nodes = 2L) ## Variables actually used in tree construction: ## [1] "balance" ## Number of terminal nodes: 3 ## Residual mean deviance: 0.1757 = 1405 / 7997 ## Misclassification error rate: 0.02712 = 217 / 8000
tree.pred <- predict(prune.tree, test, type = "class") table(tree.pred, test$default) #.0295
## ## tree.pred 0 1 ## 0 1908 45 ## 1 14 33
#pruned tree performs same as unpruned, but uses two less nodes
You can also embed plots, for example: