Homework 10

#Kevin Kuipers (Completed byself) #04/02/2019

#1. Question 8.4.4 pg 332

##Question 4 a) Sketch the tree corresponding to the partition of the predictor space illustrated in the left-hand panel of Figure 8.12. The numbers inside the boxes indicate the mean of Y within each region.

if \(X_{1} \ge 0\) Then 5 if \(X_{2} \ge 0\) Then 15 if \(0 \gt X_{1}\) Then 0 if \(0 \gt X_{2}\) Then 10 Else 0

##Question 4 b) Create a diagram similar to the left-hand panel of Figure 8.12, using the tree illustrated in the right-hand panel of the same figure. You should divide up the predictor space into the correct regions, and indicate the mean for each region.

#install.packages('tree')
library(tree)

#2. Question 8.4.8 pg 332 In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.

##Question 8 a) Split the data set into a training set and a test set.

I will load the Carseats dataset found in the ISLR library. After that I will split the data set, 70% training and the other 30% will be for testing. Before the split I will set.seed as 702

#Loading Carseats dataset
library(ISLR)
data(Carseats)

#Setting seed. 
set.seed(702)

#Training and testing data split
index <- sample(x=nrow(Carseats), size=0.70*nrow(Carseats))
training <- Carseats[index,]
testing <- Carseats[-index,]

#Question 8 b) I will fit regression tree on the training data. I will also ouput the tree plot and the test MSE. The tree regression command is found in the tree library. The tree regression model will be trying to predict sales explained by all the other variables in the data set.

##Tree Models Plots

#install.packages('tree')
library(tree)

#fitting tree regression model 
tree.mod <- tree(Sales ~ ., data=training)

#Plotting tree regression model 
plot(tree.mod)
text(tree.mod, pretty=0)

library('dplyr')
library(tidyverse)

library(ggdendro)

#ggplot of the tree diagram
ggdendrogram(tree.mod, segements=TRUE, labels=TRUE, leaf_labels = TRUE, rotate=FALSE, theme_dendro = TRUE) + labs(title="Tree Model of Sales Explained by All Variables")

##Test MSE

pred <- predict(tree.mod, newdata = testing)
test.mse <- mean((pred - testing$Sales)^2)

cat('The testing MSE is :', test.mse)

## The testing MSE is : 5.337681

#Question 8 c) Use cross-validation in order to determine the optimal level of tree complexity. Does pruning the tree improve the test MSE?

Now I will do a tree model using cross validation. Then I will output the plot in order to determine if pruning the tree helps.

set.seed(702)
cv.tree.mod <- cv.tree(tree.mod)
plot(cv.tree.mod, type='b', main='No. Trees Vs Model Performance')

cv.tree.mod.df <- data.frame(
  size = cv.tree.mod$size,
  deviance = cv.tree.mod$dev
)

ggplot(cv.tree.mod.df, aes(x=size, y=deviance)) + 
  geom_point() + 
  geom_line(colour='blue') +
  labs(title='No. Trees vs Model Performance')

It appears that the optimal number of trees nodes is 6. Therefore, 6 in the model and output the test MSE.

##Graph of 6 tree model.

prune.tree.mod <- prune.tree(tree.mod, best=6)
plot(prune.tree.mod)
text(prune.tree.mod, pretty=0, main='6-node Tree of Model of Sales Explained by all other variables')

ggdendrogram(prune.tree.mod, segements=TRUE, labels=TRUE, leaf_labels = TRUE, rotate=FALSE, theme_dendro = TRUE) + labs(title="6-node Tree of Model of Sales Explained by all other variables")

##Test MSE Now I will output use the training data on the testing data to determine the test MSE of the cv.tree model containing 6-node tree.

pred <- predict(prune.tree.mod, newdata = testing)
test.mse <- mean((pred - testing$Sales))

cat('The testing MSE is :', test.mse)

## The testing MSE is : -0.3281356

It appears the testing MSE has decreased. The original test MSE was 5.032503 the cv pruned tree with 6-nodes has test MSE as -0.3488717

#Question 8 d) Use the bagging approach in order to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important.

In order to use the bagging approach I will have use the randomForest library. All variables will be used to explain Sales in the model but the importance function in fitting the model determine which variables are most relevant in the model. After fitting the model using the training data I will out use it against the testing data and output the test MSE.

##Fitting randomForest Model using bagging

library(randomForest)
bag.mod <- randomForest(Sales ~ ., data=training, importance=TRUE, ntree=500, mtry=(dim(training)[2]-1))
bag.mod$importance

##                  %IncMSE IncNodePurity
## CompPrice    1.543092428    264.147866
## Income       0.272898611    121.662252
## Advertising  0.464209423    141.501956
## Population  -0.036295200     67.489211
## Price        5.529682749    711.840683
## ShelveLoc    5.287692160    720.092741
## Age          0.492163356    178.088791
## Education    0.072684298     65.275216
## Urban        0.009675504      9.438239
## US           0.013178465     13.668325

importance(bag.mod)

##               %IncMSE IncNodePurity
## CompPrice   35.639762    264.147866
## Income      12.448004    121.662252
## Advertising 18.654590    141.501956
## Population  -2.247620     67.489211
## Price       73.629348    711.840683
## ShelveLoc   81.227898    720.092741
## Age         17.170070    178.088791
## Education    4.469004     65.275216
## Urban        1.323662      9.438239
## US           1.468959     13.668325

pred <- predict(bag.mod, newdata=testing)
test.mse <- ((pred - testing$Sales)^2)

It appears SelveLoc and Price are the two most important variables. After that CompPrice and Age.

#Test MSE Now I will using the training data model against the testing data model to output the test MSE.

pred <- predict(bag.mod, newdata=testing)
test.mse <- mean((pred - testing$Sales)^2)
cat('The test MSE of the model is: ', test.mse)

## The test MSE of the model is:  2.417689

The test MSE has decreased from the previous model. However, it is lower than the very first model that was fit.

#Question 8 e) Use random forests to analyze this data. What test MSE do you obtain? Use the importance() function to determine which rate obtained.

Now I will fit two random forest models. One with mtry=3 and one with mtry=5. mtry is number of variables randomly sampled as candidates at each split. Then I will output the test MSE of each model and compare. I will also use the importance function within each model.

##Fitting Models

rf.3 <- randomForest(Sales ~ ., data=training, mtry=3, importance=TRUE)
rf.5 <- randomForest(Sales ~ ., data=training, mtry=5, importance=TRUE)

##Importance of mtry=3

importance(rf.3)

##                %IncMSE IncNodePurity
## CompPrice   18.2988632     219.82797
## Income       6.8583398     176.29329
## Advertising 16.5701937     177.70190
## Population  -2.8230850     130.08904
## Price       43.8568679     548.27217
## ShelveLoc   49.4300699     564.81388
## Age         16.2610178     255.79659
## Education    6.6956771     109.08170
## Urban       -0.6381438      20.85426
## US           2.1510191      26.78344

##Importances of mtry=5

importance(rf.5)

##               %IncMSE IncNodePurity
## CompPrice   25.255937     235.08251
## Income       9.545963     144.02416
## Advertising 14.946647     173.19953
## Population  -1.107274      95.92273
## Price       54.687753     625.37520
## ShelveLoc   68.389934     657.12104
## Age         16.972406     227.56609
## Education    5.807438      84.54217
## Urban       -1.776028      12.50194
## US           2.023125      17.38570

##Test MSE of mtry=3

pred <- predict(rf.3, newdata=testing)
test.mse <- mean((pred - testing$Sales)^2)
cat('The test MSE for mtry=3 is :', test.mse)

## The test MSE for mtry=3 is : 2.499195

##Test MSE of mtry=5

pred <- predict(rf.5, newdata=testing)
test.mse <- mean((pred - testing$Sales)^2)
cat('The test MSE for mtry=5 is :', test.mse)

## The test MSE for mtry=5 is : 2.319711

It appears the mtry = 5 very slighly lower test MSE than than mtry=3

#Question 8 Conclusion

It appears The models the model with lowest test MSE was prune.tree models with 6-tree nodes. The test MSE was -0.3488717. The rest of the models appeared to have test MSEs that were close together. However, the first tree model without any pruning had the highest test MSE which was 5.032503. So it appears that pruning was the best approach. The second best approaches were bagging and randomForest.

#3. Question 8.4.9 pg 334 This problem involves the OJ data set which is part of the ISLR package.

##Question 9 a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.

##Training test split I will load the data set OJ found in the ISLR library. I will then I split the data set into training and testing. The training set will have 800 observations and the testing will contain the remaining obsverations.

library(ISLR)
data(OJ)
set.seed(702)
index <- sample(1:nrow(OJ), 800)
training <- OJ[index,]
testing <- OJ[-index,]

##Question 9 b) Fit a tree to the training data, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?

##Model fit

Now I will fit a tree model on training data with Purchase explained by all the other variables in the data set. I will then output the summary of the model.

#install.packages('tree')
tree.mod <- tree(Purchase ~ ., data=training)
summary(tree.mod)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = training)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "ListPriceDiff"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7005 = 554.1 / 791 
## Misclassification error rate: 0.1462 = 117 / 800

The misclassification error rate is 0.1662. The number of terminal nodes is 8.

##Question 9 c) Type in the name of the tree object in order to get a detailed text output. Pick one of the terminal nodes, and interpret the information displayed.

#For whatever reason I kept getting an error when knit to PDF. Please uncomment to get the same output below
#install.packages('tree')
#library('tree')
#tree(Purchase ~ ., data=training)

node), split, n, deviance, yval, (yprob) * denotes terminal node

root 800 1073.00 CH ( 0.60500 0.39500 )
LoyalCH < 0.50395 348 411.90 MM ( 0.27874 0.72126 )
4) LoyalCH < 0.276142 163 121.40 MM ( 0.12270 0.87730 )
1. LoyalCH < 0.051325 60 10.17 MM ( 0.01667 0.98333 ) *
2. LoyalCH > 0.051325 103 98.49 MM ( 0.18447 0.81553 ) *
  1. LoyalCH > 0.276142 185 251.20 MM ( 0.41622 0.58378 )
3. PriceDiff < 0.05 76 72.61 MM ( 0.18421 0.81579 ) *
4. PriceDiff > 0.05 109 148.40 CH ( 0.57798 0.42202 ) *
LoyalCH > 0.50395 452 372.30 CH ( 0.85619 0.14381 )
6) LoyalCH < 0.764572 191 223.70 CH ( 0.72775 0.27225 )
1. PriceDiff < 0.265 114 154.50 CH ( 0.58772 0.41228 )
2. PriceDiff < -0.165 34 42.81 MM ( 0.32353 0.67647 ) *
3. PriceDiff > -0.165 80 97.74 CH ( 0.70000 0.30000 ) *
4. PriceDiff > 0.265 77 37.01 CH ( 0.93506 0.06494 ) *
  1. LoyalCH > 0.764572 261 103.30 CH ( 0.95019 0.04981 ) *

The terminal node that seems best is 8. This means the LoyalCH < 0.051325 is the split crition. The number of observations is 60. The deviance is 10.17. 1.667% of 60 observations take the form of CH cagegory for Purchase and 98.333% take the MM category for Purchase. Terminal node 8 is the same one that my original model fit.

#Question 9 d) Create a plot of the tree, and interpret the results.

##Plot

Now I will plot the data using both plot() command and ggplot() using the ggdendro command of the tree.

plot(tree.mod)
text(tree.mod, pretty=0)

ggdendrogram(tree.mod, segements=TRUE, labels=TRUE, leaf_labels = TRUE, rotate=FALSE, theme_dendro = TRUE) + labs(title="Tree Model of Sales Explained by All Variables")

It appears the most imporant variable is LoyalCH. If the value is less than 0.50395 then mostly likely the purchase is going to be MM. However, PriceDiff less than 0.05 can alter the purchase to MM.

If loyalCH < 0.764572 then it seems that CH will most likely be purchased. However, at this point the PriceDiff less than 0.265 and PriceDiff less than -0.165 then it has some possibility of MM being purchsed.

#Question 9 e) Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?

pred <- predict(tree.mod, testing, type = 'class')
cm <- table(pred, testing$Purchase)

error.rate <- 1 - ( (cm[1,1] + cm[2,2]) / sum(cm))

cm

##     
## pred  CH  MM
##   CH 144  38
##   MM  22  66

cat('The error rate is :', error.rate)

## The error rate is : 0.2222222

It appears the error rate for testing the model is 15.56%.

#Question 9 f) I will no use the cv.tree() command on the training data to determine the optiaml tree size.

cv.tree.t <- cv.tree(tree.mod, FUN=prune.misclass)

cv.tree.t

## $size
## [1] 9 5 2 1
## 
## $dev
## [1] 145 141 142 313
## 
## $k
## [1]       -Inf   1.250000   6.333333 172.000000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

#Question 9 g) Produce a plot with tree size on the x-axis and cross-validated classification error rate on the y-axis.

I will plot this using both the plot() command and ggplot() command

plot(cv.tree.t, type='b', main='CV Tree Model Peformance')

cv.df <- data.frame (
  Size = c(cv.tree.t$size),
  Misclass = c(cv.tree.t$dev)
)


ggplot(data=cv.df, aes(x=Size, y=Misclass)) +
  geom_point() + 
  geom_line(colour='blue') +
  labs(title= 'CV Tree Model Peformance')

#Question 9 h) Which tree size corresponds to the lowest cross-validated classification error rate?

According to the plots it appears a 4-node tree prodiced the lowest classification error rate.

#Question 10 i) Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.

I will prune with 4-node and output a summary. I will also prune with 5-node and output the summary and compare the results.

##4-node summary

prune.4 <- prune.misclass(tree.mod, best=4)
summary(prune.4)

## 
## Classification tree:
## snip.tree(tree = tree.mod, nodes = 2L)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "ListPriceDiff"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.7887 = 627 / 795 
## Misclassification error rate: 0.1525 = 122 / 800

##5-node summary

prune.5 <- prune.misclass(tree.mod, best=5)
summary(prune.5)

## 
## Classification tree:
## snip.tree(tree = tree.mod, nodes = 2L)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "ListPriceDiff"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.7887 = 627 / 795 
## Misclassification error rate: 0.1525 = 122 / 800

##Unpurned tree summary

summary(tree.mod)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = training)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "ListPriceDiff"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7005 = 554.1 / 791 
## Misclassification error rate: 0.1462 = 117 / 800

#Question 9 k)

Compare the training error rates between the pruned and unpruned trees. Which is higher?

according the summary of the models, it appears that the misclassification error rate is lower for the unpruned tree than the 4-node pruned tree.

#Question 9 k)

Compare the test error rates between the pruned and unpruned trees. Which is higher?

Now I will compare the models test error rates using the training data models against the testing data.

#Pruned  tree with 4-node error rate

pred.p4 <- predict(prune.4, testing, type='class')
cm.p4 <- table(pred.p4, testing$Purchase)
error.rate.p4 <- 1 - ( (cm.p4[1,1] + cm.p4[2,2]) / sum(cm.p4) )


#Pruned tree with 5-node error rate
pred.p5 <- predict(prune.5, testing, type='class')
cm.p5 <- table(pred.p5, testing$Purchase)
error.rate.p5 <- 1 - ( (cm.p5[1,1] + cm.p5[2,2]) / sum(cm.p5) )

#unpruned tree model error rate was already performaned saved as error.rate 

data.frame ( 
  Pruned.tree.n4 = error.rate.p4,
  Pruned.tree.n5 = error.rate.p5,
  Unpruned.tree = error.rate)

It appears that the unpruned tree has the lowest error rate when testing it against the test data.

#4. Question 8.4.10 pg 334

#Question 10 a) Remove the observations for whom the salary information is unknown, and then log-transform the salaries.

First I will load the Hitters data set found in the ISLR library. Then I will remove the observations where Salary is unknown (NA)

library(ISLR)
data(Hitters)

#install.packages('sqldf')
#install.packages('RSQLite')
#install.packages('digest')
library('RSQLite')
library('sqldf')

#Removing missing salary observations 
Hitters <- sqldf(
  "
  SELECT
  *
  FROM
  Hitters
  
  WHERE
  Salary != 'N/A'
  AND
  Salary IS NOT NULL"
)

Hitters$Salary <- log(Hitters$Salary)

#Question 10 b) Now I will create a training and testing set. The training data will contrain the first 200 observations, the remaining the data will be the testing data.

training <- Hitters[1:200,]
testing <- Hitters[201:(nrow(Hitters)),]

10 c)

Now I will perform boosting on the training set with 1,000 trees for a range of values of the skrinkage parameter \(lambda\). Then I will produce a plot with different shrinkage value son the x-axis and the corresponding training set MSE on the y-axis.

##Boosting lambda = 1000 trees

set.seed(702)
#install.packages('gbm')
library('gbm')
p <- seq(from=-10, to=0, by=0.05)
lam <- 10^p
train.mse <- rep(NA, length(lam))

for (i in 1:length(lam)) {
  boost.mod <- gbm(Salary ~ ., data=training, distribution = 'gaussian', n.trees=1000, shrinkage=lam[i])
  pred <- predict(boost.mod, training, n.trees=1000)
  train.mse[i] <- mean((pred - training$Salary)^2)
}


boost.df <- data.frame(
  lambdas = c(lam),
  train.mse <- c(train.mse)
)

plot(x=lam, y=train.mse, type='b', ylab='Training MSE', xlab='Shrinkage Values', main='Training MSE vs Shrinkage Values')

library(tidyverse)
ggplot(data=boost.df, aes(x=lambdas,y=train.mse)) + 
  geom_point() + 
  labs(y='Training MSE', x='Shrinkage Values', title='Training MSE vs Shrinkage Values')

It appears that has shrinkage values increases the the training MSE decreases.

#Question 10 d) Now will perform the same action as above but I will use training data set against the testing data set in order to obtain the test MSE vs Shrinkage values.

set.seed(702)
test.mse <- rep(NA, length(lam))

for (i in 1:length(lam)) {
  boost.mod <- gbm(Salary ~ ., data=training, distribution = 'gaussian', n.trees=1000, shrinkage=lam[i])
  pred <- predict(boost.mod, newdata=testing, n.trees=1000)
  test.mse[i] <- mean((pred - testing$Salary)^2)
}


boost.df.test <- data.frame(
  lambdas = c(lam),
  test.mse <- c(test.mse)
)

plot(x=lam,y=test.mse, type='b', ylab='Test MSE', xlab='Shrinkage Values', main='Test MSE vs Shrinkage Values')

ggplot(data=boost.df.test, aes(x=lambdas,y=test.mse)) + 
  geom_point() + 
  geom_line(colour='blue') +
  labs(y='Test MSE', x='Shrinkage Values', title='Test MSE vs Shrinkage Values')

When performing the same experiment on the test MSE a different result occurs. It appears that for very small shrinkage values the training MSE decreaes. However, there is slippery slope part. Onces the shrinkage values get just a little larger the testing MSE increases. There is a sweet spot between skrinkage values and testing MSE values.

#Question 9 e) Compare the test MSE of boosting to the test MSE that results from applying two of the regression approaches seen in Chapters 3 and 6.

I will fit a regular linear regression model and produce the test MSE, from chapter 3. I will also fit a principal compnenents regression model and produe the test MSE from chapter 6

##Linear Regression Model

lm.fit <- lm(Salary ~ ., data=training)
pred.lm <- predict(lm.fit, testing)
lm.mse <- mean((pred.lm - testing$Salary)^2)

lm.mse

## [1] 0.4917959

##principal compnenents regression (PCR)

library(pls)
set.seed(702)
pcr.fit <- pcr(Salary ~., data=training, scale=TRUE, validation='CV')
validationplot(pcr.fit, val.type='MSEP')

It appears number of components around 7 is best MSEP

##PCR test MSE

pred.pcr <- predict(pcr.fit, newdata=testing,ncomp=7)
pcr.mse <- mean((pred.pcr - testing$Salary))
pcr.mse

## [1] 0.0348506

##Boosting test MSE

boost.mse <- min(test.mse)
boost.mse

## [1] 0.2576145

It appears the PCR test MSE has the lowest value. Then boost method, and lastly regaular linear regression model.

#Question 10 f) Which variables appear to be the most important predictors in the boosted model?

Now I will fit boost model with the optimal value for lambda with n.tree=1000. I will output the summary to determine which variables are most sigifinicant in predicting salary.

best.lam <- lam[which.min(test.mse)]

optimal.boost.mod <- gbm(Salary ~ ., data=training, distribution = 'gaussian', n.trees=1000, shrinkage = best.lam)
summary(optimal.boost.mod)

The summary outputs are listed in order from most significant to least in predicting salary CAtBat is the most important variable.

#Question 10 g) Now apply bagging to the training set. What is the test set MSE for this approach?

set.seed(702)
bagging <- randomForest(Salary ~ ., data=training, mtry=(dim(training)[2]-1), ntree=500)
pred <- predict(bagging, newdata=testing)
test.mse <- mean((pred - testing$Salary)^2)

cat('The bagging test MSE is: ', test.mse)

## The bagging test MSE is:  0.2306229

It appears the bagging approach has lower slightly lower test MSE value than the boost method.

#Question 5. In the past couple of homework assignments you have used different classification methods to analyze the dataset you chose. For this homework, use tree-based classification methods (tree,bagging, randomforest, boosting) to model your data. Find the test error using any/all of methods (VSA, K-fold CV). Compare the results you obtained with the result from previous homework. Did the results improve? (Use the table you previously made to compare results)

##Data set split I will be splitting the data set to split the data. 75% will be for training and the other 25% will be used for testing.

invnodes will be: 0-2 = 0 3-5 = 1 6-8 = 2 9-11 = 3 12-14 = 4 15-17 = 5 24-26 = 6 missing=99

Age: 10-19 = 1 20-29 = 2 30-39 = 3 40-49 = 4 50-59 = 5 60-69 = 6 70-79 = 7 80-89 = 8 90-99 = 9 missing = 10

menopause: lt40 = 0 ge40 = 1 premeno = 2 missing = 3

tumorsize: 0-4 = 0 5-9 = 1 10-14 = 2 15-19 = 3 20-24 = 4 25-29 = 5 30-34 = 6 35-39 = 7 40-44 = 8 45-49 = 9 50-54 = 10 55-59 = 11 missing = 12

node-caps: no = 0 yes = 1 missing = 3

breast: left = 0 right = 1 missing= 3

breast-quad: left-up = 0 left-low = 1 right-up = 2 right-low = 3 central = 4 missing = 5

irradiat:
yes = 1, no = 0 missing = 2.

#install.packages('data.table')

library(data.table)
library(tidyverse)
bc.dat <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data', col.names=c('class','age','menopause','tumorsize','invnodes','nodecaps','degmalig','breast','breastquad','irradiant'))


Class <- ifelse(bc.dat$class=="no-recurrence-events",'No','Yes')
Class2 <- ifelse(bc.dat$class=="no-recurrence-events", 0,1)
bc.dat <- data.frame(bc.dat, Class,Class2)

bc.dat$invnodes <- ifelse(bc.dat$invnodes=='0-2',0,(ifelse(bc.dat$invnodes=='3-5',1,
(ifelse(bc.dat$invnodes=='6-8',2,(ifelse(bc.dat$invnodes=='9-11',3,(ifelse(bc.dat$invnodes=='12-14',4,(ifelse(bc.dat$invnodes=='15-17',5,(ifelse(bc.dat$invnodes=='24-26',6,99)))))))))))))

bc.dat$age <- ifelse(bc.dat$age=='10-19',1,(ifelse(bc.dat$age=='20-29',2,(ifelse(bc.dat$age=='30-39',3,(ifelse(bc.dat$age=='40-49',4,(ifelse(bc.dat$age=='50-59',5,(ifelse(bc.dat$age=='60-69',6,(ifelse(bc.dat$age=='70-79',7,(ifelse('80-89',8,(ifelse('90-99',9,10)))))))))))))))))

bc.dat$menopause <- ifelse(bc.dat$menopause=='lt40',0,(ifelse(bc.dat$menopause=='ge40',1,(ifelse(bc.dat$menopause=='premeno',2,3)))))

bc.dat$tumorsize <- ifelse(bc.dat$tumorsize=='0-4',0,ifelse(bc.dat$tumorsize=='5-9',1,ifelse(bc.dat$tumorsize=='10-14',2,ifelse(bc.dat$tumorsize=='15-19',3,ifelse(bc.dat$tumorsize=='20-24',4,ifelse(bc.dat$tumorsize=='25-29',5,ifelse(bc.dat$tumorsize=='30-34',6,ifelse(bc.dat$tumorsize=='35-39',7,ifelse(bc.dat$tumorsize=='40-44',8,ifelse(bc.dat$tumorsize=='45-49',9,ifelse(bc.dat$tumorsize=='50-54',10,ifelse(bc.dat$tumorsize=='55-59',11,12))))))))))))


bc.dat$nodecaps <- ifelse(bc.dat$nodecaps=='no',0,(ifelse(bc.dat$nodecaps=='yes',1,2)))

bc.dat$breast <- ifelse(bc.dat$breast=='left',0,(ifelse(bc.dat$breast=='right',1,2)))

bc.dat$breastquad <- ifelse(bc.dat$breastquad=='left_up',0,(ifelse(bc.dat$breastquad=='left_low',1,(ifelse(bc.dat$breastquad=='right_up',2,(ifelse(bc.dat$breastquad=='right_low',3,(ifelse(bc.dat$breastquad=='central',4,5)))))))))



bc.dat$irradiant <- ifelse(bc.dat$irradiant == 'yes',1,ifelse(bc.dat$irradiant=='no',0,2))



set.seed(702)
index <- sample(x=nrow(bc.dat), size=0.75*nrow(bc.dat))
training <- bc.dat[index,]
testing <- bc.dat[-index,]
Class.test <- Class[-index]

##Tree Regression model

library(tree)
set.seed(702)

tree.bc <- tree(Class ~ .-class-Class2,data=training)
tree.pred <- predict(tree.bc,testing,type="class")
cm.tree <- table(tree.pred,testing$Class)
cm.tree

##          
## tree.pred No Yes
##       No  49  17
##       Yes  2   4

tree.error <- 1 - ( (cm.tree[1,1] + cm.tree[2,2]) / sum(cm.tree))
tree.error

## [1] 0.2638889

#Plotting tree regression model 
plot(tree.bc)
text(tree.bc, pretty=0)

cv.tree.bc <- cv.tree(tree.bc)
plot(cv.tree.bc, type='b', main='No. Trees Vs Model Performance')

##Bagging Method and error rate

library(randomForest)
set.seed(702)
bag.bc <- randomForest(Class ~ .-class-Class2, data=training, importance=TRUE, ntree=1000, mtry=6)
bag.pred <- predict(bag.bc, testing, type='class')
cm.bag <- table(bag.pred, testing$Class)
cm.bag

##         
## bag.pred No Yes
##      No  45  15
##      Yes  6   6

bag.error <- 1 - ( (cm.bag[1,1] + cm.bag[2,2]) / sum(cm.bag))
bag.error

## [1] 0.2916667

#Random Forest Model and error rate

set.seed(702)

#bag.mod$importance
#importance(bag.mod)
set.seed(702)
rf.bc <- randomForest(Class ~ .-class-Class2, data=training, importance=TRUE,  mtry=6)
rf.pred <- predict(rf.bc, testing, type='class')
cm.rf <- table(rf.pred, testing$Class)
cm.rf

##        
## rf.pred No Yes
##     No  44  15
##     Yes  7   6

rf.error <- 1 - ( (cm.rf[1,1] + cm.rf[2,2]) / sum(cm.rf))
rf.error

## [1] 0.3055556

df <- data.frame(
  tree.error = tree.error,
  bag.error = bag.error,
  rf.error = rf.error,
    GLM = 0.319,
  KNN = 0.292,
  LDA = 0.319,
  MclustDA = 0.278,
  MclustDA_EDDA = 0.306,
  SVM = 0.292,
  GLM_LOOCV = 0.259,
  GLM_CV5 = 0.266
)



library(knitr)
kable(df, caption = "Model Error Rates")

Model Error Rates
tree.error	bag.error	rf.error	GLM	KNN	LDA	MclustDA	MclustDA_EDDA	SVM	GLM_LOOCV	GLM_CV5
0.2638889	0.2916667	0.3055556	0.319	0.292	0.319	0.278	0.306	0.292	0.259	0.266

It appears the GLM LOOCV and MclustDA still produce the lowest error rates.

Homework 10

Modern Applied Statistics II

10 c)