#Kevin Kuipers (Completed byself) #04/02/2019
#1. Question 8.4.4 pg 332
##Question 4 a) Sketch the tree corresponding to the partition of the predictor space illustrated in the left-hand panel of Figure 8.12. The numbers inside the boxes indicate the mean of Y within each region.
if \(X_{1} \ge 0\) Then 5 if \(X_{2} \ge 0\) Then 15 if \(0 \gt X_{1}\) Then 0 if \(0 \gt X_{2}\) Then 10 Else 0
##Question 4 b) Create a diagram similar to the left-hand panel of Figure 8.12, using the tree illustrated in the right-hand panel of the same figure. You should divide up the predictor space into the correct regions, and indicate the mean for each region.
#install.packages('tree')
library(tree)
#2. Question 8.4.8 pg 332 In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.
##Question 8 a) Split the data set into a training set and a test set.
I will load the Carseats dataset found in the ISLR library. After that I will split the data set, 70% training and the other 30% will be for testing. Before the split I will set.seed as 702
#Loading Carseats dataset
library(ISLR)
data(Carseats)
#Setting seed.
set.seed(702)
#Training and testing data split
index <- sample(x=nrow(Carseats), size=0.70*nrow(Carseats))
training <- Carseats[index,]
testing <- Carseats[-index,]
#Question 8 b) I will fit regression tree on the training data. I will also ouput the tree plot and the test MSE. The tree regression command is found in the tree library. The tree regression model will be trying to predict sales explained by all the other variables in the data set.
##Tree Models Plots
#install.packages('tree')
library(tree)
#fitting tree regression model
tree.mod <- tree(Sales ~ ., data=training)
#Plotting tree regression model
plot(tree.mod)
text(tree.mod, pretty=0)
library('dplyr')
library(tidyverse)
library(ggdendro)
#ggplot of the tree diagram
ggdendrogram(tree.mod, segements=TRUE, labels=TRUE, leaf_labels = TRUE, rotate=FALSE, theme_dendro = TRUE) + labs(title="Tree Model of Sales Explained by All Variables")
##Test MSE
pred <- predict(tree.mod, newdata = testing)
test.mse <- mean((pred - testing$Sales)^2)
cat('The testing MSE is :', test.mse)
## The testing MSE is : 5.337681
#Question 8 c) Use cross-validation in order to determine the optimal level of tree complexity. Does pruning the tree improve the test MSE?
Now I will do a tree model using cross validation. Then I will output the plot in order to determine if pruning the tree helps.
set.seed(702)
cv.tree.mod <- cv.tree(tree.mod)
plot(cv.tree.mod, type='b', main='No. Trees Vs Model Performance')
cv.tree.mod.df <- data.frame(
size = cv.tree.mod$size,
deviance = cv.tree.mod$dev
)
ggplot(cv.tree.mod.df, aes(x=size, y=deviance)) +
geom_point() +
geom_line(colour='blue') +
labs(title='No. Trees vs Model Performance')
It appears that the optimal number of trees nodes is 6. Therefore, 6 in the model and output the test MSE.
##Graph of 6 tree model.
prune.tree.mod <- prune.tree(tree.mod, best=6)
plot(prune.tree.mod)
text(prune.tree.mod, pretty=0, main='6-node Tree of Model of Sales Explained by all other variables')
ggdendrogram(prune.tree.mod, segements=TRUE, labels=TRUE, leaf_labels = TRUE, rotate=FALSE, theme_dendro = TRUE) + labs(title="6-node Tree of Model of Sales Explained by all other variables")
##Test MSE Now I will output use the training data on the testing data to determine the test MSE of the cv.tree model containing 6-node tree.
pred <- predict(prune.tree.mod, newdata = testing)
test.mse <- mean((pred - testing$Sales))
cat('The testing MSE is :', test.mse)
## The testing MSE is : -0.3281356
It appears the testing MSE has decreased. The original test MSE was 5.032503 the cv pruned tree with 6-nodes has test MSE as -0.3488717
#Question 8 d) Use the bagging approach in order to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important.
In order to use the bagging approach I will have use the randomForest library. All variables will be used to explain Sales in the model but the importance function in fitting the model determine which variables are most relevant in the model. After fitting the model using the training data I will out use it against the testing data and output the test MSE.
##Fitting randomForest Model using bagging
library(randomForest)
bag.mod <- randomForest(Sales ~ ., data=training, importance=TRUE, ntree=500, mtry=(dim(training)[2]-1))
bag.mod$importance
## %IncMSE IncNodePurity
## CompPrice 1.543092428 264.147866
## Income 0.272898611 121.662252
## Advertising 0.464209423 141.501956
## Population -0.036295200 67.489211
## Price 5.529682749 711.840683
## ShelveLoc 5.287692160 720.092741
## Age 0.492163356 178.088791
## Education 0.072684298 65.275216
## Urban 0.009675504 9.438239
## US 0.013178465 13.668325
importance(bag.mod)
## %IncMSE IncNodePurity
## CompPrice 35.639762 264.147866
## Income 12.448004 121.662252
## Advertising 18.654590 141.501956
## Population -2.247620 67.489211
## Price 73.629348 711.840683
## ShelveLoc 81.227898 720.092741
## Age 17.170070 178.088791
## Education 4.469004 65.275216
## Urban 1.323662 9.438239
## US 1.468959 13.668325
pred <- predict(bag.mod, newdata=testing)
test.mse <- ((pred - testing$Sales)^2)
It appears SelveLoc and Price are the two most important variables. After that CompPrice and Age.
#Test MSE Now I will using the training data model against the testing data model to output the test MSE.
pred <- predict(bag.mod, newdata=testing)
test.mse <- mean((pred - testing$Sales)^2)
cat('The test MSE of the model is: ', test.mse)
## The test MSE of the model is: 2.417689
The test MSE has decreased from the previous model. However, it is lower than the very first model that was fit.
#Question 8 e) Use random forests to analyze this data. What test MSE do you obtain? Use the importance() function to determine which rate obtained.
Now I will fit two random forest models. One with mtry=3 and one with mtry=5. mtry is number of variables randomly sampled as candidates at each split. Then I will output the test MSE of each model and compare. I will also use the importance function within each model.
##Fitting Models
rf.3 <- randomForest(Sales ~ ., data=training, mtry=3, importance=TRUE)
rf.5 <- randomForest(Sales ~ ., data=training, mtry=5, importance=TRUE)
##Importance of mtry=3
importance(rf.3)
## %IncMSE IncNodePurity
## CompPrice 18.2988632 219.82797
## Income 6.8583398 176.29329
## Advertising 16.5701937 177.70190
## Population -2.8230850 130.08904
## Price 43.8568679 548.27217
## ShelveLoc 49.4300699 564.81388
## Age 16.2610178 255.79659
## Education 6.6956771 109.08170
## Urban -0.6381438 20.85426
## US 2.1510191 26.78344
##Importances of mtry=5
importance(rf.5)
## %IncMSE IncNodePurity
## CompPrice 25.255937 235.08251
## Income 9.545963 144.02416
## Advertising 14.946647 173.19953
## Population -1.107274 95.92273
## Price 54.687753 625.37520
## ShelveLoc 68.389934 657.12104
## Age 16.972406 227.56609
## Education 5.807438 84.54217
## Urban -1.776028 12.50194
## US 2.023125 17.38570
##Test MSE of mtry=3
pred <- predict(rf.3, newdata=testing)
test.mse <- mean((pred - testing$Sales)^2)
cat('The test MSE for mtry=3 is :', test.mse)
## The test MSE for mtry=3 is : 2.499195
##Test MSE of mtry=5
pred <- predict(rf.5, newdata=testing)
test.mse <- mean((pred - testing$Sales)^2)
cat('The test MSE for mtry=5 is :', test.mse)
## The test MSE for mtry=5 is : 2.319711
It appears the mtry = 5 very slighly lower test MSE than than mtry=3
#Question 8 Conclusion
It appears The models the model with lowest test MSE was prune.tree models with 6-tree nodes. The test MSE was -0.3488717. The rest of the models appeared to have test MSEs that were close together. However, the first tree model without any pruning had the highest test MSE which was 5.032503. So it appears that pruning was the best approach. The second best approaches were bagging and randomForest.
#3. Question 8.4.9 pg 334 This problem involves the OJ data set which is part of the ISLR package.
##Question 9 a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.
##Training test split I will load the data set OJ found in the ISLR library. I will then I split the data set into training and testing. The training set will have 800 observations and the testing will contain the remaining obsverations.
library(ISLR)
data(OJ)
set.seed(702)
index <- sample(1:nrow(OJ), 800)
training <- OJ[index,]
testing <- OJ[-index,]
##Question 9 b) Fit a tree to the training data, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?
##Model fit
Now I will fit a tree model on training data with Purchase explained by all the other variables in the data set. I will then output the summary of the model.
#install.packages('tree')
tree.mod <- tree(Purchase ~ ., data=training)
summary(tree.mod)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = training)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7005 = 554.1 / 791
## Misclassification error rate: 0.1462 = 117 / 800
The misclassification error rate is 0.1662. The number of terminal nodes is 8.
##Question 9 c) Type in the name of the tree object in order to get a detailed text output. Pick one of the terminal nodes, and interpret the information displayed.
#For whatever reason I kept getting an error when knit to PDF. Please uncomment to get the same output below
#install.packages('tree')
#library('tree')
#tree(Purchase ~ ., data=training)
node), split, n, deviance, yval, (yprob) * denotes terminal node
The terminal node that seems best is 8. This means the LoyalCH < 0.051325 is the split crition. The number of observations is 60. The deviance is 10.17. 1.667% of 60 observations take the form of CH cagegory for Purchase and 98.333% take the MM category for Purchase. Terminal node 8 is the same one that my original model fit.
#Question 9 d) Create a plot of the tree, and interpret the results.
##Plot
Now I will plot the data using both plot() command and ggplot() using the ggdendro command of the tree.
plot(tree.mod)
text(tree.mod, pretty=0)
ggdendrogram(tree.mod, segements=TRUE, labels=TRUE, leaf_labels = TRUE, rotate=FALSE, theme_dendro = TRUE) + labs(title="Tree Model of Sales Explained by All Variables")
It appears the most imporant variable is LoyalCH. If the value is less than 0.50395 then mostly likely the purchase is going to be MM. However, PriceDiff less than 0.05 can alter the purchase to MM.
If loyalCH < 0.764572 then it seems that CH will most likely be purchased. However, at this point the PriceDiff less than 0.265 and PriceDiff less than -0.165 then it has some possibility of MM being purchsed.
#Question 9 e) Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?
pred <- predict(tree.mod, testing, type = 'class')
cm <- table(pred, testing$Purchase)
error.rate <- 1 - ( (cm[1,1] + cm[2,2]) / sum(cm))
cm
##
## pred CH MM
## CH 144 38
## MM 22 66
cat('The error rate is :', error.rate)
## The error rate is : 0.2222222
It appears the error rate for testing the model is 15.56%.
#Question 9 f) I will no use the cv.tree() command on the training data to determine the optiaml tree size.
cv.tree.t <- cv.tree(tree.mod, FUN=prune.misclass)
cv.tree.t
## $size
## [1] 9 5 2 1
##
## $dev
## [1] 145 141 142 313
##
## $k
## [1] -Inf 1.250000 6.333333 172.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
#Question 9 g) Produce a plot with tree size on the x-axis and cross-validated classification error rate on the y-axis.
I will plot this using both the plot() command and ggplot() command
plot(cv.tree.t, type='b', main='CV Tree Model Peformance')
cv.df <- data.frame (
Size = c(cv.tree.t$size),
Misclass = c(cv.tree.t$dev)
)
ggplot(data=cv.df, aes(x=Size, y=Misclass)) +
geom_point() +
geom_line(colour='blue') +
labs(title= 'CV Tree Model Peformance')
#Question 9 h) Which tree size corresponds to the lowest cross-validated classification error rate?
According to the plots it appears a 4-node tree prodiced the lowest classification error rate.
#Question 10 i) Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.
I will prune with 4-node and output a summary. I will also prune with 5-node and output the summary and compare the results.
##4-node summary
prune.4 <- prune.misclass(tree.mod, best=4)
summary(prune.4)
##
## Classification tree:
## snip.tree(tree = tree.mod, nodes = 2L)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff"
## Number of terminal nodes: 5
## Residual mean deviance: 0.7887 = 627 / 795
## Misclassification error rate: 0.1525 = 122 / 800
##5-node summary
prune.5 <- prune.misclass(tree.mod, best=5)
summary(prune.5)
##
## Classification tree:
## snip.tree(tree = tree.mod, nodes = 2L)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff"
## Number of terminal nodes: 5
## Residual mean deviance: 0.7887 = 627 / 795
## Misclassification error rate: 0.1525 = 122 / 800
##Unpurned tree summary
summary(tree.mod)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = training)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7005 = 554.1 / 791
## Misclassification error rate: 0.1462 = 117 / 800
#Question 9 k)
Compare the training error rates between the pruned and unpruned trees. Which is higher?
according the summary of the models, it appears that the misclassification error rate is lower for the unpruned tree than the 4-node pruned tree.
#Question 9 k)
Compare the test error rates between the pruned and unpruned trees. Which is higher?
Now I will compare the models test error rates using the training data models against the testing data.
#Pruned tree with 4-node error rate
pred.p4 <- predict(prune.4, testing, type='class')
cm.p4 <- table(pred.p4, testing$Purchase)
error.rate.p4 <- 1 - ( (cm.p4[1,1] + cm.p4[2,2]) / sum(cm.p4) )
#Pruned tree with 5-node error rate
pred.p5 <- predict(prune.5, testing, type='class')
cm.p5 <- table(pred.p5, testing$Purchase)
error.rate.p5 <- 1 - ( (cm.p5[1,1] + cm.p5[2,2]) / sum(cm.p5) )
#unpruned tree model error rate was already performaned saved as error.rate
data.frame (
Pruned.tree.n4 = error.rate.p4,
Pruned.tree.n5 = error.rate.p5,
Unpruned.tree = error.rate)
It appears that the unpruned tree has the lowest error rate when testing it against the test data.
#4. Question 8.4.10 pg 334
#Question 10 a) Remove the observations for whom the salary information is unknown, and then log-transform the salaries.
First I will load the Hitters data set found in the ISLR library. Then I will remove the observations where Salary is unknown (NA)
library(ISLR)
data(Hitters)
#install.packages('sqldf')
#install.packages('RSQLite')
#install.packages('digest')
library('RSQLite')
library('sqldf')
#Removing missing salary observations
Hitters <- sqldf(
"
SELECT
*
FROM
Hitters
WHERE
Salary != 'N/A'
AND
Salary IS NOT NULL"
)
Hitters$Salary <- log(Hitters$Salary)
#Question 10 b) Now I will create a training and testing set. The training data will contrain the first 200 observations, the remaining the data will be the testing data.
training <- Hitters[1:200,]
testing <- Hitters[201:(nrow(Hitters)),]
Now I will perform boosting on the training set with 1,000 trees for a range of values of the skrinkage parameter \(lambda\). Then I will produce a plot with different shrinkage value son the x-axis and the corresponding training set MSE on the y-axis.
##Boosting lambda = 1000 trees
set.seed(702)
#install.packages('gbm')
library('gbm')
p <- seq(from=-10, to=0, by=0.05)
lam <- 10^p
train.mse <- rep(NA, length(lam))
for (i in 1:length(lam)) {
boost.mod <- gbm(Salary ~ ., data=training, distribution = 'gaussian', n.trees=1000, shrinkage=lam[i])
pred <- predict(boost.mod, training, n.trees=1000)
train.mse[i] <- mean((pred - training$Salary)^2)
}
boost.df <- data.frame(
lambdas = c(lam),
train.mse <- c(train.mse)
)
plot(x=lam, y=train.mse, type='b', ylab='Training MSE', xlab='Shrinkage Values', main='Training MSE vs Shrinkage Values')
library(tidyverse)
ggplot(data=boost.df, aes(x=lambdas,y=train.mse)) +
geom_point() +
labs(y='Training MSE', x='Shrinkage Values', title='Training MSE vs Shrinkage Values')
It appears that has shrinkage values increases the the training MSE decreases.
#Question 10 d) Now will perform the same action as above but I will use training data set against the testing data set in order to obtain the test MSE vs Shrinkage values.
set.seed(702)
test.mse <- rep(NA, length(lam))
for (i in 1:length(lam)) {
boost.mod <- gbm(Salary ~ ., data=training, distribution = 'gaussian', n.trees=1000, shrinkage=lam[i])
pred <- predict(boost.mod, newdata=testing, n.trees=1000)
test.mse[i] <- mean((pred - testing$Salary)^2)
}
boost.df.test <- data.frame(
lambdas = c(lam),
test.mse <- c(test.mse)
)
plot(x=lam,y=test.mse, type='b', ylab='Test MSE', xlab='Shrinkage Values', main='Test MSE vs Shrinkage Values')
ggplot(data=boost.df.test, aes(x=lambdas,y=test.mse)) +
geom_point() +
geom_line(colour='blue') +
labs(y='Test MSE', x='Shrinkage Values', title='Test MSE vs Shrinkage Values')
When performing the same experiment on the test MSE a different result occurs. It appears that for very small shrinkage values the training MSE decreaes. However, there is slippery slope part. Onces the shrinkage values get just a little larger the testing MSE increases. There is a sweet spot between skrinkage values and testing MSE values.
#Question 9 e) Compare the test MSE of boosting to the test MSE that results from applying two of the regression approaches seen in Chapters 3 and 6.
I will fit a regular linear regression model and produce the test MSE, from chapter 3. I will also fit a principal compnenents regression model and produe the test MSE from chapter 6
##Linear Regression Model
lm.fit <- lm(Salary ~ ., data=training)
pred.lm <- predict(lm.fit, testing)
lm.mse <- mean((pred.lm - testing$Salary)^2)
lm.mse
## [1] 0.4917959
##principal compnenents regression (PCR)
library(pls)
set.seed(702)
pcr.fit <- pcr(Salary ~., data=training, scale=TRUE, validation='CV')
validationplot(pcr.fit, val.type='MSEP')
It appears number of components around 7 is best MSEP
##PCR test MSE
pred.pcr <- predict(pcr.fit, newdata=testing,ncomp=7)
pcr.mse <- mean((pred.pcr - testing$Salary))
pcr.mse
## [1] 0.0348506
##Boosting test MSE
boost.mse <- min(test.mse)
boost.mse
## [1] 0.2576145
It appears the PCR test MSE has the lowest value. Then boost method, and lastly regaular linear regression model.
#Question 10 f) Which variables appear to be the most important predictors in the boosted model?
Now I will fit boost model with the optimal value for lambda with n.tree=1000. I will output the summary to determine which variables are most sigifinicant in predicting salary.
best.lam <- lam[which.min(test.mse)]
optimal.boost.mod <- gbm(Salary ~ ., data=training, distribution = 'gaussian', n.trees=1000, shrinkage = best.lam)
summary(optimal.boost.mod)
The summary outputs are listed in order from most significant to least in predicting salary CAtBat is the most important variable.
#Question 10 g) Now apply bagging to the training set. What is the test set MSE for this approach?
set.seed(702)
bagging <- randomForest(Salary ~ ., data=training, mtry=(dim(training)[2]-1), ntree=500)
pred <- predict(bagging, newdata=testing)
test.mse <- mean((pred - testing$Salary)^2)
cat('The bagging test MSE is: ', test.mse)
## The bagging test MSE is: 0.2306229
It appears the bagging approach has lower slightly lower test MSE value than the boost method.
#Question 5. In the past couple of homework assignments you have used different classification methods to analyze the dataset you chose. For this homework, use tree-based classification methods (tree,bagging, randomforest, boosting) to model your data. Find the test error using any/all of methods (VSA, K-fold CV). Compare the results you obtained with the result from previous homework. Did the results improve? (Use the table you previously made to compare results)
##Data set split I will be splitting the data set to split the data. 75% will be for training and the other 25% will be used for testing.
invnodes will be: 0-2 = 0 3-5 = 1 6-8 = 2 9-11 = 3 12-14 = 4 15-17 = 5 24-26 = 6 missing=99
Age: 10-19 = 1 20-29 = 2 30-39 = 3 40-49 = 4 50-59 = 5 60-69 = 6 70-79 = 7 80-89 = 8 90-99 = 9 missing = 10
menopause: lt40 = 0 ge40 = 1 premeno = 2 missing = 3
tumorsize: 0-4 = 0 5-9 = 1 10-14 = 2 15-19 = 3 20-24 = 4 25-29 = 5 30-34 = 6 35-39 = 7 40-44 = 8 45-49 = 9 50-54 = 10 55-59 = 11 missing = 12
node-caps: no = 0 yes = 1 missing = 3
breast: left = 0 right = 1 missing= 3
breast-quad: left-up = 0 left-low = 1 right-up = 2 right-low = 3 central = 4 missing = 5
irradiat:
yes = 1, no = 0 missing = 2.
#install.packages('data.table')
library(data.table)
library(tidyverse)
bc.dat <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data', col.names=c('class','age','menopause','tumorsize','invnodes','nodecaps','degmalig','breast','breastquad','irradiant'))
Class <- ifelse(bc.dat$class=="no-recurrence-events",'No','Yes')
Class2 <- ifelse(bc.dat$class=="no-recurrence-events", 0,1)
bc.dat <- data.frame(bc.dat, Class,Class2)
bc.dat$invnodes <- ifelse(bc.dat$invnodes=='0-2',0,(ifelse(bc.dat$invnodes=='3-5',1,
(ifelse(bc.dat$invnodes=='6-8',2,(ifelse(bc.dat$invnodes=='9-11',3,(ifelse(bc.dat$invnodes=='12-14',4,(ifelse(bc.dat$invnodes=='15-17',5,(ifelse(bc.dat$invnodes=='24-26',6,99)))))))))))))
bc.dat$age <- ifelse(bc.dat$age=='10-19',1,(ifelse(bc.dat$age=='20-29',2,(ifelse(bc.dat$age=='30-39',3,(ifelse(bc.dat$age=='40-49',4,(ifelse(bc.dat$age=='50-59',5,(ifelse(bc.dat$age=='60-69',6,(ifelse(bc.dat$age=='70-79',7,(ifelse('80-89',8,(ifelse('90-99',9,10)))))))))))))))))
bc.dat$menopause <- ifelse(bc.dat$menopause=='lt40',0,(ifelse(bc.dat$menopause=='ge40',1,(ifelse(bc.dat$menopause=='premeno',2,3)))))
bc.dat$tumorsize <- ifelse(bc.dat$tumorsize=='0-4',0,ifelse(bc.dat$tumorsize=='5-9',1,ifelse(bc.dat$tumorsize=='10-14',2,ifelse(bc.dat$tumorsize=='15-19',3,ifelse(bc.dat$tumorsize=='20-24',4,ifelse(bc.dat$tumorsize=='25-29',5,ifelse(bc.dat$tumorsize=='30-34',6,ifelse(bc.dat$tumorsize=='35-39',7,ifelse(bc.dat$tumorsize=='40-44',8,ifelse(bc.dat$tumorsize=='45-49',9,ifelse(bc.dat$tumorsize=='50-54',10,ifelse(bc.dat$tumorsize=='55-59',11,12))))))))))))
bc.dat$nodecaps <- ifelse(bc.dat$nodecaps=='no',0,(ifelse(bc.dat$nodecaps=='yes',1,2)))
bc.dat$breast <- ifelse(bc.dat$breast=='left',0,(ifelse(bc.dat$breast=='right',1,2)))
bc.dat$breastquad <- ifelse(bc.dat$breastquad=='left_up',0,(ifelse(bc.dat$breastquad=='left_low',1,(ifelse(bc.dat$breastquad=='right_up',2,(ifelse(bc.dat$breastquad=='right_low',3,(ifelse(bc.dat$breastquad=='central',4,5)))))))))
bc.dat$irradiant <- ifelse(bc.dat$irradiant == 'yes',1,ifelse(bc.dat$irradiant=='no',0,2))
set.seed(702)
index <- sample(x=nrow(bc.dat), size=0.75*nrow(bc.dat))
training <- bc.dat[index,]
testing <- bc.dat[-index,]
Class.test <- Class[-index]
##Tree Regression model
library(tree)
set.seed(702)
tree.bc <- tree(Class ~ .-class-Class2,data=training)
tree.pred <- predict(tree.bc,testing,type="class")
cm.tree <- table(tree.pred,testing$Class)
cm.tree
##
## tree.pred No Yes
## No 49 17
## Yes 2 4
tree.error <- 1 - ( (cm.tree[1,1] + cm.tree[2,2]) / sum(cm.tree))
tree.error
## [1] 0.2638889
#Plotting tree regression model
plot(tree.bc)
text(tree.bc, pretty=0)
cv.tree.bc <- cv.tree(tree.bc)
plot(cv.tree.bc, type='b', main='No. Trees Vs Model Performance')
##Bagging Method and error rate
library(randomForest)
set.seed(702)
bag.bc <- randomForest(Class ~ .-class-Class2, data=training, importance=TRUE, ntree=1000, mtry=6)
bag.pred <- predict(bag.bc, testing, type='class')
cm.bag <- table(bag.pred, testing$Class)
cm.bag
##
## bag.pred No Yes
## No 45 15
## Yes 6 6
bag.error <- 1 - ( (cm.bag[1,1] + cm.bag[2,2]) / sum(cm.bag))
bag.error
## [1] 0.2916667
#Random Forest Model and error rate
set.seed(702)
#bag.mod$importance
#importance(bag.mod)
set.seed(702)
rf.bc <- randomForest(Class ~ .-class-Class2, data=training, importance=TRUE, mtry=6)
rf.pred <- predict(rf.bc, testing, type='class')
cm.rf <- table(rf.pred, testing$Class)
cm.rf
##
## rf.pred No Yes
## No 44 15
## Yes 7 6
rf.error <- 1 - ( (cm.rf[1,1] + cm.rf[2,2]) / sum(cm.rf))
rf.error
## [1] 0.3055556
df <- data.frame(
tree.error = tree.error,
bag.error = bag.error,
rf.error = rf.error,
GLM = 0.319,
KNN = 0.292,
LDA = 0.319,
MclustDA = 0.278,
MclustDA_EDDA = 0.306,
SVM = 0.292,
GLM_LOOCV = 0.259,
GLM_CV5 = 0.266
)
library(knitr)
kable(df, caption = "Model Error Rates")
tree.error | bag.error | rf.error | GLM | KNN | LDA | MclustDA | MclustDA_EDDA | SVM | GLM_LOOCV | GLM_CV5 |
---|---|---|---|---|---|---|---|---|---|---|
0.2638889 | 0.2916667 | 0.3055556 | 0.319 | 0.292 | 0.319 | 0.278 | 0.306 | 0.292 | 0.259 | 0.266 |
It appears the GLM LOOCV and MclustDA still produce the lowest error rates.