Assignment 7

Question 3

p=seq(0,1,0.01)

gini= 2*p*(1-p)
classerror= 1-pmax(p,1-p)
crossentropy= -(p*log(p)+(1-p)*log(1-p))

plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='f')

lines(p,gini,type='l')
lines(p,classerror,col='blue')
lines(p,crossentropy,col='red')

legend(x='top',legend=c('gini','class error','cross entropy'),
       col=c('black','blue','red'),lty=1,text.width = 0.22)

## Question 8

set.seed(42)
train=sample(1:nrow(Carseats),nrow(Carseats)/2)

tree.carseats=tree(formula=Sales~.,data=Carseats,subset = train)
tree.pred=predict(tree.carseats,Carseats[-train,])

mean((tree.pred-Carseats[-train,'Sales'])^2)

## [1] 5.686401

plot(tree.carseats)
text(tree.carseats)

tree.carseats.cv=cv.tree(tree.carseats) 
plot(tree.carseats.cv)

prune.carseats=prune.tree(tree.carseats,best=5)

plot(prune.carseats)
text(prune.carseats)

tree.pred=predict(prune.carseats,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)

## [1] 4.866831

plot(tree.pred,Carseats[-train,'Sales'],xlab='prediction',ylab='actual')
abline(0,1)

require(randomForest)

## Loading required package: randomForest

## randomForest 4.7-1

## Type rfNews() to see new features/changes/bug fixes.

d=ncol(Carseats)-1

set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=d,importance=T,ntree=100)

tree.pred=predict(carseats.rf,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)

## [1] 2.416614

plot(carseats.rf)

varImpPlot(carseats.rf)

kable(importance(carseats.rf))

	%IncMSE	IncNodePurity
CompPrice	12.7572658	194.553633
Income	3.6925644	114.061269
Advertising	8.4583279	99.943015
Population	1.7756989	62.016441
Price	24.8149447	428.177561
ShelveLoc	24.9933231	428.448690
Age	5.7722652	127.112993
Education	1.2674696	43.842548
Urban	-0.2388463	10.176602
US	-0.0532919	7.770429

mse=c()

set.seed(42)

for(i in 3:10){
  carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=5,importance=T,ntree=100)

  tree.pred=predict(carseats.rf,Carseats[-train,])
  mse=rbind(mse,mean((tree.pred-Carseats[-train,'Sales'])^2))
}
plot(3:10,mse,type='b')

require(randomForest)

set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=9,importance=T,ntree=100)

plot(carseats.rf)

varImpPlot(carseats.rf)

kable(importance(carseats.rf))

	%IncMSE	IncNodePurity
CompPrice	12.9744625	197.97151
Income	5.1316050	103.17128
Advertising	5.4144641	110.59823
Population	-0.4748975	63.45821
Price	23.8930328	416.10410
ShelveLoc	25.0436020	417.41569
Age	4.1412728	133.36952
Education	0.5282113	46.89271
Urban	2.0743624	11.77462
US	-0.5563391	6.75921

From above we can see that Shelve Loc is the most important predictor in terms of MSE

Question 9

set.seed(42)
train=sample(1:nrow(OJ),800)

OJ.train=OJ[train,]
OJ.test=OJ[-train,]

OJ.tree=tree(Purchase~.,data=OJ.train)  # There is no predictor `Buy`.
summary(OJ.tree)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH"     "SalePriceMM" "PriceDiff"  
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7392 = 585.5 / 792 
## Misclassification error rate: 0.1638 = 131 / 800

OJ.tree

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1066.00 CH ( 0.61500 0.38500 )  
##    2) LoyalCH < 0.48285 285  296.00 MM ( 0.21404 0.78596 )  
##      4) LoyalCH < 0.064156 64    0.00 MM ( 0.00000 1.00000 ) *
##      5) LoyalCH > 0.064156 221  260.40 MM ( 0.27602 0.72398 )  
##       10) SalePriceMM < 2.04 128  123.50 MM ( 0.18750 0.81250 ) *
##       11) SalePriceMM > 2.04 93  125.00 MM ( 0.39785 0.60215 ) *
##    3) LoyalCH > 0.48285 515  458.10 CH ( 0.83689 0.16311 )  
##      6) LoyalCH < 0.753545 230  282.70 CH ( 0.69565 0.30435 )  
##       12) PriceDiff < 0.265 149  203.00 CH ( 0.57718 0.42282 )  
##         24) PriceDiff < -0.165 32   38.02 MM ( 0.28125 0.71875 ) *
##         25) PriceDiff > -0.165 117  150.30 CH ( 0.65812 0.34188 )  
##           50) LoyalCH < 0.703993 105  139.60 CH ( 0.61905 0.38095 ) *
##           51) LoyalCH > 0.703993 12    0.00 CH ( 1.00000 0.00000 ) *
##       13) PriceDiff > 0.265 81   47.66 CH ( 0.91358 0.08642 ) *
##      7) LoyalCH > 0.753545 285  111.70 CH ( 0.95088 0.04912 ) *

plot(OJ.tree)
text(OJ.tree)

OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))

	CH	MM
CH	422	70
MM	61	247

kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))

	CH	MM
CH	0.52750	0.08750
MM	0.07625	0.30875

OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))

	CH	MM
CH	125	36
MM	15	94

kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))

	CH	MM
CH	0.4629630	0.1333333
MM	0.0555556	0.3481481

set.seed(42)
OJ.tree.cv=cv.tree(OJ.tree,K = 10,FUN = prune.misclass)
plot(OJ.tree.cv)

OJ.tree=prune.misclass(OJ.tree,best = 2)

OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))

	CH	MM
CH	431	61
MM	84	224

kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))

	CH	MM
CH	0.53875	0.07625
MM	0.10500	0.28000

OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))

	CH	MM
CH	128	33
MM	26	83

kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))

	CH	MM
CH	0.4740741	0.1222222
MM	0.0962963	0.3074074

plot(OJ.tree)
text(OJ.tree)

Assignment 7

Gitanjali Mule

4/20/2022

Question 3

Question 9