###Carseats data is available in ISLR package
#ISLR:Introduction to statistical learning Book
library(ISLR)
data(package="ISLR")
carseats<-Carseats
View(carseats)
library(tree)## for decision tree
library(caret)## for partitioning of data into train and test sets.
## Loading required package: lattice
## Loading required package: ggplot2
##The following commands are used to separate
#data into train data and test data
#split contains the indices to make the
#corresponding rows as train data. Other than the
#indices in split formed as test data.
#createDataPartition() is used to generate indices in
#split variable:p=.7 means 70% of entire dataset is
#going to be as part of train data and remaining 30%
#make as test data.
split<-createDataPartition(carseats$Urban,p=.7,list = F)
train<-carseats[split,]
test<-carseats[-split,]
#Construction of decision tree.
#y~x:Y is predicted variable and x is one or more
#independent variable(s).if x is '.' then all
#remaining variables except y is considered for
#predicting y value.
dtree<-tree(Urban~.,carseats)
plot(dtree)
text(dtree)

##cross validating the decision tree and also gives
#information about at what value(height of tree)
#the accuracy is going up and going down.
cv.dtree<-cv.tree(dtree,FUN = prune.misclass)
plot(cv.dtree)

##Based on above plot we can determine at what value
#i.e., height of tree, we need to prune the children
#of decision tree.
prune.dtree<-prune.misclass(dtree,best = 6)
plot(prune.dtree)
text(prune.dtree)

#Predicting test data labels from the pruned decision
#tree.
dtree.pred<-predict(prune.dtree,test,type='class')
#we create confusion matrix to determine false positives
#,false negatives,accuracy,sensitivity and specificty.
confusionMatrix(dtree.pred,test$Urban)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 6 1
## Yes 29 83
##
## Accuracy : 0.7479
## 95% CI : (0.6601, 0.823)
## No Information Rate : 0.7059
## P-Value [Acc > NIR] : 0.1833
##
## Kappa : 0.2081
## Mcnemar's Test P-Value : 8.244e-07
##
## Sensitivity : 0.17143
## Specificity : 0.98810
## Pos Pred Value : 0.85714
## Neg Pred Value : 0.74107
## Prevalence : 0.29412
## Detection Rate : 0.05042
## Detection Prevalence : 0.05882
## Balanced Accuracy : 0.57976
##
## 'Positive' Class : No
##