library(rpart)
library(rattle)

## Loading required package: RGtk2
## Rattle: A free graphical interface for data mining with R.
## Version 3.5.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)
library(RColorBrewer)
library(crossval)
library(gplots)

## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

library(vcd)

## Loading required package: grid

library(Metrics)

Read the Caravan.csvfile

d1<- read.csv("C:/Users/vananga/Downloads/Caravan2.csv")
d1 <- read.csv(file.choose(), header=TRUE)

d1.ori<-d1

set.seed(99)

tr <- d1.ori[sample(row.names(d1.ori), size = round(nrow(d1.ori)*0.5)), ]
te <- d1.ori[!(row.names(d1.ori) %in% row.names(tr)), ]

Reset the original training and test data - just to be sure

tr1 <- tr
te1  <- te
te2 <-te

zero r startgey no one will purchase

te2$Purchase <- rep(0,nrow(te2))

buliding the tree

tr1$Purchase = as.factor(tr1$Purchase)
fit1 <- rpart(formula=Purchase ~ .,data=tr1,control=rpart.control(minsplit=20, minbucket=1, cp=0.008))

NAMES	INFORMATION	VALUES 1	labels
PPERSAUT	`car policy`	1-8 values	-
MOSTYPE	`Customer subtype`	1-41	FYE,12(affluent young)
PBRAND	`fire policy`	(0-7) values	-
MBERHOOG	`High status`	(0-9)values	-
MBERMIDD	`Middle management`	(0-9)values	-

fit1

## n= 2911 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 2911 181 0 (0.93782205 0.06217795)  
##    2) PPERSAUT< 5.5 1755  53 0 (0.96980057 0.03019943) *
##    3) PPERSAUT>=5.5 1156 128 0 (0.88927336 0.11072664)  
##      6) MOSTYPE>=12.5 785  56 0 (0.92866242 0.07133758) *
##      7) MOSTYPE< 12.5 371  72 0 (0.80592992 0.19407008)  
##       14) PBRAND< 3.5 213  24 0 (0.88732394 0.11267606)  
##         28) MBERHOOG< 5.5 181  15 0 (0.91712707 0.08287293) *
##         29) MBERHOOG>=5.5 32   9 0 (0.71875000 0.28125000)  
##           58) MBERMIDD< 1.5 23   3 0 (0.86956522 0.13043478) *
##           59) MBERMIDD>=1.5 9   3 1 (0.33333333 0.66666667) *
##       15) PBRAND>=3.5 158  48 0 (0.69620253 0.30379747)  
##         30) MBERMIDD< 6.5 142  37 0 (0.73943662 0.26056338) *
##         31) MBERMIDD>=6.5 16   5 1 (0.31250000 0.68750000) *

gc()

##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  593337 31.7    1168576 62.5   750400 40.1
## Vcells 1302014 10.0    2658172 20.3  2657611 20.3

fancyRpartPlot(fit1)

printcp(fit1)

## 
## Classification tree:
## rpart(formula = Purchase ~ ., data = tr1, control = rpart.control(minsplit = 20, 
##     minbucket = 1, cp = 0.008))
## 
## Variables actually used in tree construction:
## [1] MBERHOOG MBERMIDD MOSTYPE  PBRAND   PPERSAUT
## 
## Root node error: 181/2911 = 0.062178
## 
## n= 2911 
## 
##          CP nsplit rel error xerror     xstd
## 1 0.0082873      0   1.00000  1.000 0.071982
## 2 0.0080000      6   0.95028  1.105 0.075402

print(fit1)

## n= 2911 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 2911 181 0 (0.93782205 0.06217795)  
##    2) PPERSAUT< 5.5 1755  53 0 (0.96980057 0.03019943) *
##    3) PPERSAUT>=5.5 1156 128 0 (0.88927336 0.11072664)  
##      6) MOSTYPE>=12.5 785  56 0 (0.92866242 0.07133758) *
##      7) MOSTYPE< 12.5 371  72 0 (0.80592992 0.19407008)  
##       14) PBRAND< 3.5 213  24 0 (0.88732394 0.11267606)  
##         28) MBERHOOG< 5.5 181  15 0 (0.91712707 0.08287293) *
##         29) MBERHOOG>=5.5 32   9 0 (0.71875000 0.28125000)  
##           58) MBERMIDD< 1.5 23   3 0 (0.86956522 0.13043478) *
##           59) MBERMIDD>=1.5 9   3 1 (0.33333333 0.66666667) *
##       15) PBRAND>=3.5 158  48 0 (0.69620253 0.30379747)  
##         30) MBERMIDD< 6.5 142  37 0 (0.73943662 0.26056338) *
##         31) MBERMIDD>=6.5 16   5 1 (0.31250000 0.68750000) *

plot(fit1)
text(fit1)

fit1$cptable[which.min(fit1$cptable[,"xerror"]),"CP"]

## [1] 0.008287293

Prediction<-predict(fit1,te1,type="class")

Compare with base model

Update the prediction

te2$Purchase <- Prediction

Pred = factor(as.factor(te2$Purchase), c(0, 1), labels = c("Not purchased", "Purchased"))
Actual = factor(as.factor(te1$Purchase), c(0, 1), labels = c("Not purchased", "Purchased"))
                      
cm4 = confusionMatrix(Actual,Pred, negative = "Not purchased")
cm4

##   FP   TP   TN   FN 
##   23    6 2721  161 
## attr(,"negative")
## [1] "Not purchased"

# corresponding accuracy, sensitivity etc.
diagnosticErrors(cm4)

##        acc       sens       spec        ppv        npv        lor 
## 0.93679148 0.03592814 0.99161808 0.20689655 0.94413602 1.48361563 
## attr(,"negative")
## [1] "Not purchased"

# Compute the classification error
ce(Actual,Pred)

## [1] 0.06320852