library(rpart)
library(rattle)
## Loading required package: RGtk2
## Rattle: A free graphical interface for data mining with R.
## Version 3.5.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(RColorBrewer)
library(crossval)
library(gplots)
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library(vcd)
## Loading required package: grid
library(Metrics)
Read the Caravan.csvfile
d1<- read.csv("C:/Users/vananga/Downloads/Caravan2.csv")
d1 <- read.csv(file.choose(), header=TRUE)
d1.ori<-d1
set.seed(99)
tr <- d1.ori[sample(row.names(d1.ori), size = round(nrow(d1.ori)*0.5)), ]
te <- d1.ori[!(row.names(d1.ori) %in% row.names(tr)), ]
Reset the original training and test data - just to be sure
tr1 <- tr
te1 <- te
te2 <-te
zero r startgey no one will purchase
te2$Purchase <- rep(0,nrow(te2))
tr1$Purchase = as.factor(tr1$Purchase)
fit1 <- rpart(formula=Purchase ~ .,data=tr1,control=rpart.control(minsplit=20, minbucket=1, cp=0.008))
| NAMES | INFORMATION | VALUES 1 | labels |
|---|---|---|---|
| PPERSAUT | car policy |
1-8 values | - |
| MOSTYPE | Customer subtype |
1-41 | FYE,12(affluent young) |
| PBRAND | fire policy |
** (0-7) values** | - |
| MBERHOOG | High status |
(0-9)values | - |
| MBERMIDD | Middle management |
(0-9)values | - |
fit1
## n= 2911
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2911 181 0 (0.93782205 0.06217795)
## 2) PPERSAUT< 5.5 1755 53 0 (0.96980057 0.03019943) *
## 3) PPERSAUT>=5.5 1156 128 0 (0.88927336 0.11072664)
## 6) MOSTYPE>=12.5 785 56 0 (0.92866242 0.07133758) *
## 7) MOSTYPE< 12.5 371 72 0 (0.80592992 0.19407008)
## 14) PBRAND< 3.5 213 24 0 (0.88732394 0.11267606)
## 28) MBERHOOG< 5.5 181 15 0 (0.91712707 0.08287293) *
## 29) MBERHOOG>=5.5 32 9 0 (0.71875000 0.28125000)
## 58) MBERMIDD< 1.5 23 3 0 (0.86956522 0.13043478) *
## 59) MBERMIDD>=1.5 9 3 1 (0.33333333 0.66666667) *
## 15) PBRAND>=3.5 158 48 0 (0.69620253 0.30379747)
## 30) MBERMIDD< 6.5 142 37 0 (0.73943662 0.26056338) *
## 31) MBERMIDD>=6.5 16 5 1 (0.31250000 0.68750000) *
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 593337 31.7 1168576 62.5 750400 40.1
## Vcells 1302014 10.0 2658172 20.3 2657611 20.3
fancyRpartPlot(fit1)
printcp(fit1)
##
## Classification tree:
## rpart(formula = Purchase ~ ., data = tr1, control = rpart.control(minsplit = 20,
## minbucket = 1, cp = 0.008))
##
## Variables actually used in tree construction:
## [1] MBERHOOG MBERMIDD MOSTYPE PBRAND PPERSAUT
##
## Root node error: 181/2911 = 0.062178
##
## n= 2911
##
## CP nsplit rel error xerror xstd
## 1 0.0082873 0 1.00000 1.000 0.071982
## 2 0.0080000 6 0.95028 1.105 0.075402
print(fit1)
## n= 2911
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2911 181 0 (0.93782205 0.06217795)
## 2) PPERSAUT< 5.5 1755 53 0 (0.96980057 0.03019943) *
## 3) PPERSAUT>=5.5 1156 128 0 (0.88927336 0.11072664)
## 6) MOSTYPE>=12.5 785 56 0 (0.92866242 0.07133758) *
## 7) MOSTYPE< 12.5 371 72 0 (0.80592992 0.19407008)
## 14) PBRAND< 3.5 213 24 0 (0.88732394 0.11267606)
## 28) MBERHOOG< 5.5 181 15 0 (0.91712707 0.08287293) *
## 29) MBERHOOG>=5.5 32 9 0 (0.71875000 0.28125000)
## 58) MBERMIDD< 1.5 23 3 0 (0.86956522 0.13043478) *
## 59) MBERMIDD>=1.5 9 3 1 (0.33333333 0.66666667) *
## 15) PBRAND>=3.5 158 48 0 (0.69620253 0.30379747)
## 30) MBERMIDD< 6.5 142 37 0 (0.73943662 0.26056338) *
## 31) MBERMIDD>=6.5 16 5 1 (0.31250000 0.68750000) *
plot(fit1)
text(fit1)
fit1$cptable[which.min(fit1$cptable[,"xerror"]),"CP"]
## [1] 0.008287293
Prediction<-predict(fit1,te1,type="class")
Compare with base model
Update the prediction
te2$Purchase <- Prediction
Pred = factor(as.factor(te2$Purchase), c(0, 1), labels = c("Not purchased", "Purchased"))
Actual = factor(as.factor(te1$Purchase), c(0, 1), labels = c("Not purchased", "Purchased"))
cm4 = confusionMatrix(Actual,Pred, negative = "Not purchased")
cm4
## FP TP TN FN
## 23 6 2721 161
## attr(,"negative")
## [1] "Not purchased"
# corresponding accuracy, sensitivity etc.
diagnosticErrors(cm4)
## acc sens spec ppv npv lor
## 0.93679148 0.03592814 0.99161808 0.20689655 0.94413602 1.48361563
## attr(,"negative")
## [1] "Not purchased"
# Compute the classification error
ce(Actual,Pred)
## [1] 0.06320852