Reading the file
setwd("E:/Great Lakes/Predictive modeling/Assignment")
paul <- read.csv("PaulBooks1.csv",header=TRUE)
paul1 <- read.csv("PaulBooks2.csv",header=TRUE)
head(paul)
## id Months NoBought Purchase
## 1 1001 24 0 0
## 2 1002 16 0 0
## 3 1003 15 0 0
## 4 1004 22 0 0
## 5 1005 15 0 1
## 6 1006 6 2 0
str(paul)
## 'data.frame': 1000 obs. of 4 variables:
## $ id : int 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
## $ Months : int 24 16 15 22 15 6 11 11 26 14 ...
## $ NoBought: int 0 0 0 0 0 2 0 1 2 1 ...
## $ Purchase: int 0 0 0 0 1 0 0 0 0 0 ...
univariate analysis
summary(paul)
## id Months NoBought Purchase
## Min. :1001 Min. : 1.00 Min. :0.000 Min. :0.000
## 1st Qu.:1251 1st Qu.: 7.00 1st Qu.:0.000 1st Qu.:0.000
## Median :1500 Median :12.00 Median :0.000 Median :0.000
## Mean :1500 Mean :12.46 Mean :0.389 Mean :0.083
## 3rd Qu.:1750 3rd Qu.:15.00 3rd Qu.:1.000 3rd Qu.:0.000
## Max. :2000 Max. :35.00 Max. :5.000 Max. :1.000
library(psych)
describe(paul)
## vars n mean sd median trimmed mad min max range
## id 1 1000 1500.50 288.82 1500.5 1500.50 370.65 1001 2000 999
## Months 2 1000 12.46 8.00 12.0 11.55 5.93 1 35 34
## NoBought 3 1000 0.39 0.68 0.0 0.25 0.00 0 5 5
## Purchase 4 1000 0.08 0.28 0.0 0.00 0.00 0 1 1
## skew kurtosis se
## id 0.00 -1.20 9.13
## Months 0.97 0.67 0.25
## NoBought 2.05 5.21 0.02
## Purchase 3.02 7.12 0.01
Missing value Imputation
library(VIM)
## Warning: package 'VIM' was built under R version 3.4.3
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.4.3
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(paul,prop = F,cex.axis = 0.7,numbers=T)
Univariate analysis
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(paul$Months,main ="MONTHS")
boxplot(paul$NoBought,main ="NoBought")
hist(paul$Months,main ="MONTHS", xlab=NA, ylab=NA)
hist(paul$NoBought,main ="NoBought", xlab=NA, ylab=NA)
Bivariate analysis
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
corx <- cor(paul)
corrplot(corx)
boxplot(paul$Months ~ paul$Purchase,horizontal=TRUE)
boxplot(paul$NoBought ~ paul$Purchase,horizontal=TRUE)
Response rate
resp <- sum(paul$Purchase/nrow(paul))
resp
## [1] 0.083
Model Development
library(MASS)
attach(paul)
model1 <- lda(Purchase ~ Months + NoBought,data=paul)
model1
## Call:
## lda(Purchase ~ Months + NoBought, data = paul)
##
## Prior probabilities of groups:
## 0 1
## 0.917 0.083
##
## Group means:
## Months NoBought
## 0 12.731734 0.3336968
## 1 9.409639 1.0000000
##
## Coefficients of linear discriminants:
## LD1
## Months -0.05098078
## NoBought 1.41242601
library(DiscriMiner)
## Warning: package 'DiscriMiner' was built under R version 3.4.3
x <- paul[,2:3]
y <- paul[,4]
fisher <- desDA(x,y)
fisher
##
## Descriptive Discriminant Analysis
## ---------------------------------
## $power discriminant power
## $values table of eigenvalues
## $discrivar discriminant variables
## $discor correlations
## $scores discriminant scores
## ---------------------------------
##
## $power
## cor_ratio wilks_lamb F_statistic p_values
## Months 0.0131350 0.9868650 13.2832222 0.0002816
## NoBought 0.0728742 0.9271258 78.4450638 0.0000000
##
##
## $values
## value proportion accumulated
## DF1 0.0868 100.0000 100.0000
##
##
## $discrivar
## DF1
## constant 0.08558
## Months -0.05098
## NoBought 1.41243
##
##
## $discor
## DF1
## Months -0.3909
## NoBought 0.9207
##
##
## $scores
## z1
## 1 -1.1380
## 2 -0.7301
## 3 -0.6791
## 4 -1.0360
## 5 -0.6791
## 6 2.6046
## ...
mahalanaob <- linDA(x,y)
mahalanaob
##
## Linear Discriminant Analysis
## -------------------------------------------
## $functions discrimination functions
## $confusion confusion matrix
## $scores discriminant scores
## $classification assigned class
## $error_rate error rate
## -------------------------------------------
##
## $functions
## 0 1
## constant -1.473 -4.295
## Months 0.200 0.143
## NoBought 0.699 2.267
##
##
## $confusion
## predicted
## original 0 1
## 0 907 10
## 1 72 11
##
##
## $error_rate
## [1] 0.082
##
##
## $scores
## 0 1
## 1 3.315145 -0.8650023
## 2 1.718994 -2.0082531
## 3 1.519476 -2.1511595
## 4 2.916107 -1.1508150
## 5 1.519476 -2.1511595
## 6 1.120868 1.0966496
## ...
##
## $classification
## [1] 0 0 0 0 0 0
## Levels: 0 1
## ...
Testing on Holdout sample
class <- list(0)
pred0 <- (mahalanaob$functions[1,1] + (paul1$Months*mahalanaob$functions[2,1])+(paul1$NoBought*mahalanaob$functions[3,1]))
pred1 <- (mahalanaob$functions[1,2] + (paul1$Months*mahalanaob$functions[2,2])+(paul1$NoBought*mahalanaob$functions[3,2]))
for(i in 1:1000){
if(pred0[i] > pred1[i]){ class[i] <- 0}else{class[i] <- 1}
}
df <- data.frame(matrix(unlist(class), nrow=1000, byrow=T))
Confusion matrix
table(actual=paul1$Purchase,predict=df$matrix.unlist.class...nrow...1000..byrow...T.)
## predict
## actual 0 1
## 0 908 11
## 1 67 14
Adding paractical value, changing the cutoff value to (1/7)
From scores getting the probablity value
Prob_0 <- 1
Prob_1 <- 1
Prob_test <- 0
class_train <-0
class_test <- 0
for(i in 1:1000){
Prob_0[i] <- exp(mahalanaob$scores[i,1])/((exp(mahalanaob$scores[i,1]))+(exp(mahalanaob$scores[i,2])))
}
for(i in 1:1000){
Prob_1[i] <- exp(mahalanaob$scores[i,2])/((exp(mahalanaob$scores[i,1]))+(exp(mahalanaob$scores[i,2])))
}
for(i in 1:1000){
if (Prob_1[i] > (1/7)){class_train[i] <- 1} else {class_train[i] <- 0}
}
table(actual=paul$Purchase,predict=class_train)
## predict
## actual 0 1
## 0 791 126
## 1 49 34
Probablity for hold out sample
for(i in 1:1000){
Prob_test[i] <- exp(pred1[i])/((exp(pred1[i]))+(exp(pred0[i])))
}
for(i in 1:1000){
if (Prob_test[i] > (1/7)){class_test[i] <- 1} else {class_test[i] <- 0}
}
table(actual=paul1$Purchase,predict=class_test)
## predict
## actual 0 1
## 0 805 114
## 1 39 42