Reading the file

setwd("E:/Great Lakes/Predictive modeling/Assignment")
paul <- read.csv("PaulBooks1.csv",header=TRUE)
paul1 <- read.csv("PaulBooks2.csv",header=TRUE)
head(paul)
##     id Months NoBought Purchase
## 1 1001     24        0        0
## 2 1002     16        0        0
## 3 1003     15        0        0
## 4 1004     22        0        0
## 5 1005     15        0        1
## 6 1006      6        2        0
str(paul)
## 'data.frame':    1000 obs. of  4 variables:
##  $ id      : int  1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
##  $ Months  : int  24 16 15 22 15 6 11 11 26 14 ...
##  $ NoBought: int  0 0 0 0 0 2 0 1 2 1 ...
##  $ Purchase: int  0 0 0 0 1 0 0 0 0 0 ...

univariate analysis

summary(paul)
##        id           Months         NoBought        Purchase    
##  Min.   :1001   Min.   : 1.00   Min.   :0.000   Min.   :0.000  
##  1st Qu.:1251   1st Qu.: 7.00   1st Qu.:0.000   1st Qu.:0.000  
##  Median :1500   Median :12.00   Median :0.000   Median :0.000  
##  Mean   :1500   Mean   :12.46   Mean   :0.389   Mean   :0.083  
##  3rd Qu.:1750   3rd Qu.:15.00   3rd Qu.:1.000   3rd Qu.:0.000  
##  Max.   :2000   Max.   :35.00   Max.   :5.000   Max.   :1.000
library(psych)
describe(paul)
##          vars    n    mean     sd median trimmed    mad  min  max range
## id          1 1000 1500.50 288.82 1500.5 1500.50 370.65 1001 2000   999
## Months      2 1000   12.46   8.00   12.0   11.55   5.93    1   35    34
## NoBought    3 1000    0.39   0.68    0.0    0.25   0.00    0    5     5
## Purchase    4 1000    0.08   0.28    0.0    0.00   0.00    0    1     1
##          skew kurtosis   se
## id       0.00    -1.20 9.13
## Months   0.97     0.67 0.25
## NoBought 2.05     5.21 0.02
## Purchase 3.02     7.12 0.01

Missing value Imputation

library(VIM)
## Warning: package 'VIM' was built under R version 3.4.3
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.4.3
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
aggr(paul,prop = F,cex.axis = 0.7,numbers=T)

Univariate analysis

par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(paul$Months,main ="MONTHS")
boxplot(paul$NoBought,main ="NoBought")
hist(paul$Months,main ="MONTHS", xlab=NA, ylab=NA)
hist(paul$NoBought,main ="NoBought", xlab=NA, ylab=NA)

Bivariate analysis

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
corx <- cor(paul)
corrplot(corx)

boxplot(paul$Months ~ paul$Purchase,horizontal=TRUE)

boxplot(paul$NoBought ~ paul$Purchase,horizontal=TRUE)

Response rate

resp <- sum(paul$Purchase/nrow(paul))
resp
## [1] 0.083

Model Development

library(MASS)
attach(paul)

model1 <- lda(Purchase ~ Months + NoBought,data=paul)
model1
## Call:
## lda(Purchase ~ Months + NoBought, data = paul)
## 
## Prior probabilities of groups:
##     0     1 
## 0.917 0.083 
## 
## Group means:
##      Months  NoBought
## 0 12.731734 0.3336968
## 1  9.409639 1.0000000
## 
## Coefficients of linear discriminants:
##                  LD1
## Months   -0.05098078
## NoBought  1.41242601
library(DiscriMiner)
## Warning: package 'DiscriMiner' was built under R version 3.4.3
x <- paul[,2:3]
y <- paul[,4]

fisher <- desDA(x,y)
fisher
## 
## Descriptive Discriminant Analysis
## ---------------------------------
## $power      discriminant power
## $values     table of eigenvalues
## $discrivar  discriminant variables
## $discor     correlations
## $scores     discriminant scores
## ---------------------------------
## 
## $power
##           cor_ratio   wilks_lamb  F_statistic  p_values  
## Months     0.0131350   0.9868650  13.2832222    0.0002816
## NoBought   0.0728742   0.9271258  78.4450638    0.0000000
## 
## 
## $values
##      value     proportion  accumulated
## DF1    0.0868  100.0000    100.0000   
## 
## 
## $discrivar
##                DF1
## constant   0.08558
## Months    -0.05098
## NoBought   1.41243
## 
## 
## $discor
##               DF1
## Months    -0.3909
## NoBought   0.9207
## 
## 
## $scores
##         z1
## 1  -1.1380
## 2  -0.7301
## 3  -0.6791
## 4  -1.0360
## 5  -0.6791
## 6   2.6046
## ...
mahalanaob <- linDA(x,y)
mahalanaob
## 
## Linear Discriminant Analysis
## -------------------------------------------
## $functions        discrimination functions
## $confusion        confusion matrix
## $scores           discriminant scores
## $classification   assigned class
## $error_rate       error rate
## -------------------------------------------
## 
## $functions
##           0       1     
## constant  -1.473  -4.295
## Months     0.200   0.143
## NoBought   0.699   2.267
## 
## 
## $confusion
##         predicted
## original    0    1
##        0  907   10
##        1   72   11
## 
## 
## $error_rate
## [1] 0.082
## 
## 
## $scores
##           0           1
## 1  3.315145  -0.8650023
## 2  1.718994  -2.0082531
## 3  1.519476  -2.1511595
## 4  2.916107  -1.1508150
## 5  1.519476  -2.1511595
## 6  1.120868   1.0966496
## ...
## 
## $classification
## [1] 0 0 0 0 0 0
## Levels: 0 1
## ...

Testing on Holdout sample

class <- list(0)
pred0 <- (mahalanaob$functions[1,1] + (paul1$Months*mahalanaob$functions[2,1])+(paul1$NoBought*mahalanaob$functions[3,1]))
pred1 <- (mahalanaob$functions[1,2] + (paul1$Months*mahalanaob$functions[2,2])+(paul1$NoBought*mahalanaob$functions[3,2]))
for(i in 1:1000){
if(pred0[i] > pred1[i]){ class[i] <- 0}else{class[i] <- 1}
}

df <- data.frame(matrix(unlist(class), nrow=1000, byrow=T))

Confusion matrix

table(actual=paul1$Purchase,predict=df$matrix.unlist.class...nrow...1000..byrow...T.)
##       predict
## actual   0   1
##      0 908  11
##      1  67  14

Adding paractical value, changing the cutoff value to (1/7)

From scores getting the probablity value

Prob_0 <- 1
Prob_1 <- 1
Prob_test <- 0
class_train <-0
class_test <- 0

for(i in 1:1000){
Prob_0[i] <- exp(mahalanaob$scores[i,1])/((exp(mahalanaob$scores[i,1]))+(exp(mahalanaob$scores[i,2])))
}

for(i in 1:1000){
  Prob_1[i] <- exp(mahalanaob$scores[i,2])/((exp(mahalanaob$scores[i,1]))+(exp(mahalanaob$scores[i,2])))
}

for(i in 1:1000){
if (Prob_1[i] > (1/7)){class_train[i] <- 1} else {class_train[i] <- 0}
}

table(actual=paul$Purchase,predict=class_train)
##       predict
## actual   0   1
##      0 791 126
##      1  49  34

Probablity for hold out sample

for(i in 1:1000){
  Prob_test[i] <- exp(pred1[i])/((exp(pred1[i]))+(exp(pred0[i])))
}
for(i in 1:1000){
  if (Prob_test[i] > (1/7)){class_test[i] <- 1} else {class_test[i] <- 0}
}

table(actual=paul1$Purchase,predict=class_test)
##       predict
## actual   0   1
##      0 805 114
##      1  39  42