book_company_marketing_response

# Load packages for analysis and this section will have all the required libraries mentioned for better clarity
library('ggplot2') # visualization

## Warning: package 'ggplot2' was built under R version 3.4.1

library('ggthemes') # visualization

## Warning: package 'ggthemes' was built under R version 3.4.1

library('scales') # visualization

## Warning: package 'scales' was built under R version 3.4.1

library('dplyr') # data manipulation

## Warning: package 'dplyr' was built under R version 3.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library('mice') # imputation

## Warning: package 'mice' was built under R version 3.4.2

## Loading required package: lattice

library('randomForest') # classification algorithm

## Warning: package 'randomForest' was built under R version 3.4.1

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library('rpart') # for decision tree

## Warning: package 'rpart' was built under R version 3.4.3

library('ROCR')

## Warning: package 'ROCR' was built under R version 3.4.1

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.4.1

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library('ROCR')
library('randomForest')
library('corrr')

## Warning: package 'corrr' was built under R version 3.4.1

library('corrplot')

## Warning: package 'corrplot' was built under R version 3.4.2

## corrplot 0.84 loaded

library('glue')

## Warning: package 'glue' was built under R version 3.4.2

## 
## Attaching package: 'glue'

## The following object is masked from 'package:dplyr':
## 
##     collapse

library('caTools')

## Warning: package 'caTools' was built under R version 3.4.1

library('data.table')

## Warning: package 'data.table' was built under R version 3.4.2

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

require("GGally")

## Loading required package: GGally

## Warning: package 'GGally' was built under R version 3.4.3

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

require("geosphere")

## Loading required package: geosphere

## Warning: package 'geosphere' was built under R version 3.4.2

require("gmapsdistance")

## Loading required package: gmapsdistance

## Warning: package 'gmapsdistance' was built under R version 3.4.2

require("tidyr")

## Loading required package: tidyr

## Warning: package 'tidyr' was built under R version 3.4.2

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:mice':
## 
##     complete

library('corrplot')
#source("distance.R")
library('car')

## Warning: package 'car' was built under R version 3.4.2

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library('caret')

## Warning: package 'caret' was built under R version 3.4.3

library('gclus')

## Warning: package 'gclus' was built under R version 3.4.1

## Loading required package: cluster

## Warning: package 'cluster' was built under R version 3.4.2

library('visdat')

## Warning: package 'visdat' was built under R version 3.4.1

library('psych')

## Warning: package 'psych' was built under R version 3.4.2

## 
## Attaching package: 'psych'

## The following object is masked from 'package:car':
## 
##     logit

## The following object is masked from 'package:randomForest':
## 
##     outlier

## The following objects are masked from 'package:scales':
## 
##     alpha, rescale

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library('leaflet')

## Warning: package 'leaflet' was built under R version 3.4.1

library('leaflet.extras')

## Warning: package 'leaflet.extras' was built under R version 3.4.1

# library("PerformanceAnalytics")
library('GPArotation')

## Warning: package 'GPArotation' was built under R version 3.4.1

library('MVN')

## Warning: package 'MVN' was built under R version 3.4.2

## sROC 0.1-2 loaded

## 
## Attaching package: 'MVN'

## The following object is masked from 'package:psych':
## 
##     mardia

library('psych')
library('MASS')

## Warning: package 'MASS' was built under R version 3.4.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library('psy')

## Warning: package 'psy' was built under R version 3.4.1

## 
## Attaching package: 'psy'

## The following object is masked from 'package:psych':
## 
##     wkappa

library('corpcor')

## Warning: package 'corpcor' was built under R version 3.4.1

library('fastmatch')

## Warning: package 'fastmatch' was built under R version 3.4.1

## 
## Attaching package: 'fastmatch'

## The following object is masked from 'package:dplyr':
## 
##     coalesce

library('plyr')

## Warning: package 'plyr' was built under R version 3.4.1

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library('car')
library('ggcorrplot')

## Warning: package 'ggcorrplot' was built under R version 3.4.2

library('cluster')
library('caTools')
library('rpart')
library('rpart.plot')

## Warning: package 'rpart.plot' was built under R version 3.4.3

library('rattle')

## Warning: package 'rattle' was built under R version 3.4.2

## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

## 
## Attaching package: 'rattle'

## The following object is masked from 'package:randomForest':
## 
##     importance

library('RColorBrewer')

## Warning: package 'RColorBrewer' was built under R version 3.4.1

library('data.table')
library('ROCR')
library('maptree')

## Warning: package 'maptree' was built under R version 3.4.2

library('tree')

## Warning: package 'tree' was built under R version 3.4.3

library('dummies') # for converting categorical into dummy one

## Warning: package 'dummies' was built under R version 3.4.1

## dummies-1.5.6 provided by Decision Patterns

library('caret')
library('pscl') ## for  McFadden R2

## Warning: package 'pscl' was built under R version 3.4.3

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

library('randomForest')
library('StatMeasures')

## Warning: package 'StatMeasures' was built under R version 3.4.3

library('sqldf')

## Warning: package 'sqldf' was built under R version 3.4.3

## Loading required package: gsubfn

## Warning: package 'gsubfn' was built under R version 3.4.1

## Loading required package: proto

## Warning: package 'proto' was built under R version 3.4.1

## Loading required package: RSQLite

## Warning: package 'RSQLite' was built under R version 3.4.1

library('purrr')

## Warning: package 'purrr' was built under R version 3.4.3

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:plyr':
## 
##     compact

## The following object is masked from 'package:caret':
## 
##     lift

## The following object is masked from 'package:car':
## 
##     some

## The following object is masked from 'package:data.table':
## 
##     transpose

## The following object is masked from 'package:scales':
## 
##     discard

library('tidyr')
library('ggplot2')
library('gains')

## Warning: package 'gains' was built under R version 3.4.1

library('car')
library('MASS')
library('DiscriMiner')

## Warning: package 'DiscriMiner' was built under R version 3.4.3

library('klaR')

## Warning: package 'klaR' was built under R version 3.4.3

library('caret')

Including Plots

Load the data and do the basis analysis

paul_book_training_data <- read.csv('PaulBooks1.csv')
paul_book_testing_data <- read.csv('PaulBooks2.csv')

table(paul_book_training_data$Purchase)

## 
##   0   1 
## 917  83

summary(paul_book_training_data)

##        ID           Months         NoBought        Purchase    
##  Min.   :1001   Min.   : 1.00   Min.   :0.000   Min.   :0.000  
##  1st Qu.:1251   1st Qu.: 7.00   1st Qu.:0.000   1st Qu.:0.000  
##  Median :1500   Median :12.00   Median :0.000   Median :0.000  
##  Mean   :1500   Mean   :12.46   Mean   :0.389   Mean   :0.083  
##  3rd Qu.:1750   3rd Qu.:15.00   3rd Qu.:1.000   3rd Qu.:0.000  
##  Max.   :2000   Max.   :35.00   Max.   :5.000   Max.   :1.000

hist(paul_book_training_data$Months)

# scatterplot(paul_book_training_data$Months, paul_book_training_data$Purchase , data=paul_book_training_data, xlab="Months", ylab="Purchased", groups=paul_book_training_data$NoBought, by.groups=TRUE)

hist(paul_book_training_data$NoBought)

paul_book_test_data <- read.csv('PaulBooks2.csv')
table(paul_book_test_data$Purchase)

## 
##   0   1 
## 919  81

boxplot(paul_book_training_data$NoBought ~ paul_book_training_data$Purchase, horizontal=TRUE)

boxplot(paul_book_training_data$Months ~ paul_book_training_data$Purchase, horizontal = TRUE)

Model developments with MASS Package

paul_book_lda_model <- lda(paul_book_training_data$Purchase ~ paul_book_training_data$Months +paul_book_training_data$NoBought, data = paul_book_training_data)  ## using MASS package

paul_book_lda_model

## Call:
## lda(paul_book_training_data$Purchase ~ paul_book_training_data$Months + 
##     paul_book_training_data$NoBought, data = paul_book_training_data)
## 
## Prior probabilities of groups:
##     0     1 
## 0.917 0.083 
## 
## Group means:
##   paul_book_training_data$Months paul_book_training_data$NoBought
## 0                      12.731734                        0.3336968
## 1                       9.409639                        1.0000000
## 
## Coefficients of linear discriminants:
##                                          LD1
## paul_book_training_data$Months   -0.05098078
## paul_book_training_data$NoBought  1.41242601

Model developments with DDiscriminer package Observations are: –0.08558 is the cutoff score came from discriminant coefficient and weighted average –Fisher Discriminant corelation shows that fisher discriminant function is corelated 92.07% with NoBought and 39.09% correlated with Income –The power output shows that discriminant power of explanatory variables. “NoBought” having very low p-value compare with Months and NoBought is more powerful variable to explain. Therefor we conclude from here that these two variables are important predictors to make group separation. –

myIndVariable <- paul_book_training_data[,2:3]
myDepVariable <- paul_book_training_data[,4]
paul_book_lda_model_discri <- desDA(myIndVariable,myDepVariable)

paul_book_lda_model_discri

## 
## Descriptive Discriminant Analysis
## ---------------------------------
## $power      discriminant power
## $values     table of eigenvalues
## $discrivar  discriminant variables
## $discor     correlations
## $scores     discriminant scores
## ---------------------------------
## 
## $power
##           cor_ratio   wilks_lamb  F_statistic  p_values  
## Months     0.0131350   0.9868650  13.2832222    0.0002816
## NoBought   0.0728742   0.9271258  78.4450638    0.0000000
## 
## 
## $values
##      value     proportion  accumulated
## DF1    0.0868  100.0000    100.0000   
## 
## 
## $discrivar
##                DF1
## constant   0.08558
## Months    -0.05098
## NoBought   1.41243
## 
## 
## $discor
##               DF1
## Months    -0.3909
## NoBought   0.9207
## 
## 
## $scores
##         z1
## 1  -1.1380
## 2  -0.7301
## 3  -0.6791
## 4  -1.0360
## 5  -0.6791
## 6   2.6046
## ...

Now lets understand the cor_ratio and validate this value.. Observations are: First Anova with NoBought shows - Total Sum of Square:76.11 So contribution made by NoBought towards total sum of square is 33.8/463.7 = 0.07289 Second Anova with Months shows - Total Sum of Square: 76.11 So contribution made by Months towards total sum of square is 840/63950 = .0131

These two values are exactly matching with discri miner desc DA power output..This says that NoBought is relatively approx 7 times more important than Months in clasifying ans separating groups

myAnovamktg_nobought <- aov( paul_book_training_data$NoBought ~ paul_book_training_data$Purchase, data = paul_book_training_data)
summary(myAnovamktg_nobought)

##                                   Df Sum Sq Mean Sq F value Pr(>F)    
## paul_book_training_data$Purchase   1   33.8   33.79   78.44 <2e-16 ***
## Residuals                        998  429.9    0.43                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

myAnovamktg_Months <- aov(paul_book_training_data$Months ~ paul_book_training_data$Purchase, data = paul_book_training_data)
summary(myAnovamktg_Months)

##                                   Df Sum Sq Mean Sq F value   Pr(>F)    
## paul_book_training_data$Purchase   1    840   840.0   13.28 0.000282 ***
## Residuals                        998  63110    63.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Our data is biassed one and hence we can’t take straightaway cutroff point. We need to add a correction to that. So we will use mahalanobis method which will give equation for each of the class of the response variable which is 0 and 1 for Purchase.

Key Observations are Accuracy is 100 -8.2= 91.8% Two equations and sets of discriminant coefficients are given Confusion Matrix shows Accuracy is 91.8% and Misclassification Rate is 8.2%

myPaulBookMahalanobis <- linDA(myIndVariable,myDepVariable)
myPaulBookMahalanobis

## 
## Linear Discriminant Analysis
## -------------------------------------------
## $functions        discrimination functions
## $confusion        confusion matrix
## $scores           discriminant scores
## $classification   assigned class
## $error_rate       error rate
## -------------------------------------------
## 
## $functions
##           0       1     
## constant  -1.473  -4.295
## Months     0.200   0.143
## NoBought   0.699   2.267
## 
## 
## $confusion
##         predicted
## original    0    1
##        0  907   10
##        1   72   11
## 
## 
## $error_rate
## [1] 0.082
## 
## 
## $scores
##           0           1
## 1  3.315145  -0.8650023
## 2  1.718994  -2.0082531
## 3  1.519476  -2.1511595
## 4  2.916107  -1.1508150
## 5  1.519476  -2.1511595
## 6  1.120868   1.0966496
## ...
## 
## $classification
## [1] 0 0 0 0 0 0
## Levels: 0 1
## ...

paul_book_training_data$MahalaScoreClassZero <- myPaulBookMahalanobis$scores[1]
paul_book_training_data$MahalaScoreClassOne <- myPaulBookMahalanobis$scores[2]
paul_book_training_data$PredictedClass <- ifelse(paul_book_training_data$MahalaScoreClassZero > paul_book_training_data$MahalaScoreClassOne ,0,1 )


# plot(wine.lda.values$x[,1],wine.lda.values$x[,2]) # make a scatterplot
# text(wine.lda.values$x[,1],wine.lda.values$x[,2],Type,cex=0.7,pos=4,col="red") # add labels

Now lets test for testing on holdout sample

myLDA <- lda(paul_book_training_data$Purchase ~ paul_book_training_data$Months+paul_book_training_data$NoBought, data=paul_book_training_data)

mylinDAHoldOut <- predict(paul_book_lda_model, newdata=paul_book_testing_data, type='response')
paul_book_testing_data$PredictedPurchase <- mylinDAHoldOut$class

mycamaigntVIable <- table(actualclass=paul_book_testing_data$Purchase, predictedclass=paul_book_testing_data$PredictedPurchase)

mycampaignconfusionVImatrix <- confusionMatrix(mycamaigntVIable, cutoff = 0.5)
print(mycampaignconfusionVImatrix)

## Confusion Matrix and Statistics
## 
##            predictedclass
## actualclass   0   1
##           0 899  20
##           1  80   1
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.8797, 0.9179)
##     No Information Rate : 0.979           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0142         
##  Mcnemar's Test P-Value : 3.635e-09       
##                                           
##             Sensitivity : 0.91828         
##             Specificity : 0.04762         
##          Pos Pred Value : 0.97824         
##          Neg Pred Value : 0.01235         
##              Prevalence : 0.97900         
##          Detection Rate : 0.89900         
##    Detection Prevalence : 0.91900         
##       Balanced Accuracy : 0.48295         
##                                           
##        'Positive' Class : 0               
##

book_company_marketing_response_analysis

Amit Kayal

January 20, 2018

Including Plots