# Load packages for analysis and this section will have all the required libraries mentioned for better clarity
library('ggplot2') # visualization
## Warning: package 'ggplot2' was built under R version 3.4.1
library('ggthemes') # visualization
## Warning: package 'ggthemes' was built under R version 3.4.1
library('scales') # visualization
## Warning: package 'scales' was built under R version 3.4.1
library('dplyr') # data manipulation
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('mice') # imputation
## Warning: package 'mice' was built under R version 3.4.2
## Loading required package: lattice
library('randomForest') # classification algorithm
## Warning: package 'randomForest' was built under R version 3.4.1
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library('rpart') # for decision tree
## Warning: package 'rpart' was built under R version 3.4.3
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.4.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library('ROCR')
library('randomForest')
library('corrr')
## Warning: package 'corrr' was built under R version 3.4.1
library('corrplot')
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library('glue')
## Warning: package 'glue' was built under R version 3.4.2
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
library('caTools')
## Warning: package 'caTools' was built under R version 3.4.1
library('data.table')
## Warning: package 'data.table' was built under R version 3.4.2
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require("GGally")
## Loading required package: GGally
## Warning: package 'GGally' was built under R version 3.4.3
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
require("geosphere")
## Loading required package: geosphere
## Warning: package 'geosphere' was built under R version 3.4.2
require("gmapsdistance")
## Loading required package: gmapsdistance
## Warning: package 'gmapsdistance' was built under R version 3.4.2
require("tidyr")
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.4.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:mice':
##
## complete
library('corrplot')
#source("distance.R")
library('car')
## Warning: package 'car' was built under R version 3.4.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library('caret')
## Warning: package 'caret' was built under R version 3.4.3
library('gclus')
## Warning: package 'gclus' was built under R version 3.4.1
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.4.2
library('visdat')
## Warning: package 'visdat' was built under R version 3.4.1
library('psych')
## Warning: package 'psych' was built under R version 3.4.2
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library('leaflet')
## Warning: package 'leaflet' was built under R version 3.4.1
library('leaflet.extras')
## Warning: package 'leaflet.extras' was built under R version 3.4.1
# library("PerformanceAnalytics")
library('GPArotation')
## Warning: package 'GPArotation' was built under R version 3.4.1
library('MVN')
## Warning: package 'MVN' was built under R version 3.4.2
## sROC 0.1-2 loaded
##
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
##
## mardia
library('psych')
library('MASS')
## Warning: package 'MASS' was built under R version 3.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library('psy')
## Warning: package 'psy' was built under R version 3.4.1
##
## Attaching package: 'psy'
## The following object is masked from 'package:psych':
##
## wkappa
library('corpcor')
## Warning: package 'corpcor' was built under R version 3.4.1
library('fastmatch')
## Warning: package 'fastmatch' was built under R version 3.4.1
##
## Attaching package: 'fastmatch'
## The following object is masked from 'package:dplyr':
##
## coalesce
library('plyr')
## Warning: package 'plyr' was built under R version 3.4.1
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library('car')
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 3.4.2
library('cluster')
library('caTools')
library('rpart')
library('rpart.plot')
## Warning: package 'rpart.plot' was built under R version 3.4.3
library('rattle')
## Warning: package 'rattle' was built under R version 3.4.2
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
library('RColorBrewer')
## Warning: package 'RColorBrewer' was built under R version 3.4.1
library('data.table')
library('ROCR')
library('maptree')
## Warning: package 'maptree' was built under R version 3.4.2
library('tree')
## Warning: package 'tree' was built under R version 3.4.3
library('dummies') # for converting categorical into dummy one
## Warning: package 'dummies' was built under R version 3.4.1
## dummies-1.5.6 provided by Decision Patterns
library('caret')
library('pscl') ## for McFadden R2
## Warning: package 'pscl' was built under R version 3.4.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library('randomForest')
library('StatMeasures')
## Warning: package 'StatMeasures' was built under R version 3.4.3
library('sqldf')
## Warning: package 'sqldf' was built under R version 3.4.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.4.1
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.4.1
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.4.1
library('purrr')
## Warning: package 'purrr' was built under R version 3.4.3
##
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
##
## compact
## The following object is masked from 'package:caret':
##
## lift
## The following object is masked from 'package:car':
##
## some
## The following object is masked from 'package:data.table':
##
## transpose
## The following object is masked from 'package:scales':
##
## discard
library('tidyr')
library('ggplot2')
library('gains')
## Warning: package 'gains' was built under R version 3.4.1
library('car')
library('MASS')
library('DiscriMiner')
## Warning: package 'DiscriMiner' was built under R version 3.4.3
library('klaR')
## Warning: package 'klaR' was built under R version 3.4.3
library('caret')
Load the data and do the basis analysis
paul_book_training_data <- read.csv('PaulBooks1.csv')
paul_book_testing_data <- read.csv('PaulBooks2.csv')
table(paul_book_training_data$Purchase)
##
## 0 1
## 917 83
summary(paul_book_training_data)
## ID Months NoBought Purchase
## Min. :1001 Min. : 1.00 Min. :0.000 Min. :0.000
## 1st Qu.:1251 1st Qu.: 7.00 1st Qu.:0.000 1st Qu.:0.000
## Median :1500 Median :12.00 Median :0.000 Median :0.000
## Mean :1500 Mean :12.46 Mean :0.389 Mean :0.083
## 3rd Qu.:1750 3rd Qu.:15.00 3rd Qu.:1.000 3rd Qu.:0.000
## Max. :2000 Max. :35.00 Max. :5.000 Max. :1.000
hist(paul_book_training_data$Months)
# scatterplot(paul_book_training_data$Months, paul_book_training_data$Purchase , data=paul_book_training_data, xlab="Months", ylab="Purchased", groups=paul_book_training_data$NoBought, by.groups=TRUE)
hist(paul_book_training_data$NoBought)
paul_book_test_data <- read.csv('PaulBooks2.csv')
table(paul_book_test_data$Purchase)
##
## 0 1
## 919 81
boxplot(paul_book_training_data$NoBought ~ paul_book_training_data$Purchase, horizontal=TRUE)
boxplot(paul_book_training_data$Months ~ paul_book_training_data$Purchase, horizontal = TRUE)
Model developments with MASS Package
paul_book_lda_model <- lda(paul_book_training_data$Purchase ~ paul_book_training_data$Months +paul_book_training_data$NoBought, data = paul_book_training_data) ## using MASS package
paul_book_lda_model
## Call:
## lda(paul_book_training_data$Purchase ~ paul_book_training_data$Months +
## paul_book_training_data$NoBought, data = paul_book_training_data)
##
## Prior probabilities of groups:
## 0 1
## 0.917 0.083
##
## Group means:
## paul_book_training_data$Months paul_book_training_data$NoBought
## 0 12.731734 0.3336968
## 1 9.409639 1.0000000
##
## Coefficients of linear discriminants:
## LD1
## paul_book_training_data$Months -0.05098078
## paul_book_training_data$NoBought 1.41242601
Model developments with DDiscriminer package Observations are: –0.08558 is the cutoff score came from discriminant coefficient and weighted average –Fisher Discriminant corelation shows that fisher discriminant function is corelated 92.07% with NoBought and 39.09% correlated with Income –The power output shows that discriminant power of explanatory variables. “NoBought” having very low p-value compare with Months and NoBought is more powerful variable to explain. Therefor we conclude from here that these two variables are important predictors to make group separation. –
myIndVariable <- paul_book_training_data[,2:3]
myDepVariable <- paul_book_training_data[,4]
paul_book_lda_model_discri <- desDA(myIndVariable,myDepVariable)
paul_book_lda_model_discri
##
## Descriptive Discriminant Analysis
## ---------------------------------
## $power discriminant power
## $values table of eigenvalues
## $discrivar discriminant variables
## $discor correlations
## $scores discriminant scores
## ---------------------------------
##
## $power
## cor_ratio wilks_lamb F_statistic p_values
## Months 0.0131350 0.9868650 13.2832222 0.0002816
## NoBought 0.0728742 0.9271258 78.4450638 0.0000000
##
##
## $values
## value proportion accumulated
## DF1 0.0868 100.0000 100.0000
##
##
## $discrivar
## DF1
## constant 0.08558
## Months -0.05098
## NoBought 1.41243
##
##
## $discor
## DF1
## Months -0.3909
## NoBought 0.9207
##
##
## $scores
## z1
## 1 -1.1380
## 2 -0.7301
## 3 -0.6791
## 4 -1.0360
## 5 -0.6791
## 6 2.6046
## ...
Now lets understand the cor_ratio and validate this value.. Observations are: First Anova with NoBought shows - Total Sum of Square:76.11 So contribution made by NoBought towards total sum of square is 33.8/463.7 = 0.07289 Second Anova with Months shows - Total Sum of Square: 76.11 So contribution made by Months towards total sum of square is 840/63950 = .0131
These two values are exactly matching with discri miner desc DA power output..This says that NoBought is relatively approx 7 times more important than Months in clasifying ans separating groups
myAnovamktg_nobought <- aov( paul_book_training_data$NoBought ~ paul_book_training_data$Purchase, data = paul_book_training_data)
summary(myAnovamktg_nobought)
## Df Sum Sq Mean Sq F value Pr(>F)
## paul_book_training_data$Purchase 1 33.8 33.79 78.44 <2e-16 ***
## Residuals 998 429.9 0.43
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
myAnovamktg_Months <- aov(paul_book_training_data$Months ~ paul_book_training_data$Purchase, data = paul_book_training_data)
summary(myAnovamktg_Months)
## Df Sum Sq Mean Sq F value Pr(>F)
## paul_book_training_data$Purchase 1 840 840.0 13.28 0.000282 ***
## Residuals 998 63110 63.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Our data is biassed one and hence we can’t take straightaway cutroff point. We need to add a correction to that. So we will use mahalanobis method which will give equation for each of the class of the response variable which is 0 and 1 for Purchase.
Key Observations are Accuracy is 100 -8.2= 91.8% Two equations and sets of discriminant coefficients are given Confusion Matrix shows Accuracy is 91.8% and Misclassification Rate is 8.2%
myPaulBookMahalanobis <- linDA(myIndVariable,myDepVariable)
myPaulBookMahalanobis
##
## Linear Discriminant Analysis
## -------------------------------------------
## $functions discrimination functions
## $confusion confusion matrix
## $scores discriminant scores
## $classification assigned class
## $error_rate error rate
## -------------------------------------------
##
## $functions
## 0 1
## constant -1.473 -4.295
## Months 0.200 0.143
## NoBought 0.699 2.267
##
##
## $confusion
## predicted
## original 0 1
## 0 907 10
## 1 72 11
##
##
## $error_rate
## [1] 0.082
##
##
## $scores
## 0 1
## 1 3.315145 -0.8650023
## 2 1.718994 -2.0082531
## 3 1.519476 -2.1511595
## 4 2.916107 -1.1508150
## 5 1.519476 -2.1511595
## 6 1.120868 1.0966496
## ...
##
## $classification
## [1] 0 0 0 0 0 0
## Levels: 0 1
## ...
paul_book_training_data$MahalaScoreClassZero <- myPaulBookMahalanobis$scores[1]
paul_book_training_data$MahalaScoreClassOne <- myPaulBookMahalanobis$scores[2]
paul_book_training_data$PredictedClass <- ifelse(paul_book_training_data$MahalaScoreClassZero > paul_book_training_data$MahalaScoreClassOne ,0,1 )
# plot(wine.lda.values$x[,1],wine.lda.values$x[,2]) # make a scatterplot
# text(wine.lda.values$x[,1],wine.lda.values$x[,2],Type,cex=0.7,pos=4,col="red") # add labels
Now lets test for testing on holdout sample
myLDA <- lda(paul_book_training_data$Purchase ~ paul_book_training_data$Months+paul_book_training_data$NoBought, data=paul_book_training_data)
mylinDAHoldOut <- predict(paul_book_lda_model, newdata=paul_book_testing_data, type='response')
paul_book_testing_data$PredictedPurchase <- mylinDAHoldOut$class
mycamaigntVIable <- table(actualclass=paul_book_testing_data$Purchase, predictedclass=paul_book_testing_data$PredictedPurchase)
mycampaignconfusionVImatrix <- confusionMatrix(mycamaigntVIable, cutoff = 0.5)
print(mycampaignconfusionVImatrix)
## Confusion Matrix and Statistics
##
## predictedclass
## actualclass 0 1
## 0 899 20
## 1 80 1
##
## Accuracy : 0.9
## 95% CI : (0.8797, 0.9179)
## No Information Rate : 0.979
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0142
## Mcnemar's Test P-Value : 3.635e-09
##
## Sensitivity : 0.91828
## Specificity : 0.04762
## Pos Pred Value : 0.97824
## Neg Pred Value : 0.01235
## Prevalence : 0.97900
## Detection Rate : 0.89900
## Detection Prevalence : 0.91900
## Balanced Accuracy : 0.48295
##
## 'Positive' Class : 0
##