library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(car)
## Loading required package: carData
library(readxl)
library(e1071)
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
##
## kurtosis, skewness
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v tibble 3.0.3 v purrr 0.3.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks xts::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks xts::last()
## x purrr::lift() masks caret::lift()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
library(broom)
BBBC_Train <- read_excel("BBBC-Train.xlsx")
BBBC_Test <- read_excel("BBBC-Test.xlsx")
#Removing Variable observation
BBBC_Train$Observation <- NULL
BBBC_Test$Observation <- NULL
#creating a copy of dataset
BBBC.T1 <- BBBC_Train
summary(BBBC.T1)
## Choice Gender Amount_purchased Frequency
## Min. :0.00 Min. :0.0000 Min. : 15.0 Min. : 2.00
## 1st Qu.:0.00 1st Qu.:0.0000 1st Qu.:126.8 1st Qu.: 6.00
## Median :0.00 Median :1.0000 Median :203.0 Median :12.00
## Mean :0.25 Mean :0.6587 Mean :200.9 Mean :12.31
## 3rd Qu.:0.25 3rd Qu.:1.0000 3rd Qu.:273.0 3rd Qu.:16.00
## Max. :1.00 Max. :1.0000 Max. :474.0 Max. :36.00
## Last_purchase First_purchase P_Child P_Youth
## Min. : 1.000 Min. : 2.00 Min. :0.0000 Min. :0.0000
## 1st Qu.: 1.000 1st Qu.:12.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 2.000 Median :18.00 Median :0.0000 Median :0.0000
## Mean : 3.199 Mean :22.58 Mean :0.7394 Mean :0.3375
## 3rd Qu.: 4.000 3rd Qu.:30.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :12.000 Max. :96.00 Max. :8.0000 Max. :4.0000
## P_Cook P_DIY P_Art
## Min. :0.00 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.00 Median :0.0000 Median :0.000
## Mean :0.76 Mean :0.3912 Mean :0.425
## 3rd Qu.:1.00 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :6.00 Max. :4.0000 Max. :5.000
#Conveting variable into factor
BBBC.T1$Choice <-as.factor(BBBC_Train$Choice)
BBBC.T1$Gender <- as.factor(BBBC_Train$Gender)
BBBC_Test$Choice <- as.factor(BBBC_Test$Choice)
BBBC_Test$Gender <- as.factor(BBBC_Test$Gender)
#Explanatory Analysis
par(mfrow=c(2,2))
plot(BBBC.T1$Gender,BBBC.T1$Amount_purchased, col=c('Red','green'))
plot(BBBC.T1$Gender,BBBC.T1$P_Child, col=c('Red','green'))
plot(BBBC.T1$Gender,BBBC.T1$P_Youth, col=c('Red','green'))
plot(BBBC.T1$Gender,BBBC.T1$P_Cook, col=c('Red','green'))
plot(BBBC.T1$Gender,BBBC.T1$P_DIY, col=c('Red','green'))
plot(BBBC.T1$Gender,BBBC.T1$P_Art, col=c('Red','green'))
hist(BBBC.T1$P_Child, main = 'Number of Children Book Purchased')
hist(BBBC.T1$P_Youth, main = 'Number of Youth Book Purchased')
hist(BBBC.T1$P_Cook, main = 'Number of Cook Book Purchased')
hist(BBBC.T1$P_DIY, main = 'Number of DIY Book Purchased')
hist(BBBC.T1$P_Art, main = 'Number of Art Book Purchased')
##Stats and Facts - Calculated in Excel Total Female= 546 Total Male = 1054
Female | P_Child == 437 (total child book purchased by female) male | P_Child == 746 (total child book purchased by male)
Female | P_Youth == 191 (total Youth book purchased by female) male | P_Youth == 349 (total Youth book purchased by male)
Female | P_Cook == 436 (total Cook book purchased by female) male | P_Cook == 780 (total Cook book purchased by male)
Female | P_DIY == 227 (total DIY book purchased by female) male | P_DIY == 399 (total DIY book purchased by male)
Female | P_Art == 234 (total Art book purchased by female) male | P_Art == 446 (total Art book purchased by male)
#6. Correlation
chart.Correlation(BBBC_Train, histogram=FALSE, pch=19)
First_purchase and Last_Purchase are highly corelated with 0.81
#Linear Regression and assumptions
par(mfrow=c(2,2)) # set 2 rows and 2 column plot layout
mod_1 <- lm(Choice ~., data=BBBC_Train) # linear model
plot(mod_1)
summary(mod_1)
##
## Call:
## lm(formula = Choice ~ ., data = BBBC_Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9603 -0.2462 -0.1161 0.1622 1.0588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3642284 0.0307411 11.848 < 2e-16 ***
## Gender -0.1309205 0.0200303 -6.536 8.48e-11 ***
## Amount_purchased 0.0002736 0.0001110 2.464 0.0138 *
## Frequency -0.0090868 0.0021791 -4.170 3.21e-05 ***
## Last_purchase 0.0970286 0.0135589 7.156 1.26e-12 ***
## First_purchase -0.0020024 0.0018160 -1.103 0.2704
## P_Child -0.1262584 0.0164011 -7.698 2.41e-14 ***
## P_Youth -0.0963563 0.0201097 -4.792 1.81e-06 ***
## P_Cook -0.1414907 0.0166064 -8.520 < 2e-16 ***
## P_DIY -0.1352313 0.0197873 -6.834 1.17e-11 ***
## P_Art 0.1178494 0.0194427 6.061 1.68e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared: 0.2401, Adjusted R-squared: 0.2353
## F-statistic: 50.2 on 10 and 1589 DF, p-value: < 2.2e-16
#VIF using Linear Model
car::vif(mod_1)
## Gender Amount_purchased Frequency Last_purchase
## 1.005801 1.248066 3.253860 18.770402
## First_purchase P_Child P_Youth P_Cook
## 9.685333 3.360349 1.775022 3.324928
## P_DIY P_Art
## 2.016910 2.273771
#Base Logit Model with predictor
glm.fit1 <- glm(Choice~., data =BBBC.T1, family = 'binomial')
#Excluding Last_Purchase and First_Purchase
glm.fit2 <- glm(Choice~ Gender+Amount_purchased+Frequency+P_Child+P_Youth+P_Cook+P_DIY+P_Art, data = BBBC.T1, family= 'binomial')
#Checking Collinearity with Logit Base model
car::vif(glm.fit1)
## Gender Amount_purchased Frequency Last_purchase
## 1.023359 1.232172 2.490447 17.706670
## First_purchase P_Child P_Youth P_Cook
## 9.247748 2.992269 1.761546 3.229097
## P_DIY P_Art
## 1.992698 1.938089
Last_purchase and First_Purchase collinearity is high
#Checking Collinearity Excluding Last_Purchase and First_Purchase
car::vif(glm.fit2)
## Gender Amount_purchased Frequency P_Child
## 1.020217 1.213528 1.015899 1.215500
## P_Youth P_Cook P_DIY P_Art
## 1.081019 1.228798 1.179821 1.229491
# Predict the probability (p) of logistic regression fit ran above
probabilities <- predict(glm.fit2, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
head(predicted.classes)
## 1 2 3 4 5 6
## "neg" "pos" "neg" "neg" "pos" "neg"
# Select only numeric predictors
mydata2 <- BBBC.T1 %>%
dplyr::select_if(is.numeric)
predictors <- colnames(mydata2)
# Bind the logit and tidying the data for plot
mydata2 <- mydata2 %>%
mutate(logit = log(probabilities/(1-probabilities))) %>%
gather(key = "predictors", value = "predictor.value", -logit)
# Create the scatter plots
ggplot(mydata2, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
plot(glm.fit2, which=4, id.n=3)
#GLM Model with interaction terms
set.seed(123)
train.control <- trainControl(method = "cv", number = 10)
# Train the model
glm.fit3 <- train(Choice~ Gender+Amount_purchased+Frequency+Last_purchase+P_Child+P_Youth+P_Cook+P_DIY+P_Art+
Gender*P_Child+Gender*P_Youth+Gender*P_Cook+Gender*P_DIY+Gender*P_Art,
data = BBBC.T1,
method = "glm",
family = binomial,
trControl = train.control)
summary(glm.fit3)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.49064 -0.67179 -0.43069 -0.01095 2.79158
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.292559 0.221113 -1.323 0.185796
## Gender1 -0.863087 0.194887 -4.429 9.48e-06 ***
## Amount_purchased 0.001876 0.000794 2.362 0.018164 *
## Frequency -0.090221 0.010634 -8.484 < 2e-16 ***
## Last_purchase 0.549378 0.079275 6.930 4.21e-12 ***
## P_Child -0.806151 0.141345 -5.703 1.17e-08 ***
## P_Youth -0.752065 0.193530 -3.886 0.000102 ***
## P_Cook -0.876094 0.139047 -6.301 2.96e-10 ***
## P_DIY -0.704329 0.179889 -3.915 9.03e-05 ***
## P_Art 0.492890 0.174077 2.831 0.004634 **
## `Gender1:P_Child` -0.016242 0.146983 -0.111 0.912010
## `Gender1:P_Youth` 0.189251 0.228454 0.828 0.407445
## `Gender1:P_Cook` -0.085407 0.146986 -0.581 0.561204
## `Gender1:P_DIY` -0.398897 0.223753 -1.783 0.074626 .
## `Gender1:P_Art` 0.286821 0.206231 1.391 0.164292
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1388.2 on 1585 degrees of freedom
## AIC: 1418.2
##
## Number of Fisher Scoring iterations: 5
None of the interaction term is significant
pred.glm <- predict(glm.fit3, BBBC_Test)
confusionMatrix(pred.glm, BBBC_Test$Choice)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1971 127
## 1 125 77
##
## Accuracy : 0.8904
## 95% CI : (0.877, 0.9029)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9997
##
## Kappa : 0.3192
##
## Mcnemar's Test P-Value : 0.9498
##
## Sensitivity : 0.9404
## Specificity : 0.3775
## Pos Pred Value : 0.9395
## Neg Pred Value : 0.3812
## Prevalence : 0.9113
## Detection Rate : 0.8570
## Detection Prevalence : 0.9122
## Balanced Accuracy : 0.6589
##
## 'Positive' Class : 0
##
#GLM Model Exclduing First_Purchase
set.seed(123)
train.control <- trainControl(method = "cv", number = 10)
# Train the model
glm.fit4 <- train(Choice~ Gender+Amount_purchased+Frequency+Last_purchase+P_Child+P_Youth+P_Cook+P_DIY+P_Art, data = BBBC.T1, method = "glm",family = binomial, trControl = train.control)
pred.glm <- predict(glm.fit4, BBBC_Test)
confusionMatrix(pred.glm, BBBC_Test$Choice)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1973 127
## 1 123 77
##
## Accuracy : 0.8913
## 95% CI : (0.8779, 0.9037)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9995
##
## Kappa : 0.3216
##
## Mcnemar's Test P-Value : 0.8495
##
## Sensitivity : 0.9413
## Specificity : 0.3775
## Pos Pred Value : 0.9395
## Neg Pred Value : 0.3850
## Prevalence : 0.9113
## Detection Rate : 0.8578
## Detection Prevalence : 0.9130
## Balanced Accuracy : 0.6594
##
## 'Positive' Class : 0
##
#SVM Models
set.seed(123)
model1 <- Choice ~ as.factor(Gender) + Amount_purchased +
Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art
tune <- tune.svm(model1, data = BBBC.T1, gamma =seq(.01, 0.1, by = .01), cost = seq(0.1,1, by = 0.1))
tune$best.parameters
## gamma cost
## 90 0.1 0.9
tune$performances
## gamma cost error dispersion
## 1 0.01 0.1 0.250000 0.03750000
## 2 0.02 0.1 0.250000 0.03750000
## 3 0.03 0.1 0.245000 0.03343734
## 4 0.04 0.1 0.240000 0.03513367
## 5 0.05 0.1 0.236875 0.03649700
## 6 0.06 0.1 0.235000 0.04084609
## 7 0.07 0.1 0.235000 0.04052605
## 8 0.08 0.1 0.233750 0.04105806
## 9 0.09 0.1 0.234375 0.04169270
## 10 0.10 0.1 0.235625 0.04187966
## 11 0.01 0.2 0.248125 0.03461720
## 12 0.02 0.2 0.223125 0.03842042
## 13 0.03 0.2 0.220000 0.03793727
## 14 0.04 0.2 0.216250 0.03622844
## 15 0.05 0.2 0.216875 0.03609041
## 16 0.06 0.2 0.218125 0.03755205
## 17 0.07 0.2 0.220625 0.03819381
## 18 0.08 0.2 0.220000 0.03712778
## 19 0.09 0.2 0.220000 0.03701070
## 20 0.10 0.2 0.220625 0.03964359
## 21 0.01 0.3 0.229375 0.03715700
## 22 0.02 0.3 0.213125 0.03625838
## 23 0.03 0.3 0.215000 0.03574602
## 24 0.04 0.3 0.217500 0.03382451
## 25 0.05 0.3 0.218750 0.03333333
## 26 0.06 0.3 0.218750 0.03535534
## 27 0.07 0.3 0.220000 0.03593976
## 28 0.08 0.3 0.221875 0.03611445
## 29 0.09 0.3 0.221250 0.03670453
## 30 0.10 0.3 0.220625 0.03762133
## 31 0.01 0.4 0.220000 0.03861761
## 32 0.02 0.4 0.215000 0.03763863
## 33 0.03 0.4 0.216875 0.03633013
## 34 0.04 0.4 0.216250 0.03574602
## 35 0.05 0.4 0.217500 0.03533078
## 36 0.06 0.4 0.217500 0.03606033
## 37 0.07 0.4 0.218750 0.03510896
## 38 0.08 0.4 0.218125 0.03637789
## 39 0.09 0.4 0.219375 0.03685204
## 40 0.10 0.4 0.220625 0.03633013
## 41 0.01 0.5 0.213750 0.03939649
## 42 0.02 0.5 0.215625 0.03764440
## 43 0.03 0.5 0.215000 0.03634805
## 44 0.04 0.5 0.215625 0.03623443
## 45 0.05 0.5 0.215625 0.03489195
## 46 0.06 0.5 0.216875 0.03333984
## 47 0.07 0.5 0.216250 0.03322900
## 48 0.08 0.5 0.217500 0.03382451
## 49 0.09 0.5 0.218125 0.03403557
## 50 0.10 0.5 0.220625 0.03680490
## 51 0.01 0.6 0.212500 0.03761556
## 52 0.02 0.6 0.215000 0.03705758
## 53 0.03 0.6 0.215000 0.03821086
## 54 0.04 0.6 0.216250 0.03425801
## 55 0.05 0.6 0.216875 0.03359920
## 56 0.06 0.6 0.216875 0.03385657
## 57 0.07 0.6 0.217500 0.03224795
## 58 0.08 0.6 0.217500 0.03197764
## 59 0.09 0.6 0.218125 0.03313090
## 60 0.10 0.6 0.218125 0.03685204
## 61 0.01 0.7 0.213125 0.03755205
## 62 0.02 0.7 0.214375 0.03864569
## 63 0.03 0.7 0.215625 0.03922535
## 64 0.04 0.7 0.216875 0.03536148
## 65 0.05 0.7 0.217500 0.03408018
## 66 0.06 0.7 0.217500 0.03408018
## 67 0.07 0.7 0.216875 0.03333984
## 68 0.08 0.7 0.217500 0.03073181
## 69 0.09 0.7 0.216250 0.03413108
## 70 0.10 0.7 0.216250 0.03574602
## 71 0.01 0.8 0.214375 0.03819381
## 72 0.02 0.8 0.213750 0.03872983
## 73 0.03 0.8 0.213750 0.03917553
## 74 0.04 0.8 0.216875 0.03536148
## 75 0.05 0.8 0.216875 0.03461720
## 76 0.06 0.8 0.216875 0.03411200
## 77 0.07 0.8 0.217500 0.03197764
## 78 0.08 0.8 0.217500 0.03278190
## 79 0.09 0.8 0.215000 0.03335936
## 80 0.10 0.8 0.210000 0.03361857
## 81 0.01 0.9 0.213125 0.03957785
## 82 0.02 0.9 0.214375 0.03975293
## 83 0.03 0.9 0.215000 0.03786856
## 84 0.04 0.9 0.216875 0.03536148
## 85 0.05 0.9 0.216250 0.03451047
## 86 0.06 0.9 0.216875 0.03333984
## 87 0.07 0.9 0.216875 0.03385657
## 88 0.08 0.9 0.215625 0.03362503
## 89 0.09 0.9 0.211250 0.03304563
## 90 0.10 0.9 0.208750 0.03216710
## 91 0.01 1.0 0.213750 0.03961621
## 92 0.02 1.0 0.213750 0.03917553
## 93 0.03 1.0 0.215625 0.03647321
## 94 0.04 1.0 0.216875 0.03536148
## 95 0.05 1.0 0.215625 0.03451675
## 96 0.06 1.0 0.218125 0.03273552
## 97 0.07 1.0 0.215625 0.03451675
## 98 0.08 1.0 0.213750 0.03557562
## 99 0.09 1.0 0.209375 0.03513985
## 100 0.10 1.0 0.211250 0.03030516
lowest error = 0.210625 can be found at gamma = 0.09 and cost = 0.9
mysvm <- svm(formula=model1, data = BBBC.T1, kernel = "polynomial", gamma = tune$best.parameters$gamma, cost = tune$best.parameters$cost)
summary(mysvm)
##
## Call:
## svm(formula = model1, data = BBBC.T1, kernel = "polynomial", gamma = tune$best.parameters$gamma,
## cost = tune$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 0.9
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 752
##
## ( 356 396 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svmpredict <- predict(mysvm,BBBC_Test, type = "response")
# Cross tabs or confusion matrix
table(pred=svmpredict,true=BBBC_Test$Choice)
## true
## pred 0 1
## 0 2056 167
## 1 40 37
(2056+37)/(2300)
## [1] 0.91
BBBC_Test$svmpredict <- predict(mysvm,BBBC_Test, type = "response")
write.csv(BBBC_Test,file="finalModel.csv")
#Model Comparison
| Model | Accuracy |
|---|---|
| glm.fit3 | 0.8904 |
| glm.fit4 | 0.8913 |
| MYSVM (Poly) | 0.91 |