Splitting Data
bbbc_test <- read_excel("BBBC-Test.xlsx")
bbbc_test <- (subset(bbbc_test, select=-c(Observation)))
bbbc_test$Gender[bbbc_test$Gender==1]="Male";
bbbc_test$Gender[bbbc_test$Gender==0]="Female";
bbbc_train <- read_excel("BBBC-Train.xlsx")
bbbc_train <- (subset(bbbc_train, select=-c(Observation)))
bbbc_train$Gender[bbbc_train$Gender==1]="Male";
bbbc_train$Gender[bbbc_train$Gender==0]="Female";
anyNA(bbbc_train)
## [1] FALSE
anyNA(bbbc_test)
## [1] FALSE
str(bbbc_train)
## tibble [1,600 x 11] (S3: tbl_df/tbl/data.frame)
## $ Choice : num [1:1600] 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender : chr [1:1600] "Male" "Male" "Male" "Male" ...
## $ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
## $ Frequency : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
## $ Last_purchase : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
## $ First_purchase : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
## $ P_Child : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
## $ P_Youth : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
## $ P_Cook : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
## $ P_DIY : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
## $ P_Art : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...
summary(bbbc_train$Choice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 0.25 0.25 1.00
table(bbbc_train$Choice)
##
## 0 1
## 1200 400
Exploratory Analysis Note high correlation
#pairs(bbbc_train)
pairs(subset(bbbc_train, select=-c(Gender)))
plot(bbbc_train)
#cor(subset(bbbc_train, select=-c(Gender)))
Multicollinearity (taking out Last Purchased due to high VIF/correlation)
bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)
vif(bbbc_logit)
## Gender Amount_purchased Frequency Last_purchase
## 1.023359 1.232172 2.490447 17.706670
## First_purchase P_Child P_Youth P_Cook
## 9.247748 2.992269 1.761546 3.229097
## P_DIY P_Art
## 1.992698 1.938089
bbbc_train = (subset(bbbc_train, select=-c(Last_purchase)))
bbbc_test = (subset(bbbc_test, select=-c(Last_purchase)))
bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)
vif(bbbc_logit)
## Gender Amount_purchased Frequency First_purchase
## 1.021977 1.220305 2.173240 6.886806
## P_Child P_Youth P_Cook P_DIY
## 1.904631 1.320305 2.060140 1.462770
## P_Art
## 1.603865
bbbc_train = (subset(bbbc_train, select=-c(First_purchase)))
bbbc_test = (subset(bbbc_test, select=-c(First_purchase)))
bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)
vif(bbbc_logit)
## Gender Amount_purchased Frequency P_Child
## 1.020217 1.213528 1.015899 1.215500
## P_Youth P_Cook P_DIY P_Art
## 1.081019 1.228798 1.179821 1.229491
Linear Regression
note that this method gives continuous values and not probabilities and therefore is not suited for classification assume probabilities
bbbc_lm <- lm(Choice~., data = bbbc_train)
summary(bbbc_lm)
##
## Call:
## lm(formula = Choice ~ ., data = bbbc_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9501 -0.2518 -0.1273 0.1509 1.1211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3731865 0.0302933 12.319 < 2e-16 ***
## GenderMale -0.1263728 0.0203683 -6.204 6.99e-10 ***
## Amount_purchased 0.0003688 0.0001123 3.283 0.00105 **
## Frequency -0.0112345 0.0012344 -9.101 < 2e-16 ***
## P_Child -0.0275983 0.0100284 -2.752 0.00599 **
## P_Youth -0.0014841 0.0159946 -0.093 0.92609
## P_Cook -0.0428346 0.0102155 -4.193 2.90e-05 ***
## P_DIY -0.0384262 0.0153017 -2.511 0.01213 *
## P_Art 0.2183323 0.0140081 15.586 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3856 on 1591 degrees of freedom
## Multiple R-squared: 0.2114, Adjusted R-squared: 0.2075
## F-statistic: 53.32 on 8 and 1591 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(bbbc_lm)
predict = predict(bbbc_lm, bbbc_test)
predict_choice = ifelse(predict>0.3, 1,0)
plot(bbbc_train$Amount_purchased, bbbc_train$Choice)
abline(lm(bbbc_train$Choice ~ bbbc_train$Amount_purchased))
caret::confusionMatrix(as.factor(predict_choice),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1621 71
## 1 475 133
##
## Accuracy : 0.7626
## 95% CI : (0.7447, 0.7799)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2246
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.65196
## Specificity : 0.77338
## Pos Pred Value : 0.21875
## Neg Pred Value : 0.95804
## Prevalence : 0.08870
## Detection Rate : 0.05783
## Detection Prevalence : 0.26435
## Balanced Accuracy : 0.71267
##
## 'Positive' Class : 1
##
make Choice a factor for Logistic Regression and SVM
bbbc_test[["Choice"]] = factor(bbbc_test[["Choice"]])
bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])
Logistic Regression with all predictors
bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)
#cooks distance plot
dev <-residuals(bbbc_logit, type = "deviance")
pea <-residuals(bbbc_logit, type = "pearson")
devstd<-residuals(bbbc_logit, type = "deviance")/sqrt(1 - hatvalues(bbbc_logit)) # standardized deviance residuals
peastd <-residuals(bbbc_logit, type = "pearson")/sqrt(1 - hatvalues(bbbc_logit)) # standardized pearson residuals
dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))
plot(devstd[bbbc_logit$model$Choice==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[bbbc_logit$model$Choice==0], col = "blue")
plot(peastd[bbbc_logit$model$Choice==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[bbbc_logit$model$Choice==0], col = "blue")
#model selection
#taking out outliers
cooks_sd_cutoff = 3*sd(cooks.distance(bbbc_logit))
out <- which(cooks.distance(bbbc_logit)>cooks_sd_cutoff)
bbbc_logit <- glm(Choice~., data = bbbc_train[-out,], family = binomial)
summary(bbbc_logit)
##
## Call:
## glm(formula = Choice ~ ., family = binomial, data = bbbc_train[-out,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0699 -0.6284 -0.3964 -0.1077 2.4597
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.047670 0.220341 -0.216 0.828718
## GenderMale -0.880793 0.145243 -6.064 1.33e-09 ***
## Amount_purchased 0.002656 0.000833 3.188 0.001431 **
## Frequency -0.117777 0.012245 -9.619 < 2e-16 ***
## P_Child -0.281328 0.084375 -3.334 0.000855 ***
## P_Youth -0.234805 0.127903 -1.836 0.066385 .
## P_Cook -0.419486 0.085446 -4.909 9.14e-07 ***
## P_DIY -0.319053 0.121319 -2.630 0.008542 **
## P_Art 1.493295 0.110991 13.454 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1696.9 on 1558 degrees of freedom
## Residual deviance: 1263.8 on 1550 degrees of freedom
## AIC: 1281.8
##
## Number of Fisher Scoring iterations: 5
#Maximized cutoff (change predict object)
pred <- prediction(predict(bbbc_logit, bbbc_test, type = "response"),
bbbc_test$Choice)
#Predicted Probability and True Classification
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
#ROC plot (don't need to change)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
#Confusion Matrix (change predict object)
predict = predict(bbbc_logit, bbbc_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1501 58
## 1 595 146
##
## Accuracy : 0.7161
## 95% CI : (0.6972, 0.7344)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1973
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.71569
## Specificity : 0.71613
## Pos Pred Value : 0.19703
## Neg Pred Value : 0.96280
## Prevalence : 0.08870
## Detection Rate : 0.06348
## Detection Prevalence : 0.32217
## Balanced Accuracy : 0.71591
##
## 'Positive' Class : 1
##
#diagnostics
cooks_distance = cooks.distance(bbbc_logit)
plot(cooks_distance,col="pink", pch=19, cex=1)
text(cooks.distance(bbbc_logit),labels = rownames(bbbc_train))
#hoslem.test(bbbc_logit$Choice, fitted(bbbc_logit), g=10)
exp(coef(bbbc_logit))
## (Intercept) GenderMale Amount_purchased Frequency
## 0.9534484 0.4144543 1.0026594 0.8888940
## P_Child P_Youth P_Cook P_DIY
## 0.7547806 0.7907248 0.6573845 0.7268366
## P_Art
## 4.4517406
LR = Anova(bbbc_logit, test = "LR")
Wald = Anova(bbbc_logit, test = "Wald")
score = anova(bbbc_logit, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)
## LR Score Wald
## 1 37.102971 32.5259 36.775120
## 2 10.303922 32.5259 10.165776
## 3 122.455019 32.5259 92.521765
## 4 11.893374 32.5259 11.117279
## 5 3.472190 32.5259 3.370205
## 6 26.822816 32.5259 24.101850
## 7 7.218459 32.5259 6.916238
## 8 233.897272 32.5259 181.015591
data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)
## p.LR p.Score p.Wald
## 1 1.120529e-09 1.176148e-08 1.325714e-09
## 2 1.327478e-03 1.176148e-08 1.430717e-03
## 3 1.835293e-28 1.176148e-08 6.658840e-22
## 4 5.633399e-04 1.176148e-08 8.552718e-04
## 5 6.240862e-02 1.176148e-08 6.638548e-02
## 6 2.229890e-07 1.176148e-08 9.137245e-07
## 7 7.215762e-03 1.176148e-08 8.541649e-03
## 8 8.422534e-53 1.176148e-08 2.908566e-41
Logistic Regression taking out P_Youth (BEST LOGIT MODEL)
bbbc_logit <- glm(Choice~.-P_Youth, data = bbbc_train, family = binomial)
#cooks distance plot
dev <-residuals(bbbc_logit, type = "deviance")
pea <-residuals(bbbc_logit, type = "pearson")
devstd<-residuals(bbbc_logit, type = "deviance")/sqrt(1 - hatvalues(bbbc_logit)) # standardized deviance residuals
peastd <-residuals(bbbc_logit, type = "pearson")/sqrt(1 - hatvalues(bbbc_logit)) # standardized pearson residuals
dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))
plot(devstd[bbbc_logit$model$Choice==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[bbbc_logit$model$Choice==0], col = "blue")
plot(peastd[bbbc_logit$model$Choice==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[bbbc_logit$model$Choice==0], col = "blue")
#model selection
#taking out outliers
cooks_sd_cutoff = 3*sd(cooks.distance(bbbc_logit))
out <- which(cooks.distance(bbbc_logit)>cooks_sd_cutoff)
bbbc_logit <- glm(Choice~.-P_Youth, data = bbbc_train[-out,], family = binomial)
summary(bbbc_logit)
##
## Call:
## glm(formula = Choice ~ . - P_Youth, family = binomial, data = bbbc_train[-out,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0258 -0.6400 -0.4045 -0.1091 2.4655
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0595999 0.2181315 -0.273 0.784677
## GenderMale -0.8572699 0.1442686 -5.942 2.81e-09 ***
## Amount_purchased 0.0024290 0.0008208 2.959 0.003084 **
## Frequency -0.1174284 0.0121335 -9.678 < 2e-16 ***
## P_Child -0.2865928 0.0834734 -3.433 0.000596 ***
## P_Cook -0.4020912 0.0837769 -4.800 1.59e-06 ***
## P_DIY -0.3378003 0.1195938 -2.825 0.004734 **
## P_Art 1.4704568 0.1098880 13.381 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1707.9 on 1561 degrees of freedom
## Residual deviance: 1280.2 on 1554 degrees of freedom
## AIC: 1296.2
##
## Number of Fisher Scoring iterations: 5
#Maximized cutoff (change predict object)
pred <- prediction(predict(bbbc_logit, bbbc_test, type = "response"),
bbbc_test$Choice)
#Predicted Probability and True Classification
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
#ROC plot (don't need to change)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
#Confusion Matrix (change predict object)
predict = predict(bbbc_logit, bbbc_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1511 57
## 1 585 147
##
## Accuracy : 0.7209
## 95% CI : (0.702, 0.7391)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2036
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.72059
## Specificity : 0.72090
## Pos Pred Value : 0.20082
## Neg Pred Value : 0.96365
## Prevalence : 0.08870
## Detection Rate : 0.06391
## Detection Prevalence : 0.31826
## Balanced Accuracy : 0.72074
##
## 'Positive' Class : 1
##
#diagnostics
cooks_distance = cooks.distance(bbbc_logit)
plot(cooks_distance,col="pink", pch=19, cex=1)
text(cooks.distance(bbbc_logit),labels = rownames(bbbc_train))
#hoslem.test(bbbc_logit$Choice, fitted(bbbc_logit), g=10)
exp(coef(bbbc_logit))
## (Intercept) GenderMale Amount_purchased Frequency
## 0.9421414 0.4243189 1.0024319 0.8892042
## P_Child P_Cook P_DIY P_Art
## 0.7508174 0.6689197 0.7133377 4.3512224
LR = Anova(bbbc_logit, test = "LR")
Wald = Anova(bbbc_logit, test = "Wald")
score = anova(bbbc_logit, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)
## LR Score Wald
## 1 35.568898 31.07381 35.309507
## 2 8.850279 31.07381 8.757069
## 3 123.870373 31.07381 93.663831
## 4 12.612136 31.07381 11.787838
## 5 25.581283 31.07381 23.035677
## 6 8.344074 31.07381 7.978166
## 7 230.574526 31.07381 179.062355
data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)
## p.LR p.Score p.Wald
## 1 2.461871e-09 2.483999e-08 2.812558e-09
## 2 2.930440e-03 2.483999e-08 3.084043e-03
## 3 8.993083e-29 2.483999e-08 3.739320e-22
## 4 3.832502e-04 2.483999e-08 5.961892e-04
## 5 4.241336e-07 2.483999e-08 1.590228e-06
## 6 3.869474e-03 2.483999e-08 4.734489e-03
## 7 4.467337e-52 2.483999e-08 7.765136e-41
Coefficients (subtract 1 from coefficient and multiply by 100 to get percentage change per unit increase)
exp(coef(bbbc_logit))
## (Intercept) GenderMale Amount_purchased Frequency
## 0.9421414 0.4243189 1.0024319 0.8892042
## P_Child P_Cook P_DIY P_Art
## 0.7508174 0.6689197 0.7133377 4.3512224
Logistic Regression AIC criteria (takes out P_youth)
bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)
#cooks distance plot
dev <-residuals(bbbc_logit, type = "deviance")
pea <-residuals(bbbc_logit, type = "pearson")
devstd<-residuals(bbbc_logit, type = "deviance")/sqrt(1 - hatvalues(bbbc_logit)) # standardized deviance residuals
peastd <-residuals(bbbc_logit, type = "pearson")/sqrt(1 - hatvalues(bbbc_logit)) # standardized pearson residuals
dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))
plot(devstd[bbbc_logit$model$Choice==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[bbbc_logit$model$Choice==0], col = "blue")
plot(peastd[bbbc_logit$model$Choice==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[bbbc_logit$model$Choice==0], col = "blue")
#model selection (AIC)
bbbc_logit = step(bbbc_logit, direction="both",test="Chisq", trace = F)
summary(bbbc_logit)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## P_Child + P_Cook + P_DIY + P_Art, family = binomial, data = bbbc_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.30792 -0.69156 -0.47311 -0.02466 2.84228
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2894506 0.2026211 -1.429 0.15314
## GenderMale -0.8120440 0.1345723 -6.034 1.6e-09 ***
## Amount_purchased 0.0023859 0.0007678 3.108 0.00189 **
## Frequency -0.0885491 0.0103772 -8.533 < 2e-16 ***
## P_Child -0.1964495 0.0720116 -2.728 0.00637 **
## P_Cook -0.2940503 0.0727520 -4.042 5.3e-05 ***
## P_DIY -0.2823065 0.1076089 -2.623 0.00870 **
## P_Art 1.2441330 0.0988603 12.585 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1445.1 on 1592 degrees of freedom
## AIC: 1461.1
##
## Number of Fisher Scoring iterations: 5
#cooks_sd_cutoff = 3*sd(cooks.distance(AIC))
#out <- which(cooks.distance(AIC)>cooks_sd_cutoff)
#taking out outliers
cooks_sd_cutoff = 3*sd(cooks.distance(bbbc_logit))
out <- which(cooks.distance(bbbc_logit)>cooks_sd_cutoff)
bbbc_logit <- glm(Choice ~ Gender + Amount_purchased + Frequency +
P_Child + P_Cook + P_DIY + P_Art, data = bbbc_train[-out,], family = binomial)
#Maximized cutoff (change predict object)
pred <- prediction(predict(bbbc_logit, bbbc_test, type = "response"),
bbbc_test$Choice)
#Predicted Probability and True Classification
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
#ROC plot (don't need to change)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
#Confusion Matrix (change predict object)
predict = predict(bbbc_logit, bbbc_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1511 57
## 1 585 147
##
## Accuracy : 0.7209
## 95% CI : (0.702, 0.7391)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2036
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.72059
## Specificity : 0.72090
## Pos Pred Value : 0.20082
## Neg Pred Value : 0.96365
## Prevalence : 0.08870
## Detection Rate : 0.06391
## Detection Prevalence : 0.31826
## Balanced Accuracy : 0.72074
##
## 'Positive' Class : 1
##
#diagnostics
cooks_distance = cooks.distance(bbbc_logit)
plot(cooks_distance,col="pink", pch=19, cex=1)
text(cooks.distance(bbbc_logit),labels = rownames(bbbc_train))
#hoslem.test(bbbc_logit$Choice, fitted(bbbc_logit), g=10)
exp(coef(bbbc_logit))
## (Intercept) GenderMale Amount_purchased Frequency
## 0.9421414 0.4243189 1.0024319 0.8892042
## P_Child P_Cook P_DIY P_Art
## 0.7508174 0.6689197 0.7133377 4.3512224
LR = Anova(bbbc_logit, test = "LR")
Wald = Anova(bbbc_logit, test = "Wald")
score = anova(bbbc_logit, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)
## LR Score Wald
## 1 35.568898 31.07381 35.309507
## 2 8.850279 31.07381 8.757069
## 3 123.870373 31.07381 93.663831
## 4 12.612136 31.07381 11.787838
## 5 25.581283 31.07381 23.035677
## 6 8.344074 31.07381 7.978166
## 7 230.574526 31.07381 179.062355
data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)
## p.LR p.Score p.Wald
## 1 2.461871e-09 2.483999e-08 2.812558e-09
## 2 2.930440e-03 2.483999e-08 3.084043e-03
## 3 8.993083e-29 2.483999e-08 3.739320e-22
## 4 3.832502e-04 2.483999e-08 5.961892e-04
## 5 4.241336e-07 2.483999e-08 1.590228e-06
## 6 3.869474e-03 2.483999e-08 4.734489e-03
## 7 4.467337e-52 2.483999e-08 7.765136e-41
Logistic Regression with BIC criteria (takes out predictor P_youth)
bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)
#cooks distance plot
dev <-residuals(bbbc_logit, type = "deviance")
pea <-residuals(bbbc_logit, type = "pearson")
devstd<-residuals(bbbc_logit, type = "deviance")/sqrt(1 - hatvalues(bbbc_logit)) # standardized deviance residuals
peastd <-residuals(bbbc_logit, type = "pearson")/sqrt(1 - hatvalues(bbbc_logit)) # standardized pearson residuals
dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))
plot(devstd[bbbc_logit$model$Choice==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[bbbc_logit$model$Choice==0], col = "blue")
plot(peastd[bbbc_logit$model$Choice==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[bbbc_logit$model$Choice==0], col = "blue")
#model selection (BIC)
bbbc_logit = step(bbbc_logit, direction="both",test="Chisq", trace = F,k=log(nrow(bbbc_train)))
summary(bbbc_logit)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## P_Child + P_Cook + P_Art, family = binomial, data = bbbc_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2775 -0.6899 -0.4718 -0.0161 2.8399
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3080705 0.2017240 -1.527 0.12671
## GenderMale -0.8036007 0.1341843 -5.989 2.11e-09 ***
## Amount_purchased 0.0021863 0.0007609 2.873 0.00406 **
## Frequency -0.0878272 0.0103437 -8.491 < 2e-16 ***
## P_Child -0.2193596 0.0713916 -3.073 0.00212 **
## P_Cook -0.3273889 0.0719981 -4.547 5.44e-06 ***
## P_Art 1.2061765 0.0970177 12.433 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1452.2 on 1593 degrees of freedom
## AIC: 1466.2
##
## Number of Fisher Scoring iterations: 5
#identifying outliers
cooks_sd_cutoff = 3*sd(cooks.distance(bbbc_logit))
out <- which(cooks.distance(bbbc_logit)>cooks_sd_cutoff)
#fit BIC model without outliers
bbbc_logit <- glm(Choice ~ Gender + Amount_purchased + Frequency +
P_Child + P_Cook + P_Art, data = bbbc_train[-out,], family = binomial)
#Maximized cutoff (change predict object)
pred <- prediction(predict(bbbc_logit, bbbc_test, type = "response"),
bbbc_test$Choice)
#Predicted Probability and True Classification
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
#ROC plot (don't need to change)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
#Confusion Matrix (change predict object)
predict = predict(bbbc_logit, bbbc_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1499 59
## 1 597 145
##
## Accuracy : 0.7148
## 95% CI : (0.6958, 0.7332)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1945
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.71078
## Specificity : 0.71517
## Pos Pred Value : 0.19542
## Neg Pred Value : 0.96213
## Prevalence : 0.08870
## Detection Rate : 0.06304
## Detection Prevalence : 0.32261
## Balanced Accuracy : 0.71298
##
## 'Positive' Class : 1
##
#diagnostics
cooks_distance = cooks.distance(bbbc_logit)
plot(cooks_distance,col="pink", pch=19, cex=1)
text(cooks.distance(bbbc_logit),labels = rownames(bbbc_train))
#hoslem.test(bbbc_logit$Choice, fitted(bbbc_logit), g=10)
exp(coef(bbbc_logit))
## (Intercept) GenderMale Amount_purchased Frequency
## 0.9537936 0.4249002 1.0020992 0.8896104
## P_Child P_Cook P_Art
## 0.7377371 0.6418250 4.1014105
LR = Anova(bbbc_logit, test = "LR")
Wald = Anova(bbbc_logit, test = "Wald")
score = anova(bbbc_logit, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)
## LR Score Wald
## 1 35.996590 32.0851 35.727003
## 2 6.765368 32.0851 6.712547
## 3 123.677827 32.0851 93.494151
## 4 14.980828 32.0851 13.877597
## 5 31.983008 32.0851 28.249544
## 6 220.492576 32.0851 173.461103
data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)
## p.LR p.Score p.Wald
## 1 1.976631e-09 1.475649e-08 2.269958e-09
## 2 9.294376e-03 1.475649e-08 9.573691e-03
## 3 9.909506e-29 1.475649e-08 4.074026e-22
## 4 1.086090e-04 1.475649e-08 1.951104e-04
## 5 1.555270e-08 1.475649e-08 1.066397e-07
## 6 7.062212e-50 1.475649e-08 1.297988e-39
SVM Radial Attempt with VIF less than 5 criteria (BEST SO FAR)
bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])
#bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))
#bbbc_tuned$best.parameters
mysvm = svm(formula = Choice~., data= bbbc_train, gamma = 0.1, cost = 10000)
summary(mysvm)
##
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = 0.1, cost = 10000)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10000
##
## Number of Support Vectors: 712
##
## ( 296 416 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svmpredict = predict(mysvm, bbbc_test, type = "response")
#table(pred = svmpredict, true = bbbc_test$Choice)
#table(pred = svmpredict, true = bbbc_test$Choice)
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1817 135
## 1 279 69
##
## Accuracy : 0.82
## 95% CI : (0.8037, 0.8355)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1556
##
## Mcnemar's Test P-Value : 2.094e-12
##
## Sensitivity : 0.3382
## Specificity : 0.8669
## Pos Pred Value : 0.1983
## Neg Pred Value : 0.9308
## Prevalence : 0.0887
## Detection Rate : 0.0300
## Detection Prevalence : 0.1513
## Balanced Accuracy : 0.6026
##
## 'Positive' Class : 1
##
#table(bbbc_test$Choice)
#table(svmpredict)
SVM Linear Attempt with VIF less than 5 criteria
g/c: 0.03 0.7 0.1 0.8
bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])
#bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))
#bbbc_tuned$best.parameters
mysvm = svm(formula = Choice~., data= bbbc_train, gamma = 0.1, cost = 100000, kernel = "linear")
summary(mysvm)
##
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = 0.1, cost = 1e+05,
## kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1e+05
##
## Number of Support Vectors: 1133
##
## ( 377 756 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svmpredict = predict(mysvm, bbbc_test, type = "response")
#table(pred = svmpredict, true = bbbc_test$Choice)
#table(pred = svmpredict, true = bbbc_test$Choice)
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1720 140
## 1 376 64
##
## Accuracy : 0.7757
## 95% CI : (0.758, 0.7926)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0883
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.31373
## Specificity : 0.82061
## Pos Pred Value : 0.14545
## Neg Pred Value : 0.92473
## Prevalence : 0.08870
## Detection Rate : 0.02783
## Detection Prevalence : 0.19130
## Balanced Accuracy : 0.56717
##
## 'Positive' Class : 1
##
#table(bbbc_test$Choice)
#table(svmpredict)
Radial but reintroduce variables we took out due to VIF
bbbc_test <- read_excel("BBBC-Test.xlsx")
bbbc_test <- (subset(bbbc_test, select=-c(Observation)))
bbbc_test$Gender[bbbc_test$Gender==1]="Male";
bbbc_test$Gender[bbbc_test$Gender==0]="Female";
bbbc_train <- read_excel("BBBC-Train.xlsx")
bbbc_train <- (subset(bbbc_train, select=-c(Observation)))
bbbc_train$Gender[bbbc_train$Gender==1]="Male";
bbbc_train$Gender[bbbc_train$Gender==0]="Female";
anyNA(bbbc_train)
## [1] FALSE
anyNA(bbbc_test)
## [1] FALSE
bbbc_test[["Choice"]] = factor(bbbc_test[["Choice"]])
bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])
#bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))
#bbbc_tuned$best.parameters
mysvm = svm(formula = Choice~., data= bbbc_train, gamma = 0.1, cost = 10000)
summary(mysvm)
##
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = 0.1, cost = 10000)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10000
##
## Number of Support Vectors: 674
##
## ( 275 399 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svmpredict = predict(mysvm, bbbc_test, type = "response")
#table(pred = svmpredict, true = bbbc_test$Choice)
#table(pred = svmpredict, true = bbbc_test$Choice)
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1776 119
## 1 320 85
##
## Accuracy : 0.8091
## 95% CI : (0.7925, 0.825)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1827
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.41667
## Specificity : 0.84733
## Pos Pred Value : 0.20988
## Neg Pred Value : 0.93720
## Prevalence : 0.08870
## Detection Rate : 0.03696
## Detection Prevalence : 0.17609
## Balanced Accuracy : 0.63200
##
## 'Positive' Class : 1
##
#table(bbbc_test$Choice)
#table(svmpredict)
SVM Linear All Predictors
bbbc_test[["Choice"]] = factor(bbbc_test[["Choice"]])
bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])
#bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))
#bbbc_tuned$best.parameters
mysvm = svm(formula = Choice~., data= bbbc_train, gamma = 0.1, cost = 10000, kernel = "linear")
summary(mysvm)
##
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = 0.1, cost = 10000,
## kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10000
##
## Number of Support Vectors: 777
##
## ( 369 408 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svmpredict = predict(mysvm, bbbc_test, type = "response")
#table(pred = svmpredict, true = bbbc_test$Choice)
#table(pred = svmpredict, true = bbbc_test$Choice)
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1988 148
## 1 108 56
##
## Accuracy : 0.8887
## 95% CI : (0.8751, 0.9013)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.99990
##
## Kappa : 0.2446
##
## Mcnemar's Test P-Value : 0.01479
##
## Sensitivity : 0.27451
## Specificity : 0.94847
## Pos Pred Value : 0.34146
## Neg Pred Value : 0.93071
## Prevalence : 0.08870
## Detection Rate : 0.02435
## Detection Prevalence : 0.07130
## Balanced Accuracy : 0.61149
##
## 'Positive' Class : 1
##
polynomial svm (worst sensitivity)
bbbc_test[["Choice"]] = factor(bbbc_test[["Choice"]])
bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])
#bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))
#bbbc_tuned$best.parameters
mysvm = svm(formula = Choice~., data= bbbc_train, gamma = 0.1, cost = 10000, kernel = "polynomial")
summary(mysvm)
##
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = 0.1, cost = 10000,
## kernel = "polynomial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 10000
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 664
##
## ( 306 358 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svmpredict = predict(mysvm, bbbc_test, type = "response")
#table(pred = svmpredict, true = bbbc_test$Choice)
#table(pred = svmpredict, true = bbbc_test$Choice)
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1875 140
## 1 221 64
##
## Accuracy : 0.843
## 95% CI : (0.8275, 0.8577)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1766
##
## Mcnemar's Test P-Value : 2.548e-05
##
## Sensitivity : 0.31373
## Specificity : 0.89456
## Pos Pred Value : 0.22456
## Neg Pred Value : 0.93052
## Prevalence : 0.08870
## Detection Rate : 0.02783
## Detection Prevalence : 0.12391
## Balanced Accuracy : 0.60414
##
## 'Positive' Class : 1
##
SVM Linear with class weights (no change)
bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1), class.weights= c("0" = 1, "1" = 10))
bbbc_tuned$best.parameters
## gamma cost class.weights
## 61 0.01 0.7 1
mysvm = svm(formula = Choice~., data= bbbc_train, gamma = bbbc_tuned$best.parameters$gamma, cost = bbbc_tuned$best.parameters$cost, kernel = "linear")
summary(mysvm)
##
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = bbbc_tuned$best.parameters$gamma,
## cost = bbbc_tuned$best.parameters$cost, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.7
##
## Number of Support Vectors: 739
##
## ( 367 372 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svmpredict = predict(mysvm, bbbc_test, type = "response")
table(pred = svmpredict, true = bbbc_test$Choice)
## true
## pred 0 1
## 0 2011 147
## 1 85 57
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2011 147
## 1 85 57
##
## Accuracy : 0.8991
## 95% CI : (0.8861, 0.9111)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9802
##
## Kappa : 0.2768
##
## Mcnemar's Test P-Value : 6.206e-05
##
## Sensitivity : 0.27941
## Specificity : 0.95945
## Pos Pred Value : 0.40141
## Neg Pred Value : 0.93188
## Prevalence : 0.08870
## Detection Rate : 0.02478
## Detection Prevalence : 0.06174
## Balanced Accuracy : 0.61943
##
## 'Positive' Class : 1
##
questions how to prioritize sensitivity? should we try interaction terms?