library(dplyr) #Functions for editing data frames.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(haven) #Lets R recognize other data file types besides csv.
library(rpart) #Functions for creating trees.
library(rpart.plot) #Functions for plotting trees from rpart.
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
Set Working Directory
setwd ("G:/My Drive/PTMBACLASSES/DA1ClassPTMBA/ProblemSet4B")
Read the data
PhoneData<-read_sav("SmartPhone_data.sav")
Create Provider Dummies
library(fastDummies)
PhoneData_dum<-dummy_cols(PhoneData, select_columns=c('Provider'), remove_first_dummy = TRUE)
PhoneData_dum1<-select(PhoneData_dum, -Provider)
Note: it seems that no other pre-processing is needed for regression and trees.
Set up training and test set.
phone_train<-filter(PhoneData_dum1, partition=="train") %>% select(-partition)
phone_test<-filter(PhoneData_dum1, partition=="test") %>% select(-partition)
You will be using the data file named SmartPhone_data.sav. It includes demographic and ownership information on 100 telecom customers. It also includes a data partitioning variable (“partition”). The variable SmartPhone indicates whether the customer owns a smartphone (=1) or not (=0).
phone_all_LR<-glm(SmartPhone~., data=phone_train, family="binomial")
summary(phone_all_LR)
##
## Call:
## glm(formula = SmartPhone ~ ., family = "binomial", data = phone_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.29981 2.93589 -3.849 0.000119 ***
## Minutes 0.01777 0.00720 2.469 0.013561 *
## Income 0.05959 0.03101 1.922 0.054653 .
## MonthsService 0.03470 0.05597 0.620 0.535318
## LongDistance 1.23133 0.72140 1.707 0.087851 .
## Bill 0.04359 0.02201 1.980 0.047663 *
## Provider_TMobile 0.62993 0.80784 0.780 0.435527
## Provider_Verizon 0.67284 0.88977 0.756 0.449532
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 103.456 on 75 degrees of freedom
## Residual deviance: 68.387 on 68 degrees of freedom
## AIC: 84.387
##
## Number of Fisher Scoring iterations: 5
phone_null_LR<-glm(SmartPhone~1, data=phone_train, family="binomial")
summary (phone_null_LR)
##
## Call:
## glm(formula = SmartPhone ~ 1, family = "binomial", data = phone_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.3185 0.2323 1.371 0.17
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 103.46 on 75 degrees of freedom
## Residual deviance: 103.46 on 75 degrees of freedom
## AIC: 105.46
##
## Number of Fisher Scoring iterations: 4
step_LR<-step(phone_all_LR, scope=formula(phone_all_LR))
## Start: AIC=84.39
## SmartPhone ~ Minutes + Income + MonthsService + LongDistance +
## Bill + Provider_TMobile + Provider_Verizon
##
## Df Deviance AIC
## - MonthsService 1 68.777 82.777
## - Provider_Verizon 1 68.967 82.967
## - Provider_TMobile 1 68.999 82.999
## <none> 68.387 84.387
## - LongDistance 1 71.554 85.554
## - Income 1 72.295 86.295
## - Bill 1 72.742 86.742
## - Minutes 1 75.499 89.499
##
## Step: AIC=82.78
## SmartPhone ~ Minutes + Income + LongDistance + Bill + Provider_TMobile +
## Provider_Verizon
##
## Df Deviance AIC
## - Provider_TMobile 1 69.536 81.536
## - Provider_Verizon 1 69.574 81.574
## <none> 68.777 82.777
## + MonthsService 1 68.387 84.387
## - LongDistance 1 72.396 84.396
## - Income 1 72.797 84.797
## - Bill 1 73.605 85.605
## - Minutes 1 76.763 88.763
##
## Step: AIC=81.54
## SmartPhone ~ Minutes + Income + LongDistance + Bill + Provider_Verizon
##
## Df Deviance AIC
## - Provider_Verizon 1 69.751 79.751
## <none> 69.536 81.536
## - LongDistance 1 72.578 82.578
## + Provider_TMobile 1 68.777 82.777
## + MonthsService 1 68.999 82.999
## - Income 1 73.894 83.894
## - Bill 1 74.514 84.514
## - Minutes 1 76.835 86.835
##
## Step: AIC=79.75
## SmartPhone ~ Minutes + Income + LongDistance + Bill
##
## Df Deviance AIC
## <none> 69.751 79.751
## - LongDistance 1 72.604 80.604
## + MonthsService 1 69.130 81.130
## + Provider_Verizon 1 69.536 81.536
## + Provider_TMobile 1 69.574 81.574
## - Bill 1 74.540 82.540
## - Income 1 74.717 82.717
## - Minutes 1 77.231 85.231
Let’s consider full model vs. stepwise, and evaluate it on the training set.
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:caret':
##
## dotPlot
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
#step logistic regression
pred_LR_step<-predict(step_LR, phone_train, type="response") %>% round()
pred_LR_all<-predict(phone_all_LR, phone_train, type="response")%>%round()
#all variables logistic regression
mean(~(SmartPhone == pred_LR_step), data=phone_train)
## [1] 0.7894737
mean(~(SmartPhone == pred_LR_all), data=phone_train)
## [1] 0.8157895
Full model has higher percent correct at 81.58% vs. 78.95% for stepwise.
Let’s check the confusion matrix (classification table)
#step regression
tally(SmartPhone~pred_LR_step, data=phone_train) %>%addmargins()
## pred_LR_step
## SmartPhone 0 1 Sum
## 0 23 9 32
## 1 7 37 44
## Sum 30 46 76
tally(SmartPhone~pred_LR_step, data=phone_train)%>%prop.table(margin=1)%>%round(2)
## pred_LR_step
## SmartPhone 0 1
## 0 0.72 0.28
## 1 0.16 0.84
#All variables
tally(SmartPhone~pred_LR_all, data=phone_train) %>%addmargins()
## pred_LR_all
## SmartPhone 0 1 Sum
## 0 24 8 32
## 1 6 38 44
## Sum 30 46 76
tally(SmartPhone~pred_LR_all, data=phone_train)%>%prop.table(margin=1)%>%round(2)
## pred_LR_all
## SmartPhone 0 1
## 0 0.75 0.25
## 1 0.14 0.86
Again, all variables outperforms step function at predicting Smart Phone owners who actually own the phone. All is at 86%, Step at 84%.
summary(step_LR)
##
## Call:
## glm(formula = SmartPhone ~ Minutes + Income + LongDistance +
## Bill, family = "binomial", data = phone_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.096342 2.627966 -3.842 0.000122 ***
## Minutes 0.017132 0.006704 2.555 0.010609 *
## Income 0.063702 0.029680 2.146 0.031847 *
## LongDistance 1.086961 0.662013 1.642 0.100610
## Bill 0.042417 0.020467 2.073 0.038219 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 103.456 on 75 degrees of freedom
## Residual deviance: 69.751 on 71 degrees of freedom
## AIC: 79.751
##
## Number of Fisher Scoring iterations: 5
Minutes, income and bill all increase likelihood of owning a cell phone, LongDistance is not significant.
#STEP LOGISTIC REGRESSION
#predict step on test set
pred_LR_step_test<-predict(step_LR, phone_test, type="response") %>% round()
#step logistic regression
mean(~(SmartPhone == pred_LR_step_test), data=phone_test)
## [1] 0.625
tally(SmartPhone~pred_LR_step_test, data=phone_test)%>%prop.table(margin=1)%>%round(2)
## pred_LR_step_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.18 0.82
#ALL VARIABLES LOGISTIC REGRESSION
#predict all on test set
pred_LR_all_test<-predict(phone_all_LR , phone_test, type="response")%>% round()
#all-variables logistic regression
mean(~(SmartPhone == pred_LR_all_test), data=phone_test)
## [1] 0.6666667
tally(SmartPhone~pred_LR_all_test, data=phone_test)%>%prop.table(margin=1)%>%round(2)
## pred_LR_all_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.09 0.91
Again, all variables logistic regression performs better on both counts. %correct is 0.67 vs. 0.62, and percent correct for owners is 91% vs 82% for step logistic regression.
It is better at predicting smartphone ownership status of owners as it gets it correctly 91% of the time vs 46% of the time for non-owners.
Let’s first run proper libraries
library(rpart)
library(rpart.plot)
Next, let’s re-run the data since we do not have to adjust dummies.
phone_tree_train<-filter(PhoneData, partition=="train") %>% select(-partition)
phone_tree_test<-filter(PhoneData, partition=="test") %>% select (-partition)
tree_cp01<-rpart(SmartPhone~., data=phone_tree_train, method="class", cp=0.01)
rpart.plot(tree_cp01, roundint = FALSE, nn=TRUE, extra=1)
rpart.plot(tree_cp01, roundint = FALSE, nn=TRUE, extra=4)
Yes, the model predicts 62% chance of having a smart phone, so it is a yes.
Let’s run the model and plot results
tree_cp07<-rpart(SmartPhone~., data=phone_tree_train, method="class", cp=0.07)
rpart.plot(tree_cp07, roundint = FALSE, nn=TRUE, extra=1)
rpart.plot(tree_cp07, roundint = FALSE, nn=TRUE, extra=4)
Now let’s predict and calculate accuracy in training data
#CP=0.01
pred_tree_01<-predict(tree_cp01, phone_tree_train, type="class")
mean(~(SmartPhone==pred_tree_01), data=phone_tree_train)
## [1] 0.8026316
tally(SmartPhone~pred_tree_01, data=phone_tree_train)%>%prop.table(margin=1)%>%round(2)
## pred_tree_01
## SmartPhone 0 1
## 0 0.72 0.28
## 1 0.14 0.86
#CP=0.07
pred_tree_07<-predict(tree_cp07, phone_tree_train, type="class")
mean(~(SmartPhone==pred_tree_07), data=phone_tree_train)
## [1] 0.7763158
tally(SmartPhone~pred_tree_07, data=phone_tree_train)%>%prop.table(margin=1)%>%round(2)
## pred_tree_07
## SmartPhone 0 1
## 0 0.56 0.44
## 1 0.07 0.93
% correct at CP=0.01 is 80.26% and for CP=0.07 is 77.63%. Percent correct for owners with CP=0.01 is 86% and for CP=0.07 is 93%
Not really. I would have guessed CP=0.01 as it would splice the data into finer slices.
The one with higher % correct has the higher % correct, so that would lead to our using it. The other one actually does a better job of predicting actual phone ownership among owners.
pred_tree_01_test<-predict(tree_cp01, phone_tree_test, type="class")
mean(~(SmartPhone==pred_tree_01_test), data=phone_tree_test)
## [1] 0.5833333
tally(SmartPhone~pred_tree_01_test, data=phone_tree_test)%>%prop.table(margin=1)%>%round(2)
## pred_tree_01_test
## SmartPhone 0 1
## 0 0.38 0.62
## 1 0.18 0.82
%correct in the test data is 58.33%
I am taking this to mean correctly predicting no ownership (0 prediction, 0 observation). Step Regression has this at 46% and tree with CP=0.01 has it at 38%. Step logistic regression performs better.
They are actually equal at 82%. All-variables logistic regression is better at 91%
Pre-processing for kNN, starting from scratch
PhoneData_knn<-PhoneData<-read_sav("SmartPhone_data.sav")
PD_knn_dum<-dummy_cols(PhoneData_knn, select_columns = c('Provider'), remove_first_dummy = TRUE)
PD_knn<-select(PD_knn_dum, -Provider)
phone_knn<-mutate(PD_knn,
Minutes=(Minutes-min(Minutes))/(max(Minutes)-min(Minutes)),
Income= (Income-min(Income))/(max(Income)-min(Income)),
MonthsService = (MonthsService-min(MonthsService))/(max(MonthsService)-min(MonthsService)),
Bill = (Bill - min(Bill))/(max(Bill)-min(Bill)))
phone_train_knn<-filter(phone_knn, partition=="train")%>%select(-partition)
phone_test_knn<-filter(phone_knn, partition=="test")%>%select(-partition)
phone_knn_train<-select(phone_train_knn,-c(Provider_TMobile, Provider_Verizon, MonthsService, SmartPhone))
phone_knn_test<-select(phone_test_knn,-c(Provider_TMobile, Provider_Verizon, MonthsService, SmartPhone))
Running the KNN
library(FNN)
knn_3_train<-knn(phone_knn_train, phone_knn_train, phone_train_knn$SmartPhone, k=3)
Checking % correct
library(mosaic)
mean(~(SmartPhone==knn_3_train), data=phone_train_knn)
## [1] 0.8552632
tally(SmartPhone~knn_3_train, data=phone_train_knn)%>%prop.table(margin=1)%>%round(2)
## knn_3_train
## SmartPhone 0 1
## 0 0.84 0.16
## 1 0.14 0.86
%correct is 85.53%
library(FNN)
knn_5_train<-knn(phone_knn_train, phone_knn_train, phone_train_knn$SmartPhone, k=5)
mean(~(SmartPhone==knn_5_train), data=phone_train_knn)
## [1] 0.8157895
tally(SmartPhone~knn_5_train, data=phone_train_knn)%>%prop.table(margin=1)%>%round(2)
## knn_5_train
## SmartPhone 0 1
## 0 0.72 0.28
## 1 0.11 0.89
%correct training is 81.58%
Let’s run kNN 5
library(FNN)
knn_5_test<-knn(phone_knn_train, phone_knn_test, phone_train_knn$SmartPhone, k=5)
mean(~(SmartPhone==knn_5_test), data=phone_test_knn)
## [1] 0.625
tally(SmartPhone~knn_5_test, data=phone_test_knn)%>%prop.table(margin=1)%>%round(2)
## knn_5_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.18 0.82
It is the k=3 model, so let’s figure this out
library(FNN)
knn_3_test<-knn(phone_knn_train, phone_knn_test, phone_train_knn$SmartPhone, k=3)
library(mosaic)
mean(~(SmartPhone==knn_3_test), data=phone_test_knn)
## [1] 0.625
tally(SmartPhone~knn_3_test, data=phone_test_knn)%>%prop.table(margin=1)%>%round(2)
## knn_3_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.18 0.82
In the test data it gets %correct of 62.5%
I don’t think so.
It is better at predicting if owner actually owns the phone (82%) rather than if non-owner does not own (46%).
I assume this is non owners not owning. Stepwise: 46%, CP0.01: 38% kNN3: 46% All based on test set.
Neural net with 3 hidden layers has higher %correct, but the neural net with 5 hidden layers has no advantage.
I am not sure I could have.
nnet3_pred_test<-predict(phone_nnet3, phone_test_nnet)%>%round()
mean(~(SmartPhone==nnet3_pred_test), data=phone_test_nnet)
## [1] 0.6666667
tally(SmartPhone~nnet3_pred_test, data=phone_test_nnet)%>%prop.table(margin=1)%>%round(2)
## nnet3_pred_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.09 0.91
%correct in test is 66.67%
Let’s run the prediction for Neural Net with 5 hidden layers
nnet5_pred_test<-predict(phone_nnet5, phone_test_nnet)%>%round()
mean(~(SmartPhone==nnet5_pred_test), data=phone_test_nnet)
## [1] 0.6666667
tally(SmartPhone~nnet5_pred_test, data=phone_test_nnet)%>%prop.table(margin=1)%>%round(2)
## nnet5_pred_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.09 0.91
To summarize (all methods):
#step logistic regression
mean(~(SmartPhone == pred_LR_step_test), data=phone_test)
## [1] 0.625
tally(SmartPhone~pred_LR_step_test, data=phone_test)%>%prop.table(margin=1)%>%round(2)
## pred_LR_step_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.18 0.82
#all-variables logistic regression
mean(~(SmartPhone == pred_LR_all_test), data=phone_test)
## [1] 0.6666667
tally(SmartPhone~pred_LR_all_test, data=phone_test)%>%prop.table(margin=1)%>%round(2)
## pred_LR_all_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.09 0.91
#CP01 Tree
pred_tree_01_test<-predict(tree_cp01, phone_tree_test, type="class")
mean(~(SmartPhone==pred_tree_01_test), data=phone_tree_test)
## [1] 0.5833333
tally(SmartPhone~pred_tree_01_test, data=phone_tree_test)%>%prop.table(margin=1)%>%round(2)
## pred_tree_01_test
## SmartPhone 0 1
## 0 0.38 0.62
## 1 0.18 0.82
#CP07 Tree
pred_tree_07_test<-predict(tree_cp07, phone_tree_test, type="class")
mean(~(SmartPhone==pred_tree_07_test), data=phone_tree_test)
## [1] 0.5833333
tally(SmartPhone~pred_tree_07_test, data=phone_tree_test)%>%prop.table(margin=1)%>%round(2)
## pred_tree_07_test
## SmartPhone 0 1
## 0 0.31 0.69
## 1 0.09 0.91
#kNN 3
mean(~(SmartPhone==knn_3_test), data=phone_test_knn)
## [1] 0.625
tally(SmartPhone~knn_3_test, data=phone_test_knn)%>%prop.table(margin=1)%>%round(2)
## knn_3_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.18 0.82
#kNN5
mean(~(SmartPhone==knn_5_test), data=phone_test_knn)
## [1] 0.625
tally(SmartPhone~knn_5_test, data=phone_test_knn)%>%prop.table(margin=1)%>%round(2)
## knn_5_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.18 0.82
#nnet 3
nnet3_pred_test<-predict(phone_nnet3, phone_test_nnet)%>%round()
mean(~(SmartPhone==nnet3_pred_test), data=phone_test_nnet)
## [1] 0.6666667
tally(SmartPhone~nnet3_pred_test, data=phone_test_nnet)%>%prop.table(margin=1)%>%round(2)
## nnet3_pred_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.09 0.91
#nnet 5
nnet5_pred_test<-predict(phone_nnet5, phone_test_nnet)%>%round()
mean(~(SmartPhone==nnet5_pred_test), data=phone_test_nnet)
## [1] 0.6666667
tally(SmartPhone~nnet5_pred_test, data=phone_test_nnet)%>%prop.table(margin=1)%>%round(2)
## nnet5_pred_test
## SmartPhone 0 1
## 0 0.46 0.54
## 1 0.09 0.91
| TEST | % correct | %pred owner if owner | ||||
| All logistic | 66.67% | 91% | ||||
| Step Logistic | 62.50% | 82% | ||||
| CP01 Tree | 58.33% | 82% | ||||
| CP07 Tree | 58.33% | 91% | ||||
| kNN3 | 62.50% | 82% | ||||
| kNN5 | 62.50% | 82% | ||||
| nnet 3 | 66.67% | 91% | ||||
| nnet 5 | 66.67% | 91% | ||||
Logistic regression with all variables performs on par with neural networks.