—Problem Set 4B

Pre-Work

library(dplyr) #Functions for editing data frames.

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(haven) #Lets R recognize other data file types besides csv.
library(rpart) #Functions for creating trees.
library(rpart.plot) #Functions for plotting trees from rpart.
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

General Pre-Processing

Set Working Directory

setwd ("G:/My Drive/PTMBACLASSES/DA1ClassPTMBA/ProblemSet4B")

Read the data

PhoneData<-read_sav("SmartPhone_data.sav")

Create Provider Dummies

library(fastDummies)
PhoneData_dum<-dummy_cols(PhoneData, select_columns=c('Provider'), remove_first_dummy = TRUE)
PhoneData_dum1<-select(PhoneData_dum, -Provider)

Note: it seems that no other pre-processing is needed for regression and trees.

Set up training and test set.

phone_train<-filter(PhoneData_dum1, partition=="train") %>% select(-partition)
phone_test<-filter(PhoneData_dum1, partition=="test") %>% select(-partition)

Instructions:

You will be using the data file named SmartPhone_data.sav. It includes demographic and ownership information on 100 telecom customers. It also includes a data partitioning variable (“partition”). The variable SmartPhone indicates whether the customer owns a smartphone (=1) or not (=0).

Logistic Regression

1. Perform a logistic regression with SmartPhone as the dependent variable and including all other variables as the independent variables. What is the percentage correct in the training data?

phone_all_LR<-glm(SmartPhone~., data=phone_train, family="binomial")
summary(phone_all_LR)

## 
## Call:
## glm(formula = SmartPhone ~ ., family = "binomial", data = phone_train)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -11.29981    2.93589  -3.849 0.000119 ***
## Minutes            0.01777    0.00720   2.469 0.013561 *  
## Income             0.05959    0.03101   1.922 0.054653 .  
## MonthsService      0.03470    0.05597   0.620 0.535318    
## LongDistance       1.23133    0.72140   1.707 0.087851 .  
## Bill               0.04359    0.02201   1.980 0.047663 *  
## Provider_TMobile   0.62993    0.80784   0.780 0.435527    
## Provider_Verizon   0.67284    0.88977   0.756 0.449532    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 103.456  on 75  degrees of freedom
## Residual deviance:  68.387  on 68  degrees of freedom
## AIC: 84.387
## 
## Number of Fisher Scoring iterations: 5

2. Run a stepwise logistic regression with SmartPhone as the dependent variable and all other variables as potential independent variables. Which model (the one with all the variables or the stepwise model) has the better percentage correct in the training data?

phone_null_LR<-glm(SmartPhone~1, data=phone_train, family="binomial")
summary (phone_null_LR)

## 
## Call:
## glm(formula = SmartPhone ~ 1, family = "binomial", data = phone_train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   0.3185     0.2323   1.371     0.17
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 103.46  on 75  degrees of freedom
## Residual deviance: 103.46  on 75  degrees of freedom
## AIC: 105.46
## 
## Number of Fisher Scoring iterations: 4

step_LR<-step(phone_all_LR, scope=formula(phone_all_LR))

## Start:  AIC=84.39
## SmartPhone ~ Minutes + Income + MonthsService + LongDistance + 
##     Bill + Provider_TMobile + Provider_Verizon
## 
##                    Df Deviance    AIC
## - MonthsService     1   68.777 82.777
## - Provider_Verizon  1   68.967 82.967
## - Provider_TMobile  1   68.999 82.999
## <none>                  68.387 84.387
## - LongDistance      1   71.554 85.554
## - Income            1   72.295 86.295
## - Bill              1   72.742 86.742
## - Minutes           1   75.499 89.499
## 
## Step:  AIC=82.78
## SmartPhone ~ Minutes + Income + LongDistance + Bill + Provider_TMobile + 
##     Provider_Verizon
## 
##                    Df Deviance    AIC
## - Provider_TMobile  1   69.536 81.536
## - Provider_Verizon  1   69.574 81.574
## <none>                  68.777 82.777
## + MonthsService     1   68.387 84.387
## - LongDistance      1   72.396 84.396
## - Income            1   72.797 84.797
## - Bill              1   73.605 85.605
## - Minutes           1   76.763 88.763
## 
## Step:  AIC=81.54
## SmartPhone ~ Minutes + Income + LongDistance + Bill + Provider_Verizon
## 
##                    Df Deviance    AIC
## - Provider_Verizon  1   69.751 79.751
## <none>                  69.536 81.536
## - LongDistance      1   72.578 82.578
## + Provider_TMobile  1   68.777 82.777
## + MonthsService     1   68.999 82.999
## - Income            1   73.894 83.894
## - Bill              1   74.514 84.514
## - Minutes           1   76.835 86.835
## 
## Step:  AIC=79.75
## SmartPhone ~ Minutes + Income + LongDistance + Bill
## 
##                    Df Deviance    AIC
## <none>                  69.751 79.751
## - LongDistance      1   72.604 80.604
## + MonthsService     1   69.130 81.130
## + Provider_Verizon  1   69.536 81.536
## + Provider_TMobile  1   69.574 81.574
## - Bill              1   74.540 82.540
## - Income            1   74.717 82.717
## - Minutes           1   77.231 85.231

3. Consider the one of these two regression models that has the better percentage correct. What are the arguments in favor of selecting it as the model you will use, and what are the arguments in favor of selecting the other one instead?

Let’s consider full model vs. stepwise, and evaluate it on the training set.

library(mosaic)

## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following object is masked from 'package:caret':
## 
##     dotPlot

## The following object is masked from 'package:ggplot2':
## 
##     stat

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

#step logistic regression
pred_LR_step<-predict(step_LR, phone_train, type="response") %>% round()
pred_LR_all<-predict(phone_all_LR, phone_train, type="response")%>%round()

#all variables logistic regression
mean(~(SmartPhone == pred_LR_step), data=phone_train)

## [1] 0.7894737

mean(~(SmartPhone == pred_LR_all), data=phone_train)

## [1] 0.8157895

Full model has higher percent correct at 81.58% vs. 78.95% for stepwise.

Let’s check the confusion matrix (classification table)

#step regression

tally(SmartPhone~pred_LR_step, data=phone_train) %>%addmargins()

##           pred_LR_step
## SmartPhone  0  1 Sum
##        0   23  9  32
##        1    7 37  44
##        Sum 30 46  76

tally(SmartPhone~pred_LR_step, data=phone_train)%>%prop.table(margin=1)%>%round(2)

##           pred_LR_step
## SmartPhone    0    1
##          0 0.72 0.28
##          1 0.16 0.84

#All variables
tally(SmartPhone~pred_LR_all, data=phone_train) %>%addmargins()

##           pred_LR_all
## SmartPhone  0  1 Sum
##        0   24  8  32
##        1    6 38  44
##        Sum 30 46  76

tally(SmartPhone~pred_LR_all, data=phone_train)%>%prop.table(margin=1)%>%round(2)

##           pred_LR_all
## SmartPhone    0    1
##          0 0.75 0.25
##          1 0.14 0.86

Again, all variables outperforms step function at predicting Smart Phone owners who actually own the phone. All is at 86%, Step at 84%.

4. According to the stepwise logistic regression, which variables increase the probability of smartphone ownership, which decrease it, and which are found not to be relevant?

summary(step_LR)

## 
## Call:
## glm(formula = SmartPhone ~ Minutes + Income + LongDistance + 
##     Bill, family = "binomial", data = phone_train)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -10.096342   2.627966  -3.842 0.000122 ***
## Minutes        0.017132   0.006704   2.555 0.010609 *  
## Income         0.063702   0.029680   2.146 0.031847 *  
## LongDistance   1.086961   0.662013   1.642 0.100610    
## Bill           0.042417   0.020467   2.073 0.038219 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 103.456  on 75  degrees of freedom
## Residual deviance:  69.751  on 71  degrees of freedom
## AIC: 79.751
## 
## Number of Fisher Scoring iterations: 5

Minutes, income and bill all increase likelihood of owning a cell phone, LongDistance is not significant.

5. Use the stepwise logistic regression in the test data. What is the percentage correct in the test data?

#STEP LOGISTIC REGRESSION

#predict step on test set
pred_LR_step_test<-predict(step_LR, phone_test, type="response") %>% round()

#step logistic regression
mean(~(SmartPhone == pred_LR_step_test), data=phone_test)

## [1] 0.625

tally(SmartPhone~pred_LR_step_test, data=phone_test)%>%prop.table(margin=1)%>%round(2)

##           pred_LR_step_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.18 0.82

#ALL VARIABLES LOGISTIC REGRESSION

#predict all on test set
pred_LR_all_test<-predict(phone_all_LR , phone_test, type="response")%>% round()

#all-variables logistic regression
mean(~(SmartPhone == pred_LR_all_test), data=phone_test)

## [1] 0.6666667

tally(SmartPhone~pred_LR_all_test, data=phone_test)%>%prop.table(margin=1)%>%round(2)

##           pred_LR_all_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.09 0.91

Again, all variables logistic regression performs better on both counts. %correct is 0.67 vs. 0.62, and percent correct for owners is 91% vs 82% for step logistic regression.

6. Is the stepwise logistic regression you derived better at predicting the smartphone ownership status of smartphone owners or of nonowners? Does this make sense?

It is better at predicting smartphone ownership status of owners as it gets it correctly 91% of the time vs 46% of the time for non-owners.

Classification Trees

Let’s first run proper libraries

library(rpart)
library(rpart.plot)

Next, let’s re-run the data since we do not have to adjust dummies.

phone_tree_train<-filter(PhoneData, partition=="train") %>% select(-partition)
phone_tree_test<-filter(PhoneData, partition=="test") %>% select (-partition)

7. Run a classification tree using the same variables. Set the complexity parameter to 0.01. What is the training percentage correct in the training data?

tree_cp01<-rpart(SmartPhone~., data=phone_tree_train, method="class", cp=0.01)
rpart.plot(tree_cp01, roundint = FALSE, nn=TRUE, extra=1)

rpart.plot(tree_cp01, roundint = FALSE, nn=TRUE, extra=4)

8. For someone with an average monthly bill of $57 and 120 average monthly minutes, does this model predict that the customer has a smartphone?

Yes, the model predicts 62% chance of having a smart phone, so it is a yes.

9. Run another classification tree using the same variables. But set the complexity parameter to 0.07. Which of the two trees has the higher percentage correct in the training data?

Let’s run the model and plot results

tree_cp07<-rpart(SmartPhone~., data=phone_tree_train, method="class", cp=0.07)
rpart.plot(tree_cp07, roundint = FALSE, nn=TRUE, extra=1)

rpart.plot(tree_cp07, roundint = FALSE, nn=TRUE, extra=4)

Now let’s predict and calculate accuracy in training data

#CP=0.01
pred_tree_01<-predict(tree_cp01, phone_tree_train, type="class")
mean(~(SmartPhone==pred_tree_01), data=phone_tree_train)

## [1] 0.8026316

tally(SmartPhone~pred_tree_01, data=phone_tree_train)%>%prop.table(margin=1)%>%round(2)

##           pred_tree_01
## SmartPhone    0    1
##          0 0.72 0.28
##          1 0.14 0.86

#CP=0.07
pred_tree_07<-predict(tree_cp07, phone_tree_train, type="class")
mean(~(SmartPhone==pred_tree_07), data=phone_tree_train)

## [1] 0.7763158

tally(SmartPhone~pred_tree_07, data=phone_tree_train)%>%prop.table(margin=1)%>%round(2)

##           pred_tree_07
## SmartPhone    0    1
##          0 0.56 0.44
##          1 0.07 0.93

% correct at CP=0.01 is 80.26% and for CP=0.07 is 77.63%. Percent correct for owners with CP=0.01 is 86% and for CP=0.07 is 93%

10. Could you have known in advance which of the two trees would have the higher percentage correct?

Not really. I would have guessed CP=0.01 as it would splice the data into finer slices.

11. Consider the one of these two tree models that has the better percentage correct. What are the arguments in favor of selecting it as the model you will use, and what are the arguments in favor of selecting the other one instead?

The one with higher % correct has the higher % correct, so that would lead to our using it. The other one actually does a better job of predicting actual phone ownership among owners.

12. Use the one of the two classification tree models with a higher percentage correct in the training data. What is its percentage correct in the test data?
```
pred_tree_01_test<-predict(tree_cp01, phone_tree_test, type="class")
mean(~(SmartPhone==pred_tree_01_test), data=phone_tree_test)
```
```
## [1] 0.5833333
```
```
tally(SmartPhone~pred_tree_01_test, data=phone_tree_test)%>%prop.table(margin=1)%>%round(2)
```
```
##           pred_tree_01_test
## SmartPhone    0    1
##          0 0.38 0.62
##          1 0.18 0.82
```
%correct in the test data is 58.33%

13. Between the tree model with a cp of 0.01 and the stepwise logistic regression model which does it appear would be more accurate at predicting smart phone users among other people?

I am taking this to mean correctly predicting no ownership (0 prediction, 0 observation). Step Regression has this at 46% and tree with CP=0.01 has it at 38%. Step logistic regression performs better.

14. Which of the two models (the stepwise regression and the tree with cp=0.01) which does it appear would be more accurate at predicting the smartphone ownership of the people in the data that actually do own one?

They are actually equal at 82%. All-variables logistic regression is better at 91%

kNN MODEL

15. Run a kNN model on the data, using variables that were found to be significant in the stepwise logistic regression. These variables are: Bill, Minutes, Income, and LongDistance. Use k=3. What is the training percentage correct?

Pre-processing for kNN, starting from scratch

PhoneData_knn<-PhoneData<-read_sav("SmartPhone_data.sav")
PD_knn_dum<-dummy_cols(PhoneData_knn, select_columns = c('Provider'), remove_first_dummy = TRUE)
PD_knn<-select(PD_knn_dum, -Provider)

phone_knn<-mutate(PD_knn, 
                  Minutes=(Minutes-min(Minutes))/(max(Minutes)-min(Minutes)), 
                  Income= (Income-min(Income))/(max(Income)-min(Income)),
                  MonthsService = (MonthsService-min(MonthsService))/(max(MonthsService)-min(MonthsService)),
                                   Bill = (Bill - min(Bill))/(max(Bill)-min(Bill)))
                                   
                                   

phone_train_knn<-filter(phone_knn, partition=="train")%>%select(-partition)
phone_test_knn<-filter(phone_knn, partition=="test")%>%select(-partition)

phone_knn_train<-select(phone_train_knn,-c(Provider_TMobile, Provider_Verizon, MonthsService, SmartPhone))
phone_knn_test<-select(phone_test_knn,-c(Provider_TMobile, Provider_Verizon, MonthsService, SmartPhone))

Running the KNN

library(FNN)
knn_3_train<-knn(phone_knn_train, phone_knn_train, phone_train_knn$SmartPhone, k=3)

Checking % correct

library(mosaic)
mean(~(SmartPhone==knn_3_train), data=phone_train_knn)

## [1] 0.8552632

tally(SmartPhone~knn_3_train, data=phone_train_knn)%>%prop.table(margin=1)%>%round(2)

##           knn_3_train
## SmartPhone    0    1
##          0 0.84 0.16
##          1 0.14 0.86

%correct is 85.53%

16. Run another kNN model on the data using those same variables, but use k=5. What is the training percentage correct?

library(FNN)
knn_5_train<-knn(phone_knn_train, phone_knn_train, phone_train_knn$SmartPhone, k=5)

mean(~(SmartPhone==knn_5_train), data=phone_train_knn)

## [1] 0.8157895

tally(SmartPhone~knn_5_train, data=phone_train_knn)%>%prop.table(margin=1)%>%round(2)

##           knn_5_train
## SmartPhone    0    1
##          0 0.72 0.28
##          1 0.11 0.89

%correct training is 81.58%

Let’s run kNN 5

library(FNN)
knn_5_test<-knn(phone_knn_train, phone_knn_test, phone_train_knn$SmartPhone, k=5)

mean(~(SmartPhone==knn_5_test), data=phone_test_knn)

## [1] 0.625

tally(SmartPhone~knn_5_test, data=phone_test_knn)%>%prop.table(margin=1)%>%round(2)

##           knn_5_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.18 0.82

17 .Take the one of these two kNN models with a better percentage correct in the training data. What percentage correct does it get in the test data?

It is the k=3 model, so let’s figure this out

library(FNN)
knn_3_test<-knn(phone_knn_train, phone_knn_test, phone_train_knn$SmartPhone, k=3)

library(mosaic)
mean(~(SmartPhone==knn_3_test), data=phone_test_knn)

## [1] 0.625

tally(SmartPhone~knn_3_test, data=phone_test_knn)%>%prop.table(margin=1)%>%round(2)

##           knn_3_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.18 0.82

In the test data it gets %correct of 62.5%

17. Could you have known in advance which of the two kNN models would have the higher percentage correct?

I don’t think so.

18. In practice, which of the following is the k=3 kNN model probably better at: correctly predicting whether a customer who actually has a smart phone does indeed have one, or correctly predicting whether a customer who does not have a smart phone indeed does not have one?

It is better at predicting if owner actually owns the phone (82%) rather than if non-owner does not own (46%).

19. Between the stepwise logistic regression model, the tree model with a cp of 0.01, and the kNN model with k=3, which does it appear would be more accurate at predicting smart phone users among other people?

I assume this is non owners not owning. Stepwise: 46%, CP0.01: 38% kNN3: 46% All based on test set.

NEURAL NETWORKS

20. Run a neural network on the data using three hidden node and setting the “decay” argument to 0.05. Be sure to set the seed to 1 first. What is the training percentage correct?

I already normalized the data in phone_train_knn and phone_test_knn. Let me rename them for nnet

library(nnet)
phone_train_nnet<-phone_train_knn
phone_test_nnet<-phone_test_knn

#set the seed
set.seed(1)

phone_nnet3<-nnet(SmartPhone~., data=phone_train_nnet, size=3, linout=F, decay=0.05)

## # weights:  28
## initial  value 24.241122 
## iter  10 value 14.444325
## iter  20 value 14.266489
## iter  30 value 14.244782
## iter  40 value 14.224221
## iter  50 value 14.221687
## final  value 14.221653 
## converged

nnet3_pred<-predict(phone_nnet3, data=phone_train_nnet)%>%round()
mean(~(SmartPhone==nnet3_pred), data=phone_train_nnet)

## [1] 0.8026316

tally(SmartPhone~nnet3_pred, data=phone_train_nnet)%>%prop.table(margin=1)%>%round(2)

##           nnet3_pred
## SmartPhone    0    1
##          0 0.69 0.31
##          1 0.11 0.89

%correct in train set is 80.26%

21. Now, run a neural network with five hidden nodes and setting the “decay” argument to 0.05. Be sure to set the seed to 1 first. What is the training percentage correct?

#set the seed
set.seed(1)

phone_nnet5<-nnet(SmartPhone~., data=phone_train_nnet, size=5, linout=F, decay=0.05)

## # weights:  46
## initial  value 20.781618 
## iter  10 value 14.541454
## iter  20 value 14.149377
## iter  30 value 14.133469
## iter  40 value 14.127488
## iter  50 value 14.127327
## iter  60 value 14.127315
## final  value 14.127311 
## converged

nnet5_pred<-predict(phone_nnet5, data=phone_train_nnet)%>%round()
mean(~(SmartPhone==nnet5_pred), data=phone_train_nnet)

## [1] 0.7894737

tally(SmartPhone~nnet5_pred, data=phone_train_nnet)%>%prop.table(margin=1)%>%round(2)

##           nnet5_pred
## SmartPhone    0    1
##          0 0.66 0.34
##          1 0.11 0.89

Training %correct is 78.95%

22. Which of the two neural network models has the higher training percentage correct? What are the arguments for preferring the other one instead?

Neural net with 3 hidden layers has higher %correct, but the neural net with 5 hidden layers has no advantage.

23.Could you have known in advance which of the two neural network models would have the higher percentage correct?

I am not sure I could have.

24. Use the 3-node neural network model to make predictions on the test data. What is the percentage correct in the test data?

nnet3_pred_test<-predict(phone_nnet3, phone_test_nnet)%>%round()
mean(~(SmartPhone==nnet3_pred_test), data=phone_test_nnet)

## [1] 0.6666667

tally(SmartPhone~nnet3_pred_test, data=phone_test_nnet)%>%prop.table(margin=1)%>%round(2)

##           nnet3_pred_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.09 0.91

%correct in test is 66.67%

Let’s run the prediction for Neural Net with 5 hidden layers

nnet5_pred_test<-predict(phone_nnet5, phone_test_nnet)%>%round()
mean(~(SmartPhone==nnet5_pred_test), data=phone_test_nnet)

## [1] 0.6666667

tally(SmartPhone~nnet5_pred_test, data=phone_test_nnet)%>%prop.table(margin=1)%>%round(2)

##           nnet5_pred_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.09 0.91

24. If you compare the four preferred models of each type (the preferred regression model, the preferred tree, the preferred kNN, and the preferred neural network) to predict smart phone ownership, which one would you advise Sprout to use for new customers in the future? Why?

To summarize (all methods):

#step logistic regression
mean(~(SmartPhone == pred_LR_step_test), data=phone_test)

## [1] 0.625

tally(SmartPhone~pred_LR_step_test, data=phone_test)%>%prop.table(margin=1)%>%round(2)

##           pred_LR_step_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.18 0.82

#all-variables logistic regression
mean(~(SmartPhone == pred_LR_all_test), data=phone_test)

## [1] 0.6666667

tally(SmartPhone~pred_LR_all_test, data=phone_test)%>%prop.table(margin=1)%>%round(2)

##           pred_LR_all_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.09 0.91

#CP01 Tree
pred_tree_01_test<-predict(tree_cp01, phone_tree_test, type="class")
mean(~(SmartPhone==pred_tree_01_test), data=phone_tree_test)

## [1] 0.5833333

tally(SmartPhone~pred_tree_01_test, data=phone_tree_test)%>%prop.table(margin=1)%>%round(2)

##           pred_tree_01_test
## SmartPhone    0    1
##          0 0.38 0.62
##          1 0.18 0.82

#CP07 Tree
pred_tree_07_test<-predict(tree_cp07, phone_tree_test, type="class")
mean(~(SmartPhone==pred_tree_07_test), data=phone_tree_test)

## [1] 0.5833333

tally(SmartPhone~pred_tree_07_test, data=phone_tree_test)%>%prop.table(margin=1)%>%round(2)

##           pred_tree_07_test
## SmartPhone    0    1
##          0 0.31 0.69
##          1 0.09 0.91

#kNN 3
mean(~(SmartPhone==knn_3_test), data=phone_test_knn)

## [1] 0.625

tally(SmartPhone~knn_3_test, data=phone_test_knn)%>%prop.table(margin=1)%>%round(2)

##           knn_3_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.18 0.82

#kNN5
mean(~(SmartPhone==knn_5_test), data=phone_test_knn)

## [1] 0.625

tally(SmartPhone~knn_5_test, data=phone_test_knn)%>%prop.table(margin=1)%>%round(2)

##           knn_5_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.18 0.82

#nnet 3
nnet3_pred_test<-predict(phone_nnet3, phone_test_nnet)%>%round()
mean(~(SmartPhone==nnet3_pred_test), data=phone_test_nnet)

## [1] 0.6666667

tally(SmartPhone~nnet3_pred_test, data=phone_test_nnet)%>%prop.table(margin=1)%>%round(2)

##           nnet3_pred_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.09 0.91

#nnet 5

nnet5_pred_test<-predict(phone_nnet5, phone_test_nnet)%>%round()
mean(~(SmartPhone==nnet5_pred_test), data=phone_test_nnet)

## [1] 0.6666667

tally(SmartPhone~nnet5_pred_test, data=phone_test_nnet)%>%prop.table(margin=1)%>%round(2)

##           nnet5_pred_test
## SmartPhone    0    1
##          0 0.46 0.54
##          1 0.09 0.91


TEST	% correct	%pred owner if owner

All logistic	66.67%	91%

Step Logistic	62.50%	82%

CP01 Tree	58.33%	82%

CP07 Tree	58.33%	91%

kNN3	62.50%	82%

kNN5	62.50%	82%

nnet 3	66.67%	91%

nnet 5	66.67%	91%

Logistic regression with all variables performs on par with neural networks.