Load libraries

library(readr)
library(dplyr)
library(ggplot2)
library(caret)
library(corrplot)
library(knitr)
library(kableExtra)

Load Source data

survey <- read.csv("CompleteResponses.csv")
test_incomplete <-read.csv("SurveyIncomplete.csv")

Data Preprocessing

dim(survey)
## [1] 9898    7
str(survey)
## 'data.frame':    9898 obs. of  7 variables:
##  $ salary : num  119807 106880 78021 63690 50874 ...
##  $ age    : int  45 63 23 51 20 56 24 62 29 41 ...
##  $ elevel : int  0 1 0 3 3 3 4 3 4 1 ...
##  $ car    : int  14 11 15 6 14 14 8 3 17 5 ...
##  $ zipcode: int  4 6 2 5 4 3 5 0 0 4 ...
##  $ credit : num  442038 45007 48795 40889 352951 ...
##  $ brand  : int  0 1 0 1 0 1 1 1 0 1 ...
names(survey)
## [1] "salary"  "age"     "elevel"  "car"     "zipcode" "credit"  "brand"
## Check unique values of columns
table(survey$brand)
## 
##    0    1 
## 3744 6154
table(survey$elevel)
## 
##    0    1    2    3    4 
## 2052 1948 1983 1947 1968
table(survey$car)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## 492 509 488 479 505 477 495 511 487 500 473 498 473 494 542 470 508 524 489 484
table(survey$zipcode)
## 
##    0    1    2    3    4    5    6    7    8 
## 1085 1053 1112 1080 1087 1108 1155 1083 1135
## Check for null values
sum(is.na(survey))
## [1] 0

Change brand column to a factor for classification model

survey$brand <- as.factor(survey$brand)

Classification Modeling

Step 1: Set the seed number

set.seed(123)

Step 2 : Partition the data to training and test

row_index <- createDataPartition(survey$brand, 
                                 p = .75,
                                 list = FALSE)
training_data <- survey[row_index,]
testing_data <- survey[-row_index,]
str(testing_data)
## 'data.frame':    2474 obs. of  7 variables:
##  $ salary : num  50874 37803 82475 107710 148495 ...
##  $ age    : int  20 41 33 75 62 42 26 59 63 36 ...
##  $ elevel : int  3 1 4 0 2 4 0 2 0 4 ...
##  $ car    : int  14 5 13 16 9 8 18 5 7 8 ...
##  $ zipcode: int  4 4 3 2 1 2 2 6 7 7 ...
##  $ credit : num  352951 493219 424657 209002 377495 ...
##  $ brand  : Factor w/ 2 levels "0","1": 1 2 1 2 2 2 1 2 2 1 ...

Step 3:Cross Validation

cv <- trainControl(method = "repeatedcv", 
                   number = 10, 
                   repeats = 1)

Step 4: Compare different models

Compare Random Forest model (Auto tuning, Manual tuning, Random tuning, with selected features) vs C5.0 model (Auto, Manual, Random, Selected features)

Random Forest

Model 1: Random Forest with Auto Tuning and all features

set.seed(123)
rf_model <- train(brand~.,
                  data = training_data,
                  method = "rf", 
                  trControl=cv, 
                  metric="Accuracy",
                  tuneLength = 2)

rf_model  
## Random Forest 
## 
## 7424 samples
##    6 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9175632  0.8250521
##   6     0.9125812  0.8140643
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Check feature importance

ggplot(varImp(rf_model,scale=FALSE)) + theme_bw()

Salary, age, credit are top features.

Model 2: Random Forest with manual tuning of mtry (5 different values)

set.seed(123)

rfGrid <- expand.grid(mtry=c(2,3,4,5,6))

rf_mod_manual <- train(brand~.,
                       data = training_data,
                       method = "rf", 
                       trControl=cv, 
                       metric="Accuracy",
                       tuneGrid=rfGrid)

rf_mod_manual 
## Random Forest 
## 
## 7424 samples
##    6 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9176981  0.8252764
##   3     0.9176985  0.8252930
##   4     0.9171598  0.8240410
##   5     0.9155429  0.8203682
##   6     0.9132536  0.8155551
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.

Check feature importance

ggplot(varImp(rf_mod_manual,scale=FALSE)) + theme_bw()+
  ggtitle ("Feature Importances by Random Forest model")

Model 3: Random Forest with Random tuning of mtry

set.seed(123)

cv_random <- trainControl(method = "repeatedcv", 
                          number = 10, 
                          repeats = 2,
                          search="random")

rf_mod_random <- train(brand~.,
                       data = training_data,
                       method = "rf", 
                       trControl=cv_random, 
                       metric="Accuracy",
                       tuneLength=5)

rf_mod_random 
## Random Forest 
## 
## 7424 samples
##    6 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9187749  0.8276598
##   3     0.9166870  0.8231164
##   6     0.9128485  0.8146097
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Model 4: Random Forest, Auto tuning and training with selected features- salary,age,credit

set.seed(123)

rf_mod_features <- train(brand~ salary+age+credit,
                         data = training_data,
                         method = "rf", 
                         trControl=cv, 
                         metric="Accuracy",
                         tuneLength = 2)

rf_mod_features 
## Random Forest 
## 
## 7424 samples
##    3 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9144674  0.8180274
##   3     0.9119079  0.8124586
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Model 5: Random forest, manual tuning and selected features

set.seed(123)

rfGrid <- expand.grid(mtry=c(2,3,4,5,6))

rf_mod_manual_features <- train(brand~salary+age+credit,
                                data = training_data,
                                method = "rf", 
                                trControl=cv, 
                                metric="Accuracy",
                                tuneGrid=rfGrid)

rf_mod_manual_features  
## Random Forest 
## 
## 7424 samples
##    3 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9151398  0.8194564
##   3     0.9106935  0.8098159
##   4     0.9096166  0.8076917
##   5     0.9102892  0.8091063
##   6     0.9123109  0.8133812
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Storing the model results for easy reference

results_1 <- data.frame(rf_model$results)
results_1['Model_name']<-"Random forest Auto tuning"
results_2 <- data.frame(rf_mod_manual$results)
results_2['Model_name']<-"Random Forest Manual tuning"
results_3 <- data.frame(rf_mod_features$results)
results_3['Model_name']<-"Random Forest Auto tuning Selected features"
results_4<- data.frame(rf_mod_random$results)
results_4['Model_name']<-"Random Forest Random tuning"
results_5<- data.frame(rf_mod_manual_features$results)
results_5['Model_name']<-"Random Forest Manual tuning Selected features"

Random_Forest_results <- rbind(results_1,results_2,results_3,results_4,results_5)
kable(Random_Forest_results,format = "html",caption= "Results of all Random Forest models",digits=4)%>% kable_styling(bootstrap_options = "striped", full_width = F)
Results of all Random Forest models
mtry Accuracy Kappa AccuracySD KappaSD Model_name
2 0.9176 0.8251 0.0069 0.0147 Random forest Auto tuning
6 0.9126 0.8141 0.0048 0.0104 Random forest Auto tuning
2 0.9177 0.8253 0.0066 0.0142 Random Forest Manual tuning
3 0.9177 0.8253 0.0068 0.0145 Random Forest Manual tuning
4 0.9172 0.8240 0.0072 0.0155 Random Forest Manual tuning
5 0.9155 0.8204 0.0062 0.0132 Random Forest Manual tuning
6 0.9133 0.8156 0.0077 0.0162 Random Forest Manual tuning
2 0.9145 0.8180 0.0087 0.0186 Random Forest Auto tuning Selected features
3 0.9119 0.8125 0.0067 0.0144 Random Forest Auto tuning Selected features
2 0.9188 0.8277 0.0075 0.0158 Random Forest Random tuning
3 0.9167 0.8231 0.0063 0.0134 Random Forest Random tuning
6 0.9128 0.8146 0.0073 0.0152 Random Forest Random tuning
2 0.9151 0.8195 0.0085 0.0184 Random Forest Manual tuning Selected features
3 0.9107 0.8098 0.0088 0.0190 Random Forest Manual tuning Selected features
4 0.9096 0.8077 0.0086 0.0185 Random Forest Manual tuning Selected features
5 0.9103 0.8091 0.0082 0.0176 Random Forest Manual tuning Selected features
6 0.9123 0.8134 0.0083 0.0178 Random Forest Manual tuning Selected features

C5.0 Model

Model 6: C5.0 model with Auto tuning and all features

set.seed(123)
C5_model <- train(brand~.,
                  data=training_data,
                  method="C5.0",
                  trControl=cv,
                  metric="Accuracy",
                  tuneLength = 2)

C5_model 
## C5.0 
## 
## 7424 samples
##    6 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ... 
## Resampling results across tuning parameters:
## 
##   model  winnow  trials  Accuracy   Kappa    
##   rules  FALSE    1      0.8715035  0.7374790
##   rules  FALSE   10      0.9136570  0.8154378
##   rules   TRUE    1      0.8739270  0.7423625
##   rules   TRUE   10      0.9155420  0.8197228
##   tree   FALSE    1      0.8688088  0.7251092
##   tree   FALSE   10      0.9152735  0.8201520
##   tree    TRUE    1      0.8721757  0.7320037
##   tree    TRUE   10      0.9159439  0.8217090
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = tree and winnow
##  = TRUE.

Check feature importance

ggplot(varImp(C5_model,scale=FALSE)) + theme_bw()

Model 7: C5.0 model, Manual tuning and with all features

set.seed(123)

C5grid<-expand.grid(trials =c(1,5,10),
                    model = c("tree", "rules"), 
                    winnow = c(TRUE, FALSE))

C5_mod_manual <- train(brand~.,
                       data=training_data,
                       method="C5.0",
                       trControl=cv,
                       metric="Accuracy",
                       tuneGrid = C5grid)

C5_mod_manual
## C5.0 
## 
## 7424 samples
##    6 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ... 
## Resampling results across tuning parameters:
## 
##   model  winnow  trials  Accuracy   Kappa    
##   rules  FALSE    1      0.8715035  0.7374790
##   rules  FALSE    5      0.9133862  0.8164083
##   rules  FALSE   10      0.9136570  0.8154378
##   rules   TRUE    1      0.8739270  0.7423625
##   rules   TRUE    5      0.9128493  0.8157683
##   rules   TRUE   10      0.9155420  0.8197228
##   tree   FALSE    1      0.8688088  0.7251092
##   tree   FALSE    5      0.9143323  0.8186833
##   tree   FALSE   10      0.9152735  0.8201520
##   tree    TRUE    1      0.8721757  0.7320037
##   tree    TRUE    5      0.9096162  0.8085107
##   tree    TRUE   10      0.9159439  0.8217090
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = tree and winnow
##  = TRUE.

Check feature importance

ggplot(varImp(C5_mod_manual,scale=FALSE)) + theme_bw()

Model 8: C5.0 model , Random tuning and all features

set.seed(123)
cv_random <- trainControl(method = "repeatedcv", 
                          number = 10, 
                          repeats = 2,
                          search="random")

C5_mod_random <- train(brand~.,
                       data=training_data,
                       method="C5.0",
                       trControl=cv_random,
                       metric="Accuracy",
                       tuneLength=5)

C5_mod_random 
## C5.0 
## 
## 7424 samples
##    6 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ... 
## Resampling results across tuning parameters:
## 
##   model  winnow  trials  Accuracy   Kappa    
##   rules  FALSE   50      0.9142596  0.8175991
##   rules  FALSE   51      0.9142596  0.8175991
##   rules   TRUE   42      0.9176262  0.8249437
##   tree   FALSE   67      0.9160127  0.8218737
##   tree    TRUE   14      0.9145302  0.8188681
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 42, model = rules and
##  winnow = TRUE.

Model 9: C5.0 model , Auto tuning and selected features

set.seed(123)

C5_mod_features <- train(brand~ salary+age+car+credit,
                         data = training_data,
                         method = "C5.0", 
                         trControl=cv, 
                         metric="Accuracy",
                         tuneLength = 2)

C5_mod_features 
## C5.0 
## 
## 7424 samples
##    4 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ... 
## Resampling results across tuning parameters:
## 
##   model  winnow  trials  Accuracy   Kappa    
##   rules  FALSE    1      0.8719087  0.7372298
##   rules  FALSE   10      0.9171583  0.8230345
##   rules   TRUE    1      0.8708292  0.7354013
##   rules   TRUE   10      0.9152728  0.8187035
##   tree   FALSE    1      0.8705602  0.7276431
##   tree   FALSE   10      0.9162158  0.8220200
##   tree    TRUE    1      0.8697507  0.7262263
##   tree    TRUE   10      0.9166185  0.8231757
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = rules and
##  winnow = FALSE.

Model 10: C5.0 model, manual tuning and selected features

set.seed(123)

C5grid_features<-expand.grid(trials =c(1,2,3),
                             model = c("tree", "rules"), 
                             winnow = c(TRUE, FALSE))

C5_mod_manual_features <- train(brand~salary+age+car+credit,
                                data=training_data,
                                method="C5.0",
                                trControl=cv,
                                metric="Accuracy",
                                tuneGrid = C5grid_features)

C5_mod_manual_features 
## C5.0 
## 
## 7424 samples
##    4 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ... 
## Resampling results across tuning parameters:
## 
##   model  winnow  trials  Accuracy   Kappa    
##   rules  FALSE   1       0.8719087  0.7372298
##   rules  FALSE   2       0.9053070  0.7951399
##   rules  FALSE   3       0.9108295  0.8122261
##   rules   TRUE   1       0.8708292  0.7354013
##   rules   TRUE   2       0.9034237  0.7915440
##   rules   TRUE   3       0.9088089  0.8082261
##   tree   FALSE   1       0.8705602  0.7276431
##   tree   FALSE   2       0.9073238  0.8035706
##   tree   FALSE   3       0.9102912  0.8102284
##   tree    TRUE   1       0.8697507  0.7262263
##   tree    TRUE   2       0.9031483  0.7939397
##   tree    TRUE   3       0.9123124  0.8153281
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 3, model = tree and winnow
##  = TRUE.

Storing the results for easy reference

C5_result1 <- data.frame(C5_model$results)
C5_result1['Model_name']<-"C5 model Auto tuning"
C5_result2 <- data.frame(C5_mod_manual$results)
C5_result2['Model_name']<-"C5 model Manual tuning"
C5_result3 <- data.frame(C5_mod_features$results)
C5_result3['Model_name']<-"C5 model Auto tuning selected features"
C5_result4 <- data.frame(C5_mod_random$results)
C5_result4['Model_name']<-"C5 model Random tuning"
C5_result5 <- data.frame(C5_mod_manual_features$results)
C5_result5['Model_name']<-"C5 model Manual tuning selected features"

C5_model_results <- rbind(C5_result1,C5_result2,C5_result3,C5_result4,C5_result5)
kable(C5_model_results,format = "html",caption= "Results of all Random Forest models",digits=4)%>% kable_styling(bootstrap_options = "striped", full_width = F)
Results of all Random Forest models
model winnow trials Accuracy Kappa AccuracySD KappaSD Model_name
5 rules FALSE 1 0.8715 0.7375 0.0517 0.0967 C5 model Auto tuning
7 rules TRUE 1 0.8739 0.7424 0.0516 0.0967 C5 model Auto tuning
1 tree FALSE 1 0.8688 0.7251 0.0526 0.1105 C5 model Auto tuning
3 tree TRUE 1 0.8722 0.7320 0.0528 0.1120 C5 model Auto tuning
6 rules FALSE 10 0.9137 0.8154 0.0052 0.0118 C5 model Auto tuning
8 rules TRUE 10 0.9155 0.8197 0.0059 0.0132 C5 model Auto tuning
2 tree FALSE 10 0.9153 0.8202 0.0069 0.0143 C5 model Auto tuning
4 tree TRUE 10 0.9159 0.8217 0.0079 0.0169 C5 model Auto tuning
71 rules FALSE 1 0.8715 0.7375 0.0517 0.0967 C5 model Manual tuning
10 rules TRUE 1 0.8739 0.7424 0.0516 0.0967 C5 model Manual tuning
13 tree FALSE 1 0.8688 0.7251 0.0526 0.1105 C5 model Manual tuning
41 tree TRUE 1 0.8722 0.7320 0.0528 0.1120 C5 model Manual tuning
81 rules FALSE 5 0.9134 0.8164 0.0085 0.0178 C5 model Manual tuning
11 rules TRUE 5 0.9128 0.8158 0.0052 0.0111 C5 model Manual tuning
21 tree FALSE 5 0.9143 0.8187 0.0081 0.0166 C5 model Manual tuning
51 tree TRUE 5 0.9096 0.8085 0.0096 0.0202 C5 model Manual tuning
9 rules FALSE 10 0.9137 0.8154 0.0052 0.0118 C5 model Manual tuning
12 rules TRUE 10 0.9155 0.8197 0.0059 0.0132 C5 model Manual tuning
31 tree FALSE 10 0.9153 0.8202 0.0069 0.0143 C5 model Manual tuning
61 tree TRUE 10 0.9159 0.8217 0.0079 0.0169 C5 model Manual tuning
52 rules FALSE 1 0.8719 0.7372 0.0512 0.0962 C5 model Auto tuning selected features
72 rules TRUE 1 0.8708 0.7354 0.0515 0.0959 C5 model Auto tuning selected features
14 tree FALSE 1 0.8706 0.7276 0.0524 0.1113 C5 model Auto tuning selected features
32 tree TRUE 1 0.8698 0.7262 0.0534 0.1139 C5 model Auto tuning selected features
62 rules FALSE 10 0.9172 0.8230 0.0092 0.0199 C5 model Auto tuning selected features
82 rules TRUE 10 0.9153 0.8187 0.0084 0.0184 C5 model Auto tuning selected features
22 tree FALSE 10 0.9162 0.8220 0.0057 0.0124 C5 model Auto tuning selected features
42 tree TRUE 10 0.9166 0.8232 0.0071 0.0150 C5 model Auto tuning selected features
53 tree TRUE 14 0.9145 0.8189 0.0080 0.0166 C5 model Random tuning
33 rules TRUE 42 0.9176 0.8249 0.0097 0.0206 C5 model Random tuning
15 rules FALSE 50 0.9143 0.8176 0.0111 0.0240 C5 model Random tuning
23 rules FALSE 51 0.9143 0.8176 0.0111 0.0240 C5 model Random tuning
43 tree FALSE 67 0.9160 0.8219 0.0089 0.0189 C5 model Random tuning
73 rules FALSE 1 0.8719 0.7372 0.0512 0.0962 C5 model Manual tuning selected features
101 rules TRUE 1 0.8708 0.7354 0.0515 0.0959 C5 model Manual tuning selected features
16 tree FALSE 1 0.8706 0.7276 0.0524 0.1113 C5 model Manual tuning selected features
44 tree TRUE 1 0.8698 0.7262 0.0534 0.1139 C5 model Manual tuning selected features
83 rules FALSE 2 0.9053 0.7951 0.0071 0.0172 C5 model Manual tuning selected features
111 rules TRUE 2 0.9034 0.7915 0.0108 0.0241 C5 model Manual tuning selected features
24 tree FALSE 2 0.9073 0.8036 0.0097 0.0196 C5 model Manual tuning selected features
54 tree TRUE 2 0.9031 0.7939 0.0133 0.0277 C5 model Manual tuning selected features
91 rules FALSE 3 0.9108 0.8122 0.0066 0.0137 C5 model Manual tuning selected features
121 rules TRUE 3 0.9088 0.8082 0.0093 0.0192 C5 model Manual tuning selected features
34 tree FALSE 3 0.9103 0.8102 0.0090 0.0192 C5 model Manual tuning selected features
63 tree TRUE 3 0.9123 0.8153 0.0084 0.0173 C5 model Manual tuning selected features

Choose the best model for prediction

Random Forest manual tuned model with all features is selected for predicting the test data.

set.seed(123)

Predicted = predict(rf_mod_manual,testing_data)

postResample(Predicted,testing_data$brand)
##  Accuracy     Kappa 
## 0.9264349 0.8445849
Actual_vs_Predicted <- data.frame(testing_data,Predicted)

summary(Predicted)
##    0    1 
##  966 1508

Plotting Confusion matrix

confusionMatrix(data = Predicted, reference = testing_data$brand)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  860  106
##          1   76 1432
##                                           
##                Accuracy : 0.9264          
##                  95% CI : (0.9154, 0.9364)
##     No Information Rate : 0.6217          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8446          
##                                           
##  Mcnemar's Test P-Value : 0.03159         
##                                           
##             Sensitivity : 0.9188          
##             Specificity : 0.9311          
##          Pos Pred Value : 0.8903          
##          Neg Pred Value : 0.9496          
##              Prevalence : 0.3783          
##          Detection Rate : 0.3476          
##    Detection Prevalence : 0.3905          
##       Balanced Accuracy : 0.9249          
##                                           
##        'Positive' Class : 0               
## 
confusion_matrix <- as.data.frame(table(Predicted, testing_data$brand))
names(confusion_matrix) <- c("Predicted","Actual","Count")
ggplot(data = confusion_matrix, aes(x = Predicted, y=Actual,fill = Count))+
  ggtitle("Confusion Matrix")+
  geom_tile()+ theme_bw()+
  scale_fill_distiller(palette="Greens", direction=1)+
  scale_x_discrete(labels=c("0"="Acer","1"="Sony"))+
  scale_y_discrete(labels=c("0"="Acer","1"="Sony"))+
  geom_text(aes(label=Count), color="black")

Predict Incomplete survey data

Check the brand column

str(test_incomplete)
## 'data.frame':    5000 obs. of  7 variables:
##  $ salary : num  150000 82524 115647 141443 149211 ...
##  $ age    : int  76 51 34 22 56 26 64 50 26 46 ...
##  $ elevel : int  1 1 0 3 0 4 3 3 2 3 ...
##  $ car    : int  3 8 10 18 5 12 1 9 3 18 ...
##  $ zipcode: int  3 3 2 2 3 1 2 0 4 6 ...
##  $ credit : num  377980 141658 360980 282736 215667 ...
##  $ brand  : int  1 0 1 1 1 1 1 1 1 0 ...
table(test_incomplete$brand)
## 
##    0    1 
## 4937   63

The brand column is filled with 0’s for incomplete survey rows.

Converting the brand column of incomplete survey data

test_incomplete$brand <- as.factor(test_incomplete$brand)

Predict the brand column of incomplete survey data

set.seed(123)
predicted_brand <- predict(rf_mod_manual,test_incomplete)
predicted_incomplete_survey <- data.frame(test_incomplete,predicted_brand)

Post Resample cannot be done as there is no ground truth to compare

Check the distribution of Acer and Sony

summary(predicted_brand)
##    0    1 
## 1884 3116

Distribution of predicted brand in incomplete survey

ggplot(data=predicted_incomplete_survey,aes(x=predicted_brand,fill=predicted_brand))+geom_bar()+theme_bw()+
  xlab("Preferred brand")+ylab("Number of Customers")+
  ggtitle("Predicted computer brand of Incomplete Survey ")+
  scale_x_discrete(labels=c("0"="Acer","1"="Sony"))+
  scale_y_continuous(limits=c(0,4000))+
  scale_fill_discrete(name="Preferred Brand",labels=c("0"="Acer","1"="Sony"))+
  geom_text(stat='count', aes(label=..count..), vjust=-1)+
  theme(axis.text.x=element_text(size=10))+
  theme(axis.text.y=element_text(size=10))

Check the distribution of preferred brand in both completed and incomplete survey

Drop the incomplete brand column and rename the predicted brand column as brand.

Completed_survey <- predicted_incomplete_survey[c(1:6,8)]
names(Completed_survey)[7]<- "brand"

Joining both the survey data to check the distribution of preferred brand for all 15,000 customers

Totalsurvey <- rbind(survey,Completed_survey)

ggplot(data=Totalsurvey,aes(x=brand,fill=brand))+geom_bar()+theme_bw()+
  xlab("Preferred brand")+ylab("Number of Customers")+
  ggtitle("Preferred computer brand of all customers ")+
  scale_x_discrete(labels=c("0"="Acer","1"="Sony"))+
  scale_y_continuous(limits = c(0,10000))+
  scale_fill_discrete(name="Preferred Brand",labels=c("0"="Acer","1"="Sony"))+
  geom_text(stat='count', aes(label=..count..), vjust=-1)+
  theme(axis.text.x=element_text(size=10))+
  theme(axis.text.y=element_text(size=10))

Brand Preference in terms of customer percentage

table(Totalsurvey$brand)
## 
##    0    1 
## 5628 9270
ggplot(Totalsurvey, aes(brand,fill=brand)) + 
  geom_bar(aes(y = (..count..)/sum(..count..))) + theme_bw()+
  scale_y_continuous(labels=scales::percent) +
  ggtitle ("Brand preference in terms of percentage of total Customers ")+
  scale_x_discrete(labels=c("0"="Acer","1"="Sony"))+
  scale_fill_discrete(name="Preferred Brand",labels=c("0"="Acer","1"="Sony"))+
  xlab("Brand Preference")+
  ylab("Percentage of customers")

Summary:

  1. Models tried are Random Forest and C5.0
  2. Salary,age, credit are the three most main features with salary being the most important. People who earn over 130k prefer only Sony.
  3. Training accuracy was at max 91.87% for Random Forest.
  4. Testing accuracy was 92.64%
  5. Preferred brand is predicted for incomplete survey data which has 5000 observations. 1184 customers are predicted to prefer Acer and 3116 customers prefer Sony.
  6. 37.8% of the total customers (9898+4000) prefer Acer whereas remaining 62.2% prefer Sony.