library(readr)
library(dplyr)
library(ggplot2)
library(caret)
library(corrplot)
library(knitr)
library(kableExtra)
survey <- read.csv("CompleteResponses.csv")
test_incomplete <-read.csv("SurveyIncomplete.csv")
dim(survey)
## [1] 9898 7
str(survey)
## 'data.frame': 9898 obs. of 7 variables:
## $ salary : num 119807 106880 78021 63690 50874 ...
## $ age : int 45 63 23 51 20 56 24 62 29 41 ...
## $ elevel : int 0 1 0 3 3 3 4 3 4 1 ...
## $ car : int 14 11 15 6 14 14 8 3 17 5 ...
## $ zipcode: int 4 6 2 5 4 3 5 0 0 4 ...
## $ credit : num 442038 45007 48795 40889 352951 ...
## $ brand : int 0 1 0 1 0 1 1 1 0 1 ...
names(survey)
## [1] "salary" "age" "elevel" "car" "zipcode" "credit" "brand"
## Check unique values of columns
table(survey$brand)
##
## 0 1
## 3744 6154
table(survey$elevel)
##
## 0 1 2 3 4
## 2052 1948 1983 1947 1968
table(survey$car)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 492 509 488 479 505 477 495 511 487 500 473 498 473 494 542 470 508 524 489 484
table(survey$zipcode)
##
## 0 1 2 3 4 5 6 7 8
## 1085 1053 1112 1080 1087 1108 1155 1083 1135
## Check for null values
sum(is.na(survey))
## [1] 0
survey$brand <- as.factor(survey$brand)
set.seed(123)
row_index <- createDataPartition(survey$brand,
p = .75,
list = FALSE)
training_data <- survey[row_index,]
testing_data <- survey[-row_index,]
str(testing_data)
## 'data.frame': 2474 obs. of 7 variables:
## $ salary : num 50874 37803 82475 107710 148495 ...
## $ age : int 20 41 33 75 62 42 26 59 63 36 ...
## $ elevel : int 3 1 4 0 2 4 0 2 0 4 ...
## $ car : int 14 5 13 16 9 8 18 5 7 8 ...
## $ zipcode: int 4 4 3 2 1 2 2 6 7 7 ...
## $ credit : num 352951 493219 424657 209002 377495 ...
## $ brand : Factor w/ 2 levels "0","1": 1 2 1 2 2 2 1 2 2 1 ...
cv <- trainControl(method = "repeatedcv",
number = 10,
repeats = 1)
Compare Random Forest model (Auto tuning, Manual tuning, Random tuning, with selected features) vs C5.0 model (Auto, Manual, Random, Selected features)
set.seed(123)
rf_model <- train(brand~.,
data = training_data,
method = "rf",
trControl=cv,
metric="Accuracy",
tuneLength = 2)
rf_model
## Random Forest
##
## 7424 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9175632 0.8250521
## 6 0.9125812 0.8140643
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
ggplot(varImp(rf_model,scale=FALSE)) + theme_bw()
Salary, age, credit are top features.
set.seed(123)
rfGrid <- expand.grid(mtry=c(2,3,4,5,6))
rf_mod_manual <- train(brand~.,
data = training_data,
method = "rf",
trControl=cv,
metric="Accuracy",
tuneGrid=rfGrid)
rf_mod_manual
## Random Forest
##
## 7424 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9176981 0.8252764
## 3 0.9176985 0.8252930
## 4 0.9171598 0.8240410
## 5 0.9155429 0.8203682
## 6 0.9132536 0.8155551
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
ggplot(varImp(rf_mod_manual,scale=FALSE)) + theme_bw()+
ggtitle ("Feature Importances by Random Forest model")
set.seed(123)
cv_random <- trainControl(method = "repeatedcv",
number = 10,
repeats = 2,
search="random")
rf_mod_random <- train(brand~.,
data = training_data,
method = "rf",
trControl=cv_random,
metric="Accuracy",
tuneLength=5)
rf_mod_random
## Random Forest
##
## 7424 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9187749 0.8276598
## 3 0.9166870 0.8231164
## 6 0.9128485 0.8146097
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
set.seed(123)
rf_mod_features <- train(brand~ salary+age+credit,
data = training_data,
method = "rf",
trControl=cv,
metric="Accuracy",
tuneLength = 2)
rf_mod_features
## Random Forest
##
## 7424 samples
## 3 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9144674 0.8180274
## 3 0.9119079 0.8124586
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
set.seed(123)
rfGrid <- expand.grid(mtry=c(2,3,4,5,6))
rf_mod_manual_features <- train(brand~salary+age+credit,
data = training_data,
method = "rf",
trControl=cv,
metric="Accuracy",
tuneGrid=rfGrid)
rf_mod_manual_features
## Random Forest
##
## 7424 samples
## 3 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9151398 0.8194564
## 3 0.9106935 0.8098159
## 4 0.9096166 0.8076917
## 5 0.9102892 0.8091063
## 6 0.9123109 0.8133812
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
results_1 <- data.frame(rf_model$results)
results_1['Model_name']<-"Random forest Auto tuning"
results_2 <- data.frame(rf_mod_manual$results)
results_2['Model_name']<-"Random Forest Manual tuning"
results_3 <- data.frame(rf_mod_features$results)
results_3['Model_name']<-"Random Forest Auto tuning Selected features"
results_4<- data.frame(rf_mod_random$results)
results_4['Model_name']<-"Random Forest Random tuning"
results_5<- data.frame(rf_mod_manual_features$results)
results_5['Model_name']<-"Random Forest Manual tuning Selected features"
Random_Forest_results <- rbind(results_1,results_2,results_3,results_4,results_5)
kable(Random_Forest_results,format = "html",caption= "Results of all Random Forest models",digits=4)%>% kable_styling(bootstrap_options = "striped", full_width = F)
| mtry | Accuracy | Kappa | AccuracySD | KappaSD | Model_name |
|---|---|---|---|---|---|
| 2 | 0.9176 | 0.8251 | 0.0069 | 0.0147 | Random forest Auto tuning |
| 6 | 0.9126 | 0.8141 | 0.0048 | 0.0104 | Random forest Auto tuning |
| 2 | 0.9177 | 0.8253 | 0.0066 | 0.0142 | Random Forest Manual tuning |
| 3 | 0.9177 | 0.8253 | 0.0068 | 0.0145 | Random Forest Manual tuning |
| 4 | 0.9172 | 0.8240 | 0.0072 | 0.0155 | Random Forest Manual tuning |
| 5 | 0.9155 | 0.8204 | 0.0062 | 0.0132 | Random Forest Manual tuning |
| 6 | 0.9133 | 0.8156 | 0.0077 | 0.0162 | Random Forest Manual tuning |
| 2 | 0.9145 | 0.8180 | 0.0087 | 0.0186 | Random Forest Auto tuning Selected features |
| 3 | 0.9119 | 0.8125 | 0.0067 | 0.0144 | Random Forest Auto tuning Selected features |
| 2 | 0.9188 | 0.8277 | 0.0075 | 0.0158 | Random Forest Random tuning |
| 3 | 0.9167 | 0.8231 | 0.0063 | 0.0134 | Random Forest Random tuning |
| 6 | 0.9128 | 0.8146 | 0.0073 | 0.0152 | Random Forest Random tuning |
| 2 | 0.9151 | 0.8195 | 0.0085 | 0.0184 | Random Forest Manual tuning Selected features |
| 3 | 0.9107 | 0.8098 | 0.0088 | 0.0190 | Random Forest Manual tuning Selected features |
| 4 | 0.9096 | 0.8077 | 0.0086 | 0.0185 | Random Forest Manual tuning Selected features |
| 5 | 0.9103 | 0.8091 | 0.0082 | 0.0176 | Random Forest Manual tuning Selected features |
| 6 | 0.9123 | 0.8134 | 0.0083 | 0.0178 | Random Forest Manual tuning Selected features |
set.seed(123)
C5_model <- train(brand~.,
data=training_data,
method="C5.0",
trControl=cv,
metric="Accuracy",
tuneLength = 2)
C5_model
## C5.0
##
## 7424 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 1 0.8715035 0.7374790
## rules FALSE 10 0.9136570 0.8154378
## rules TRUE 1 0.8739270 0.7423625
## rules TRUE 10 0.9155420 0.8197228
## tree FALSE 1 0.8688088 0.7251092
## tree FALSE 10 0.9152735 0.8201520
## tree TRUE 1 0.8721757 0.7320037
## tree TRUE 10 0.9159439 0.8217090
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = tree and winnow
## = TRUE.
ggplot(varImp(C5_model,scale=FALSE)) + theme_bw()
set.seed(123)
C5grid<-expand.grid(trials =c(1,5,10),
model = c("tree", "rules"),
winnow = c(TRUE, FALSE))
C5_mod_manual <- train(brand~.,
data=training_data,
method="C5.0",
trControl=cv,
metric="Accuracy",
tuneGrid = C5grid)
C5_mod_manual
## C5.0
##
## 7424 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 1 0.8715035 0.7374790
## rules FALSE 5 0.9133862 0.8164083
## rules FALSE 10 0.9136570 0.8154378
## rules TRUE 1 0.8739270 0.7423625
## rules TRUE 5 0.9128493 0.8157683
## rules TRUE 10 0.9155420 0.8197228
## tree FALSE 1 0.8688088 0.7251092
## tree FALSE 5 0.9143323 0.8186833
## tree FALSE 10 0.9152735 0.8201520
## tree TRUE 1 0.8721757 0.7320037
## tree TRUE 5 0.9096162 0.8085107
## tree TRUE 10 0.9159439 0.8217090
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = tree and winnow
## = TRUE.
ggplot(varImp(C5_mod_manual,scale=FALSE)) + theme_bw()
set.seed(123)
cv_random <- trainControl(method = "repeatedcv",
number = 10,
repeats = 2,
search="random")
C5_mod_random <- train(brand~.,
data=training_data,
method="C5.0",
trControl=cv_random,
metric="Accuracy",
tuneLength=5)
C5_mod_random
## C5.0
##
## 7424 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 50 0.9142596 0.8175991
## rules FALSE 51 0.9142596 0.8175991
## rules TRUE 42 0.9176262 0.8249437
## tree FALSE 67 0.9160127 0.8218737
## tree TRUE 14 0.9145302 0.8188681
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 42, model = rules and
## winnow = TRUE.
set.seed(123)
C5_mod_features <- train(brand~ salary+age+car+credit,
data = training_data,
method = "C5.0",
trControl=cv,
metric="Accuracy",
tuneLength = 2)
C5_mod_features
## C5.0
##
## 7424 samples
## 4 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 1 0.8719087 0.7372298
## rules FALSE 10 0.9171583 0.8230345
## rules TRUE 1 0.8708292 0.7354013
## rules TRUE 10 0.9152728 0.8187035
## tree FALSE 1 0.8705602 0.7276431
## tree FALSE 10 0.9162158 0.8220200
## tree TRUE 1 0.8697507 0.7262263
## tree TRUE 10 0.9166185 0.8231757
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = rules and
## winnow = FALSE.
set.seed(123)
C5grid_features<-expand.grid(trials =c(1,2,3),
model = c("tree", "rules"),
winnow = c(TRUE, FALSE))
C5_mod_manual_features <- train(brand~salary+age+car+credit,
data=training_data,
method="C5.0",
trControl=cv,
metric="Accuracy",
tuneGrid = C5grid_features)
C5_mod_manual_features
## C5.0
##
## 7424 samples
## 4 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6681, 6681, 6682, 6681, 6683, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 1 0.8719087 0.7372298
## rules FALSE 2 0.9053070 0.7951399
## rules FALSE 3 0.9108295 0.8122261
## rules TRUE 1 0.8708292 0.7354013
## rules TRUE 2 0.9034237 0.7915440
## rules TRUE 3 0.9088089 0.8082261
## tree FALSE 1 0.8705602 0.7276431
## tree FALSE 2 0.9073238 0.8035706
## tree FALSE 3 0.9102912 0.8102284
## tree TRUE 1 0.8697507 0.7262263
## tree TRUE 2 0.9031483 0.7939397
## tree TRUE 3 0.9123124 0.8153281
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 3, model = tree and winnow
## = TRUE.
C5_result1 <- data.frame(C5_model$results)
C5_result1['Model_name']<-"C5 model Auto tuning"
C5_result2 <- data.frame(C5_mod_manual$results)
C5_result2['Model_name']<-"C5 model Manual tuning"
C5_result3 <- data.frame(C5_mod_features$results)
C5_result3['Model_name']<-"C5 model Auto tuning selected features"
C5_result4 <- data.frame(C5_mod_random$results)
C5_result4['Model_name']<-"C5 model Random tuning"
C5_result5 <- data.frame(C5_mod_manual_features$results)
C5_result5['Model_name']<-"C5 model Manual tuning selected features"
C5_model_results <- rbind(C5_result1,C5_result2,C5_result3,C5_result4,C5_result5)
kable(C5_model_results,format = "html",caption= "Results of all Random Forest models",digits=4)%>% kable_styling(bootstrap_options = "striped", full_width = F)
| model | winnow | trials | Accuracy | Kappa | AccuracySD | KappaSD | Model_name | |
|---|---|---|---|---|---|---|---|---|
| 5 | rules | FALSE | 1 | 0.8715 | 0.7375 | 0.0517 | 0.0967 | C5 model Auto tuning |
| 7 | rules | TRUE | 1 | 0.8739 | 0.7424 | 0.0516 | 0.0967 | C5 model Auto tuning |
| 1 | tree | FALSE | 1 | 0.8688 | 0.7251 | 0.0526 | 0.1105 | C5 model Auto tuning |
| 3 | tree | TRUE | 1 | 0.8722 | 0.7320 | 0.0528 | 0.1120 | C5 model Auto tuning |
| 6 | rules | FALSE | 10 | 0.9137 | 0.8154 | 0.0052 | 0.0118 | C5 model Auto tuning |
| 8 | rules | TRUE | 10 | 0.9155 | 0.8197 | 0.0059 | 0.0132 | C5 model Auto tuning |
| 2 | tree | FALSE | 10 | 0.9153 | 0.8202 | 0.0069 | 0.0143 | C5 model Auto tuning |
| 4 | tree | TRUE | 10 | 0.9159 | 0.8217 | 0.0079 | 0.0169 | C5 model Auto tuning |
| 71 | rules | FALSE | 1 | 0.8715 | 0.7375 | 0.0517 | 0.0967 | C5 model Manual tuning |
| 10 | rules | TRUE | 1 | 0.8739 | 0.7424 | 0.0516 | 0.0967 | C5 model Manual tuning |
| 13 | tree | FALSE | 1 | 0.8688 | 0.7251 | 0.0526 | 0.1105 | C5 model Manual tuning |
| 41 | tree | TRUE | 1 | 0.8722 | 0.7320 | 0.0528 | 0.1120 | C5 model Manual tuning |
| 81 | rules | FALSE | 5 | 0.9134 | 0.8164 | 0.0085 | 0.0178 | C5 model Manual tuning |
| 11 | rules | TRUE | 5 | 0.9128 | 0.8158 | 0.0052 | 0.0111 | C5 model Manual tuning |
| 21 | tree | FALSE | 5 | 0.9143 | 0.8187 | 0.0081 | 0.0166 | C5 model Manual tuning |
| 51 | tree | TRUE | 5 | 0.9096 | 0.8085 | 0.0096 | 0.0202 | C5 model Manual tuning |
| 9 | rules | FALSE | 10 | 0.9137 | 0.8154 | 0.0052 | 0.0118 | C5 model Manual tuning |
| 12 | rules | TRUE | 10 | 0.9155 | 0.8197 | 0.0059 | 0.0132 | C5 model Manual tuning |
| 31 | tree | FALSE | 10 | 0.9153 | 0.8202 | 0.0069 | 0.0143 | C5 model Manual tuning |
| 61 | tree | TRUE | 10 | 0.9159 | 0.8217 | 0.0079 | 0.0169 | C5 model Manual tuning |
| 52 | rules | FALSE | 1 | 0.8719 | 0.7372 | 0.0512 | 0.0962 | C5 model Auto tuning selected features |
| 72 | rules | TRUE | 1 | 0.8708 | 0.7354 | 0.0515 | 0.0959 | C5 model Auto tuning selected features |
| 14 | tree | FALSE | 1 | 0.8706 | 0.7276 | 0.0524 | 0.1113 | C5 model Auto tuning selected features |
| 32 | tree | TRUE | 1 | 0.8698 | 0.7262 | 0.0534 | 0.1139 | C5 model Auto tuning selected features |
| 62 | rules | FALSE | 10 | 0.9172 | 0.8230 | 0.0092 | 0.0199 | C5 model Auto tuning selected features |
| 82 | rules | TRUE | 10 | 0.9153 | 0.8187 | 0.0084 | 0.0184 | C5 model Auto tuning selected features |
| 22 | tree | FALSE | 10 | 0.9162 | 0.8220 | 0.0057 | 0.0124 | C5 model Auto tuning selected features |
| 42 | tree | TRUE | 10 | 0.9166 | 0.8232 | 0.0071 | 0.0150 | C5 model Auto tuning selected features |
| 53 | tree | TRUE | 14 | 0.9145 | 0.8189 | 0.0080 | 0.0166 | C5 model Random tuning |
| 33 | rules | TRUE | 42 | 0.9176 | 0.8249 | 0.0097 | 0.0206 | C5 model Random tuning |
| 15 | rules | FALSE | 50 | 0.9143 | 0.8176 | 0.0111 | 0.0240 | C5 model Random tuning |
| 23 | rules | FALSE | 51 | 0.9143 | 0.8176 | 0.0111 | 0.0240 | C5 model Random tuning |
| 43 | tree | FALSE | 67 | 0.9160 | 0.8219 | 0.0089 | 0.0189 | C5 model Random tuning |
| 73 | rules | FALSE | 1 | 0.8719 | 0.7372 | 0.0512 | 0.0962 | C5 model Manual tuning selected features |
| 101 | rules | TRUE | 1 | 0.8708 | 0.7354 | 0.0515 | 0.0959 | C5 model Manual tuning selected features |
| 16 | tree | FALSE | 1 | 0.8706 | 0.7276 | 0.0524 | 0.1113 | C5 model Manual tuning selected features |
| 44 | tree | TRUE | 1 | 0.8698 | 0.7262 | 0.0534 | 0.1139 | C5 model Manual tuning selected features |
| 83 | rules | FALSE | 2 | 0.9053 | 0.7951 | 0.0071 | 0.0172 | C5 model Manual tuning selected features |
| 111 | rules | TRUE | 2 | 0.9034 | 0.7915 | 0.0108 | 0.0241 | C5 model Manual tuning selected features |
| 24 | tree | FALSE | 2 | 0.9073 | 0.8036 | 0.0097 | 0.0196 | C5 model Manual tuning selected features |
| 54 | tree | TRUE | 2 | 0.9031 | 0.7939 | 0.0133 | 0.0277 | C5 model Manual tuning selected features |
| 91 | rules | FALSE | 3 | 0.9108 | 0.8122 | 0.0066 | 0.0137 | C5 model Manual tuning selected features |
| 121 | rules | TRUE | 3 | 0.9088 | 0.8082 | 0.0093 | 0.0192 | C5 model Manual tuning selected features |
| 34 | tree | FALSE | 3 | 0.9103 | 0.8102 | 0.0090 | 0.0192 | C5 model Manual tuning selected features |
| 63 | tree | TRUE | 3 | 0.9123 | 0.8153 | 0.0084 | 0.0173 | C5 model Manual tuning selected features |
Random Forest manual tuned model with all features is selected for predicting the test data.
set.seed(123)
Predicted = predict(rf_mod_manual,testing_data)
postResample(Predicted,testing_data$brand)
## Accuracy Kappa
## 0.9264349 0.8445849
Actual_vs_Predicted <- data.frame(testing_data,Predicted)
summary(Predicted)
## 0 1
## 966 1508
confusionMatrix(data = Predicted, reference = testing_data$brand)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 860 106
## 1 76 1432
##
## Accuracy : 0.9264
## 95% CI : (0.9154, 0.9364)
## No Information Rate : 0.6217
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8446
##
## Mcnemar's Test P-Value : 0.03159
##
## Sensitivity : 0.9188
## Specificity : 0.9311
## Pos Pred Value : 0.8903
## Neg Pred Value : 0.9496
## Prevalence : 0.3783
## Detection Rate : 0.3476
## Detection Prevalence : 0.3905
## Balanced Accuracy : 0.9249
##
## 'Positive' Class : 0
##
confusion_matrix <- as.data.frame(table(Predicted, testing_data$brand))
names(confusion_matrix) <- c("Predicted","Actual","Count")
ggplot(data = confusion_matrix, aes(x = Predicted, y=Actual,fill = Count))+
ggtitle("Confusion Matrix")+
geom_tile()+ theme_bw()+
scale_fill_distiller(palette="Greens", direction=1)+
scale_x_discrete(labels=c("0"="Acer","1"="Sony"))+
scale_y_discrete(labels=c("0"="Acer","1"="Sony"))+
geom_text(aes(label=Count), color="black")
str(test_incomplete)
## 'data.frame': 5000 obs. of 7 variables:
## $ salary : num 150000 82524 115647 141443 149211 ...
## $ age : int 76 51 34 22 56 26 64 50 26 46 ...
## $ elevel : int 1 1 0 3 0 4 3 3 2 3 ...
## $ car : int 3 8 10 18 5 12 1 9 3 18 ...
## $ zipcode: int 3 3 2 2 3 1 2 0 4 6 ...
## $ credit : num 377980 141658 360980 282736 215667 ...
## $ brand : int 1 0 1 1 1 1 1 1 1 0 ...
table(test_incomplete$brand)
##
## 0 1
## 4937 63
The brand column is filled with 0’s for incomplete survey rows.
test_incomplete$brand <- as.factor(test_incomplete$brand)
set.seed(123)
predicted_brand <- predict(rf_mod_manual,test_incomplete)
predicted_incomplete_survey <- data.frame(test_incomplete,predicted_brand)
summary(predicted_brand)
## 0 1
## 1884 3116
ggplot(data=predicted_incomplete_survey,aes(x=predicted_brand,fill=predicted_brand))+geom_bar()+theme_bw()+
xlab("Preferred brand")+ylab("Number of Customers")+
ggtitle("Predicted computer brand of Incomplete Survey ")+
scale_x_discrete(labels=c("0"="Acer","1"="Sony"))+
scale_y_continuous(limits=c(0,4000))+
scale_fill_discrete(name="Preferred Brand",labels=c("0"="Acer","1"="Sony"))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
theme(axis.text.x=element_text(size=10))+
theme(axis.text.y=element_text(size=10))
Drop the incomplete brand column and rename the predicted brand column as brand.
Completed_survey <- predicted_incomplete_survey[c(1:6,8)]
names(Completed_survey)[7]<- "brand"
Totalsurvey <- rbind(survey,Completed_survey)
ggplot(data=Totalsurvey,aes(x=brand,fill=brand))+geom_bar()+theme_bw()+
xlab("Preferred brand")+ylab("Number of Customers")+
ggtitle("Preferred computer brand of all customers ")+
scale_x_discrete(labels=c("0"="Acer","1"="Sony"))+
scale_y_continuous(limits = c(0,10000))+
scale_fill_discrete(name="Preferred Brand",labels=c("0"="Acer","1"="Sony"))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
theme(axis.text.x=element_text(size=10))+
theme(axis.text.y=element_text(size=10))
table(Totalsurvey$brand)
##
## 0 1
## 5628 9270
ggplot(Totalsurvey, aes(brand,fill=brand)) +
geom_bar(aes(y = (..count..)/sum(..count..))) + theme_bw()+
scale_y_continuous(labels=scales::percent) +
ggtitle ("Brand preference in terms of percentage of total Customers ")+
scale_x_discrete(labels=c("0"="Acer","1"="Sony"))+
scale_fill_discrete(name="Preferred Brand",labels=c("0"="Acer","1"="Sony"))+
xlab("Brand Preference")+
ylab("Percentage of customers")