We are asked to make predictions and provide suggestion to the sales team of customers’ preference (Acer Vs. Sony). We need to build our models based on CompleteResponses and apply them to Incompleted surveys.
# Load libraries
library(readr)
library(caret)
library(ggplot2)
# Set seed
set.seed(998)
# Load the dataset and check the data structure
CompleteResponses <- read.csv("CompleteResponses.csv")
# Check the data structure
str(CompleteResponses)
## 'data.frame': 9898 obs. of 7 variables:
## $ salary : num 119807 106880 78021 63690 50874 ...
## $ age : int 45 63 23 51 20 56 24 62 29 41 ...
## $ elevel : int 0 1 0 3 3 3 4 3 4 1 ...
## $ car : int 14 11 15 6 14 14 8 3 17 5 ...
## $ zipcode: int 4 6 2 5 4 3 5 0 0 4 ...
## $ credit : num 442038 45007 48795 40889 352951 ...
## $ brand : int 0 1 0 1 0 1 1 1 0 1 ...
# Check for the general info
summary(CompleteResponses)
## salary age elevel car
## Min. : 20000 Min. :20.00 Min. :0.000 Min. : 1.00
## 1st Qu.: 52082 1st Qu.:35.00 1st Qu.:1.000 1st Qu.: 6.00
## Median : 84950 Median :50.00 Median :2.000 Median :11.00
## Mean : 84871 Mean :49.78 Mean :1.983 Mean :10.52
## 3rd Qu.:117162 3rd Qu.:65.00 3rd Qu.:3.000 3rd Qu.:15.75
## Max. :150000 Max. :80.00 Max. :4.000 Max. :20.00
## zipcode credit brand
## Min. :0.000 Min. : 0 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:120807 1st Qu.:0.0000
## Median :4.000 Median :250607 Median :1.0000
## Mean :4.041 Mean :249176 Mean :0.6217
## 3rd Qu.:6.000 3rd Qu.:374640 3rd Qu.:1.0000
## Max. :8.000 Max. :500000 Max. :1.0000
# Check for the sum of missing values
sum(is.na(CompleteResponses))
## [1] 0
# Change below attributes to factor
CompleteResponses$elevel <- as.factor(CompleteResponses$elevel)
CompleteResponses$car <- as.factor(CompleteResponses$car)
CompleteResponses$zipcode <- as.factor(CompleteResponses$zipcode)
CompleteResponses$brand <- as.factor(CompleteResponses$brand)
# Define an 75%/25% train/test split of the dataset
inTraining <- createDataPartition(CompleteResponses$brand, p = .75, list = FALSE)
training <- CompleteResponses[inTraining,]
testing <- CompleteResponses[-inTraining,]
# 10 fold cross validation
fitControl <- trainControl(method = "repeatedcv", number = 10)
# Train C5.0 model with a tuneLenght = 2
C50 <- train(brand~., data = training, method = "C5.0", trControl=fitControl, tuneLength = 2)
# Training results
C50
## C5.0
##
## 7424 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6682, 6681, 6681, 6682, 6681, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 1 0.8100775 0.6161377
## rules FALSE 10 0.9216012 0.8333742
## rules TRUE 1 0.8207093 0.6367364
## rules TRUE 10 0.9209292 0.8315294
## tree FALSE 1 0.8100775 0.6161377
## tree FALSE 10 0.9238938 0.8381171
## tree TRUE 1 0.8209785 0.6373269
## tree TRUE 10 0.9217371 0.8335757
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = tree and winnow
## = FALSE.
# Check for variable importance
C50imp <-varImp(C50)
C50imp
## C5.0 variable importance
##
## only 20 most important variables shown (out of 34)
##
## Overall
## salary 100.00
## age 85.17
## zipcode4 38.24
## credit 16.55
## zipcode1 16.31
## car13 8.01
## elevel1 0.00
## car3 0.00
## car8 0.00
## car9 0.00
## car16 0.00
## car20 0.00
## car7 0.00
## elevel3 0.00
## zipcode5 0.00
## car10 0.00
## car19 0.00
## car4 0.00
## car5 0.00
## car6 0.00
# Use RandomForest with 10-fold cross validation and manually tune 5 different mtry values
# 10 fold cross validation (RandomForest)
fitControl <- trainControl(method = "repeatedcv", number=10, repeats = 1)
# Dataframe for manual tuning of mtry
rfGrid <- expand.grid(mtry=c(1,2,3,4,5))
# Note the system time wrapper. system.time()
# This is used to measure process execution time
system.time(rfFitm1 <- train(brand~., data = training, method = "rf", trControl=fitControl, tuneGrid=rfGrid))
## user system elapsed
## 409.81 5.20 416.73
# Training results
rfFitm1
## Random Forest
##
## 7424 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6682, 6681, 6682, 6681, 6682, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.6217673 0.0000000
## 2 0.6217673 0.0000000
## 3 0.7241366 0.3332566
## 4 0.8390274 0.6467964
## 5 0.8881987 0.7616528
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
# Random Variable Importance
RFimp <-varImp(rfFitm1)
RFimp
## rf variable importance
##
## only 20 most important variables shown (out of 34)
##
## Overall
## salary 100.0000
## age 42.6666
## credit 19.7986
## elevel3 0.9979
## elevel4 0.9511
## elevel1 0.8959
## elevel2 0.8584
## zipcode1 0.5007
## zipcode4 0.4768
## zipcode7 0.4446
## zipcode5 0.4408
## zipcode3 0.3853
## zipcode2 0.3833
## zipcode6 0.3830
## car7 0.2278
## car10 0.1904
## car15 0.1805
## zipcode8 0.1786
## car12 0.1765
## car2 0.1468
# Load the incomplete dataset and check the data structure
SurveyIncomplete <- read_csv("SurveyIncomplete.csv")
## Parsed with column specification:
## cols(
## salary = col_double(),
## age = col_double(),
## elevel = col_double(),
## car = col_double(),
## zipcode = col_double(),
## credit = col_double(),
## brand = col_double()
## )
# Check the data structure
str(CompleteResponses)
## 'data.frame': 9898 obs. of 7 variables:
## $ salary : num 119807 106880 78021 63690 50874 ...
## $ age : int 45 63 23 51 20 56 24 62 29 41 ...
## $ elevel : Factor w/ 5 levels "0","1","2","3",..: 1 2 1 4 4 4 5 4 5 2 ...
## $ car : Factor w/ 20 levels "1","2","3","4",..: 14 11 15 6 14 14 8 3 17 5 ...
## $ zipcode: Factor w/ 9 levels "0","1","2","3",..: 5 7 3 6 5 4 6 1 1 5 ...
## $ credit : num 442038 45007 48795 40889 352951 ...
## $ brand : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 2 2 1 2 ...
# Check for the general info
summary(SurveyIncomplete)
## salary age elevel car
## Min. : 20000 Min. :20.00 Min. :0.000 Min. : 1.0
## 1st Qu.: 52590 1st Qu.:35.00 1st Qu.:1.000 1st Qu.: 6.0
## Median : 86221 Median :50.00 Median :2.000 Median :11.0
## Mean : 85794 Mean :49.94 Mean :2.009 Mean :10.6
## 3rd Qu.:118535 3rd Qu.:65.00 3rd Qu.:3.000 3rd Qu.:16.0
## Max. :150000 Max. :80.00 Max. :4.000 Max. :20.0
## zipcode credit brand
## Min. :0.000 Min. : 0 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:122311 1st Qu.:0.0000
## Median :4.000 Median :250974 Median :0.0000
## Mean :4.038 Mean :249546 Mean :0.0126
## 3rd Qu.:6.000 3rd Qu.:375653 3rd Qu.:0.0000
## Max. :8.000 Max. :500000 Max. :1.0000
# Change below attributes to factor
SurveyIncomplete$brand <- as.factor(SurveyIncomplete$brand)
SurveyIncomplete$elevel <- as.factor(SurveyIncomplete$elevel)
SurveyIncomplete$car <- as.factor(SurveyIncomplete$car)
SurveyIncomplete$zipcode <- as.factor(SurveyIncomplete$zipcode)
# Make a prediction based on C50 (SurveyIncomplete)
prediction <- predict(C50, SurveyIncomplete)
summary(prediction)
## 0 1
## 1907 3093
# Confusion Matrix
confusionMatrix(prediction, SurveyIncomplete$brand)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1901 6
## 1 3036 57
##
## Accuracy : 0.3916
## 95% CI : (0.378, 0.4053)
## No Information Rate : 0.9874
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0117
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.38505
## Specificity : 0.90476
## Pos Pred Value : 0.99685
## Neg Pred Value : 0.01843
## Prevalence : 0.98740
## Detection Rate : 0.38020
## Detection Prevalence : 0.38140
## Balanced Accuracy : 0.64491
##
## 'Positive' Class : 0
##
# PostResample (No Ground Truth lead to Low Accuracy & Kappa value )
postResample(prediction, SurveyIncomplete$brand)
## Accuracy Kappa
## 0.39160000 0.01171402
# Make a prediction based on C50 (25% testing from CompleteResponse)
prediction_testing <- predict(C50, testing)
summary(prediction_testing)
## 0 1
## 949 1525
# Confusion Matrix
confusionMatrix(prediction_testing, testing$brand)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 841 108
## 1 95 1430
##
## Accuracy : 0.9179
## 95% CI : (0.9064, 0.9285)
## No Information Rate : 0.6217
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.826
##
## Mcnemar's Test P-Value : 0.3997
##
## Sensitivity : 0.8985
## Specificity : 0.9298
## Pos Pred Value : 0.8862
## Neg Pred Value : 0.9377
## Prevalence : 0.3783
## Detection Rate : 0.3399
## Detection Prevalence : 0.3836
## Balanced Accuracy : 0.9141
##
## 'Positive' Class : 0
##
# PostResample
postResample(prediction_testing, testing$brand)
## Accuracy Kappa
## 0.9179466 0.8260379
Accuracy_Kappa & Preference_Count