16 de marzo de 2019

Defining the problem

The sales team engaged a market research firm to conduct a survey of our existing customers.

  • Objective: find out which of two brands of computers our customers prefer.

  • Problem: the answer to the brand preference question was not properly captured for all of the respondents.

Our work in that task would be:

  1. to investigate if customer responses to some survey questions (e.g. income, age, etc.) enable us to predict the answer to the brand preference question.

  2. to make those predictions and provide the sales team with a complete view of what brand our customers prefer.

Import the data

library(dplyr)
glimpse(completed_survey)
## Observations: 9,898
## Variables: 7
## $ salary  <dbl> 119806.54, 106880.48, 78020.75, 63689.94, 50873.62, 13...
## $ age     <int> 45, 63, 23, 51, 20, 56, 24, 62, 29, 41, 48, 52, 52, 33...
## $ elevel  <int> 0, 1, 0, 3, 3, 3, 4, 3, 4, 1, 4, 1, 3, 4, 2, 1, 2, 1, ...
## $ car     <int> 14, 11, 15, 6, 14, 14, 8, 3, 17, 5, 16, 6, 20, 13, 6, ...
## $ zipcode <int> 4, 6, 2, 5, 4, 3, 5, 0, 0, 4, 5, 0, 4, 3, 3, 4, 7, 2, ...
## $ credit  <dbl> 442037.71, 45007.18, 48795.32, 40888.88, 352951.50, 13...
## $ brand   <int> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, ...
glimpse(incompleted_survey)
## Observations: 5,000
## Variables: 7
## $ salary  <dbl> 110499.74, 140893.78, 119159.65, 20000.00, 93956.32, 4...
## $ age     <int> 54, 44, 49, 56, 59, 71, 32, 33, 32, 58, 22, 47, 41, 40...
## $ elevel  <int> 3, 4, 2, 0, 1, 2, 1, 4, 1, 2, 1, 4, 2, 4, 3, 1, 4, 3, ...
## $ car     <int> 15, 20, 1, 9, 15, 7, 17, 17, 19, 8, 12, 12, 1, 9, 4, 1...
## $ zipcode <int> 4, 7, 3, 1, 1, 2, 1, 0, 2, 4, 1, 2, 3, 2, 4, 3, 8, 8, ...
## $ credit  <dbl> 354724.18, 395015.34, 122025.09, 99629.62, 458679.83, ...
## $ brand   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

Data wrangling: giving names

Let’s put the brand information in a nicer way:

# transform the survey result to names
completed_survey$brand_name <- apply(completed_survey["brand"],
                                     MARGIN = 2,
                                     function(x) if_else(x == 0, "Acer", "Sony"))
salary age elevel car zipcode credit brand brand_name
119806.54 45 0 14 4 442037.71 0 Acer
106880.48 63 1 11 6 45007.18 1 Sony
78020.75 23 0 15 2 48795.32 0 Acer
63689.94 51 3 6 5 40888.88 1 Sony
50873.62 20 3 14 4 352951.50 0 Acer
130812.74 56 3 14 3 135943.02 1 Sony

Data wrangling: tranforming to factor

Now is the moment to transform the variables to factors from the incomplete survay:

categ_var <- c("elevel", "car", "zipcode", "brand_name")
completed_survey[categ_var] <- lapply(completed_survey[categ_var], as.factor)

# let's check the structure of the dataset
glimpse(completed_survey)
## Observations: 9,898
## Variables: 8
## $ salary     <dbl> 119806.54, 106880.48, 78020.75, 63689.94, 50873.62,...
## $ age        <int> 45, 63, 23, 51, 20, 56, 24, 62, 29, 41, 48, 52, 52,...
## $ elevel     <fct> 0, 1, 0, 3, 3, 3, 4, 3, 4, 1, 4, 1, 3, 4, 2, 1, 2, ...
## $ car        <fct> 14, 11, 15, 6, 14, 14, 8, 3, 17, 5, 16, 6, 20, 13, ...
## $ zipcode    <fct> 4, 6, 2, 5, 4, 3, 5, 0, 0, 4, 5, 0, 4, 3, 3, 4, 7, ...
## $ credit     <dbl> 442037.71, 45007.18, 48795.32, 40888.88, 352951.50,...
## $ brand      <int> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, ...
## $ brand_name <fct> Acer, Sony, Acer, Sony, Acer, Sony, Sony, Sony, Ace...
# We can apply the same process to the incomplete survey
incompleted_survey$brand_name <- apply(incompleted_survey["brand"],
                                       MARGIN = 2,
                                       function(x) if_else(x == 0, "Acer", "Sony"))
incompleted_survey[categ_var] <- lapply(incompleted_survey[categ_var], as.factor)

1st exploration

Understand the brand preference with the complete survey.

  • Table of absolute frequency
table(completed_survey$brand_name, dnn = c("Brand"))
## Brand
## Acer Sony 
## 3744 6154
  • Table of relative frequency
round(prop.table(table(completed_survey$brand_name, dnn = c("Brand"))),2)
## Brand
## Acer Sony 
## 0.38 0.62

1st modalisation

library(caret)
set.seed(2019)

# train and test
train_ids <- createDataPartition(y = completed_survey$brand,
                                 p = 0.75,
                                 list = F)
train <- completed_survey[train_ids,]
test <- completed_survey[-train_ids,]

# cross validation
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     number = 3)

# Creating the model with caret and using the model rpart
mod_caret_dt <- caret::train(brand_name ~ .,
                             data = train %>% dplyr::select(-brand),
                             method = "rpart",
                             trControl = ctrl)

1st error check: Accuracy and kappa

Results on train

train_results <- predict(object = mod_caret_dt, 
                         newdata = train)
postResample(train$brand_name, train_results)
##  Accuracy     Kappa 
## 0.8634159 0.7056679

Results on test

test_results <- predict(object = mod_caret_dt, 
                        newdata = test)
postResample(test$brand_name, test_results)
##  Accuracy     Kappa 
## 0.8666128 0.7095651

1st error check: variable importance

1st error check: confusion matrix

Confusion matrix on train

round(prop.table(confusionMatrix(train$brand_name, train_results)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.30 0.08
##       Sony 0.05 0.57

Confusion matrix on test

round(prop.table(confusionMatrix(test$brand_name, test_results)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.29 0.08
##       Sony 0.05 0.58

2nd exploration

Looking for the relation between the important variable and brand. Goal create a model that is not overfitted to the data:

completed_survey %>% 
  ggplot(aes(age, salary)) +
    geom_point(aes(color = brand_name))

2nd modalisation

This time im only going to use salary and age to maket the predictions:

Results on train

train_results_2ndmod <- predict(object = mod_caret_dt_subs, 
                                newdata = train)
postResample(train$brand_name, train_results_2ndmod)
##  Accuracy     Kappa 
## 0.8634159 0.7056679

Results on test

test_results_2ndmod <- predict(object = mod_caret_dt_subs, 
                               newdata = test)
postResample(test$brand_name, test_results_2ndmod)
##  Accuracy     Kappa 
## 0.8666128 0.7095651

2nd error check: confusion matrix

We are going to compare the confusion matrix for the first model (all the variables) and the model with salary and age:

Rpart model with all the variables

round(prop.table(confusionMatrix(test$brand_name, test_results)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.29 0.08
##       Sony 0.05 0.58

Rpart model with only salary and age

round(prop.table(confusionMatrix(test$brand_name, test_results_2ndmod)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.29 0.08
##       Sony 0.05 0.58

2nd error check: visualization of the errors

3rd modalization

Creation of the model with KNN

# Creating the model with caret and using the model rpart
mod_caret_knn_subs <- caret::train(brand_name ~ salary + age,
                                   data = train %>% dplyr::select(-brand),
                                   method = "knn",
                                   trControl = ctrl,
                                   preProcess = c("center","scale")) 

3rd error check: train and test metrics

Results on train

train_results_3rdmod <- predict(object = mod_caret_knn_subs, 
                                newdata = train)
postResample(train$brand_name, train_results_3rdmod)
##  Accuracy     Kappa 
## 0.9335938 0.8597098

Results on test

test_results_3rdmod <- predict(object = mod_caret_knn_subs, 
                                 newdata = test)
postResample(test$brand_name, test_results_3rdmod)
##  Accuracy     Kappa 
## 0.9223929 0.8340976

3rd error check: confusion matrix

rpart model with only salary and age

round(prop.table(confusionMatrix(test$brand_name, test_results_2ndmod)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.29 0.08
##       Sony 0.05 0.58

knn model with only salary and age

round(prop.table(confusionMatrix(test$brand_name, test_results_3rdmod)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.33 0.04
##       Sony 0.04 0.59

3rd error check: visualization of the errors

4rth modalisation: trying SVM algorithm

library(e1071)
mod_svm <- svm(brand_name ~ salary + age, 
               data = train)

Results on train

train_results_4rthmod <- predict(object = mod_svm, 
                                newdata = train)
postResample(train$brand_name, train_results_4rthmod)
##  Accuracy     Kappa 
## 0.9172953 0.8253844

Results on test

test_results_4rthmod <- predict(object = mod_svm, 
                                newdata = test)
postResample(test$brand_name, test_results_4rthmod)
##  Accuracy     Kappa 
## 0.9215845 0.8325183

4rth error check: confusion matrix

KNN model with only salary and age

# rpart model with only salary and age
round(prop.table(confusionMatrix(test$brand_name, test_results_3rdmod)$table),3)
##           Reference
## Prediction  Acer  Sony
##       Acer 0.334 0.037
##       Sony 0.040 0.588

SVM model with only salary and age

# rpart model with only salary and age
round(prop.table(confusionMatrix(test$brand_name, test_results_4rthmod)$table),3)
##           Reference
## Prediction  Acer  Sony
##       Acer 0.335 0.037
##       Sony 0.042 0.587

4rth error check: error visualization

Extra: support lines plot for SVM