16 de marzo de 2019

Defining the problem

The sales team engaged a market research firm to conduct a survey of our existing customers.

  • Objective: find out which of two brands of computers our customers prefer.

  • Problem: the answer to the brand preference question was not properly captured for all of the respondents.

Our work in that task would be:

  1. to investigate if customer responses to some survey questions (e.g. income, age, etc.) enable us to predict the answer to the brand preference question.

  2. to make those predictions and provide the sales team with a complete view of what brand our customers prefer.

Import the data

library(dplyr)
glimpse(completed_survey)
## Observations: 9,898
## Variables: 7
## $ salary  <dbl> 119806.54, 106880.48, 78020.75, 63689.94, 50873.62, 130812.74…
## $ age     <int> 45, 63, 23, 51, 20, 56, 24, 62, 29, 41, 48, 52, 52, 33, 62, 2…
## $ elevel  <int> 0, 1, 0, 3, 3, 3, 4, 3, 4, 1, 4, 1, 3, 4, 2, 1, 2, 1, 2, 0, 0…
## $ car     <int> 14, 11, 15, 6, 14, 14, 8, 3, 17, 5, 16, 6, 20, 13, 6, 11, 7, …
## $ zipcode <int> 4, 6, 2, 5, 4, 3, 5, 0, 0, 4, 5, 0, 4, 3, 3, 4, 7, 2, 8, 2, 8…
## $ credit  <dbl> 442037.71, 45007.18, 48795.32, 40888.88, 352951.50, 135943.02…
## $ brand   <int> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0…
glimpse(incompleted_survey)
## Observations: 5,000
## Variables: 7
## $ salary  <dbl> 110499.74, 140893.78, 119159.65, 20000.00, 93956.32, 41365.43…
## $ age     <int> 54, 44, 49, 56, 59, 71, 32, 33, 32, 58, 22, 47, 41, 40, 62, 5…
## $ elevel  <int> 3, 4, 2, 0, 1, 2, 1, 4, 1, 2, 1, 4, 2, 4, 3, 1, 4, 3, 3, 3, 0…
## $ car     <int> 15, 20, 1, 9, 15, 7, 17, 17, 19, 8, 12, 12, 1, 9, 4, 13, 6, 1…
## $ zipcode <int> 4, 7, 3, 1, 1, 2, 1, 0, 2, 4, 1, 2, 3, 2, 4, 3, 8, 8, 6, 8, 2…
## $ credit  <dbl> 354724.182, 395015.339, 122025.085, 99629.621, 458679.826, 21…
## $ brand   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Data wrangling: giving names

Let’s put the brand information in a nicer way:

# transform the survey result to names
completed_survey$brand_name <- apply(completed_survey["brand"],
                                     MARGIN = 2,
                                     function(x) if_else(x == 0, "Acer", "Sony"))
salary age elevel car zipcode credit brand brand_name
119806.54 45 0 14 4 442037.71 0 Acer
106880.48 63 1 11 6 45007.18 1 Sony
78020.75 23 0 15 2 48795.32 0 Acer
63689.94 51 3 6 5 40888.88 1 Sony
50873.62 20 3 14 4 352951.50 0 Acer
130812.74 56 3 14 3 135943.02 1 Sony

Data wrangling: tranforming to factor

Now is the moment to transform the variables to factors from the incomplete survay:

categ_var <- c("elevel", "car", "zipcode", "brand_name")
completed_survey[categ_var] <- lapply(completed_survey[categ_var], as.factor)

# let's check the structure of the dataset
glimpse(completed_survey)
## Observations: 9,898
## Variables: 8
## $ salary     <dbl> 119806.54, 106880.48, 78020.75, 63689.94, 50873.62, 130812…
## $ age        <int> 45, 63, 23, 51, 20, 56, 24, 62, 29, 41, 48, 52, 52, 33, 62…
## $ elevel     <fct> 0, 1, 0, 3, 3, 3, 4, 3, 4, 1, 4, 1, 3, 4, 2, 1, 2, 1, 2, 0…
## $ car        <fct> 14, 11, 15, 6, 14, 14, 8, 3, 17, 5, 16, 6, 20, 13, 6, 11, …
## $ zipcode    <fct> 4, 6, 2, 5, 4, 3, 5, 0, 0, 4, 5, 0, 4, 3, 3, 4, 7, 2, 8, 2…
## $ credit     <dbl> 442037.71, 45007.18, 48795.32, 40888.88, 352951.50, 135943…
## $ brand      <int> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1…
## $ brand_name <fct> Acer, Sony, Acer, Sony, Acer, Sony, Sony, Sony, Acer, Sony…
# We can apply the same process to the incomplete survey
incompleted_survey$brand_name <- apply(incompleted_survey["brand"],
                                       MARGIN = 2,
                                       function(x) if_else(x == 0, "Acer", "Sony"))
incompleted_survey[categ_var] <- lapply(incompleted_survey[categ_var], as.factor)

1st exploration

Understand the brand preference with the complete survey.

  • Table of absolute frequency
table(completed_survey$brand_name, dnn = c("Brand"))
## Brand
## Acer Sony 
## 3744 6154
  • Table of relative frequency
round(prop.table(table(completed_survey$brand_name, dnn = c("Brand"))),2)
## Brand
## Acer Sony 
## 0.38 0.62

1st modalisation

library(caret)
set.seed(2019)

# train and test
train_ids <- createDataPartition(y = completed_survey$brand,
                                 p = 0.75,
                                 list = F)
train <- completed_survey[train_ids,]
test <- completed_survey[-train_ids,]

# cross validation
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     number = 3)

# Creating the model with caret and using the model rpart
mod_caret_dt <- caret::train(brand_name ~ .,
                             data = train %>% dplyr::select(-brand),
                             method = "rpart",
                             trControl = ctrl)

1st error check: Accuracy and kappa

Results on train

train_results <- predict(object = mod_caret_dt, 
                         newdata = train)
postResample(train$brand_name, train_results)
##  Accuracy     Kappa 
## 0.7637392 0.5236003

Results on test

test_results <- predict(object = mod_caret_dt, 
                        newdata = test)
postResample(test$brand_name, test_results)
##  Accuracy     Kappa 
## 0.7599030 0.5159364

1st error check: variable importance

1st error check: confusion matrix

Confusion matrix on train

round(prop.table(confusionMatrix(train$brand_name, train_results)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.31 0.06
##       Sony 0.17 0.45

Confusion matrix on test

round(prop.table(confusionMatrix(test$brand_name, test_results)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.31 0.07
##       Sony 0.17 0.45

2nd exploration

Looking for the relation between the important variable and brand. Goal create a model that is not overfitted to the data:

completed_survey %>% 
  ggplot(aes(age, salary)) +
    geom_point(aes(color = brand_name))

2nd modalisation

This time im only going to use salary and age to maket the predictions:

Results on train

train_results_2ndmod <- predict(object = mod_caret_dt_subs, 
                                newdata = train)
postResample(train$brand_name, train_results_2ndmod)
##  Accuracy     Kappa 
## 0.7637392 0.5236003

Results on test

test_results_2ndmod <- predict(object = mod_caret_dt_subs, 
                               newdata = test)
postResample(test$brand_name, test_results_2ndmod)
##  Accuracy     Kappa 
## 0.7599030 0.5159364

2nd error check: confusion matrix

We are going to compare the confusion matrix for the first model (all the variables) and the model with salary and age:

Rpart model with all the variables

round(prop.table(confusionMatrix(test$brand_name, test_results)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.31 0.07
##       Sony 0.17 0.45

Rpart model with only salary and age

round(prop.table(confusionMatrix(test$brand_name, test_results_2ndmod)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.31 0.07
##       Sony 0.17 0.45

2nd error check: visualization of the errors

Probability to be Acer

3rd modalization

Creation of the model with KNN

# Creating the model with caret and using the model rpart
mod_caret_knn_subs <- caret::train(brand_name ~ salary + age,
                                   data = train %>% dplyr::select(-brand),
                                   method = "knn",
                                   trControl = ctrl,
                                   preProcess = c("center","scale")) 

3rd error check: train and test metrics

Results on train

train_results_3rdmod <- predict(object = mod_caret_knn_subs, 
                                newdata = train)
postResample(train$brand_name, train_results_3rdmod)
##  Accuracy     Kappa 
## 0.9338631 0.8587763

Results on test

test_results_3rdmod <- predict(object = mod_caret_knn_subs, 
                                 newdata = test)
postResample(test$brand_name, test_results_3rdmod)
##  Accuracy     Kappa 
## 0.9215845 0.8350790

3rd error check: confusion matrix

rpart model with only salary and age

round(prop.table(confusionMatrix(test$brand_name, test_results_2ndmod)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.31 0.07
##       Sony 0.17 0.45

knn model with only salary and age

round(prop.table(confusionMatrix(test$brand_name, test_results_3rdmod)$table),2)
##           Reference
## Prediction Acer Sony
##       Acer 0.35 0.04
##       Sony 0.04 0.57

3rd error check: visualization of the errors

Probability to be Acer

4rth modalisation: trying SVM algorithm

library(e1071)
mod_svm <- svm(brand_name ~ salary + age, 
               data = train)

Results on train

train_results_4rthmod <- predict(object = mod_svm, 
                                newdata = train)
postResample(train$brand_name, train_results_4rthmod)
##  Accuracy     Kappa 
## 0.9174300 0.8247185

Results on test

test_results_4rthmod <- predict(object = mod_svm, 
                                newdata = test)
postResample(test$brand_name, test_results_4rthmod)
##  Accuracy     Kappa 
## 0.9143088 0.8200521

4rth error check: confusion matrix

KNN model with only salary and age

# rpart model with only salary and age
round(prop.table(confusionMatrix(test$brand_name, test_results_3rdmod)$table),3)
##           Reference
## Prediction  Acer  Sony
##       Acer 0.350 0.038
##       Sony 0.041 0.572

SVM model with only salary and age

# rpart model with only salary and age
round(prop.table(confusionMatrix(test$brand_name, test_results_4rthmod)$table),3)
##           Reference
## Prediction  Acer  Sony
##       Acer 0.348 0.040
##       Sony 0.046 0.566

4rth error check: error visualization

Probability to be Acer

Extra: support lines plot for SVM