Background
We are asked to predict the customers’ brand preferences between Acer and Sony that are missing from the incomplete surveys by building computer models.
Dataset Information
Pre-processing
# load the libraries
library(readr)
library(caret)
library(ggplot2)
# set seed
set.seed(123)
# load the dataset and check the structure
complete <- read.csv("CompleteResponses.csv")
str(complete)
## 'data.frame': 9898 obs. of 7 variables:
## $ salary : num 119807 106880 78021 63690 50874 ...
## $ age : int 45 63 23 51 20 56 24 62 29 41 ...
## $ elevel : int 0 1 0 3 3 3 4 3 4 1 ...
## $ car : int 14 11 15 6 14 14 8 3 17 5 ...
## $ zipcode: int 4 6 2 5 4 3 5 0 0 4 ...
## $ credit : num 442038 45007 48795 40889 352951 ...
## $ brand : int 0 1 0 1 0 1 1 1 0 1 ...
# check the sum of missing values
sum(is.na(complete))
## [1] 0
# change data type to factor
complete$elevel <- as.factor(complete$elevel)
complete$car <- as.factor(complete$car)
complete$zipcode <- as.factor(complete$zipcode)
complete$brand <- as.factor(complete$brand)
Build the Models
# define an 75%/25% train/test split of the dataset
inTraining <- createDataPartition(complete$brand, p = .75, list = FALSE)
training <- complete[inTraining,]
testing <- complete[-inTraining,]
# 10 fold cross validation
fitControl <- trainControl(method = "cv", number = 10)
Evaluate the Models
Make Predictions