Main Body
Preparing R, and the Data:
library(ggplot2)
library(readr)
library(ggplot2)
library(binr)
library(C50)
library(caret)
## Loading required package: lattice
#Import the data and getting the summary
CompleteResponses_original <- read.csv("CompleteResponses.csv")
CompleteResponses_beforePP <- read.csv("CompleteResponses.csv")
summary(CompleteResponses_original)
## salary age elevel car
## Min. : 20000 Min. :20.00 Min. :0.000 Min. : 1.00
## 1st Qu.: 52082 1st Qu.:35.00 1st Qu.:1.000 1st Qu.: 6.00
## Median : 84950 Median :50.00 Median :2.000 Median :11.00
## Mean : 84871 Mean :49.78 Mean :1.983 Mean :10.52
## 3rd Qu.:117162 3rd Qu.:65.00 3rd Qu.:3.000 3rd Qu.:15.75
## Max. :150000 Max. :80.00 Max. :4.000 Max. :20.00
## zipcode credit brand
## Min. :0.000 Min. : 0 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:120807 1st Qu.:0.0000
## Median :4.000 Median :250607 Median :1.0000
## Mean :4.041 Mean :249176 Mean :0.6217
## 3rd Qu.:6.000 3rd Qu.:374640 3rd Qu.:1.0000
## Max. :8.000 Max. :500000 Max. :1.0000
Pre-process the data
Firstly, its important to check if we have any NA into our data set
#Checking Missing information
my_na <- is.na(CompleteResponses_original)
sum(my_na)
## [1] 0
This mean the dataset used to develop a model didn’t have missing values.
After, its important to transform some features into categories:
#Converting features in categories
CompleteResponses_beforePP$brand <-as.factor(CompleteResponses_beforePP$brand)
CompleteResponses_beforePP$car <- as.factor(CompleteResponses_beforePP$car)
CompleteResponses_beforePP$elevel <- as.factor(CompleteResponses_beforePP$elevel)
CompleteResponses_beforePP$zipcode <- as.factor(CompleteResponses_beforePP$zipcode)
Feature Engineering
Key relation: AGE, SALARY, and BRAND
Ploting the differents features a looking for a relationship I found this:
#The only relation that we found was between AGE and SALARY
ggplot(CompleteResponses_beforePP, aes(x=salary, y=age, col=brand )) + geom_point()
We can see a correlation between salary and age. Looking at the age, we can see a patter if we divide it in tree groups (20 to 40, 40 to 60, and 60 to 80). And regarding to the salary, if we divide it in five groups($20.000 to $46.000, $46.000 to $72.000, …)
Lets divide this features using bin
#Dividing the age (3) and the salary (5)
CompleteResponses_beforePP$age_bin <- cut(CompleteResponses_beforePP$age, 3)
CompleteResponses_beforePP$salary_bin <- cut(CompleteResponses_beforePP$salary, 5)
Now, its time to transform them as a factor
#Converting them as a categorical
CompleteResponses_beforePP$age_bin <- as.factor(CompleteResponses_beforePP$age_bin)
CompleteResponses_beforePP$salary_bin <- as.factor(CompleteResponses_beforePP$salary_bin)
Finally, I created a new data set to train our model
#Creating a new data base with only the Brand, and the new categories (age_bin, and salary_bin)
features <- c("brand", "age_bin", "salary_bin")
CompleteResponses_afprepo <- CompleteResponses_beforePP [, features]
This is what the new data set looks like:
head(CompleteResponses_afprepo)
## brand age_bin salary_bin
## 1 0 (40,60] (9.8e+04,1.24e+05]
## 2 1 (60,80.1] (9.8e+04,1.24e+05]
## 3 0 (19.9,40] (7.2e+04,9.8e+04]
## 4 1 (40,60] (4.6e+04,7.2e+04]
## 5 0 (19.9,40] (4.6e+04,7.2e+04]
## 6 1 (40,60] (1.24e+05,1.5e+05]
Train and Assess the models
set.seed(998)
#Define an 75%/25% train/test.
inTraining <- createDataPartition(CompleteResponses_afprepo$brand, p = .75, list = FALSE)
training <- CompleteResponses_afprepo[inTraining,]
testing <- CompleteResponses_afprepo[-inTraining,]
#Cross Validation
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 1)
Decision tree
C.50, 10 folds, automatic tun, tuneLength 2.
Model1 <- train(brand~., data = training, method = "C5.0", trControl=fitControl,
tuneLength = 2)
Model1
## C5.0
##
## 7424 samples
## 2 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6682, 6681, 6681, 6682, 6681, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 1 0.9112291 0.8130879
## rules FALSE 10 0.9013897 0.7896372
## rules TRUE 1 0.9112291 0.8130879
## rules TRUE 10 0.9013897 0.7896372
## tree FALSE 1 0.9112291 0.8130879
## tree FALSE 10 0.9035481 0.7946562
## tree TRUE 1 0.9112291 0.8130879
## tree TRUE 10 0.9035481 0.7946562
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 1, model = rules
## and winnow = TRUE.
Random Forest
with 10-fold cross validation and manually tune 5 different mtry values.
rfGrid <- expand.grid(mtry=c(1,2,3,4,5))
Model2 <- train(brand~., data = training, method = "rf", trControl=fitControl,
tuneGrid=rfGrid)
Model2
## Random Forest
##
## 7424 samples
## 2 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6682, 6681, 6681, 6681, 6682, 6681, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.7578097 0.4152580
## 2 0.8500787 0.6657816
## 3 0.9027435 0.7934072
## 4 0.9112340 0.8131112
## 5 0.9112340 0.8131112
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
eXtreme Gradient Boosting
automatic tun, tuneLength 2.
Model3 <- train(brand~., data = training, method = "xgbTree", trControl=fitControl,
tuneLength = 2)
Model3
## eXtreme Gradient Boosting
##
## 7424 samples
## 2 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 6681, 6682, 6683, 6682, 6681, 6682, ...
## Resampling results across tuning parameters:
##
## eta max_depth colsample_bytree subsample nrounds Accuracy
## 0.3 1 0.6 0.5 50 0.7316841
## 0.3 1 0.6 0.5 100 0.7316841
## 0.3 1 0.6 1.0 50 0.7316841
## 0.3 1 0.6 1.0 100 0.7316841
## 0.3 1 0.8 0.5 50 0.7316841
## 0.3 1 0.8 0.5 100 0.7316841
## 0.3 1 0.8 1.0 50 0.7316841
## 0.3 1 0.8 1.0 100 0.7316841
## 0.3 2 0.6 0.5 50 0.8900911
## 0.3 2 0.6 0.5 100 0.9112373
## 0.3 2 0.6 1.0 50 0.9053153
## 0.3 2 0.6 1.0 100 0.9112373
## 0.3 2 0.8 0.5 50 0.9112373
## 0.3 2 0.8 0.5 100 0.9112373
## 0.3 2 0.8 1.0 50 0.9032858
## 0.3 2 0.8 1.0 100 0.9112373
## 0.4 1 0.6 0.5 50 0.7316841
## 0.4 1 0.6 0.5 100 0.7316841
## 0.4 1 0.6 1.0 50 0.7316841
## 0.4 1 0.6 1.0 100 0.7316841
## 0.4 1 0.8 0.5 50 0.7316841
## 0.4 1 0.8 0.5 100 0.7316841
## 0.4 1 0.8 1.0 50 0.7316841
## 0.4 1 0.8 1.0 100 0.7316841
## 0.4 2 0.6 0.5 50 0.9112373
## 0.4 2 0.6 0.5 100 0.9112373
## 0.4 2 0.6 1.0 50 0.9112373
## 0.4 2 0.6 1.0 100 0.9112373
## 0.4 2 0.8 0.5 50 0.9112373
## 0.4 2 0.8 0.5 100 0.9112373
## 0.4 2 0.8 1.0 50 0.9112373
## 0.4 2 0.8 1.0 100 0.9112373
## Kappa
## 0.4341213
## 0.4341213
## 0.4341213
## 0.4341213
## 0.4341213
## 0.4341213
## 0.4341213
## 0.4341213
## 0.7628439
## 0.8130903
## 0.7986915
## 0.8130903
## 0.8130903
## 0.8130903
## 0.7945293
## 0.8130903
## 0.4341213
## 0.4341213
## 0.4341213
## 0.4341213
## 0.4341213
## 0.4341213
## 0.4341213
## 0.4341213
## 0.8130903
## 0.8130903
## 0.8130903
## 0.8130903
## 0.8130903
## 0.8130903
## 0.8130903
## 0.8130903
##
## Tuning parameter 'gamma' was held constant at a value of 0
##
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 50, max_depth = 2,
## eta = 0.3, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1
## and subsample = 0.5.
Comparison between three models:
Classifier <- c('C5.0','Random Forest','eXtreme Boosting Trees')
Accuracy <- c(0.9112291, 0.9112340, 0.9112373)
Kappa <- c(0.8130879, 0.8131112, 0.8130903)
metrics <- data.frame(Classifier, Accuracy, Kappa)
metrics
## Classifier Accuracy Kappa
## 1 C5.0 0.9112291 0.8130879
## 2 Random Forest 0.9112340 0.8131112
## 3 eXtreme Boosting Trees 0.9112373 0.8130903
I picked the most accurate model after optimization which was the eXtreme Boosting Trees.
Using the model to make predictions
Pre-Process
#Read the newdata----
surveyIncomplete <- read.csv("SurveyIncomplete.csv")
#preprocess_newdata ----
surveyIncomplete$age_bin <- cut(surveyIncomplete$age, 3)
surveyIncomplete$salary_bin <- cut(surveyIncomplete$salary, 5)
surveyIncomplete$age_bin <- as.factor(surveyIncomplete$age_bin)
surveyIncomplete$salary_bin <- as.factor(surveyIncomplete$salary_bin)
#feature selection----
features <- c("brand", "age_bin", "salary_bin")
surveyIncomplete_afprepo <- surveyIncomplete [, features]
This is the new data set to fill with the predictions:
head(surveyIncomplete_afprepo)
## brand age_bin salary_bin
## 1 0 (40,60] (9.8e+04,1.24e+05]
## 2 0 (40,60] (1.24e+05,1.5e+05]
## 3 0 (40,60] (9.8e+04,1.24e+05]
## 4 0 (40,60] (1.99e+04,4.6e+04]
## 5 0 (40,60] (7.2e+04,9.8e+04]
## 6 0 (60,80.1] (1.99e+04,4.6e+04]
Apply the Model3
predictions <- predict(Model3, surveyIncomplete_afprepo)
summary(predictions)
## 0 1
## 1967 3033
We had 5,000 surveys without the answer about the brand. Using our model, we can predict that 1,967 persons would choose Acer,and 3,033 would choose Sony.
Checking the results
#create new table with surveyIncomplete and predictions----
predictions_table <- surveyIncomplete[1:6]
predictions_table$brand <- predictions
head(predictions_table)
## salary age elevel car zipcode credit brand
## 1 110499.74 54 3 15 4 354724.18 0
## 2 140893.78 44 4 20 7 395015.34 1
## 3 119159.65 49 2 1 3 122025.09 0
## 4 20000.00 56 0 9 1 99629.62 1
## 5 93956.32 59 1 15 1 458679.83 0
## 6 41365.43 71 2 7 2 216839.72 0
Plot: Relation between Age, Salary, and Brand (Predictions)
ggplot(predictions_table, aes(x=age, y=salary, col = brand)) + geom_point() +
labs(x="Age", y="Salary", title="Relation between Age, Salary, and Brand (Predictions)")
Plot: Relation between Age, Salary, and Brand (CompleteResponses)
ggplot(CompleteResponses_beforePP, aes(x=age, y=salary, col = brand)) + geom_point() +
labs(x="Age", y="Salary", title="Relation between Age, Salary, and Brand (CompleteResponses)")
As we can see, the graphics, using the real responses, and the predictions- looks the practically the same. Thats mean our model works.
Final Results
Which brand do our customers prefer?
#According to the Complete Surveys:
summary(CompleteResponses_beforePP$brand)
## 0 1
## 3744 6154
#According to the Incomplete Surveys (predictions):
summary(predictions)
## 0 1
## 1967 3033
#Total:
answers <- summary(CompleteResponses_beforePP$brand) + summary(predictions)
answers
## 0 1
## 5711 9187
To make a graphic that shows the difference and the final result, we can create a new data set with all the surveys:
#Create a new data set with all the survey's answers: real responses and predictions
table.total <- rbind(CompleteResponses_beforePP[1:7], predictions_table[, c(1:7)])
Let´s make a graphic:
#change 0 to Acer and 1 to Sony, and add a column brand_name
table.total$brand_name <- ifelse(table.total$brand == 1, "Sony", "Acer")
#using ggplot to draw barchart for incomplete data
ggplot(data = table.total, mapping = aes(x = brand_name)) +
geom_bar(position = "dodge") +
labs(x="Brand", y="Count", title="What brand do our customers prefer?") +
geom_text(stat='count', aes(label=..count..), vjust=5, col="white", size=4)