We were asked by Danielle Sherman, CTO of Blackwell Electronics (an eletronic retailer) to predict the customers’ brand preferences that were missing from an incomplete surveys.
These predictions were carried out following the next steps:You can see a section of the used survey here (5) and the complete code we used in R here (6).
All variables seem to have a practically uniform distribution. At first sight, it seems that:
We’ll explore these relations between variables deeply, with statistical analyse, in the next section
## Loading required package: pacman
#Relation brand-variables
for(i in 1:ncol(Survey)) {
if ((names(Survey[i])) == "brand"){
p1<-ggplot(Survey, aes(x = brand)) + geom_bar(fill="deepskyblue4") +
labs(x="Brand")
print("Plot Brand")
print(p1)
} else if (is.numeric(Survey[[i]]) == "TRUE"){
p1<-ggplot(Survey, aes(x = Survey[[i]], fill=brand)) + geom_histogram(color="black",bins = 10) +
labs(x=colnames(Survey[i]))
print(paste("Plot Brand-", colnames(Survey[i])))
print(p1)
} else {
p1<-ggplot(Survey, aes(x = Survey[[i]], fill = brand)) + geom_bar(position = "fill") +
labs(x=colnames(Survey[i]))
print(paste("Plot Brand-", colnames(Survey[i])))
print(p1)
}
}
## [1] "Plot Brand- salary"
## [1] "Plot Brand- age"
## [1] "Plot Brand- elevel"
## [1] "Plot Brand- car"
## [1] "Plot Brand- zipcode"
## [1] "Plot Brand- credit"
## [1] "Plot Brand"
ggplot(Survey, aes(x = age, y = salary, color = brand)) + geom_point()
Only the salary seems to create significative differences in the Brand (p=0). Although the age doesn’t have a p<0.05, it’s close, so we’re also going to consider this variable for the model.
MatrixTest<-matrix(ncol=7,nrow=7)
rownames(MatrixTest)<-c("Salary", "Age", "EducationalLevel", "Car", "ZipCode", "Credit", "Brand")
colnames(MatrixTest)<-c("Salary", "Age", "EducationalLevel", "Car", "ZipCode", "Credit", "Brand")
for (i in 1:ncol(Survey)){
for (j in 1:ncol(Survey)){
if (is.numeric(Survey[[i]]) == "TRUE" & is.numeric(Survey[[j]]) == "TRUE"){
MatrixTest[i,j]<-(cor(Survey[[i]], Survey[[j]]))
} else if (is.numeric(Survey[[i]]) == "TRUE" & is.numeric(Survey[[j]]) == "FALSE"){
MatrixTest[i,j]<-(summary(aov((Survey[[i]]~Survey[[j]]), data=Survey))[[1]][[5]][1])
} else if (is.numeric(Survey[[i]]) == "FALSE" & is.numeric(Survey[[j]]) == "TRUE"){
MatrixTest[i,j]<-(summary(aov((Survey[[j]]~Survey[[i]]), data=Survey))[[1]][[5]][1])
} else {
MatrixTest[i,j]<-(chisq.test(Survey[[i]], Survey[[j]])$p.value)
}
}
}
MatrixTest<-round(MatrixTest, digits = 5)
library(knitr)
kable(MatrixTest)
| Salary | Age | EducationalLevel | Car | ZipCode | Credit | Brand | |
|---|---|---|---|---|---|---|---|
| Salary | 1.00000 | 0.00703 | 0.61800 | 0.60367 | 0.30284 | -0.02405 | 0.00000 |
| Age | 0.00703 | 1.00000 | 0.50033 | 0.57927 | 0.02954 | -0.00496 | 0.15252 |
| EducationalLevel | 0.61800 | 0.50033 | 0.00000 | 0.87523 | 0.14492 | 0.76093 | 0.95628 |
| Car | 0.60367 | 0.57927 | 0.87523 | 0.00000 | 0.43745 | 0.09066 | 0.87598 |
| ZipCode | 0.30284 | 0.02954 | 0.14492 | 0.43745 | 0.00000 | 0.29748 | 0.49532 |
| Credit | -0.02405 | -0.00496 | 0.76093 | 0.09066 | 0.29748 | 1.00000 | 0.56212 |
| Brand | 0.00000 | 0.15252 | 0.95628 | 0.87598 | 0.49532 | 0.56212 | 0.00000 |
For this reason, we firstly built training and testing sets and prepared the cross validation
# set seed and define an 75%/25% train/test split of the dataset
set.seed(314)
inTraining<-createDataPartition(Survey$brand, p=0.75, list=FALSE)
train<- Survey[inTraining,]
test <- Survey[-inTraining,]
# cross validation
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
KNNfit1: All variables
KNNfit1<-train(brand~., data= train, method="knn", trControl=fitControl,
preProcess=c("center", "scale"), tuneLength=5)
KNNfit1
## k-Nearest Neighbors
##
## 7501 samples
## 6 predictor
## 2 classes: 'Acer', 'Sony'
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 6751, 6751, 6751, 6751, 6751, 6751, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.5642978 0.04753774
## 7 0.5945867 0.09432167
## 9 0.6231152 0.14894735
## 11 0.6328213 0.16584136
## 13 0.6286359 0.15342835
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.
KNNfit2: Salary
KNNfit2<-train(brand~salary, data=train, method="knn", trControl=fitControl,
preProcess=c("center", "scale"), tuneLength=5)
KNNfit2
## k-Nearest Neighbors
##
## 7501 samples
## 1 predictor
## 2 classes: 'Acer', 'Sony'
##
## Pre-processing: centered (1), scaled (1)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 6750, 6750, 6752, 6751, 6751, 6751, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.6909520 0.3401435
## 7 0.6982281 0.3540756
## 9 0.7059348 0.3713329
## 11 0.7104392 0.3805094
## 13 0.7145990 0.3898755
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.
KNNfit3: Salary + Age
KNNfit3<-train(brand~salary + age, data=train, method="knn", trControl=fitControl,
preProcess=c("center", "scale"), tuneLength=5)
KNNfit3
## k-Nearest Neighbors
##
## 7501 samples
## 2 predictor
## 2 classes: 'Acer', 'Sony'
##
## Pre-processing: centered (2), scaled (2)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 6751, 6752, 6750, 6752, 6751, 6751, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9138249 0.8173721
## 7 0.9136652 0.8169669
## 9 0.9168120 0.8237625
## 11 0.9186247 0.8275884
## 13 0.9200649 0.8305691
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.
DTfit1: All variables
# cross-validation
fitControldt <- trainControl(method = "repeatedcv", number = 10, repeats = 2)
DTfit1<-train(brand~., data= train, method="parRF", trControl=fitControldt, ntree=50)
DTfit1
## Parallel Random Forest
##
## 7501 samples
## 6 predictor
## 2 classes: 'Acer', 'Sony'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 6751, 6751, 6750, 6752, 6750, 6751, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.6248499 0.01180913
## 18 0.9197431 0.82965211
## 34 0.9141444 0.81775573
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 18.
DTfit2: Salary
DTfit2<-train(brand~salary, data= train, method="parRF", trControl=fitControldt,ntree=50)
DTfit2
## Parallel Random Forest
##
## 7501 samples
## 1 predictor
## 2 classes: 'Acer', 'Sony'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 6751, 6751, 6750, 6752, 6751, 6751, ...
## Resampling results:
##
## Accuracy Kappa
## 0.6417134 0.2396669
##
## Tuning parameter 'mtry' was held constant at a value of 2
DTfit3: Salary + Age
DTfit3<-train(brand~salary + age, data= train, method="parRF", trControl=fitControldt,ntree=50)
## note: only 1 unique complexity parameters in default grid. Truncating the grid to 1 .
DTfit3
## Parallel Random Forest
##
## 7501 samples
## 2 predictor
## 2 classes: 'Acer', 'Sony'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 6750, 6752, 6751, 6750, 6751, 6750, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9062809 0.8007857
##
## Tuning parameter 'mtry' was held constant at a value of 2
Plot Decision Tree (Salary + Age)
ct<-ctree(brand~salary + age, data=Survey, controls = ctree_control(maxdepth=3))
plot(ct)
Alt text
Here you can see the variables of the survey
head(Survey)
## # A tibble: 6 x 7
## salary age elevel car zipcode credit brand
## <dbl> <dbl> <ord> <fctr> <fctr> <dbl> <fctr>
## 1 119807 45.0 0 14 4 442038 Acer
## 2 106880 63.0 1 11 6 45007 Sony
## 3 78021 23.0 0 15 2 48795 Acer
## 4 63690 51.0 3 6 5 40889 Sony
## 5 50874 20.0 3 14 4 352951 Acer
## 6 130813 56.0 3 14 3 135943 Sony
Elevel: the highest level of education he have obtained
-0.Less than High School Degree
-1.High School Degree
-2.Some College
-3.4-Year College Degree
-4.Master’s, Doctoral or Professional Degree
ZipCode: 1 of the following 9 regions ini the U.S.
-0.New England
-1.Mid-Atlantic
-2.East North Central
-3.West North Central
-4.South Atlantic
-5.East South Central
-6.West South Central
->7.Mountain
-8.Pacific
Brand: which brand of computers do you prefer
-0.Acer
-1.Sonye
Here you can obtain all the code used for this task.
Includes
#Load Libraries: p_load can install, load, and update packages
if(require("pacman")=="FALSE"){
install.packages("pacman")
}
pacman::p_load(caret, readxl, ggplot2, plyr, clusterSim, party)
# Load Data
setwd("C:/SARA/Ubiqum/Section2/Task2")
Survey<-read_excel("Survey_Key_and_Complete_Responses_excel.xlsx", sheet = 2)
Cleaning and Exploring Data
#Rename some variables
Survey$brand[Survey$brand=="0"] <-"Acer"
Survey$brand[Survey$brand=="1"] <-"Sony"
# Data Type. Education Level to ordinal. Car, zipcode and brand to factor
Survey$elevel<-as.ordered(Survey$elevel)
Survey$car<-as.factor(Survey$car)
Survey$zipcode<-as.factor(Survey$zipcode)
Survey$brand<-as.factor(Survey$brand)
#sum(is.na(Survey)) is 0
#Relation brand-variables
for(i in 1:ncol(Survey)) {
if(is.numeric(Survey[[i]]) == "TRUE"){
p1<-ggplot(Survey, aes(x = Survey[[i]], fill=brand)) + geom_histogram(color="black",bins = 10) +
labs(x=colnames(Survey[i]))
print(paste("Plot Brand-", colnames(Survey[i])))
print(p1)
} else {
p1<-ggplot(Survey, aes(x = Survey[[i]], fill = brand)) + geom_bar(position = "fill") +
labs(x=colnames(Survey[i]))
print(paste("Plot Brand-", colnames(Survey[i])))
print(p1)
}
}
# Removing redundance
MatrixTest<-matrix(ncol=7,nrow=7)
rownames(MatrixTest)<-c("Salary", "Age", "EducationalLevel", "Car", "ZipCode", "Credit", "Brand")
colnames(MatrixTest)<-c("Salary", "Age", "EducationalLevel", "Car", "ZipCode", "Credit", "Brand")
for (i in 1:ncol(Survey)){
for (j in 1:ncol(Survey)){
if (is.numeric(Survey[[i]]) == "TRUE" & is.numeric(Survey[[j]]) == "TRUE"){
MatrixTest[i,j]<-(cor(Survey[[i]], Survey[[j]]))
} else if (is.numeric(Survey[[i]]) == "TRUE" & is.numeric(Survey[[j]]) == "FALSE"){
MatrixTest[i,j]<-(summary(aov((Survey[[i]]~Survey[[j]]), data=Survey))[[1]][[5]][1])
} else if (is.numeric(Survey[[i]]) == "FALSE" & is.numeric(Survey[[j]]) == "TRUE"){
MatrixTest[i,j]<-(summary(aov((Survey[[j]]~Survey[[i]]), data=Survey))[[1]][[5]][1])
} else {
MatrixTest[i,j]<-(chisq.test(Survey[[i]], Survey[[j]])$p.value)
}
}
}
MatrixTest<-round(MatrixTest, digits = 5)
Create training and testing sets
# set seed and define an 75%/25% train/test split of the dataset
set.seed(314)
inTraining<-createDataPartition(Survey$brand, p=0.75, list=FALSE)
train<- Survey[inTraining,]
test <- Survey[-inTraining,]
Create a predictive model: KNN
# 10 fold cross validation
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
# Train KNN model
KNNfit1<-train(brand~., data= train, method="knn", trControl=fitControl, preProcess=c("center", "scale"), tuneLength=5)
KNNfit2<-train(brand~salary, data=train, method="knn", trControl=fitControl, preProcess=c("center", "scale"), tuneLength=5)
KNNfit3<-train(brand~salary + age, data=train, method="knn", trControl=fitControl, preProcess=c("center", "scale"), tuneLength=5)
# Predictor variables
predictors(KNNfit1)
predictors(KNNfit2)
predictors(KNNfit3)
# Make predictions
testPredKNN1<-predict(KNNfit1, test)
testPredKNN2<-predict(KNNfit2, test)
testPredKNN3<-predict(KNNfit3, test)
# Performance measurment
postResample(testPredKNN1, test$brand)
postResample(testPredKNN2, test$brand)
postResample(testPredKNN3, test$brand)
# Plot predicted verses actual
plot(testPredKNN1, test$brand)
plot(KNNfit1)
plot(testPredKNN2, test$brand)
plot(KNNfit2)
plot(testPredKNN3, test$brand)
plot(KNNfit3)
Create a predictive model: DT
#### CREATE A PREDICTIVE MODEL: Decision Tree ####
fitControldt <- trainControl(method = "repeatedcv", number = 10, repeats = 2)
# Train DT model
DTfit1<-train(brand~., data= train, method="parRF", trControl=fitControldt, ntree=50, do.trace=10)
DTfit2<-train(brand~salary, data= train, method="parRF", trControl=fitControldt,ntree=50, do.trace=10)
DTfit3<-train(brand~salary + age, data= train, method="parRF", trControl=fitControldt,ntree=50, do.trace=10)
# Predictor variables
predictors(DTfit1)
predictors(DTfit2)
predictors(DTfit3)
# Make predictions
testPredDTfit1<-predict(DTfit1, test)
testPredDTfit2<-predict(DTfit2, test)
testPredDTfit3<-predict(DTfit3, test)
# Performance measurment
postResample(testPredDTfit1, test$brand)
postResample(testPredDTfit2, test$brand)
postResample(testPredDTfit3, test$brand)
# Plot predicted verses actual
plot(testPredDTfit1, test$brand)
plot(DTfit1)
plot(testPredDTfit2, test$brand)
plot(DTfit2)
plot(testPredDTfit3, test$brand)
plot(DTfit3)
Completing Survey
SurveyIncomplete<-read.csv("SurveyIncomplete.csv")
# Data Type. Education Level to ordinal and make of the car, zipcode and brand to factor
SurveyIncomplete$elevel<-as.ordered(SurveyIncomplete$elevel)
SurveyIncomplete$car<-as.factor(SurveyIncomplete$car)
SurveyIncomplete$zipcode<-as.factor(SurveyIncomplete$zipcode)
SurveyIncomplete$brand<-as.factor(SurveyIncomplete$brand)
Su
PredSurveyIncomp<-predict(KNNfit3, newdata = SurveyIncomplete)
summary(PredSurveyIncomp)
barplot(summary(PredSurveyIncomp), col="deepskyblue4", border="black")