#### Import Data ####
#read train file
complete <- read.csv("CompleteResponses.csv", header = TRUE, check.names=FALSE,sep = ",", quote = "\'", as.is = TRUE)
#read predict file
incomplete <- read.csv("SurveyIncomplete.csv", header = TRUE, check.names=FALSE,sep =",", quote = "\'", as.is = TRUE)
In the survey “complete” data set, we have 7 attributes. The variable called “brand” will be our dependent variable. 1. salary 2. age 3. elevel 4. car 5. zipcode 6. credit 7. brand
After that, we need to check the data types in the two data sets:
#### Data structure####
str(complete) # 10.000 obs of 7 vars
## 'data.frame': 9898 obs. of 7 variables:
## $ salary : num 119807 106880 78021 63690 50874 ...
## $ age : int 45 63 23 51 20 56 24 62 29 41 ...
## $ elevel : int 0 1 0 3 3 3 4 3 4 1 ...
## $ car : int 14 11 15 6 14 14 8 3 17 5 ...
## $ zipcode: int 4 6 2 5 4 3 5 0 0 4 ...
## $ credit : num 442038 45007 48795 40889 352951 ...
## $ brand : int 0 1 0 1 0 1 1 1 0 1 ...
str(incomplete) # 5.000 obs of 7 vars
## 'data.frame': 5000 obs. of 7 variables:
## $ salary : num 110500 140894 119160 20000 93956 ...
## $ age : int 54 44 49 56 59 71 32 33 32 58 ...
## $ elevel : int 3 4 2 0 1 2 1 4 1 2 ...
## $ car : int 15 20 1 9 15 7 17 17 19 8 ...
## $ zipcode: int 4 7 3 1 1 2 1 0 2 4 ...
## $ credit : num 354724 395015 122025 99630 458680 ...
## $ brand : int 0 0 0 0 0 0 0 0 0 0 ...
#Rename some variables
complete$brand[complete$brand=="0"] <-"Acer"
complete$brand[complete$brand=="1"] <-"Sony"
#Convert brand to factor
complete$brand <- factor(complete$brand)
The brand Bar Plot bellow show us that in the complete survey customers choose Sony brand with 61,54%.
ggplot(complete, aes(x=brand, fill=brand)) + geom_bar() + ggtitle("Brand") +
geom_text(stat="count",aes(label=..count..,y=..count..), vjust=10)
Plot Brand - Salary
ggplot(complete, aes(x=salary, fill=brand)) + geom_histogram(color="grey", bins=20) + ggtitle("Brand - Salary")
Plot Brand - Age
ggplot(complete, aes(x=age, fill=brand)) + geom_histogram(color="grey", bins=20) + ggtitle("Brand - Age")
Plot Brand - Education Level
ggplot(complete, aes(x=elevel, fill=brand)) + geom_histogram(color="grey", bins=20) + ggtitle("Brand - Education Level")
Plot Brand - Zip Code
ggplot(complete, aes(x=zipcode, fill=brand)) + geom_histogram(color="grey", bins=20) + ggtitle("Brand - Zip Code")
Plot Brand - Salary + Age
p <- ggplot(complete, aes(x=salary, y=age, colour = brand))
p <- p + geom_point() + ggtitle("Brand - Salary + Age")
p
This plot showed us how that “age and salary” had some correlation with the brand variable. We can split the age into 3 bins and the salary into 5 bins to see it with more clarity.
Decision Tree is a very useful method for determining the relevant variables in a data set. Bellow, you can find the lines of code of the decision tree and the outputs:
ct1<-ctree(brand~., data=complete, controls = ctree_control(maxdepth=3))
plot(ct1, main="Plot Decision Tree")
According to this decision tree, we can say that the brand as a dependent variable is related to the salary and age variables. We cloud use these variables to train the models and we can check the results of the accuracy and kappa.
set.seed(123) # set random seed (random selection can be reproduced)
# create the training partition that is 75% of total obs
inTraining <- createDataPartition(complete$brand, p = .75, list = FALSE)
trainSet <- complete[inTraining,]
testSet <- complete[-inTraining,]
# str(trainSet) # 7501 obs of 7 var (tengo Age and bin_Age, debo eliminar el primero)
# str(testSet) # 2499 obs of 7 var
#Train Control | Using "Cross validation" method
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
#KNN1 with all variables -car | because I consider that it variable do not have correlation with brand
# KNNfit1<-train(brand~.-car, data= trainSet, method="knn", trControl=fitControl,
# preProcess=c("center", "scale"), tuneLength=5)
# KNNfit1
# k Accuracy Kappa
# 13 0.8180 0.6130
#KNN2 with Salary and Age variables
# KNNfit2<-train(brand~salary+age, data= trainSet, method="knn", trControl=fitControl,
# preProcess=c("center", "scale"), tuneLength=5)
# KNNfit2
# k Accuracy Kappa
# 13 0.9167 0.8233
# KNN3 with Salary variable
# KNNfit3<-train(brand~salary, data= trainSet, method="knn", trControl=fitControl,
# preProcess=c("center", "scale"), tuneLength=5)
# KNNfit3
# k Accuracy Kappa
# 13 0.7081 0.3737
#RF1 with all variables -car
# RFfit1 <-train(brand~.-car, data= trainSet, method="parRF",trControl=fitControl, ntree=50, do.trace=10)
# RFfit1
# mtry Accuracy Kappa
# 19 0.9190 0.8281
#RF2 with Salary and Age variables
# RFfit2 <-train(brand~salary+age, data= trainSet, method="parRF",trControl=fitControl, ntree=50, do.trace=10)
# RFfit2
# Accuracy Kappa
# 0.9045 0.7972
#RF3 with Salary variable
# RFfit3 <-train(brand~salary, data= trainSet, method="parRF",trControl=fitControl, ntree=50, do.trace=10)
# RFfit3
# Accuracy Kappa
# 0.6415 0.2378
#CTree1 with all variables -car
# ctreefit1 <- train(brand~.-car, data = trainSet, method = "ctree", trControl = fitControl)
# ctreefit1
# mincriterion Accuracy Kappa
# 0.01 0.9142 0.8179
#CTree2 with Salary and Age variables
# ctreefit2 <- train(brand~salary+age, data = trainSet, method = "ctree", trControl = fitControl)
# ctreefit2
# mincriterion Accuracy Kappa
# 0.01 0.9147 0.8188
#CTree3 with Salary variable
# ctreefit3 <- train(brand~salary, data = trainSet, method = "ctree", trControl = fitControl)
# ctreefit3
# mincriterion Accuracy Kappa
# 0.01 0.7263 0.4182
#C5.0 with all variables -car
# C50fit1 <- train(brand~.-car, data= trainSet, method="C5.0", trControl=fitControl)
# C50fit1
# model winnow trials Accuracy Kappa
# rules TRUE 20 0.9186 0.8267
#### C5.0
# C50fit2 <- train(brand~salary+age, data= trainSet, method="C5.0", trControl=fitControl)
# C50fit2
# model winnow trials Accuracy Kappa
# tree TRUE 20 0.9186 0.8275
# C50fit3 <- train(brand~salary, data= trainSet, method="C5.0", trControl=fitControl)
# C50fit3
# model winnow trials Accuracy Kappa
# rules TRUE 1 0.7243 0.4219
# results <- resamples(list(KNN1=KNNfit1, KNN2=KNNfit2, KNN3=KNNfit3,RF1=RFfit1,RF2=RFfit2,RF3=RFfit3,
# CTree1=ctreefit1,CTree2=ctreefit2,CTree3=ctreefit3, C50fit1=C50fit1, C50fit2=C50fit2, C50fit3=C50fit3))
# summary(results)
Texto alternativo
This table shows us the results of all models which are used by. We can look at the Median metrics comparing all of them. RF1 model has the highest volume among all models. It has 0.9179 accuracy and 0.8249 kappa statistics. For this reason, we will apply this model to the test set. And then we will check the results again. If the results are good enough to predict the brand preference of the customers, we will apply this model to the “Incomplete” data set to predict the missing preferences.
# testPredRF1<-predict(RFfit1,testSet)
# testPredKNN2<-predict(KNNfit2,testSet)
# testPredCTree2<-predict(ctreefit2,testSet)
# testPredC50fit1<-predict(C50fit1,testSet)
# testPredC50fit2<-predict(C50fit2,testSet)
# predRF1<-postResample(testPredRF1, testSet$brand)
# predRF1
# Accuracy Kappa
# 0.9187551 0.8281817
# predKNN2<-postResample(testPredKNN2, testSet$brand)
# predKNN2
# Accuracy Kappa
# 0.9244139 0.8398827
# predKCTree2<-postResample(testPredCTree2, testSet$brand)
# predKCTree2
# Accuracy Kappa
# 0.9232013 0.8362564
# PredC50fit1<-postResample(testPredC50fit1, testSet$brand)
# PredC50fit1
# Accuracy Kappa
# 0.9308812 0.8533993
# PredC50fit2<-postResample(testPredC50fit2, testSet$brand)
# PredC50fit2
# Accuracy Kappa
# 0.9232013 0.8375509
# Pred_Incomp <-predict(RFfit1, newdata = incomplete)
#
# summary(Pred_Incomp)
#### acer sony #### 1901 3099