This report describe prediction the customers and classify the customers into three segments using Machine Learning Algorithm.
The dataset using in this report for modeling is An automobile company. The dataset is hosted in Kaggle. It can be downloaded here: https://www.kaggle.com/kaushiksuresh147/customer-segmentation
The report is structured as follows: 1. Data Extraction
2. Exploratory Data Analysis (EDA)
3. Data Preparation
4. Modeling
5. Evaluation
6. Recommendation
rm(list = ls())
library(ggplot2)
library(gridExtra)
library(corrgram)
library(tidyverse)
library(dplyr)
library(scales)
library(treemapify)
library(party)
library(randomForest)
library(e1071)
library(caret)
library(tibble)
customer_df <- read.csv("../data/Train.csv")
test_df <- read.csv("../data/Test.csv")
## statistical summary
summary(customer_df)
## ID Gender Ever_Married Age
## Min. :458982 Length:8068 Length:8068 Min. :18.00
## 1st Qu.:461241 Class :character Class :character 1st Qu.:30.00
## Median :463473 Mode :character Mode :character Median :40.00
## Mean :463479 Mean :43.47
## 3rd Qu.:465744 3rd Qu.:53.00
## Max. :467974 Max. :89.00
##
## Graduated Profession Work_Experience Spending_Score
## Length:8068 Length:8068 Min. : 0.000 Length:8068
## Class :character Class :character 1st Qu.: 0.000 Class :character
## Mode :character Mode :character Median : 1.000 Mode :character
## Mean : 2.642
## 3rd Qu.: 4.000
## Max. :14.000
## NA's :829
## Family_Size Var_1 Segmentation
## Min. :1.00 Length:8068 Length:8068
## 1st Qu.:2.00 Class :character Class :character
## Median :3.00 Mode :character Mode :character
## Mean :2.85
## 3rd Qu.:4.00
## Max. :9.00
## NA's :335
summary(test_df)
## ID Gender Ever_Married Age
## Min. :458989 Length:2627 Length:2627 Min. :18.00
## 1st Qu.:461163 Class :character Class :character 1st Qu.:30.00
## Median :463379 Mode :character Mode :character Median :41.00
## Mean :463434 Mean :43.65
## 3rd Qu.:465696 3rd Qu.:53.00
## Max. :467968 Max. :89.00
##
## Graduated Profession Work_Experience Spending_Score
## Length:2627 Length:2627 Min. : 0.000 Length:2627
## Class :character Class :character 1st Qu.: 0.000 Class :character
## Mode :character Mode :character Median : 1.000 Mode :character
## Mean : 2.553
## 3rd Qu.: 4.000
## Max. :14.000
## NA's :269
## Family_Size Var_1
## Min. :1.000 Length:2627
## 1st Qu.:2.000 Class :character
## Median :2.000 Mode :character
## Mean :2.825
## 3rd Qu.:4.000
## Max. :9.000
## NA's :113
## structure of data
str(customer_df)
## 'data.frame': 8068 obs. of 11 variables:
## $ ID : int 462809 462643 466315 461735 462669 461319 460156 464347 465015 465176 ...
## $ Gender : chr "Male" "Female" "Female" "Male" ...
## $ Ever_Married : chr "No" "Yes" "Yes" "Yes" ...
## $ Age : int 22 38 67 67 40 56 32 33 61 55 ...
## $ Graduated : chr "No" "Yes" "Yes" "Yes" ...
## $ Profession : chr "Healthcare" "Engineer" "Engineer" "Lawyer" ...
## $ Work_Experience: num 1 NA 1 0 NA 0 1 1 0 1 ...
## $ Spending_Score : chr "Low" "Average" "Low" "High" ...
## $ Family_Size : num 4 3 1 2 6 2 3 3 3 4 ...
## $ Var_1 : chr "Cat_4" "Cat_4" "Cat_6" "Cat_6" ...
## $ Segmentation : chr "D" "A" "B" "B" ...
str(test_df)
## 'data.frame': 2627 obs. of 10 variables:
## $ ID : int 458989 458994 458996 459000 459001 459003 459005 459008 459013 459014 ...
## $ Gender : chr "Female" "Male" "Female" "Male" ...
## $ Ever_Married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Age : int 36 37 69 59 19 47 61 47 50 19 ...
## $ Graduated : chr "Yes" "Yes" "No" "No" ...
## $ Profession : chr "Engineer" "Healthcare" "" "Executive" ...
## $ Work_Experience: num 0 8 0 11 NA 0 5 1 2 0 ...
## $ Spending_Score : chr "Low" "Average" "Low" "High" ...
## $ Family_Size : num 1 4 1 2 4 5 3 3 4 4 ...
## $ Var_1 : chr "Cat_6" "Cat_6" "Cat_6" "Cat_6" ...
customer_df = customer_df[ , -1]
customer_df = customer_df[ , -9]
customer_df = customer_df[ , -9]
test_df = test_df[ , -1]
test_df = test_df[ , -9]
is.na(customer_df$Ever_Married) #ubah yang kosong jadi NA
customer_df$Ever_Married[customer_df$Ever_Married == ""]
customer_df <- customer_df %>%
mutate(Ever_Married =
replace(Ever_Married, Ever_Married == "", NA))
test_df$Ever_Married[test_df$Ever_Married == ""]
test_df <- test_df %>%
mutate(Ever_Married =
replace(Ever_Married, Ever_Married == "", NA))
customer_df <- customer_df %>%
mutate(Graduated =
replace(Graduated, Graduated == "", NA))
customer_df <- customer_df[complete.cases(customer_df),]
test_df <- test_df %>%
mutate(Graduated =
replace(Graduated, Graduated == "", NA))
customer_df <- customer_df[complete.cases(customer_df),]
customer_df <- customer_df %>%
mutate(Profession =
replace(Profession, Profession == "", NA))
test_df <- test_df %>%
mutate(Profession =
replace(Profession, Profession == "", NA))
test_df$Ever_Married[test_df$Ever_Married == ""]
test_df <- test_df %>%
mutate(Ever_Married =
replace(Ever_Married, Ever_Married == "", NA))
customer_df$Work_Experience[customer_df$Work_Experience == ""]
customer_df <- customer_df %>%
mutate(Work_Experience =
replace(Work_Experience, Work_Experience == "", NA))
test_df$Work_Experience[test_df$Work_Experience == ""]
test_df <- test_df %>%
mutate(Work_Experience =
replace(Work_Experience, Work_Experience == "", NA))
customer_df$Age[customer_df$Age == ""]
customer_df <- customer_df %>%
mutate(Age =
replace(Age, Age == "", NA))
test_df$Age[test_df$Age == ""]
test_df <- test_df %>%
mutate(Age =
replace(Age, Age == "", NA))
customer_df$Family_Size[customer_df$Family_Size == ""]
customer_df <- customer_df %>%
mutate(Family_Size =
replace(Family_Size, Family_Size == "", NA))
test_df$Family_Size[test_df$Family_Size == ""]
test_df <- test_df %>%
mutate(Family_Size =
replace(Family_Size, Family_Size == "", NA))
customer_df$Family_Size[customer_df$Family_Size == ""]
customer_df <- customer_df %>%
mutate(Family_Size =
replace(Family_Size, Family_Size == "", NA))
test_df$Family_Size[test_df$Family_Size == ""]
test_df <- test_df %>%
mutate(Family_Size =
replace(Family_Size, Family_Size == "", NA))
customer_df <- customer_df[complete.cases(customer_df),]
test_df <- test_df[complete.cases(test_df),]
test_df$Family_Size <- as.numeric(test_df$Family_Size)
test_df$Work_Experience <- as.numeric(test_df$Work_Experience)
test_df$Gender <- as.factor(test_df$Gender)
test_df$Ever_Married <- as.factor(test_df$Ever_Married)
test_df$Age <- as.numeric(test_df$Age)
test_df$Graduated <- as.factor(test_df$Graduated)
test_df$Spending_Score <- as.factor(test_df$Spending_Score)
test_df$Profession <- as.factor(test_df$Profession)
customer_df$Family_Size <- as.numeric(customer_df$Family_Size)
customer_df$Work_Experience <- as.numeric(customer_df$Work_Experience)
customer_df$Gender <- as.factor(customer_df$Gender)
customer_df$Ever_Married <- as.factor(customer_df$Ever_Married)
customer_df$Age <- as.numeric(customer_df$Age)
customer_df$Graduated <- as.factor(customer_df$Graduated)
customer_df$Spending_Score <- as.factor(customer_df$Spending_Score)
customer_df$Profession <- as.factor(customer_df$Profession)
pl1 <- ggplot(customer_df, aes(x = Spending_Score, fill = Spending_Score),) +
geom_bar() +
stat_count(geom = "text", color = "white", size = 3,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customers Based on Spending_Score",
x = "Spending_Score", y = "Customers") +
theme(plot.title = element_text(hjust = 0.5))
pl2 <- ggplot(customer_df, aes(x = Gender, fill = Gender),) +
geom_bar() +
stat_count(geom = "text", color = "white", size = 3,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customers Based on Gender",
x = "Gender", y = "Customers") +
theme(plot.title = element_text(hjust = 0.5))
pl3 <- ggplot(customer_df, aes(x = Graduated, fill = Graduated),) +
geom_bar() +
stat_count(geom = "text", color = "white", size = 3,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customers Based on Graduated Status",
x = "Graduated", y = "Customers") +
theme(plot.title = element_text(hjust = 0.5))
pl4 <- ggplot(customer_df, aes(x = Profession, fill = Profession),) +
geom_bar() +
stat_count(geom = "text", color = "white", size = 3,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customers Based on Profession",
x = "Profession", y = "Customers") +
theme(plot.title = element_text(hjust = 0.5))
pl5 <-ggplot(customer_df, aes(x = Family_Size, fill = Family_Size),) +
geom_bar() +
stat_count(geom = "text", color = "white", size = 3,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customers Based on Family_Size",
x = "Family_Size", y = "Customers") +
theme(plot.title = element_text(hjust = 0.5))
pl1
pl2
pl3
pl4
pl5
grid.arrange(pl1,pl2,pl3,pl4,pl5)
p1 <- ggplot(customer_df, aes(x=Spending_Score, fill = Ever_Married)) +
geom_bar(position = "stack") +
stat_count(geom = "text", color = "white", size = 3.5,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customer Spending_Score by Ever Married") +
theme(plot.title = element_text(hjust = 0.5))
p1
p2 <- ggplot(customer_df, aes(x=Spending_Score, fill = Gender)) +
geom_bar(position = "stack") +
stat_count(geom = "text", color = "white", size = 3.5,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customer Spending_Score by Gender") +
theme(plot.title = element_text(hjust = 0.5))
p2
p3 <- ggplot(customer_df, aes(x=Profession, fill = Spending_Score)) +
geom_bar(position = "stack") +
stat_count(geom = "text", color = "white", size = 3.5,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customer Profession by Spending Score") +
theme(plot.title = element_text(hjust = 0.5))
p3
p4 <- ggplot(customer_df, aes(x=Graduated, fill = Spending_Score)) +
geom_bar(position = "stack") +
stat_count(geom = "text", color = "white", size = 3.5,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customer Graduated by Spending Score") +
theme(plot.title = element_text(hjust = 0.5))
p4
p5 <- ggplot(customer_df, aes(x=Family_Size, fill = Spending_Score)) +
geom_bar(position = "stack") +
stat_count(geom = "text", color = "white", size = 3.5,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customer Family_Size by Spending Score") +
theme(plot.title = element_text(hjust = 0.5))
p5
grid.arrange(p1,p2,p3,p4,p5)
ggplot(customer_df, aes(x=Age, y=Family_Size, color=Spending_Score,
shape=Graduated)) +
geom_point() +
geom_jitter() +
facet_grid(~Profession) +
facet_wrap(~ Spending_Score) +
labs(title = "Customer Spending_Score by Age, Graduated and Family Size") +
theme(plot.title = element_text(hjust = 0.5))
Data cleaning has been done when we want to do Exploratory Data Analysis so we don’t need to do it again.
We dont need to divided the dataset because the company already provided the training and testing data.
Create classification model using Decision Tree, Random Forest and Support Vector Machine (SVM). We will create three models: without PCA & One Hot Encoding, with PCA, with OHE and with PCA & OHE.
library(party)
model.dt <- ctree(formula = Spending_Score ~ .,
data = customer_df)
model.dt
plot(model.dt)
pred.dt <- predict(model.dt, test_df)
pred.dt
cm.dt <- table(test_df$Spending_Score, pred.dt,
dnn = c("Actual", "Predicted"))
cm.dt
## Predicted
## Actual Average High Low
## Average 475 28 26
## High 84 203 24
## Low 172 48 1118
set.seed(2021)
model.forest <- randomForest(formula = Spending_Score ~ .,
data = customer_df)
pred.forest <- predict(model.forest, test_df)
cm.forest <- table(test_df$Spending_Score, pred.forest,
dnn = c("Actual", "Predicted"))
cm.forest
## Predicted
## Actual Average High Low
## Average 496 27 6
## High 87 216 8
## Low 185 52 1101
model.svm <- svm(formula = Spending_Score ~ .,
data = customer_df)
pred.svm <- predict(model.svm, test_df)
cm.svm <- table(test_df$Spending_Score, pred.svm,
dnn = c("Actual", "Predicted"))
cm.svm
## Predicted
## Actual Average High Low
## Average 490 23 16
## High 98 194 19
## Low 190 48 1100
confusionMatrix(pred.dt, test_df$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 475 84 172
## High 28 203 48
## Low 26 24 1118
##
## Overall Statistics
##
## Accuracy : 0.8246
## 95% CI : (0.808, 0.8404)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6927
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.8979 0.6527 0.8356
## Specificity 0.8448 0.9593 0.9405
## Pos Pred Value 0.6498 0.7276 0.9572
## Neg Pred Value 0.9627 0.9431 0.7822
## Prevalence 0.2429 0.1428 0.6143
## Detection Rate 0.2181 0.0932 0.5133
## Detection Prevalence 0.3356 0.1281 0.5363
## Balanced Accuracy 0.8713 0.8060 0.8880
confusionMatrix(pred.svm, test_df$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 490 98 190
## High 23 194 48
## Low 16 19 1100
##
## Overall Statistics
##
## Accuracy : 0.8191
## 95% CI : (0.8023, 0.8351)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6858
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.9263 0.62379 0.8221
## Specificity 0.8253 0.96197 0.9583
## Pos Pred Value 0.6298 0.73208 0.9692
## Neg Pred Value 0.9721 0.93884 0.7718
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2250 0.08907 0.5051
## Detection Prevalence 0.3572 0.12167 0.5211
## Balanced Accuracy 0.8758 0.79288 0.8902
confusionMatrix(pred.forest, test_df$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 496 87 185
## High 27 216 52
## Low 6 8 1101
##
## Overall Statistics
##
## Accuracy : 0.8324
## 95% CI : (0.8161, 0.8479)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7113
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.9376 0.69453 0.8229
## Specificity 0.8351 0.95769 0.9833
## Pos Pred Value 0.6458 0.73220 0.9874
## Neg Pred Value 0.9766 0.94955 0.7770
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2277 0.09917 0.5055
## Detection Prevalence 0.3526 0.13545 0.5119
## Balanced Accuracy 0.8863 0.82611 0.9031
acc_df <- data.frame(models = c("Decision Tree",
"Support Vector Machine",
"Random Forest"),
racc = c("0.8246",
"0.8191",
"0.8324"))
summary_model <- ggplot(acc_df, aes(x = models,
y = racc,
fill = models)) +
geom_bar(stat = "identity")
summary_model
customer_df2 <- customer_df
test_df2 <- test_df
Gender_df <- data.frame(customer_df2$Gender)
colnames(Gender_df) <- "Gender"
df1 <- dummyVars("~.", data = Gender_df)
df2 <- data.frame(predict(df1, newdata = Gender_df))
customer_df2 <- cbind(customer_df2, df2)
customer_df2$Gender<- NULL
Ever_Married_df <- data.frame(customer_df2$Ever_Married)
colnames(Ever_Married_df) <- "Ever_Married"
df3 <- dummyVars("~.", data = Ever_Married_df)
df4 <- data.frame(predict(df3, newdata = Ever_Married_df))
customer_df2 <- cbind(customer_df2, df4)
customer_df2$Ever_Married<- NULL
Graduated_df <- data.frame(customer_df2$Graduated)
colnames(Graduated_df) <- "Graduated"
df5 <- dummyVars("~.", data = Graduated_df)
df6 <- data.frame(predict(df5, newdata = Graduated_df))
customer_df2 <- cbind(customer_df2, df6)
customer_df2$Graduated<- NULL
Profession_df <- data.frame(customer_df2$Profession)
colnames(Profession_df) <- "Profession"
df7 <- dummyVars("~.", data = Profession_df)
df8 <- data.frame(predict(df7, newdata = Profession_df))
customer_df2 <- cbind(customer_df2, df8)
customer_df2$Profession<- NULL
Gender_df2 <- data.frame(test_df2$Gender)
colnames(Gender_df2) <- "Gender"
df11 <- dummyVars("~.", data = Gender_df2)
df12 <- data.frame(predict(df1, newdata = Gender_df2))
test_df2 <- cbind(test_df2, df12)
test_df2$Gender<- NULL
Ever_Married_df2 <- data.frame(test_df2$Ever_Married)
colnames(Ever_Married_df2) <- "Ever_Married"
df13 <- dummyVars("~.", data = Ever_Married_df2)
df14 <- data.frame(predict(df13, newdata = Ever_Married_df2))
test_df2 <- cbind(test_df2, df14)
test_df2$Ever_Married<- NULL
"
Graduated_df2 <- data.frame(test_df2$Graduated)
colnames(Graduated_df2) <- "Graduated"
df15 <- dummyVars("~.", data = Graduated_df2)
df16 <- data.frame(predict(df15, newdata = Graduated_df2))
test_df2 <- cbind(test_df2, df16)
test_df2$Graduated<- NULL
Profession_df2 <- data.frame(test_df2$Profession)
colnames(Profession_df2) <- "Profession"
df17 <- dummyVars("~.", data = Profession_df2)
df18 <- data.frame(predict(df17, newdata = Profession_df2))
test_df2 <- cbind(test_df2, df18)
test_df2$Profession<- NULL
library(party)
model.dt_OHE <- ctree(formula = Spending_Score ~ .,
data = customer_df2)
model.dt_OHE
plot(model.dt_OHE)
pred.dt_OHE <- predict(model.dt_OHE, test_df2)
pred.dt_OHE
cm.dt_OHE <- table(test_df2$Spending_Score, pred.dt_OHE,
dnn = c("Actual", "Predicted"))
cm.dt_OHE
## Predicted
## Actual Average High Low
## Average 488 28 13
## High 92 206 13
## Low 184 51 1103
set.seed(2021)
model.forest_OHE <- randomForest(formula = Spending_Score ~ .,
data = customer_df2)
model.forest_OHE
pred.forest_OHE <- predict(model.forest_OHE, test_df2)
cm.forest_OHE <- table(test_df2$Spending_Score, pred.forest_OHE,
dnn = c("Actual", "Predicted"))
cm.forest
## Predicted
## Actual Average High Low
## Average 496 27 6
## High 87 216 8
## Low 185 52 1101
model.svm_OHE <- svm(formula = Spending_Score ~ .,
data = customer_df2)
model.svm_OHE
pred.svm_OHE <- predict(model.svm_OHE, test_df2)
cm.svm_OHE <- table(test_df2$Spending_Score, pred.svm_OHE,
dnn = c("Actual", "Predicted"))
cm.svm_OHE
## Predicted
## Actual Average High Low
## Average 480 22 27
## High 92 196 23
## Low 176 51 1111
confusionMatrix(pred.dt_OHE, test_df2$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 488 92 184
## High 28 206 51
## Low 13 13 1103
##
## Overall Statistics
##
## Accuracy : 0.8251
## 95% CI : (0.8085, 0.8408)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6972
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.9225 0.66238 0.8244
## Specificity 0.8326 0.95769 0.9690
## Pos Pred Value 0.6387 0.72281 0.9770
## Neg Pred Value 0.9710 0.94453 0.7760
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2241 0.09458 0.5064
## Detection Prevalence 0.3508 0.13085 0.5184
## Balanced Accuracy 0.8776 0.81003 0.8967
confusionMatrix(pred.svm_OHE, test_df2$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 480 92 176
## High 22 196 51
## Low 27 23 1111
##
## Overall Statistics
##
## Accuracy : 0.8205
## 95% CI : (0.8037, 0.8364)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6859
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.9074 0.63023 0.8303
## Specificity 0.8375 0.96090 0.9405
## Pos Pred Value 0.6417 0.72862 0.9569
## Neg Pred Value 0.9657 0.93976 0.7768
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2204 0.08999 0.5101
## Detection Prevalence 0.3434 0.12351 0.5331
## Balanced Accuracy 0.8724 0.79556 0.8854
confusionMatrix(pred.forest_OHE, test_df2$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 491 90 184
## High 25 207 46
## Low 13 14 1108
##
## Overall Statistics
##
## Accuracy : 0.8292
## 95% CI : (0.8127, 0.8448)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7036
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.9282 0.66559 0.8281
## Specificity 0.8338 0.96197 0.9679
## Pos Pred Value 0.6418 0.74460 0.9762
## Neg Pred Value 0.9731 0.94526 0.7795
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2254 0.09504 0.5087
## Detection Prevalence 0.3512 0.12764 0.5211
## Balanced Accuracy 0.8810 0.81378 0.8980
acc_OHE_df <- data.frame(models_OHE = c("Decision Tree OHE",
"Support Vector Machine OHE",
"Random Forest OHE"),
racc_OHE = c("0.8251",
"0.8205",
"0.8292"))
summary_model_OHE <- ggplot(acc_OHE_df, aes(x = models_OHE,
y = racc_OHE,
fill = models_OHE)) +
geom_bar(stat = "identity")
summary_model_OHE
grid.arrange(summary_model,summary_model_OHE)
pca_df <- prcomp(customer_df2 %>% select(-Spending_Score), scale = TRUE)
customer_df3 <- tbl_df(pca_df$x)
test_df3 <- tbl_df(predict(pca_df, newdata = test_df2 %>% select(-Spending_Score)))
dim(pca_df$x)
## [1] 6718 18
biplot(pca_df, scale = 0)
## Variance
pr.var <- pca_df$sdev^2
pr.var
## [1] 3.493869e+00 2.397251e+00 1.905694e+00 1.292728e+00 1.246989e+00
## [6] 1.125133e+00 1.110051e+00 1.056509e+00 9.684018e-01 9.268617e-01
## [11] 7.995518e-01 7.829040e-01 6.121335e-01 2.819218e-01 2.149223e-29
## [16] 7.153723e-30 1.231423e-30 8.138717e-32
pve <- pr.var / sum (pr.var)
pve
plot(pve, type = "b", ylim = c(0,1),
xlab = "Principal Component",
ylab = "PVE")
plot(cumsum(pve), type = "b", ylim = c(0,1),
xlab = "Principal Component",
ylab = "Cumulative PVE")
cumsum(pve)
Spending_Score <- customer_df$Spending_Score
customer_df3 <- cbind(customer_df3[1:15], Spending_Score)
Spending_Score <- test_df$Spending_Score
test_df3 <- cbind(test_df3[1:15], Spending_Score)
model.dt_OHE_PCA <- ctree(formula = Spending_Score ~ .,
data = customer_df3)
model.dt_OHE_PCA
plot(model.dt_OHE_PCA)
### Predict Decision Tree with OHE and PCA
pred.dt_OHE_PCA <- predict(model.dt_OHE_PCA, test_df3)
pred.dt_OHE_PCA
cm.dt_OHE_PCA <- table(test_df3$Spending_Score, pred.dt_OHE_PCA,
dnn = c("Actual", "Predicted"))
cm.dt_OHE_PCA
## Predicted
## Actual Average High Low
## Average 437 34 58
## High 72 197 42
## Low 175 48 1115
set.seed(2021)
model.forest_OHE_PCA <- randomForest(formula = Spending_Score ~ .,
data = customer_df3)
pred.forest_OHE_PCA <- predict(model.forest_OHE_PCA, test_df3)
cm.forest_OHE_PCA <- table(test_df3$Spending_Score, pred.forest_OHE_PCA,
dnn = c("Actual", "Predicted"))
cm.forest_OHE_PCA
## Predicted
## Actual Average High Low
## Average 411 35 83
## High 52 225 34
## Low 91 47 1200
model.svm_OHE_PCA <- svm(formula = Spending_Score ~ .,
data = customer_df3)
pred.svm_OHE_PCA <- predict(model.svm_OHE_PCA, test_df3)
cm.svm_OHE_PCA <- table(test_df3$Spending_Score, pred.svm_OHE_PCA,
dnn = c("Actual", "Predicted"))
cm.svm_OHE_PCA
## Predicted
## Actual Average High Low
## Average 474 22 33
## High 86 201 24
## Low 176 52 1110
confusionMatrix(pred.dt_OHE_PCA, test_df3$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 437 72 175
## High 34 197 48
## Low 58 42 1115
##
## Overall Statistics
##
## Accuracy : 0.803
## 95% CI : (0.7857, 0.8195)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.65
##
## Mcnemar's Test P-Value : 1.087e-15
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.8261 0.63344 0.8333
## Specificity 0.8502 0.95608 0.8810
## Pos Pred Value 0.6389 0.70609 0.9177
## Neg Pred Value 0.9384 0.93997 0.7684
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2006 0.09045 0.5119
## Detection Prevalence 0.3140 0.12810 0.5579
## Balanced Accuracy 0.8381 0.79476 0.8571
confusionMatrix(pred.svm_OHE_PCA, test_df3$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 474 86 176
## High 22 201 52
## Low 33 24 1110
##
## Overall Statistics
##
## Accuracy : 0.8196
## 95% CI : (0.8028, 0.8355)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6838
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.8960 0.64630 0.8296
## Specificity 0.8411 0.96036 0.9321
## Pos Pred Value 0.6440 0.73091 0.9512
## Neg Pred Value 0.9619 0.94220 0.7745
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2176 0.09229 0.5096
## Detection Prevalence 0.3379 0.12626 0.5358
## Balanced Accuracy 0.8686 0.80333 0.8809
confusionMatrix(pred.forest_OHE_PCA, test_df3$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 411 52 91
## High 35 225 47
## Low 83 34 1200
##
## Overall Statistics
##
## Accuracy : 0.843
## 95% CI : (0.827, 0.858)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7127
##
## Mcnemar's Test P-Value : 0.123
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.7769 0.7235 0.8969
## Specificity 0.9133 0.9561 0.8607
## Pos Pred Value 0.7419 0.7329 0.9112
## Neg Pred Value 0.9273 0.9540 0.8397
## Prevalence 0.2429 0.1428 0.6143
## Detection Rate 0.1887 0.1033 0.5510
## Detection Prevalence 0.2544 0.1410 0.6047
## Balanced Accuracy 0.8451 0.8398 0.8788
a
acc_OHE_PCA_df <- data.frame(models_OHE_PCA = c("Decision Tree OHE",
"Support Vector Machine OHE",
"Random Forest OHE"),
racc_OHE_PCA = c("0.8030",
"0.8196",
"0.8430"))
summary_model_OHE_PCA <- ggplot(acc_OHE_PCA_df, aes(x = models_OHE_PCA,
y = racc_OHE_PCA,
fill = models_OHE_PCA)) +
geom_bar(stat = "identity")
summary_model_OHE_PCA
grid.arrange(summary_model,summary_model_OHE,summary_model_OHE_PCA)
The accuracy is good enough (84.16%). That can be deploy.
There are opportunities for improvements. For example, use certain formula to predict
Focus on spending_score there are so many customers who rated low score. This is important to increase sales for the new market.
From three models that we already tried, Random Forest with PCA using OHE dataset
is the best method to get better accuracy but it must still can be improve