This report describe prediction the customers and classify the customers into three segments using Machine Learning Algorithm.
The dataset using in this report for modeling is An automobile company. The dataset is hosted in Kaggle. It can be downloaded here: https://www.kaggle.com/kaushiksuresh147/customer-segmentation
The report is structured as follows: 1. Data Extraction
2. Exploratory Data Analysis (EDA)
3. Data Preparation
4. Modeling
5. Evaluation
6. Recommendation
rm(list = ls())
library(ggplot2)
library(gridExtra)
library(corrgram)
library(tidyverse)
library(dplyr)
library(scales)
library(treemapify)
library(party)
library(randomForest)
library(e1071)
library(caret)
library(tibble)
Imported library for this report
customer_df <- read.csv("../data/Train.csv")
test_df <- read.csv("../data/Test.csv")
## statistical summary
summary(customer_df)
## ID Gender Ever_Married Age
## Min. :458982 Length:8068 Length:8068 Min. :18.00
## 1st Qu.:461241 Class :character Class :character 1st Qu.:30.00
## Median :463473 Mode :character Mode :character Median :40.00
## Mean :463479 Mean :43.47
## 3rd Qu.:465744 3rd Qu.:53.00
## Max. :467974 Max. :89.00
##
## Graduated Profession Work_Experience Spending_Score
## Length:8068 Length:8068 Min. : 0.000 Length:8068
## Class :character Class :character 1st Qu.: 0.000 Class :character
## Mode :character Mode :character Median : 1.000 Mode :character
## Mean : 2.642
## 3rd Qu.: 4.000
## Max. :14.000
## NA's :829
## Family_Size Var_1 Segmentation
## Min. :1.00 Length:8068 Length:8068
## 1st Qu.:2.00 Class :character Class :character
## Median :3.00 Mode :character Mode :character
## Mean :2.85
## 3rd Qu.:4.00
## Max. :9.00
## NA's :335
summary(test_df)
## ID Gender Ever_Married Age
## Min. :458989 Length:2627 Length:2627 Min. :18.00
## 1st Qu.:461163 Class :character Class :character 1st Qu.:30.00
## Median :463379 Mode :character Mode :character Median :41.00
## Mean :463434 Mean :43.65
## 3rd Qu.:465696 3rd Qu.:53.00
## Max. :467968 Max. :89.00
##
## Graduated Profession Work_Experience Spending_Score
## Length:2627 Length:2627 Min. : 0.000 Length:2627
## Class :character Class :character 1st Qu.: 0.000 Class :character
## Mode :character Mode :character Median : 1.000 Mode :character
## Mean : 2.553
## 3rd Qu.: 4.000
## Max. :14.000
## NA's :269
## Family_Size Var_1
## Min. :1.000 Length:2627
## 1st Qu.:2.000 Class :character
## Median :2.000 Mode :character
## Mean :2.825
## 3rd Qu.:4.000
## Max. :9.000
## NA's :113
We can see the min, median, and max values of each numeric variable.
It is intersting to see that the value is NA. This could be an incorrect data.
## structure of data
str(customer_df)
## 'data.frame': 8068 obs. of 11 variables:
## $ ID : int 462809 462643 466315 461735 462669 461319 460156 464347 465015 465176 ...
## $ Gender : chr "Male" "Female" "Female" "Male" ...
## $ Ever_Married : chr "No" "Yes" "Yes" "Yes" ...
## $ Age : int 22 38 67 67 40 56 32 33 61 55 ...
## $ Graduated : chr "No" "Yes" "Yes" "Yes" ...
## $ Profession : chr "Healthcare" "Engineer" "Engineer" "Lawyer" ...
## $ Work_Experience: num 1 NA 1 0 NA 0 1 1 0 1 ...
## $ Spending_Score : chr "Low" "Average" "Low" "High" ...
## $ Family_Size : num 4 3 1 2 6 2 3 3 3 4 ...
## $ Var_1 : chr "Cat_4" "Cat_4" "Cat_6" "Cat_6" ...
## $ Segmentation : chr "D" "A" "B" "B" ...
str(test_df)
## 'data.frame': 2627 obs. of 10 variables:
## $ ID : int 458989 458994 458996 459000 459001 459003 459005 459008 459013 459014 ...
## $ Gender : chr "Female" "Male" "Female" "Male" ...
## $ Ever_Married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Age : int 36 37 69 59 19 47 61 47 50 19 ...
## $ Graduated : chr "Yes" "Yes" "No" "No" ...
## $ Profession : chr "Engineer" "Healthcare" "" "Executive" ...
## $ Work_Experience: num 0 8 0 11 NA 0 5 1 2 0 ...
## $ Spending_Score : chr "Low" "Average" "Low" "High" ...
## $ Family_Size : num 1 4 1 2 4 5 3 3 4 4 ...
## $ Var_1 : chr "Cat_6" "Cat_6" "Cat_6" "Cat_6" ...
There are 8068 observation and 11 variables in ‘data.frame’ train and 2627 and 10 variables in ‘data.frame’ test There are three type of data, int, num and char
customer_df = customer_df[ , -1]
customer_df = customer_df[ , -9]
customer_df = customer_df[ , -9]
test_df = test_df[ , -1]
test_df = test_df[ , -9]
There are nomore column that we dont need
### remove the missing value so we have to make the empty data into NA
is.na(customer_df$Ever_Married) #ubah yang kosong jadi NA
customer_df$Ever_Married[customer_df$Ever_Married == ""]
customer_df <- customer_df %>%
mutate(Ever_Married =
replace(Ever_Married, Ever_Married == "", NA))
test_df$Ever_Married[test_df$Ever_Married == ""]
test_df <- test_df %>%
mutate(Ever_Married =
replace(Ever_Married, Ever_Married == "", NA))
customer_df <- customer_df %>%
mutate(Graduated =
replace(Graduated, Graduated == "", NA))
customer_df <- customer_df[complete.cases(customer_df),]
test_df <- test_df %>%
mutate(Graduated =
replace(Graduated, Graduated == "", NA))
customer_df <- customer_df[complete.cases(customer_df),]
customer_df <- customer_df %>%
mutate(Profession =
replace(Profession, Profession == "", NA))
test_df <- test_df %>%
mutate(Profession =
replace(Profession, Profession == "", NA))
test_df$Ever_Married[test_df$Ever_Married == ""]
test_df <- test_df %>%
mutate(Ever_Married =
replace(Ever_Married, Ever_Married == "", NA))
customer_df$Work_Experience[customer_df$Work_Experience == ""]
customer_df <- customer_df %>%
mutate(Work_Experience =
replace(Work_Experience, Work_Experience == "", NA))
test_df$Work_Experience[test_df$Work_Experience == ""]
test_df <- test_df %>%
mutate(Work_Experience =
replace(Work_Experience, Work_Experience == "", NA))
customer_df$Age[customer_df$Age == ""]
customer_df <- customer_df %>%
mutate(Age =
replace(Age, Age == "", NA))
test_df$Age[test_df$Age == ""]
test_df <- test_df %>%
mutate(Age =
replace(Age, Age == "", NA))
customer_df$Family_Size[customer_df$Family_Size == ""]
customer_df <- customer_df %>%
mutate(Family_Size =
replace(Family_Size, Family_Size == "", NA))
test_df$Family_Size[test_df$Family_Size == ""]
test_df <- test_df %>%
mutate(Family_Size =
replace(Family_Size, Family_Size == "", NA))
customer_df$Family_Size[customer_df$Family_Size == ""]
customer_df <- customer_df %>%
mutate(Family_Size =
replace(Family_Size, Family_Size == "", NA))
test_df$Family_Size[test_df$Family_Size == ""]
test_df <- test_df %>%
mutate(Family_Size =
replace(Family_Size, Family_Size == "", NA))
customer_df <- customer_df[complete.cases(customer_df),]
test_df <- test_df[complete.cases(test_df),]
After remove column that we don’t need and remove missing value we got 6718 observation and 8 variables in ‘data.frame’ train and 2178 obesevation and 8 variables
test_df$Family_Size <- as.numeric(test_df$Family_Size)
test_df$Work_Experience <- as.numeric(test_df$Work_Experience)
test_df$Gender <- as.factor(test_df$Gender)
test_df$Ever_Married <- as.factor(test_df$Ever_Married)
test_df$Age <- as.numeric(test_df$Age)
test_df$Graduated <- as.factor(test_df$Graduated)
test_df$Spending_Score <- as.factor(test_df$Spending_Score)
test_df$Profession <- as.factor(test_df$Profession)
customer_df$Family_Size <- as.numeric(customer_df$Family_Size)
customer_df$Work_Experience <- as.numeric(customer_df$Work_Experience)
customer_df$Gender <- as.factor(customer_df$Gender)
customer_df$Ever_Married <- as.factor(customer_df$Ever_Married)
customer_df$Age <- as.numeric(customer_df$Age)
customer_df$Graduated <- as.factor(customer_df$Graduated)
customer_df$Spending_Score <- as.factor(customer_df$Spending_Score)
customer_df$Profession <- as.factor(customer_df$Profession)
After we change the data type there are two type: num and factor
pl1 <- ggplot(customer_df, aes(x = Spending_Score, fill = Spending_Score),) +
geom_bar() +
stat_count(geom = "text", color = "white", size = 3,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customers Based on Spending_Score",
x = "Spending_Score", y = "Customers") +
theme(plot.title = element_text(hjust = 0.5))
pl2 <- ggplot(customer_df, aes(x = Gender, fill = Gender),) +
geom_bar() +
stat_count(geom = "text", color = "white", size = 3,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customers Based on Gender",
x = "Gender", y = "Customers") +
theme(plot.title = element_text(hjust = 0.5))
pl3 <- ggplot(customer_df, aes(x = Graduated, fill = Graduated),) +
geom_bar() +
stat_count(geom = "text", color = "white", size = 3,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customers Based on Graduated Status",
x = "Graduated", y = "Customers") +
theme(plot.title = element_text(hjust = 0.5))
pl4 <- ggplot(customer_df, aes(x = Profession, fill = Profession),) +
geom_bar() +
stat_count(geom = "text", color = "white", size = 3,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customers Based on Profession",
x = "Profession", y = "Customers") +
theme(plot.title = element_text(hjust = 0.5))
pl5 <-ggplot(customer_df, aes(x = Family_Size, fill = Family_Size),) +
geom_bar() +
stat_count(geom = "text", color = "white", size = 3,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customers Based on Family_Size",
x = "Family_Size", y = "Customers") +
theme(plot.title = element_text(hjust = 0.5))
pl1
pl2
pl3
pl4
pl5
grid.arrange(pl1,pl2,pl3,pl4,pl5)
Based the plot we can see visualization of data.
Customer dominated by male customer
p1 <- ggplot(customer_df, aes(x=Spending_Score, fill = Ever_Married)) +
geom_bar(position = "stack") +
stat_count(geom = "text", color = "white", size = 3.5,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customer Spending_Score by Ever Married") +
theme(plot.title = element_text(hjust = 0.5))
p1
p2 <- ggplot(customer_df, aes(x=Spending_Score, fill = Gender)) +
geom_bar(position = "stack") +
stat_count(geom = "text", color = "white", size = 3.5,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customer Spending_Score by Gender") +
theme(plot.title = element_text(hjust = 0.5))
p2
p3 <- ggplot(customer_df, aes(x=Profession, fill = Spending_Score)) +
geom_bar(position = "stack") +
stat_count(geom = "text", color = "white", size = 3.5,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customer Profession by Spending Score") +
theme(plot.title = element_text(hjust = 0.5))
p3
p4 <- ggplot(customer_df, aes(x=Graduated, fill = Spending_Score)) +
geom_bar(position = "stack") +
stat_count(geom = "text", color = "white", size = 3.5,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customer Graduated by Spending Score") +
theme(plot.title = element_text(hjust = 0.5))
p4
p5 <- ggplot(customer_df, aes(x=Family_Size, fill = Spending_Score)) +
geom_bar(position = "stack") +
stat_count(geom = "text", color = "white", size = 3.5,
aes(label = ..count..), position=position_stack(vjust = 0.5)) +
labs(title = "Customer Family_Size by Spending Score") +
theme(plot.title = element_text(hjust = 0.5))
p5
grid.arrange(p1,p2,p3,p4,p5)
From this plot, we know the customer
Spending_Score based on several variables
ggplot(customer_df, aes(x=Age, y=Family_Size, color=Spending_Score,
shape=Graduated)) +
geom_point() +
geom_jitter() +
facet_grid(~Profession) +
facet_wrap(~ Spending_Score) +
labs(title = "Customer Spending_Score by Age, Graduated and Family Size") +
theme(plot.title = element_text(hjust = 0.5))
From this plot, we know the customer
Spending_Score based on several variables
We can see scactter customer in each Spending_Score class
Data cleaning has been done when we want to do Exploratory Data Analysis so we don’t need to do it again.
We dont need to divided the dataset because the company already provided the training and testing data.
Create classification model using Decision Tree, Random Forest and Support Vector Machine (SVM). We will create three models: without PCA & One Hot Encoding, with PCA, with OHE and with PCA & OHE.
library(party)
model.dt <- ctree(formula = Spending_Score ~ .,
data = customer_df)
model.dt
plot(model.dt)
We can see the plot of tree model
pred.dt <- predict(model.dt, test_df)
pred.dt
cm.dt <- table(test_df$Spending_Score, pred.dt,
dnn = c("Actual", "Predicted"))
cm.dt
## Predicted
## Actual Average High Low
## Average 475 28 26
## High 84 203 24
## Low 172 48 1118
set.seed(2021)
model.forest <- randomForest(formula = Spending_Score ~ .,
data = customer_df)
pred.forest <- predict(model.forest, test_df)
cm.forest <- table(test_df$Spending_Score, pred.forest,
dnn = c("Actual", "Predicted"))
cm.forest
## Predicted
## Actual Average High Low
## Average 496 27 6
## High 87 216 8
## Low 185 52 1101
model.svm <- svm(formula = Spending_Score ~ .,
data = customer_df)
pred.svm <- predict(model.svm, test_df)
cm.svm <- table(test_df$Spending_Score, pred.svm,
dnn = c("Actual", "Predicted"))
cm.svm
## Predicted
## Actual Average High Low
## Average 490 23 16
## High 98 194 19
## Low 190 48 1100
confusionMatrix(pred.dt, test_df$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 475 84 172
## High 28 203 48
## Low 26 24 1118
##
## Overall Statistics
##
## Accuracy : 0.8246
## 95% CI : (0.808, 0.8404)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6927
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.8979 0.6527 0.8356
## Specificity 0.8448 0.9593 0.9405
## Pos Pred Value 0.6498 0.7276 0.9572
## Neg Pred Value 0.9627 0.9431 0.7822
## Prevalence 0.2429 0.1428 0.6143
## Detection Rate 0.2181 0.0932 0.5133
## Detection Prevalence 0.3356 0.1281 0.5363
## Balanced Accuracy 0.8713 0.8060 0.8880
confusionMatrix(pred.svm, test_df$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 490 98 190
## High 23 194 48
## Low 16 19 1100
##
## Overall Statistics
##
## Accuracy : 0.8191
## 95% CI : (0.8023, 0.8351)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6858
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.9263 0.62379 0.8221
## Specificity 0.8253 0.96197 0.9583
## Pos Pred Value 0.6298 0.73208 0.9692
## Neg Pred Value 0.9721 0.93884 0.7718
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2250 0.08907 0.5051
## Detection Prevalence 0.3572 0.12167 0.5211
## Balanced Accuracy 0.8758 0.79288 0.8902
confusionMatrix(pred.forest, test_df$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 496 87 185
## High 27 216 52
## Low 6 8 1101
##
## Overall Statistics
##
## Accuracy : 0.8324
## 95% CI : (0.8161, 0.8479)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7113
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.9376 0.69453 0.8229
## Specificity 0.8351 0.95769 0.9833
## Pos Pred Value 0.6458 0.73220 0.9874
## Neg Pred Value 0.9766 0.94955 0.7770
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2277 0.09917 0.5055
## Detection Prevalence 0.3526 0.13545 0.5119
## Balanced Accuracy 0.8863 0.82611 0.9031
acc_df <- data.frame(models = c("Decision Tree",
"Support Vector Machine",
"Random Forest"),
racc = c("0.8246",
"0.8191",
"0.8324"))
summary_model <- ggplot(acc_df, aes(x = models,
y = racc,
fill = models)) +
geom_bar(stat = "identity")
summary_model
From that summary of
accuracy based from each model, and model Random Forest is the best score in accuracy
customer_df2 <- customer_df
test_df2 <- test_df
Gender_df <- data.frame(customer_df2$Gender)
colnames(Gender_df) <- "Gender"
df1 <- dummyVars("~.", data = Gender_df)
df2 <- data.frame(predict(df1, newdata = Gender_df))
customer_df2 <- cbind(customer_df2, df2)
customer_df2$Gender<- NULL
Ever_Married_df <- data.frame(customer_df2$Ever_Married)
colnames(Ever_Married_df) <- "Ever_Married"
df3 <- dummyVars("~.", data = Ever_Married_df)
df4 <- data.frame(predict(df3, newdata = Ever_Married_df))
customer_df2 <- cbind(customer_df2, df4)
customer_df2$Ever_Married<- NULL
Graduated_df <- data.frame(customer_df2$Graduated)
colnames(Graduated_df) <- "Graduated"
df5 <- dummyVars("~.", data = Graduated_df)
df6 <- data.frame(predict(df5, newdata = Graduated_df))
customer_df2 <- cbind(customer_df2, df6)
customer_df2$Graduated<- NULL
Profession_df <- data.frame(customer_df2$Profession)
colnames(Profession_df) <- "Profession"
df7 <- dummyVars("~.", data = Profession_df)
df8 <- data.frame(predict(df7, newdata = Profession_df))
customer_df2 <- cbind(customer_df2, df8)
customer_df2$Profession<- NULL
Gender_df2 <- data.frame(test_df2$Gender)
colnames(Gender_df2) <- "Gender"
df11 <- dummyVars("~.", data = Gender_df2)
df12 <- data.frame(predict(df1, newdata = Gender_df2))
test_df2 <- cbind(test_df2, df12)
test_df2$Gender<- NULL
Ever_Married_df2 <- data.frame(test_df2$Ever_Married)
colnames(Ever_Married_df2) <- "Ever_Married"
df13 <- dummyVars("~.", data = Ever_Married_df2)
df14 <- data.frame(predict(df13, newdata = Ever_Married_df2))
test_df2 <- cbind(test_df2, df14)
test_df2$Ever_Married<- NULL
"
Graduated_df2 <- data.frame(test_df2$Graduated)
colnames(Graduated_df2) <- "Graduated"
df15 <- dummyVars("~.", data = Graduated_df2)
df16 <- data.frame(predict(df15, newdata = Graduated_df2))
test_df2 <- cbind(test_df2, df16)
test_df2$Graduated<- NULL
Profession_df2 <- data.frame(test_df2$Profession)
colnames(Profession_df2) <- "Profession"
df17 <- dummyVars("~.", data = Profession_df2)
df18 <- data.frame(predict(df17, newdata = Profession_df2))
test_df2 <- cbind(test_df2, df18)
test_df2$Profession<- NULL
dim(customer_df2)
## [1] 6718 19
dim(test_df2)
## [1] 2178 19
Number of columns in customer_df is now 19 and Number of columns in test_df is now 19.
library(party)
model.dt_OHE <- ctree(formula = Spending_Score ~ .,
data = customer_df2)
model.dt_OHE
plot(model.dt_OHE)
We can see the plot of tree model with OHE
pred.dt_OHE <- predict(model.dt_OHE, test_df2)
pred.dt_OHE
cm.dt_OHE <- table(test_df2$Spending_Score, pred.dt_OHE,
dnn = c("Actual", "Predicted"))
cm.dt_OHE
## Predicted
## Actual Average High Low
## Average 488 28 13
## High 92 206 13
## Low 184 51 1103
set.seed(2021)
model.forest_OHE <- randomForest(formula = Spending_Score ~ .,
data = customer_df2)
model.forest_OHE
pred.forest_OHE <- predict(model.forest_OHE, test_df2)
cm.forest_OHE <- table(test_df2$Spending_Score, pred.forest_OHE,
dnn = c("Actual", "Predicted"))
cm.forest
## Predicted
## Actual Average High Low
## Average 496 27 6
## High 87 216 8
## Low 185 52 1101
model.svm_OHE <- svm(formula = Spending_Score ~ .,
data = customer_df2)
model.svm_OHE
pred.svm_OHE <- predict(model.svm_OHE, test_df2)
cm.svm_OHE <- table(test_df2$Spending_Score, pred.svm_OHE,
dnn = c("Actual", "Predicted"))
cm.svm_OHE
## Predicted
## Actual Average High Low
## Average 480 22 27
## High 92 196 23
## Low 176 51 1111
confusionMatrix(pred.dt_OHE, test_df2$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 488 92 184
## High 28 206 51
## Low 13 13 1103
##
## Overall Statistics
##
## Accuracy : 0.8251
## 95% CI : (0.8085, 0.8408)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6972
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.9225 0.66238 0.8244
## Specificity 0.8326 0.95769 0.9690
## Pos Pred Value 0.6387 0.72281 0.9770
## Neg Pred Value 0.9710 0.94453 0.7760
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2241 0.09458 0.5064
## Detection Prevalence 0.3508 0.13085 0.5184
## Balanced Accuracy 0.8776 0.81003 0.8967
confusionMatrix(pred.svm_OHE, test_df2$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 480 92 176
## High 22 196 51
## Low 27 23 1111
##
## Overall Statistics
##
## Accuracy : 0.8205
## 95% CI : (0.8037, 0.8364)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6859
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.9074 0.63023 0.8303
## Specificity 0.8375 0.96090 0.9405
## Pos Pred Value 0.6417 0.72862 0.9569
## Neg Pred Value 0.9657 0.93976 0.7768
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2204 0.08999 0.5101
## Detection Prevalence 0.3434 0.12351 0.5331
## Balanced Accuracy 0.8724 0.79556 0.8854
confusionMatrix(pred.forest_OHE, test_df2$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 491 90 184
## High 25 207 46
## Low 13 14 1108
##
## Overall Statistics
##
## Accuracy : 0.8292
## 95% CI : (0.8127, 0.8448)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7036
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.9282 0.66559 0.8281
## Specificity 0.8338 0.96197 0.9679
## Pos Pred Value 0.6418 0.74460 0.9762
## Neg Pred Value 0.9731 0.94526 0.7795
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2254 0.09504 0.5087
## Detection Prevalence 0.3512 0.12764 0.5211
## Balanced Accuracy 0.8810 0.81378 0.8980
acc_OHE_df <- data.frame(models_OHE = c("Decision Tree OHE",
"Support Vector Machine OHE",
"Random Forest OHE"),
racc_OHE = c("0.8251",
"0.8205",
"0.8292"))
summary_model_OHE <- ggplot(acc_OHE_df, aes(x = models_OHE,
y = racc_OHE,
fill = models_OHE)) +
geom_bar(stat = "identity")
summary_model_OHE
grid.arrange(summary_model,summary_model_OHE)
From that summary of accuracy based from each model, and model Random Forest is the best score in accuracy
pca_df <- prcomp(customer_df2 %>% select(-Spending_Score), scale = TRUE)
customer_df3 <- tbl_df(pca_df$x)
test_df3 <- tbl_df(predict(pca_df, newdata = test_df2 %>% select(-Spending_Score)))
dim(pca_df$x)
## [1] 6718 18
biplot(pca_df, scale = 0)
## Variance
pr.var <- pca_df$sdev^2
pr.var
## [1] 3.493869e+00 2.397251e+00 1.905694e+00 1.292728e+00 1.246989e+00
## [6] 1.125133e+00 1.110051e+00 1.056509e+00 9.684018e-01 9.268617e-01
## [11] 7.995518e-01 7.829040e-01 6.121335e-01 2.819218e-01 2.149223e-29
## [16] 7.153723e-30 1.231423e-30 8.138717e-32
pve <- pr.var / sum (pr.var)
pve
plot(pve, type = "b", ylim = c(0,1),
xlab = "Principal Component",
ylab = "PVE")
plot(cumsum(pve), type = "b", ylim = c(0,1),
xlab = "Principal Component",
ylab = "Cumulative PVE")
cumsum(pve)
Spending_Score <- customer_df$Spending_Score
customer_df3 <- cbind(customer_df3[1:15], Spending_Score)
Spending_Score <- test_df$Spending_Score
test_df3 <- cbind(test_df3[1:15], Spending_Score)
We use only 15 from 18 principal component for the best result
model.dt_OHE_PCA <- ctree(formula = Spending_Score ~ .,
data = customer_df3)
model.dt_OHE_PCA
plot(model.dt_OHE_PCA)
We can see the plot of tree model with OHE and PCA
pred.dt_OHE_PCA <- predict(model.dt_OHE_PCA, test_df3)
pred.dt_OHE_PCA
cm.dt_OHE_PCA <- table(test_df3$Spending_Score, pred.dt_OHE_PCA,
dnn = c("Actual", "Predicted"))
cm.dt_OHE_PCA
## Predicted
## Actual Average High Low
## Average 437 34 58
## High 72 197 42
## Low 175 48 1115
set.seed(2021)
model.forest_OHE_PCA <- randomForest(formula = Spending_Score ~ .,
data = customer_df3)
pred.forest_OHE_PCA <- predict(model.forest_OHE_PCA, test_df3)
cm.forest_OHE_PCA <- table(test_df3$Spending_Score, pred.forest_OHE_PCA,
dnn = c("Actual", "Predicted"))
cm.forest_OHE_PCA
## Predicted
## Actual Average High Low
## Average 411 35 83
## High 52 225 34
## Low 91 47 1200
model.svm_OHE_PCA <- svm(formula = Spending_Score ~ .,
data = customer_df3)
pred.svm_OHE_PCA <- predict(model.svm_OHE_PCA, test_df3)
cm.svm_OHE_PCA <- table(test_df3$Spending_Score, pred.svm_OHE_PCA,
dnn = c("Actual", "Predicted"))
cm.svm_OHE_PCA
## Predicted
## Actual Average High Low
## Average 474 22 33
## High 86 201 24
## Low 176 52 1110
confusionMatrix(pred.dt_OHE_PCA, test_df3$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 437 72 175
## High 34 197 48
## Low 58 42 1115
##
## Overall Statistics
##
## Accuracy : 0.803
## 95% CI : (0.7857, 0.8195)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.65
##
## Mcnemar's Test P-Value : 1.087e-15
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.8261 0.63344 0.8333
## Specificity 0.8502 0.95608 0.8810
## Pos Pred Value 0.6389 0.70609 0.9177
## Neg Pred Value 0.9384 0.93997 0.7684
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2006 0.09045 0.5119
## Detection Prevalence 0.3140 0.12810 0.5579
## Balanced Accuracy 0.8381 0.79476 0.8571
confusionMatrix(pred.svm_OHE_PCA, test_df3$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 474 86 176
## High 22 201 52
## Low 33 24 1110
##
## Overall Statistics
##
## Accuracy : 0.8196
## 95% CI : (0.8028, 0.8355)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6838
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.8960 0.64630 0.8296
## Specificity 0.8411 0.96036 0.9321
## Pos Pred Value 0.6440 0.73091 0.9512
## Neg Pred Value 0.9619 0.94220 0.7745
## Prevalence 0.2429 0.14279 0.6143
## Detection Rate 0.2176 0.09229 0.5096
## Detection Prevalence 0.3379 0.12626 0.5358
## Balanced Accuracy 0.8686 0.80333 0.8809
confusionMatrix(pred.forest_OHE_PCA, test_df3$Spending_Score )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Average High Low
## Average 411 52 91
## High 35 225 47
## Low 83 34 1200
##
## Overall Statistics
##
## Accuracy : 0.843
## 95% CI : (0.827, 0.858)
## No Information Rate : 0.6143
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7127
##
## Mcnemar's Test P-Value : 0.123
##
## Statistics by Class:
##
## Class: Average Class: High Class: Low
## Sensitivity 0.7769 0.7235 0.8969
## Specificity 0.9133 0.9561 0.8607
## Pos Pred Value 0.7419 0.7329 0.9112
## Neg Pred Value 0.9273 0.9540 0.8397
## Prevalence 0.2429 0.1428 0.6143
## Detection Rate 0.1887 0.1033 0.5510
## Detection Prevalence 0.2544 0.1410 0.6047
## Balanced Accuracy 0.8451 0.8398 0.8788
a
acc_OHE_PCA_df <- data.frame(models_OHE_PCA = c("Decision Tree OHE",
"Support Vector Machine OHE",
"Random Forest OHE"),
racc_OHE_PCA = c("0.8030",
"0.8196",
"0.8430"))
summary_model_OHE_PCA <- ggplot(acc_OHE_PCA_df, aes(x = models_OHE_PCA,
y = racc_OHE_PCA,
fill = models_OHE_PCA)) +
geom_bar(stat = "identity")
summary_model_OHE_PCA
grid.arrange(summary_model,summary_model_OHE,summary_model_OHE_PCA)
From that summary of accuracy based from each model, and model Random Forest with OHE and PCA is the best score in accuracy