This dataset explores and analyse various facets of customer interactions within an e-commerce environment. It captures essential details such as customer demographics, purchase history and payment behaviours as the dataset provides various variables such as Customer ID, Customer Name, Customer Age, Gender, Purchase Date, Product Category, Product Price, Quantity, Total Purchase Amount, Payment Method, Returns, and Churn. The dataset enables diverse analytical applications, which data scientists can leverage this information to gain insights into customer behavior, understand purchasing patterns and etc. The objective of this is to predict the monthly transaction amount sold by the store by using regression analysis and predicting the churn rate based on customer purchasing behavior.
To predict the monthly transaction amount.
To identify the best machine learning model for predicting whether a customer will churn or not based on their purchasing behavior.
Dataset Link: https://www.kaggle.com/datasets/shriyashjagtap/e-commerce-customer-for-behavior-analysis
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(DataExplorer)
library(tidyr)
library(corrplot)
## corrplot 0.92 loaded
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(xgboost)
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(rpart)
library(rpart.plot)
library(tree)
library(splitstackshape)
library(dummy)
## dummy 0.1.3
## dummyNews()
library(class)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(e1071)
if (!requireNamespace("DMwR", quietly = TRUE)) {
install.packages("DMwR")
}
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(DMwR)
## Loading required package: grid
library(Boruta)
data <- read.csv("ecommerce_customer_data_large.csv")
#info and summary
head(data)
## Customer.ID Purchase.Date Product.Category Product.Price Quantity
## 1 44605 2023-05-03 21:30:02 Home 177 1
## 2 44605 2021-05-16 13:57:44 Electronics 174 3
## 3 44605 2020-07-13 06:16:57 Books 413 1
## 4 44605 2023-01-17 13:14:36 Electronics 396 3
## 5 44605 2021-05-01 11:29:27 Books 259 4
## 6 13738 2022-08-25 06:48:33 Home 191 3
## Total.Purchase.Amount Payment.Method Customer.Age Returns Customer.Name Age
## 1 2427 PayPal 31 1 John Rivera 31
## 2 2448 PayPal 31 1 John Rivera 31
## 3 2345 Credit Card 31 1 John Rivera 31
## 4 937 Cash 31 0 John Rivera 31
## 5 2598 PayPal 31 1 John Rivera 31
## 6 3722 Credit Card 27 1 Lauren Johnson 27
## Gender Churn
## 1 Female 0
## 2 Female 0
## 3 Female 0
## 4 Female 0
## 5 Female 0
## 6 Female 0
summary(data)
## Customer.ID Purchase.Date Product.Category Product.Price
## Min. : 1 Length:250000 Length:250000 Min. : 10.0
## 1st Qu.:12590 Class :character Class :character 1st Qu.:132.0
## Median :25011 Mode :character Mode :character Median :255.0
## Mean :25018 Mean :254.7
## 3rd Qu.:37441 3rd Qu.:377.0
## Max. :50000 Max. :500.0
##
## Quantity Total.Purchase.Amount Payment.Method Customer.Age
## Min. :1.000 Min. : 100 Length:250000 Min. :18.0
## 1st Qu.:2.000 1st Qu.:1476 Class :character 1st Qu.:30.0
## Median :3.000 Median :2725 Mode :character Median :44.0
## Mean :3.005 Mean :2725 Mean :43.8
## 3rd Qu.:4.000 3rd Qu.:3975 3rd Qu.:57.0
## Max. :5.000 Max. :5350 Max. :70.0
##
## Returns Customer.Name Age Gender
## Min. :0.0 Length:250000 Min. :18.0 Length:250000
## 1st Qu.:0.0 Class :character 1st Qu.:30.0 Class :character
## Median :1.0 Mode :character Median :44.0 Mode :character
## Mean :0.5 Mean :43.8
## 3rd Qu.:1.0 3rd Qu.:57.0
## Max. :1.0 Max. :70.0
## NA's :47382
## Churn
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2005
## 3rd Qu.:0.0000
## Max. :1.0000
##
# Check for missing values
sum(is.na(data))
## [1] 47382
# Fill missing values in 'Returns' column with 0
data$Returns[is.na(data$Returns)] <- 0
# Investigate the redundancy between 'Customer Age' and 'Age'
data %>%
summarise(all_equal = all(Customer.Age == Age))
## all_equal
## 1 TRUE
# they are the same, we can drop one of the columns
data <- data %>% select(-Customer.Age)
# View the changes
head(data)
## Customer.ID Purchase.Date Product.Category Product.Price Quantity
## 1 44605 2023-05-03 21:30:02 Home 177 1
## 2 44605 2021-05-16 13:57:44 Electronics 174 3
## 3 44605 2020-07-13 06:16:57 Books 413 1
## 4 44605 2023-01-17 13:14:36 Electronics 396 3
## 5 44605 2021-05-01 11:29:27 Books 259 4
## 6 13738 2022-08-25 06:48:33 Home 191 3
## Total.Purchase.Amount Payment.Method Returns Customer.Name Age Gender Churn
## 1 2427 PayPal 1 John Rivera 31 Female 0
## 2 2448 PayPal 1 John Rivera 31 Female 0
## 3 2345 Credit Card 1 John Rivera 31 Female 0
## 4 937 Cash 0 John Rivera 31 Female 0
## 5 2598 PayPal 1 John Rivera 31 Female 0
## 6 3722 Credit Card 1 Lauren Johnson 27 Female 0
# Summary statistics for numerical columns (excluding Churn)
numerical_cols <- sapply(data, is.numeric) & names(data) != "Churn"
summary(data[, numerical_cols])
## Customer.ID Product.Price Quantity Total.Purchase.Amount
## Min. : 1 Min. : 10.0 Min. :1.000 Min. : 100
## 1st Qu.:12590 1st Qu.:132.0 1st Qu.:2.000 1st Qu.:1476
## Median :25011 Median :255.0 Median :3.000 Median :2725
## Mean :25018 Mean :254.7 Mean :3.005 Mean :2725
## 3rd Qu.:37441 3rd Qu.:377.0 3rd Qu.:4.000 3rd Qu.:3975
## Max. :50000 Max. :500.0 Max. :5.000 Max. :5350
## Returns Age
## Min. :0.0000 Min. :18.0
## 1st Qu.:0.0000 1st Qu.:30.0
## Median :0.0000 Median :44.0
## Mean :0.4059 Mean :43.8
## 3rd Qu.:1.0000 3rd Qu.:57.0
## Max. :1.0000 Max. :70.0
# Define the specific columns for which we want frequency tables
selected_cols <- c("Product.Category", "Payment.Method", "Returns", "Gender", "Churn")
# Frequency counts for the selected columns
for (col_name in selected_cols) {
cat("\nFrequency table for", col_name, ":\n")
print(table(data[[col_name]]))
}
##
## Frequency table for Product.Category :
##
## Books Clothing Electronics Home
## 62247 62581 62630 62542
##
## Frequency table for Payment.Method :
##
## Cash Credit Card PayPal
## 83012 83547 83441
##
## Frequency table for Returns :
##
## 0 1
## 148524 101476
##
## Frequency table for Gender :
##
## Female Male
## 124324 125676
##
## Frequency table for Churn :
##
## 0 1
## 199870 50130
# Convert the 'Purchase Date' column to a date format (without time)
data$Purchase.Date <- as.Date(data$Purchase.Date)
head(data)
## Customer.ID Purchase.Date Product.Category Product.Price Quantity
## 1 44605 2023-05-03 Home 177 1
## 2 44605 2021-05-16 Electronics 174 3
## 3 44605 2020-07-13 Books 413 1
## 4 44605 2023-01-17 Electronics 396 3
## 5 44605 2021-05-01 Books 259 4
## 6 13738 2022-08-25 Home 191 3
## Total.Purchase.Amount Payment.Method Returns Customer.Name Age Gender Churn
## 1 2427 PayPal 1 John Rivera 31 Female 0
## 2 2448 PayPal 1 John Rivera 31 Female 0
## 3 2345 Credit Card 1 John Rivera 31 Female 0
## 4 937 Cash 0 John Rivera 31 Female 0
## 5 2598 PayPal 1 John Rivera 31 Female 0
## 6 3722 Credit Card 1 Lauren Johnson 27 Female 0
#data visualisation (categorical)
# Convert 'Churn' and 'Returns' to a factor (or character)
data$Churn <- as.factor(data$Churn)
data$Returns <- as.factor(data$Returns)
# Create a long format of the data for the selected categorical variables
data_long <- data %>%
select(Product.Category, Payment.Method, Gender, Churn, Returns) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
# Create bar plots for each variable using facets
ggplot(data_long, aes(x = Value)) +
geom_bar() +
facet_wrap(~ Variable, scales = "free_x") +
theme_minimal() +
labs(title = "Bar Plots for Categorical Variables", x = "", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Create a bar graph for Churn by Gender
ggplot(data, aes(x = Gender, fill = as.factor(Churn))) +
geom_bar(position = "dodge") +
labs(title = "Bar Graph of Churn by Gender", x = "Gender", y = "Count", fill = "Churn") +
theme_minimal()
# Create a bar graph for Returns by Gender
ggplot(data, aes(x = Gender, fill = as.factor(Returns))) +
geom_bar(position = "dodge") +
labs(title = "Bar Graph of Returns by Gender", x = "Gender", y = "Count", fill = "Returns") +
theme_minimal()
# Bar Graph for Count of Sales per Product Category
ggplot(data, aes(x = `Product.Category`)) +
geom_bar() +
labs(title = "Count of Sales per Product Category", x = "Product Category", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Boxplot for Sales Amount by Product Category
ggplot(data, aes(x = `Product.Category`, y = `Total.Purchase.Amount`)) +
geom_boxplot() +
labs(title = "Sales Amount by Product Category", x = "Product Category", y = "Total Purchase Amount") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#historgram fpr numerical variables
# Create a long format of the data for the selected numerical variables
data_long_num <- data %>%
select(Product.Price, Quantity, Age) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
# Create histograms for each variable using facets
ggplot(data_long_num, aes(x = Value)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
facet_wrap(~ Variable, scales = "free") +
theme_minimal() +
labs(title = "Histograms for Numerical Variables", x = "", y = "Frequency")
# Create a boxplot for Age by Churn
ggplot(data, aes(x = as.factor(Churn), y = Age)) +
geom_boxplot() +
labs(title = "Boxplot of Age by Churn", x = "Churn", y = "Age") +
theme_minimal()
# Create a boxplot for Age by Return
ggplot(data, aes(x = as.factor(Returns), y = Age)) +
geom_boxplot() +
labs(title = "Boxplot of Age by Churn", x = "Returns", y = "Age") +
theme_minimal()
#pie charts
# Function to create a pie chart with percentage labels for a given variable
create_pie_chart <- function(data, variable_name) {
pie_data <- data %>%
count(!!sym(variable_name)) %>%
mutate(Percentage = n / sum(n) * 100)
ggplot(pie_data, aes(x = "", y = n, fill = !!sym(variable_name))) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
geom_text(aes(label = paste0(round(Percentage, 1), "%")),
position = position_stack(vjust = 0.5)) +
labs(title = paste("Pie Chart of", variable_name), x = NULL, y = NULL, fill = variable_name) +
theme_minimal() +
theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(),
legend.position = "bottom")
}
# Create pie charts with percentages
create_pie_chart(data, "Churn")
create_pie_chart(data, "Returns")
create_pie_chart(data, "Gender")
#correlaiton
# Selecting only numerical variables for correlation analysis
numerical_vars <- data %>% select(`Product.Price`, Quantity, `Total.Purchase.Amount`, `Age`)
# Calculate the correlation matrix
cor_matrix <- cor(numerical_vars, use = "complete.obs")
# Visualizing the correlation matrix
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45, addCoef.col = "black")
#+1 indicates a perfect positive linear relationship.
#-1 indicates a perfect negative linear relationship.
#0 indicates no linear relationship.
#A correlation coefficient of 0.06 suggests a very weak positive linear relationship between Age and Total Purchase Amount.
#This means that as Age increases, there is a slightly positive trend in the Total Purchase Amount, but this trend is very weak and not strongly predictive.
#In practical terms, a correlation of 0.06 is close to zero, implying that Age is not a strong predictor of the Total Purchase Amount in this dataset.
# Categorizing Age into groups
data$AgeGroup <- cut(data$Age, breaks = c(18, 25, 35, 45, 55, 65, Inf),
labels = c("18-25", "26-35", "36-45", "46-55", "56-65", "65+"), right = FALSE)
# Analysis 1: Purchasing Patterns by Age Group
age_group_analysis <- data %>%
group_by(AgeGroup) %>%
summarise(AverageSpend = mean(Total.Purchase.Amount),
TotalPurchases = n())
# Analysis 2: Purchasing Behavior by Gender
gender_analysis <- data %>%
group_by(Gender) %>%
summarise(AverageSpend = mean(Total.Purchase.Amount),
TotalPurchases = n())
# Output the results
print(age_group_analysis)
## # A tibble: 6 × 3
## AgeGroup AverageSpend TotalPurchases
## <fct> <dbl> <int>
## 1 18-25 2601. 34182
## 2 26-35 2656. 47679
## 3 36-45 2698. 46794
## 4 46-55 2756. 46729
## 5 56-65 2812. 46381
## 6 65+ 2846. 28235
print(gender_analysis)
## # A tibble: 2 × 3
## Gender AverageSpend TotalPurchases
## <chr> <dbl> <int>
## 1 Female 2723. 124324
## 2 Male 2728. 125676
#nothing pops out from above analysis as well, everything seems very evenly distributed in this dataset
#from gender, to total spending, to product category, age
##REGRESSION
#add month number and year number
new_data <- data %>% mutate(month_number = format(as.Date(Purchase.Date), "%m")) %>%
mutate(year_number = format(as.Date(Purchase.Date), "%Y"))
new_data <- new_data[order(new_data$Purchase.Date), ]
#create new collomn for purchase date to change it into first of the month
new_data$Purchase_Date <- paste(new_data$year_number, new_data$month_number, '01', sep = "-")
new_data$Purchase_Date <- as.Date(new_data$Purchase_Date)
#filter unfinished month transaction
new_data <- new_data %>% filter(Purchase_Date < as.Date("2023-09-01"))
#aggregate the data for Purchase_Date, month_number,year_number to summarise total amount
new_data <- new_data %>% group_by(Purchase_Date, month_number, year_number) %>% summarise(total_amount = sum(Total.Purchase.Amount))
## `summarise()` has grouped output by 'Purchase_Date', 'month_number'. You can
## override using the `.groups` argument.
new_data
## # A tibble: 44 × 4
## # Groups: Purchase_Date, month_number [44]
## Purchase_Date month_number year_number total_amount
## <date> <chr> <chr> <int>
## 1 2020-01-01 01 2020 15566765
## 2 2020-02-01 02 2020 14660710
## 3 2020-03-01 03 2020 15517357
## 4 2020-04-01 04 2020 15060419
## 5 2020-05-01 05 2020 15517765
## 6 2020-06-01 06 2020 15062859
## 7 2020-07-01 07 2020 15575247
## 8 2020-08-01 08 2020 16190648
## 9 2020-09-01 09 2020 15134038
## 10 2020-10-01 10 2020 15758325
## # ℹ 34 more rows
# Split the data into training and testing sets
test_1 <- new_data %>% filter(Purchase_Date >= as.Date("2022-08-01"))
# Training set
train_1 <- new_data %>% filter(Purchase_Date < as.Date("2022-08-01"))
# add new column to specify the value type
train_1$type <- "Train"
test_1$type <- "Test"
# create data frame to display year range of test and train data
train_test <- rbind(train_1, test_1)
#create model for total_amount using purchase date and month number as predictor
M1_1 <- lm(total_amount~ Purchase_Date + month_number, data =train_1)
#create model for total_amount using month number as predictor
M1_2 <- lm(total_amount~month_number, data =train_1)
#create model for total_amount using purchase date as predictor
M1_3 <- lm(total_amount~Purchase_Date, data =train_1)
print("M1_1")
## [1] "M1_1"
summary(M1_1)
##
## Call:
## lm(formula = total_amount ~ Purchase_Date + month_number, data = train_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -340476 -98645 -18200 119557 348250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.518e+07 2.547e+06 5.960 1.22e-05 ***
## Purchase_Date 1.626e+01 1.366e+02 0.119 0.90657
## month_number02 -1.166e+06 1.655e+05 -7.047 1.42e-06 ***
## month_number03 5.649e+04 1.656e+05 0.341 0.73702
## month_number04 -3.913e+05 1.659e+05 -2.359 0.02985 *
## month_number05 1.318e+05 1.662e+05 0.793 0.43823
## month_number06 -3.325e+05 1.667e+05 -1.995 0.06147 .
## month_number07 3.077e+05 1.673e+05 1.840 0.08236 .
## month_number08 5.680e+05 1.850e+05 3.070 0.00659 **
## month_number09 -3.307e+05 1.851e+05 -1.786 0.09095 .
## month_number10 1.507e+05 1.854e+05 0.813 0.42688
## month_number11 -4.825e+05 1.857e+05 -2.598 0.01816 *
## month_number12 1.710e+05 1.861e+05 0.919 0.37024
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 202600 on 18 degrees of freedom
## Multiple R-squared: 0.8925, Adjusted R-squared: 0.8208
## F-statistic: 12.45 on 12 and 18 DF, p-value: 2.315e-06
cat("Mean Squared Error (MSE):", mean(residuals(M1_1)^2), "\n")
## Mean Squared Error (MSE): 23835107443
print("M1_2")
## [1] "M1_2"
summary(M1_2)
##
## Call:
## lm(formula = total_amount ~ month_number, data = train_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -334536 -103096 -19941 121041 342304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15484042 113900 135.944 < 2e-16 ***
## month_number02 -1165636 161079 -7.236 7.19e-07 ***
## month_number03 57450 161079 0.357 0.72528
## month_number04 -389807 161079 -2.420 0.02571 *
## month_number05 133751 161079 0.830 0.41665
## month_number06 -330054 161079 -2.049 0.05453 .
## month_number07 310680 161079 1.929 0.06884 .
## month_number08 568519 180091 3.157 0.00519 **
## month_number09 -329678 180091 -1.831 0.08289 .
## month_number10 152170 180091 0.845 0.40865
## month_number11 -480526 180091 -2.668 0.01519 *
## month_number12 173510 180091 0.963 0.34742
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 197300 on 19 degrees of freedom
## Multiple R-squared: 0.8924, Adjusted R-squared: 0.8301
## F-statistic: 14.32 on 11 and 19 DF, p-value: 5.888e-07
cat("Mean Squared Error (MSE):", mean(residuals(M1_2)^2), "\n")
## Mean Squared Error (MSE): 23853867113
print("M1_3")
## [1] "M1_3"
summary(M1_3)
##
## Call:
## lm(formula = total_amount ~ Purchase_Date, data = train_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1400681 -229723 88538 294036 858166
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.358e+07 6.006e+06 2.261 0.0314 *
## Purchase_Date 9.484e+01 3.209e+02 0.296 0.7696
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 486100 on 29 degrees of freedom
## Multiple R-squared: 0.003004, Adjusted R-squared: -0.03138
## F-statistic: 0.08738 on 1 and 29 DF, p-value: 0.7696
cat("Mean Squared Error (MSE):", mean(residuals(M1_3)^2), "\n")
## Mean Squared Error (MSE): 221005251920
# create new df for fitted df and assign value type for the the dataset
fit_df <- data.frame(train_1[-5]) %>% mutate(type="Fitted")
# create new df for prediction df and assign value type for the the dataset
testset_all <- data.frame(test_1[-5]) %>%
mutate(type="Predicted")
# create new df for prediction row
Aug2022<-data.frame(Purchase_Date = "2022/08/01",
month_number="08",
year_number= "2022",
total_amount=0,
type="Predicted")
# convert the date column to the right date format to merge with predicted df later
Aug2022$Purchase_Date <- as.Date(Aug2022$Purchase_Date,format = "%Y/%m/%d")
## Use predict function + model we just built and apply to Aug 2022 data frame
fit_df$total_amount <- predict(M1_1, train_1)
testset_all$total_amount <- predict(M1_1, test_1)
pred <- predict(M1_1, Aug2022)
Aug2022$total_amount <- pred
# combine the predicted df with the new predicted row
testset_all <- rbind(testset_all, Aug2022)
testset_all
## Purchase_Date month_number year_number total_amount type
## 1 2022-08-01 08 2022 16061463 Predicted
## 2 2022-09-01 09 2022 15163266 Predicted
## 3 2022-10-01 10 2022 15645114 Predicted
## 4 2022-11-01 11 2022 15012418 Predicted
## 5 2022-12-01 12 2022 15666454 Predicted
## 6 2023-01-01 01 2023 15495917 Predicted
## 7 2023-02-01 02 2023 14330281 Predicted
## 8 2023-03-01 03 2023 15553362 Predicted
## 9 2023-04-01 04 2023 15106104 Predicted
## 10 2023-05-01 05 2023 15629663 Predicted
## 11 2023-06-01 06 2023 15165858 Predicted
## 12 2023-07-01 07 2023 15806592 Predicted
## 13 2023-08-01 08 2023 16067398 Predicted
## 14 2022-08-01 08 2022 16061463 Predicted
# create new column to specify value type for the dataset
train_test$type <- "Real"
# make sure our variables have the right format for plotting
fit_df <- as.data.frame(fit_df)
testset_all <- as.data.frame(testset_all)
# merge all necessary variables into 1 single df variable with rbind
Final_data <- rbind(fit_df, testset_all,train_test)
# Plot goodness of the fit from our ML R
Final_data %>%
filter(Purchase_Date < as.Date("2022-08-01")) %>%
filter(!(type %in% "Predicted")) %>%
ggplot() +
aes(x = Purchase_Date, y = total_amount, colour = type) +
geom_line(size = 0.5) +
scale_color_hue(direction = 1) +
labs(
x = "Year",
y = "Total Amount Amount",
title = "Fit from MLR"
) +
theme_light()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Plot the comparison between our MLR prediction on test set with the actual values
ggplot(Final_data) +
aes(x = Purchase_Date, y = total_amount, colour = type) +
geom_line(size = 0.5) +
scale_color_hue(direction = 1) +
labs(x = "Year", y = "Total amount") +
theme_light() +
theme(plot.title = element_text(face = "bold"))
#evaluation metrics for train vs prediction
evaluation_metrics_train <- data.frame(
evaluation_metrics = "Fitted value vs Train data",
R2 = summary(M1_1)$r.squared,
RMSE = modelr::rmse(M1_1, data = train_1),
MAE = modelr::mae(M1_1, data = train_1)
)
evaluation_metrics_test <- data.frame(
evaluation_metrics = "Predictions vs Test data",
R2 = summary(M1_1)$r.squared,
RMSE = modelr::rmse(M1_1, data = test_1),
MAE = modelr::mae(M1_1, data = test_1)
)
evaluation_metrics <- rbind(evaluation_metrics_train, evaluation_metrics_test)
evaluation_metrics
## evaluation_metrics R2 RMSE MAE
## 1 Fitted value vs Train data 0.8924754 154386.2 123102.4
## 2 Predictions vs Test data 0.8924754 387010.8 331014.9
# Model test error
test_error <- (evaluation_metrics_test$RMSE)/mean(test_1$total_amount)
test_error
## [1] 0.02531444
##Data PreProcessing
#Eliminating customer names, as the customer ID serves as the unique identifier, and age has been replaced with the new 'AgeGroup' field
data <- data[, !colnames(data) %in% c( "Customer.Name", "Age")]
# Filter Churn 1 and Churn 0 samples
churn_1_samples <- data[data$Churn == 1, ]
churn_0_samples <- data[data$Churn == 0, ]
# Randomly select 5000 samples from each class
set.seed(42)
churn_1_samples <- churn_1_samples[sample(nrow(churn_1_samples), 5000), ]
churn_0_samples <- churn_0_samples[sample(nrow(churn_0_samples), 5000), ]
# merging the sampled datasets
balanced_data <- rbind(churn_1_samples, churn_0_samples)
# Shuffle the rows to mix Churn 1 and Churn 0 samples
balanced_data <- data[sample(nrow(balanced_data)), ]
#convert categorical value to factors
balanced_data$Gender <- as.factor(balanced_data$Gender)
balanced_data$Product.Category <- as.factor(balanced_data$Product.Category)
balanced_data$Payment.Method <- as.factor(balanced_data$Payment.Method)
# Run Boruta on the dataset
boruta_output <- Boruta(Churn ~ ., data=na.omit(balanced_data), doTrace=0)
print(boruta_output)
## Boruta performed 86 iterations in 2.005741 mins.
## 3 attributes confirmed important: AgeGroup, Customer.ID, Gender;
## 7 attributes confirmed unimportant: Payment.Method, Product.Category,
## Product.Price, Purchase.Date, Quantity and 2 more;
# identify significant variables
boruta_signif <- getSelectedAttributes(boruta_output, withTentative = TRUE)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
boruta_signif <- getSelectedAttributes(roughFixMod)
# Variable Importance Scores
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])
## meanImp decision
## Customer.ID 34.39889 Confirmed
## AgeGroup 23.47961 Confirmed
## Gender 18.67113 Confirmed
# filtering out all not important columns
balanced_data <- balanced_data %>%
select(boruta_signif,"Churn")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(boruta_signif)
##
## # Now:
## data %>% select(all_of(boruta_signif))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Split the data into training and testing sets
set.seed(42)
splitIndex <- createDataPartition(balanced_data$Churn, p = 0.8, list = FALSE)
train_data <- balanced_data[splitIndex, ]
test_data <- balanced_data[-splitIndex, ]
# Convert categorical columns to numeric
train_data <- train_data %>% mutate(across(-Churn, as.numeric))
test_data <- test_data %>% mutate(across(-Churn, as.numeric))
# Feature scaling for numerical columns
numeric_cols <- sapply(train_data, is.numeric)
train_data[, numeric_cols] <- scale(train_data[, numeric_cols])
test_data[, numeric_cols] <- scale(test_data[, numeric_cols])
print("Counts of Churn in test_data:")
## [1] "Counts of Churn in test_data:"
prop.table(table(test_data$Churn))
##
## 0 1
## 0.801901 0.198099
train_data_balanced <- SMOTE(Churn ~ ., train_data, perc.over = 100, perc.under = 200)
# Compare counts of Churn in both classes
cat("Comparison of Churn counts in both classes:\n")
## Comparison of Churn counts in both classes:
prop.table(table(train_data_balanced$Churn))
##
## 0 1
## 0.5 0.5
# Model 1: k-NN
train_data_balanced$Churn <- as.factor(train_data_balanced$Churn)
knn_model <- knn(train_data_balanced[, -which(names(train_data_balanced) == "Churn")],
test_data[, -which(names(test_data) == "Churn")],
train_data_balanced$Churn, k = 15)
# Model 2: Random Forest
rf_model <- randomForest(Churn ~ ., data = train_data_balanced, ntree = 400, mtry = 4)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
rf_predictions <- predict(rf_model, newdata = test_data, type = "response")
# Model 3: Generalized Linear Model (Logistic Regression)
glm_model <- glm(Churn ~ ., data = train_data_balanced, family = binomial, control = glm.control(maxit = 300))
glm_predictions <- predict(glm_model, newdata = test_data, type = "response")
# Convert predicted probabilities to class labels (0 or 1)
glm_predictions <- ifelse(glm_predictions > 0.5, 1, 0)
# Convert data to DMatrix format
train_data_balanced$Churn <- as.numeric(train_data_balanced$Churn) - 1
dtrain <- xgb.DMatrix(data = as.matrix(train_data_balanced[, -which(names(train_data_balanced) == "Churn")]), label = train_data_balanced$Churn)
dtest <- xgb.DMatrix(data = as.matrix(test_data[, -which(names(test_data) == "Churn")]), label = test_data$Churn)
# Model 4: XGBoost model
xgb_params <- list(objective = "binary:logistic", eval_metric = "logloss")
xgb_model <- xgb.train(params = xgb_params, data = dtrain, nrounds = 300)
xgb_predictions <- predict(xgb_model, newdata = as.matrix(test_data[, -which(names(test_data) == "Churn")]))
# Train SVM model
train_data_balanced$Churn <- as.factor(train_data_balanced$Churn)
svm_model <- svm(Churn ~ ., data = train_data_balanced, kernel = "linear", cost = 5)
# Make predictions on test_data
svm_predictions <- predict(svm_model, newdata = test_data[, -which(names(test_data) == "Churn")])
# Ensure predictions have the same levels as Churn in test_data
svm_predictions <- factor(svm_predictions, levels = levels(test_data$Churn))
# Evaluate the SVM model
confusion_matrix_svm <- confusionMatrix(svm_predictions, test_data$Churn)
# Evaluate the models
confusion_matrix_knn <- confusionMatrix(knn_model, test_data$Churn)
confusion_matrix_rf <- confusionMatrix(rf_predictions, test_data$Churn)
confusion_matrix <- confusionMatrix(factor(glm_predictions, levels = levels(test_data$Churn)), test_data$Churn)
confusion_matrix_xgb <- confusionMatrix(factor(ifelse(xgb_predictions > 0.5, 1, 0), levels = levels(test_data$Churn)), test_data$Churn)
confusion_matrix_svm <- confusionMatrix(svm_predictions, test_data$Churn)
# Display confusion matrices
print("Confusion Matrix for k-NN:")
## [1] "Confusion Matrix for k-NN:"
print(confusion_matrix_knn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 852 72
## 1 751 324
##
## Accuracy : 0.5883
## 95% CI : (0.5664, 0.61)
## No Information Rate : 0.8019
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2125
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5315
## Specificity : 0.8182
## Pos Pred Value : 0.9221
## Neg Pred Value : 0.3014
## Prevalence : 0.8019
## Detection Rate : 0.4262
## Detection Prevalence : 0.4622
## Balanced Accuracy : 0.6748
##
## 'Positive' Class : 0
##
print("Confusion Matrix for Random Forest:")
## [1] "Confusion Matrix for Random Forest:"
print(confusion_matrix_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1404 112
## 1 199 284
##
## Accuracy : 0.8444
## 95% CI : (0.8278, 0.86)
## No Information Rate : 0.8019
## P-Value [Acc > NIR] : 5.372e-07
##
## Kappa : 0.5477
##
## Mcnemar's Test P-Value : 1.079e-06
##
## Sensitivity : 0.8759
## Specificity : 0.7172
## Pos Pred Value : 0.9261
## Neg Pred Value : 0.5880
## Prevalence : 0.8019
## Detection Rate : 0.7024
## Detection Prevalence : 0.7584
## Balanced Accuracy : 0.7965
##
## 'Positive' Class : 0
##
print("Confusion Matrix for Generalized Linear Model:")
## [1] "Confusion Matrix for Generalized Linear Model:"
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 903 202
## 1 700 194
##
## Accuracy : 0.5488
## 95% CI : (0.5267, 0.5708)
## No Information Rate : 0.8019
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0361
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5633
## Specificity : 0.4899
## Pos Pred Value : 0.8172
## Neg Pred Value : 0.2170
## Prevalence : 0.8019
## Detection Rate : 0.4517
## Detection Prevalence : 0.5528
## Balanced Accuracy : 0.5266
##
## 'Positive' Class : 0
##
print("Confusion Matrix for XGBoost:")
## [1] "Confusion Matrix for XGBoost:"
print(confusion_matrix_xgb)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1322 147
## 1 281 249
##
## Accuracy : 0.7859
## 95% CI : (0.7672, 0.8037)
## No Information Rate : 0.8019
## P-Value [Acc > NIR] : 0.9649
##
## Kappa : 0.4022
##
## Mcnemar's Test P-Value : 1.286e-10
##
## Sensitivity : 0.8247
## Specificity : 0.6288
## Pos Pred Value : 0.8999
## Neg Pred Value : 0.4698
## Prevalence : 0.8019
## Detection Rate : 0.6613
## Detection Prevalence : 0.7349
## Balanced Accuracy : 0.7267
##
## 'Positive' Class : 0
##
print("Confusion Matrix for SVM:")
## [1] "Confusion Matrix for SVM:"
print(confusion_matrix_svm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 817 164
## 1 786 232
##
## Accuracy : 0.5248
## 95% CI : (0.5026, 0.5469)
## No Information Rate : 0.8019
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.06
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5097
## Specificity : 0.5859
## Pos Pred Value : 0.8328
## Neg Pred Value : 0.2279
## Prevalence : 0.8019
## Detection Rate : 0.4087
## Detection Prevalence : 0.4907
## Balanced Accuracy : 0.5478
##
## 'Positive' Class : 0
##
# Testing accuracy for Random Forest
test_predictions_rf <- predict(rf_model, newdata = test_data)
test_accuracy_rf <- sum(test_predictions_rf == test_data$Churn) / nrow(test_data) * 100
cat("Testing Accuracy for Random Forest:", test_accuracy_rf, "%\n")
## Testing Accuracy for Random Forest: 84.49225 %
# Testing accuracy for XGBoost
test_accuracy_xgb <- sum(xgb_predictions > 0.5 & test_data$Churn == 1 | xgb_predictions <= 0.5 & test_data$Churn == 0) / nrow(test_data) * 100
cat("Testing Accuracy for XGBoost:", test_accuracy_xgb, "%\n")
## Testing Accuracy for XGBoost: 78.58929 %
# Testing accuracy for k-NN
test_predictions_knn <- knn_model
test_accuracy_knn <- sum(test_predictions_knn == test_data$Churn) / nrow(test_data) * 100
cat("Testing Accuracy for k-NN:", test_accuracy_knn, "%\n")
## Testing Accuracy for k-NN: 58.82941 %
# Testing accuracy for Linear model
test_accuracy_glm <- sum(glm_predictions == test_data$Churn) / nrow(test_data) * 100
cat("Testing Accuracy for linear model:", test_accuracy_glm, "%\n")
## Testing Accuracy for linear model: 54.87744 %
# Testing accuracy for SVM
test_accuracy_svm <- sum(svm_predictions == test_data$Churn) / nrow(test_data) * 100
cat("Testing Accuracy for SVM:", test_accuracy_svm, "%\n")
## Testing Accuracy for SVM: 52.47624 %
Based on the prediction using regression, model 1, using the date and month as predictor, has the higest R Squared of 0.8925 and have 2% error rate that indicates a better fit and good performance for regression.
For classification models, Random Forest demonstrates the highest accuracy (84%) and a balanced trade-off between sensitivity and specificity.
For a churn prediction model, it’s crucial to minimize false negatives (customers predicted as not churning but actually churn), emphasizing high sensitivity.
Based on the objective to predict churn, Random Forest appears to be the most suitable model.