How does a financial institution optimize their product offering? How should a credit card issuer use digital and emerging technology to acquire and retain the best customers?
Data Sources: Store Market Sales Data Invoice id: Computer generated sales slip invoice identification number
Branch: Branch of supercenter (3 branches are available identified by A, B and C).
City: Location of supercenters
Customer type: Type of customers, recorded by Members for customers using member card and Normal for without member card.
Gender: Gender type of customer
Product line: General item categorization groups - Electronic accessories, Fashion accessories, Food and beverages, Health and beauty, Home and lifestyle, Sports and travel
Unit price: Price of each product in $
Quantity: Number of products purchased by customer
Tax: 5% tax fee for customer buying
Total: Total price including tax
Date: Date of purchase (Record available from January 2019 to March 2019)
Time: Purchase time (10am to 9pm)
Payment: Payment used by customer for purchase (3 methods are available – Cash, Credit card and Ewallet)
COGS: Cost of goods sold
Gross margin percentage: Gross margin percentage
Gross income: Gross income
Rating: Customer stratification rating on their overall shopping experience (On a scale of 1 to 10)
URL: https://www.kaggle.com/datasets/neoanderson144/supermarketsales?select=supermarketsales.json
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(jsonlite)
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
# read, flatten json file, and convert to data frame
salesdt <- flatten(fromJSON(txt = "C:/Users/Anhuynh/Desktop/ITU/Classes/Principles of Business Analytics/Assignments/Dataset/supermarketsales.json"))
tab_list <- lapply(1:nrow(salesdt),
function(i) data.frame(salesdt[i, -17], salesdt[i, 17],
stringsAsFactors = FALSE))
library(dplyr)
flat_table <- bind_rows(tab_list)
# calculate Revenue
salesdf <- cbind(flat_table, Revenue = flat_table$cogs/(1-flat_table$gross.margin.percentage/100))
# rename a column
colnames(salesdf)[17] <- "Rating"
# convert from "char" to "date"
salesdf$Date <- mdy(salesdf$Date)
##Find missing values
colSums(is.na(salesdf))
## Invoice.ID Branch City
## 0 0 0
## Customer.type Gender Product.line
## 0 0 0
## Unit.price Quantity Tax.5.
## 0 0 0
## Total Date Time
## 0 0 0
## Payment cogs gross.margin.percentage
## 0 0 0
## gross.income Rating Revenue
## 0 0 0
# Changes in revenue over Payment type and Customer type
ggplot(salesdf, aes(fill=Customer.type, y=Revenue, x=Payment)) +
geom_bar(position="stack", stat="identity") +
scale_fill_viridis(discrete = T) +
ggtitle("Changes in Revenue over Payment types") +
theme(text = element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
xlab("") +
labs(x="Payment type",y="Revenue ($)")
ggplot(salesdf, aes(fill=Customer.type, y=Revenue, x=City)) +
geom_bar(position="stack", stat="identity") +
scale_fill_viridis(discrete = T) +
ggtitle("Changes in Revenue across City") +
theme(text = element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
xlab("") +
labs(x="City",y="Revenue ($)")
ggplot(salesdf, aes(fill=Customer.type, y=Revenue, x=Product.line)) +
geom_bar(position="stack", stat="identity") +
scale_fill_viridis(discrete = T) +
ggtitle("Changes in Revenue across Product lines") +
theme(text = element_text(size = 12)) +
xlab("") +
labs(x="Product line",y="Revenue ($)") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
# Distribution of the variables counts, or how many patients fall into each bucket of measurements.
hist(salesdf$Rating[which(salesdf$Customer.type == "Member")], col = "maroon",
xlab='Rating scale', main=paste0(
"Histogram of Member Customers across Rating"))
rug(salesdf$Rating)
abline(v = median(salesdf$Rating), col = "magenta", lwd = 4)
hist(salesdf$Rating[which(salesdf$Customer.type == "Normal")], col = "gold",
xlab='Rating scale', main=paste0(
"Histogram of Normal Customers across Rating"))
rug(salesdf$Rating)
abline(v = median(salesdf$Rating), col = "magenta", lwd = 4)
library(reshape2)
# create new dataset, convert column values into rows
salesdf$Date <- as.factor(salesdf$Date)
sales <- melt(salesdf[c("cogs", "Revenue", "Date")])
## Using Date as id variables
ggplot(sales, aes(fill=variable, y=value, x=Date)) +
geom_bar(position="dodge", stat="identity") +
scale_fill_viridis(discrete = T) +
ggtitle("Revenue Vs. Cost of Goods Sold") +
theme(text = element_text(size = 12)) +
xlab("") +
labs(x="Date",y="Value ($)") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
theme(legend.position="bottom")
# Identify important variables and build models
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.2.0 ──
## ✔ broom 0.8.0 ✔ rsample 0.1.1
## ✔ dials 1.0.0 ✔ tibble 3.1.7
## ✔ infer 1.0.2 ✔ tidyr 1.2.0
## ✔ modeldata 0.1.1 ✔ tune 0.2.0
## ✔ parsnip 1.0.0 ✔ workflows 0.2.6
## ✔ purrr 0.3.4 ✔ workflowsets 0.2.1
## ✔ recipes 0.2.0 ✔ yardstick 1.0.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
## Step 1: Split the data into training and test sets
set.seed(2022)
salesdf_split <- initial_split(salesdf, prop = 3/4)
sales_train <- training(salesdf_split) # training set
sales_test <- testing(salesdf_split) # test set
Feature Engineering.
sales_train_imp <- sales_train %>%
transmute(
Branch,
City,
Customer.type = ifelse(Customer.type == "Member", 1, 0),
Gender = ifelse(Gender == "Female", 1, 0),
Product.line,
Unit.price,
Quantity,
Tax.5.,
Total,
Payment = ifelse(Payment == "Credit card", 1, 0) ,
cogs,
gross.income,
Rating
)
sales_test_imp <- sales_test %>%
transmute(
Branch,
City,
Customer.type = ifelse(Customer.type == "Member", 1, 0),
Gender = ifelse(Gender == "Female", 1, 0),
Product.line,
Unit.price,
Quantity,
Tax.5.,
Total,
Payment = ifelse(Payment == "Credit card", 1, 0) ,
cogs,
gross.income,
Rating
)
library(corrplot)
## corrplot 0.92 loaded
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:parsnip':
##
## translate
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Mark the insignificant coefficients according to the specified p-value significance level
cor_9 <- rcorr(as.matrix(sales_train_imp[-c(1,2,5)]))
sales_train_imp_cor <- cor_9$r
p_mat <- cor_9$P
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(sales_train_imp_cor, method = "color", col = col(200),
type = "upper", order = "hclust",
addCoef.col = "black", # Add coefficient of correlation
tl.col = "darkblue", tl.srt = 45, #Text label color and rotation
# Combine with significance level
p.mat = p_mat, sig.level = 0.01,
# hide correlation coefficient on the principal diagonal
diag = FALSE
)
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
##
## impute
## The following object is masked from 'package:tune':
##
## tune
## The following object is masked from 'package:rsample':
##
## permutations
## The following object is masked from 'package:parsnip':
##
## tune
library(gmodels)
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ROCR)
## Naive Bayes
# accuracy = TP + TN /(TP + TN + FP + FN)
# error rate = 1 - accuracy
sales_classifier <- naiveBayes(sales_train_imp, sales_train_imp$Customer.type)
sales_class_pred <- predict(sales_classifier, sales_test_imp )
CrossTable(sales_class_pred, sales_test_imp$Customer.type,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 250
##
##
## | actual
## predicted | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 140 | 0 | 140 |
## | 0.560 | 0.000 | |
## -------------|-----------|-----------|-----------|
## 1 | 0 | 110 | 110 |
## | 0.000 | 0.440 | |
## -------------|-----------|-----------|-----------|
## Column Total | 140 | 110 | 250 |
## -------------|-----------|-----------|-----------|
##
##
Random Forest Model
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
library(pROC)
library(ROCR)
sales_train_imp$Customer.type <- factor(sales_train_imp$Customer.type)
sales_test_imp$Customer.type <- factor(sales_test_imp$Customer.type)
## Check number of trees (ntree = 100)
sales_rf <- randomForest(Customer.type ~ .,
data = sales_train_imp)
plot(sales_rf)
## Build model
control_rf <- trainControl(method = "cv", 5)
sales_rf <- train(Customer.type ~ .,
data = sales_train_imp, method="rf",
trControl=control_rf, ntree=100
)
sales_rf
## Random Forest
##
## 750 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 600, 600, 600, 599, 601
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.4694578 -0.061161991
## 10 0.4735025 -0.058528987
## 18 0.5055472 0.007279281
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 18.
predict_rf <- predict(sales_rf, sales_test_imp)
confusionMatrix(sales_test_imp$Customer.type, predict_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 62 78
## 1 53 57
##
## Accuracy : 0.476
## 95% CI : (0.4127, 0.5399)
## No Information Rate : 0.54
## P-Value [Acc > NIR] : 0.9817
##
## Kappa : -0.038
##
## Mcnemar's Test P-Value : 0.0360
##
## Sensitivity : 0.5391
## Specificity : 0.4222
## Pos Pred Value : 0.4429
## Neg Pred Value : 0.5182
## Prevalence : 0.4600
## Detection Rate : 0.2480
## Detection Prevalence : 0.5600
## Balanced Accuracy : 0.4807
##
## 'Positive' Class : 0
##
# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(sales_test_imp$Customer.type, predict_rf)$overall[1])
OOSE
## [1] 0.524
plot(sales_rf)
# RandomForest Model
sales_randomF <- randomForest(Customer.type ~ .,
data = sales_train_imp, method="rf",
trControl=control_rf, ntree=80, mtry = 10,
importance = TRUE,
proximity = TRUE
)
varImpPlot(sales_randomF, sort=T, n.var = 10, main = 'Top Feature Importance')
Tree Model
library(caret)
library(rlang)
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, splice
## The following objects are masked from 'package:jsonlite':
##
## flatten, unbox
library(rpart)
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
library(rpart.plot)
sales_train_imp$Customer.type <- factor(sales_train$Customer.type, levels = c("Member", "Normal"))
sales_test_imp$Customer.type <- factor(sales_test$Customer.type, levels = c("Member", "Normal"))
# build model with Classification Tree
modFit.rpart <- train(Customer.type ~ .,
data = sales_train_imp,
method = "rpart" )
pred.rpart <- predict(modFit.rpart, sales_test_imp)
confusionMatrix(pred.rpart, sales_test_imp$Customer.type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Member Normal
## Member 71 85
## Normal 39 55
##
## Accuracy : 0.504
## 95% CI : (0.4403, 0.5676)
## No Information Rate : 0.56
## P-Value [Acc > NIR] : 0.9673
##
## Kappa : 0.0367
##
## Mcnemar's Test P-Value : 5.32e-05
##
## Sensitivity : 0.6455
## Specificity : 0.3929
## Pos Pred Value : 0.4551
## Neg Pred Value : 0.5851
## Prevalence : 0.4400
## Detection Rate : 0.2840
## Detection Prevalence : 0.6240
## Balanced Accuracy : 0.5192
##
## 'Positive' Class : Member
##
sales_tree = rpart(Customer.type ~ .,
data = sales_train_imp )
rpart.plot(sales_tree, type = 2, extra = 101, leaf.round = 3, fallen.leaves = TRUE,
varlen = 0, tweak = 1.1
)
Market Basket Analysis
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:recipes':
##
## discretize
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Find association rules between product lines and customers' Ratings
#Convert to transaction class:
trans_product <- as(split(salesdf[,"Product.line"], salesdf[,"Rating"], salesdf[,"Quantity"], salesdf[,"Customer.type"], salesdf[,"Payment"]), "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
summary(trans_product)
## transactions as itemMatrix in sparse format with
## 61 rows (elements/itemsets/transactions) and
## 6 columns (items) and a density of 0.931694
##
## most frequent items:
## Fashion accessories Electronic accessories Sports and travel
## 59 58 57
## Food and beverages Home and lifestyle (Other)
## 56 56 55
##
## element (itemset/transaction) length distribution:
## sizes
## 3 4 5 6
## 1 2 18 40
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 5.00 6.00 5.59 6.00 6.00
##
## includes extended item information - examples:
## labels
## 1 Electronic accessories
## 2 Fashion accessories
## 3 Food and beverages
##
## includes extended transaction information - examples:
## transactionID
## 1 4
## 2 4.1
## 3 4.2
inspect(trans_product[1:5])
## items transactionID
## [1] {Electronic accessories,
## Fashion accessories,
## Food and beverages,
## Health and beauty,
## Sports and travel} 4
## [2] {Electronic accessories,
## Fashion accessories,
## Food and beverages,
## Health and beauty,
## Home and lifestyle,
## Sports and travel} 4.1
## [3] {Electronic accessories,
## Fashion accessories,
## Food and beverages,
## Health and beauty,
## Home and lifestyle,
## Sports and travel} 4.2
## [4] {Electronic accessories,
## Fashion accessories,
## Food and beverages,
## Health and beauty,
## Home and lifestyle,
## Sports and travel} 4.3
## [5] {Electronic accessories,
## Fashion accessories,
## Food and beverages,
## Health and beauty,
## Home and lifestyle,
## Sports and travel} 4.4
itemFrequency(trans_product[, 1:3])
## Electronic accessories Fashion accessories Food and beverages
## 0.9508197 0.9672131 0.9180328
#Visualizing item support – item frequency plots
itemFrequencyPlot(trans_product, support = 0.1)
itemFrequencyPlot(trans_product, topN = 3)
## Training a model on the data
# if an item is purchased twice a day (about 60 times in a month of data) then 60/1000 = 0.06
product_rules <- apriori(trans_product, parameter = list(support = 0.06, confidence = 0.25, minlen = 2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.25 0.1 1 none FALSE TRUE 5 0.06 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 3
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[6 item(s), 61 transaction(s)] done [0.00s].
## sorting and recoding items ... [6 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.00s].
## writing ... [186 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(product_rules[1:3])
## lhs rhs support confidence coverage
## [1] {Health and beauty} => {Home and lifestyle} 0.8196721 0.9090909 0.9016393
## [2] {Home and lifestyle} => {Health and beauty} 0.8196721 0.8928571 0.9180328
## [3] {Health and beauty} => {Food and beverages} 0.8196721 0.9090909 0.9016393
## lift count
## [1] 0.9902597 50
## [2] 0.9902597 50
## [3] 0.9902597 50
## Evaluating model performance
summary(product_rules)
## set of 186 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6
## 30 60 60 30 6
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 4.000 3.581 4.000 6.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.6557 Min. :0.8696 Min. :0.6721 Min. :0.9644
## 1st Qu.:0.7254 1st Qu.:0.9188 1st Qu.:0.7869 1st Qu.:0.9881
## Median :0.7705 Median :0.9362 Median :0.8443 Median :0.9974
## Mean :0.7805 Mean :0.9330 Mean :0.8368 Mean :1.0014
## 3rd Qu.:0.8197 3rd Qu.:0.9455 3rd Qu.:0.8852 3rd Qu.:1.0132
## Max. :0.9180 Max. :0.9821 Max. :0.9672 Max. :1.0474
## count
## Min. :40.00
## 1st Qu.:44.25
## Median :47.00
## Mean :47.61
## 3rd Qu.:50.00
## Max. :56.00
##
## mining info:
## data ntransactions support confidence
## trans_product 61 0.06 0.25
## call
## apriori(data = trans_product, parameter = list(support = 0.06, confidence = 0.25, minlen = 2))
## Improving model performance
inspect(sort(product_rules, by = "lift")[1:5])
## lhs rhs support confidence coverage lift count
## [1] {Home and lifestyle,
## Sports and travel} => {Food and beverages} 0.8196721 0.9615385 0.8524590 1.047390 50
## [2] {Fashion accessories,
## Home and lifestyle,
## Sports and travel} => {Food and beverages} 0.8032787 0.9607843 0.8360656 1.046569 49
## [3] {Electronic accessories,
## Home and lifestyle,
## Sports and travel} => {Food and beverages} 0.7704918 0.9591837 0.8032787 1.044825 47
## [4] {Electronic accessories,
## Fashion accessories,
## Home and lifestyle,
## Sports and travel} => {Food and beverages} 0.7540984 0.9583333 0.7868852 1.043899 46
## [5] {Health and beauty,
## Home and lifestyle,
## Sports and travel} => {Food and beverages} 0.7213115 0.9565217 0.7540984 1.041925 44
Comments: inspect(product_rules[1:3]) - The lift of a rule measures how much more likely one item or item set is to be purchased relative to its typical rate of purchase, given that you know another item or item set has been purchased. - if a customer buys Health & Beauty, they will also buy Home & Lifestyle.” With a support of about 82% and confidence of 91%, we can determine that this rule covers about 90 percent of transactions and is correct in 99 percent of purchases involving Health & Beauty. The lift value tells us how much more likely a customer is to buy Home and lifestyle relative to the average customer, given that he or she bought a Health and beauty. Since we know that about 75 percent (0.82 * 0.90) of customers bought whole milk (support), while 91 percent of customers buying a Health and beauty bought Home and lifestyle (confidence), we can compute the lift as 0.91 / 0.75 = 1.2, which mostly matches the value shown.
URL: https://www.kaggle.com/datasets/dhanushnarayananr/credit-card-fraud
Feature Explanation:
distancefromhome - the distance from home where the transaction happened.
distancefromlast_transaction - the distance from last transaction happened.
ratiotomedianpurchaseprice - Ratio of purchased price transaction to median purchase price.
repeat_retailer - Is the transaction happened from same retailer. (1-Yes / 0-No)
used_chip - Is the transaction through chip (credit card). (1-Yes / 0-No)
usedpinnumber - Is the transaction happened by using PIN number. (1-Yes / 0-No)
online_order - Is the transaction an online order. (1-Yes / 0-No)
fraud - Is the transaction fraudulent. (1-Yes / 0-No)
credit_dt <- read.csv(file = "C:/Users/Anhuynh/Desktop/ITU/Classes/Principles of Business Analytics/Assignments/Dataset/card_transdata.csv")
##Find missing values
colSums(is.na(credit_dt))
## distance_from_home distance_from_last_transaction
## 0 0
## ratio_to_median_purchase_price repeat_retailer
## 0 0
## used_chip used_pin_number
## 0 0
## online_order fraud
## 0 0
library(corrplot)
library(Hmisc)
## Mark the insignificant coefficients according to the specified p-value significance level
cor_9 <- rcorr(as.matrix(credit_dt))
credit_cor <- cor_9$r
p_mat <- cor_9$P
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(credit_cor, method = "color", col = col(200),
type = "upper", order = "hclust",
addCoef.col = "black", # Add coefficient of correlation
tl.col = "darkblue", tl.srt = 45, #Text label color and rotation
# Combine with significance level
p.mat = p_mat, sig.level = 0.01,
# hide correlation coefficient on the principal diagonal
diag = FALSE
)
library(ggplot2)
library(scales)
library(stringr)
##
## Attaching package: 'stringr'
## The following object is masked from 'package:recipes':
##
## fixed
##Exploratory Data Analysis
# fraud
count_fraud <- table(credit_dt$fraud)
df_fraud <- data.frame(transactions = c("No Fraud", "Fraud", "Total" ),
count = c(912597, 87403, 1000000))
ggplot(data=df_fraud, aes(x = transactions, y = count, fill = transactions)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_fraud$count, label= paste(df_fraud$count,"\n", "(", round(df_fraud$count/10^6*100,2), "%" , ")")), vjust=1, color="black", size=3) +
labs(title="Fraud Count") +
theme(legend.position="none") +
scale_y_continuous(labels =
label_number(scale = 1e-3, prefix = "", suffix = "K", big.mark = ',', accuracy = 1))
## Warning: Use of `df_fraud$count` is discouraged. Use `count` instead.
## Use of `df_fraud$count` is discouraged. Use `count` instead.
## Use of `df_fraud$count` is discouraged. Use `count` instead.
# repeat_retailer
count_retailer <- table(credit_dt$repeat_retailer)
df_retailer <- data.frame(transactions = c("Different Retailer", "Same Retailer", "Total" ),
count = c(118464, 881536, 1000000))
ggplot(data=df_retailer, aes(x = transactions, y = count, fill = transactions)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_retailer$count, label= paste(df_retailer$count,"\n", "(", round(df_retailer$count/10^6*100,2), "%" , ")")), vjust=1, color="black", size=3) +
labs(title="Repeated Retailer Count.") +
theme(legend.position="none") +
scale_y_continuous(labels =
label_number(scale = 1e-3, prefix = "", suffix = "K", big.mark = ',', accuracy = 1))
## Warning: Use of `df_retailer$count` is discouraged. Use `count` instead.
## Warning: Use of `df_retailer$count` is discouraged. Use `count` instead.
## Use of `df_retailer$count` is discouraged. Use `count` instead.
# used_chip (credit card)
count_chip <- table(credit_dt$used_chip)
df_chip <- data.frame(transactions = c("Without Credit Card", "With Credit Card", "Total" ),
count = c(649601, 350399, 1000000))
ggplot(data=df_chip, aes(x = transactions, y = count, fill = transactions)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_chip$count, label= paste(df_chip$count,"\n", "(", round(df_chip$count/10^6*100,2), "%" , ")")), vjust=1, color="black", size=3) +
labs(title="Credit Card User Count") +
theme(legend.position="none") +
scale_y_continuous(labels =
label_number(scale = 1e-3, prefix = "", suffix = "K", big.mark = ',', accuracy = 1))
## Warning: Use of `df_chip$count` is discouraged. Use `count` instead.
## Warning: Use of `df_chip$count` is discouraged. Use `count` instead.
## Use of `df_chip$count` is discouraged. Use `count` instead.
# used_pin_number (PIN number)
count_pin <- table(credit_dt$used_pin_number)
df_pin <- data.frame(transactions = c("Without PIN number", "With PIN number", "Total" ),
count = c(899392, 100608, 1000000))
ggplot(data=df_pin, aes(x = transactions, y = count, fill = transactions)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_pin$count, label= paste(df_pin$count,"\n", "(", round(df_pin$count/10^6*100,2), "%" , ")")), vjust=1, color="black", size=3) +
labs(title="PIN Number User Count") +
theme(legend.position="none") +
scale_y_continuous(labels =
label_number(scale = 1e-3, prefix = "", suffix = "K", big.mark = ',', accuracy = 1))
## Warning: Use of `df_pin$count` is discouraged. Use `count` instead.
## Warning: Use of `df_pin$count` is discouraged. Use `count` instead.
## Use of `df_pin$count` is discouraged. Use `count` instead.
# online_order
count_order <- table(credit_dt$online_order)
df_order <- data.frame(transactions = c("Without Online Order", "With Online Order", "Total" ),
count = c(349448, 650552, 1000000))
ggplot(data=df_order, aes(x = transactions, y = count, fill = transactions)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_order$count, label= paste(df_order$count,"\n", "(", round(df_order$count/10^6*100,2), "%" , ")")), vjust=1, color="black", size=3) +
labs(title="Online Order Count") +
theme(legend.position="none") +
scale_y_continuous(labels =
label_number(scale = 1e-3, prefix = "", suffix = "K", big.mark = ',', accuracy = 1))
## Warning: Use of `df_order$count` is discouraged. Use `count` instead.
## Warning: Use of `df_order$count` is discouraged. Use `count` instead.
## Use of `df_order$count` is discouraged. Use `count` instead.
## Distribution of the variables counts, or how many transactions fall into each bucket of measurements.
# distance_from_home
## with Fraud
hist(credit_dt$distance_from_home[which(credit_dt$fraud == 1)], col = "green")
rug(credit_dt$distance_from_home[which(credit_dt$fraud == 1)])
abline(v = median(credit_dt$distance_from_home[which(credit_dt$fraud == 1)]), col = "magenta", lwd = 4)
## without Fraud
hist(credit_dt$distance_from_home[which(credit_dt$fraud == 0)], col = "light green")
rug(credit_dt$distance_from_home[which(credit_dt$fraud == 0)])
abline(v = median(credit_dt$distance_from_home[which(credit_dt$fraud == 0)]), col = "magenta", lwd = 4)
# distance_from_last_transaction
hist(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 1)], col = "blue")
rug(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 1)])
abline(v = median(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 1)]), col = "magenta", lwd = 4)
hist(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 0)], col = "light blue")
rug(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 0)])
abline(v = median(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 0)]), col = "magenta", lwd = 4)
# ratio_to_median_purchase_price
hist(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 1)], col = "maroon")
rug(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 1)])
abline(v = median(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 1)]), col = "magenta", lwd = 4)
hist(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 0)], col = "purple")
rug(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 0)])
abline(v = median(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 0)]), col = "magenta", lwd = 4)
Data Modeling:
library(tidymodels)
## Split the data into training and test sets
set.seed(2023)
credit_split <- initial_split(credit_dt, prop = 3/4)
credit_train <- training(credit_split) # training set
credit_test <- testing(credit_split) # test set
Logistic Regression Model
library(caret)
library(pROC)
library(ROCR)
credit_train$fraud <- factor(credit_train$fraud)
credit_test$fraud <- factor(credit_test$fraud)
# Receiver operating characteristic (ROC) curve is commonly used to examine the tradeoff between the detection of true positives while avoiding the false positives.
credit_gml <- glm(fraud ~ distance_from_home + distance_from_last_transaction + ratio_to_median_purchase_price + repeat_retailer + used_chip + used_pin_number + online_order, family = "binomial", data = credit_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
credit.probs <- predict(credit_gml, credit_test, type="response")
pr <- prediction(credit.probs, credit_test$fraud)
# plotting ROC curve
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9668457
# Calculate a test misclassification rate
table(credit_test$fraud) / nrow(credit_test)
##
## 0 1
## 0.912112 0.087888
What does AUC mean? The Area Under the Curve (AUC) is the measure of the ability of a classifier to distinguish between classes and is used as a summary of the ROC curve. The higher the AUC, the better the performance of the model at distinguishing between the positive and negative classes.
Test misclassification rate, which is a proportion that algorithm incorrectly predicts transactions as “Fraud” instead of “non-Fraud” in test data. (8.78%)
Calculate cost-saving per Transaction. If we assume that fraud loss/cost is approximately $4 per transaction, which is the cost to handle a fraud transaction, which is predicted as “non-Fraud” (False Negative). Otherwise, the cost will be four time less expensive to handle an existing fraud or $1 per transaction, which is predicted as “Fraud” (True Positive)
FN (predict that a transaction wouldn’t be fraud, but it actually would, then loss will be): $4 TP (predict that a transaction would be fraud , when it actually would): $1 FP (predict that a transaction would be fraud , when it actually wouldn’t): $1 TN (predict that a transaction wouldn’t be fraud, when it actually wouldn’t): $0 Cost = FN($4) + TP($1) + FP($1) + TN($0)
Apply this cost evaluation to our model. We start by fitting the model, and making predictions in the form of probabilities.
## let’s create a threshold vector and a cost vector.
# threshold vector
thresh <- seq(0.1,1.0, length = 10)
#cost vector
cost = rep(0,length(thresh))
dim(credit_test) ## 250,000
## [1] 250000 8
# cost as a function of threshold
for (i in 1:length(thresh)){
glm.pred = rep("No", length(credit.probs))
glm.pred <- ifelse(credit.probs > thresh[[i]], 1, 0)
glm.pred <- as.factor(glm.pred)
credit_test$fraud <- as.factor(credit_test$fraud)
x <- confusionMatrix(glm.pred, credit_test$fraud)
TN <- x$table[1]/250000
FP <- x$table[2]/250000
FN <- x$table[3]/250000
TP <- x$table[4]/250000
cost[i] = FN*4 + TP*1 + FP*1 + TN*0
}
## Warning in confusionMatrix.default(glm.pred, credit_test$fraud): Levels are not
## in the same order for reference and data. Refactoring data to match.
##Confusion Matrix
x
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 228028 21972
## 1 0 0
##
## Accuracy : 0.9121
## 95% CI : (0.911, 0.9132)
## No Information Rate : 0.9121
## P-Value [Acc > NIR] : 0.5018
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9121
## Neg Pred Value : NaN
## Prevalence : 0.9121
## Detection Rate : 0.9121
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
Let’s assume that the company is using “simple model” which just defaults to a threshold of 0.5. We need fit that model, make predictions, and calculate the cost.
# simple model: assume threshold is 0.5
glm.pred = rep("No", length(credit.probs))
glm.pred <- ifelse(credit.probs > 0.5, 1, 0)
glm.pred <- as.factor(glm.pred)
credit_test$fraud <- as.factor(credit_test$fraud)
x <- confusionMatrix(glm.pred, credit_test$fraud)
TN <- x$table[1]/250000
FP <- x$table[2]/250000
FN <- x$table[3]/250000
TP <- x$table[4]/250000
cost_simple = FN*4 + TP*1 + FP*1 + TN*0
# putting results in a dataframe for plotting
dat <- data.frame(
model = c(rep("optimized",10),"simple"),
cost_per_transaction = c(cost, cost_simple),
threshold = c(thresh, 0.5)
)
# plotting
ggplot(dat, aes(x = threshold, y = cost_per_transaction, group = model, colour = model)) +
geom_line() +
geom_point()
Looking at the results, we can see that the minimum cost per transaction is about $0.05 at a threshold of 0.2. The “simple” model that our company is currently implementing costs about $0.2 per transaction, at a threshold of 0.5 .
As if we have 1,000,000 transactions to be switched from existing model to new model, the cost-saving will be:
# cost savings of optimized model (threshold = 0.2) compared to baseline model (threshold = 0.5)
savings_per_transaction = cost_simple - min(cost)
total_savings = 1000000*savings_per_transaction
total_savings
## [1] 37052
Regression Tree Model.
library(rlang)
library(rpart.plot)
credit_tree = rpart(fraud ~ distance_from_home + distance_from_last_transaction + ratio_to_median_purchase_price + repeat_retailer + used_chip + used_pin_number + online_order,
data = credit_train )
rpart.plot(credit_tree, type = 2, extra = 101, leaf.round = 3, fallen.leaves = TRUE,
varlen = 0, tweak = 1.2)
credit_tree_tst_pred = predict(credit_tree, credit_test, type = "class")
table(predicted = credit_tree_tst_pred, actual = credit_test$fraud)
## actual
## predicted 0 1
## 0 227774 358
## 1 254 21614
# function to predict accuracy of Classification tree model
calc_acc = function(actual, predicted) {
mean(actual == predicted)
}
# Accuracy
(tree_tst_acc = calc_acc(predicted = credit_tree_tst_pred, actual = credit_test$fraud))
## [1] 0.997552
# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(credit_test$fraud, credit_tree_tst_pred)$overall[1])
OOSE
## [1] 0.002448