Business Goals:

  1. How does a financial institution increase market share in consumer credit cards?
  2. How should a credit card issuer/ merchants use emerging technology to acquire the best customers?
  3. How does a financial institution grow their credit card portfolio while mitigating credit risk?

How does a financial institution optimize their product offering? How should a credit card issuer use digital and emerging technology to acquire and retain the best customers?

Data Sources: Store Market Sales Data Invoice id: Computer generated sales slip invoice identification number

Branch: Branch of supercenter (3 branches are available identified by A, B and C).

City: Location of supercenters

Customer type: Type of customers, recorded by Members for customers using member card and Normal for without member card.

Gender: Gender type of customer

Product line: General item categorization groups - Electronic accessories, Fashion accessories, Food and beverages, Health and beauty, Home and lifestyle, Sports and travel

Unit price: Price of each product in $

Quantity: Number of products purchased by customer

Tax: 5% tax fee for customer buying

Total: Total price including tax

Date: Date of purchase (Record available from January 2019 to March 2019)

Time: Purchase time (10am to 9pm)

Payment: Payment used by customer for purchase (3 methods are available – Cash, Credit card and Ewallet)

COGS: Cost of goods sold

Gross margin percentage: Gross margin percentage

Gross income: Gross income

Rating: Customer stratification rating on their overall shopping experience (On a scale of 1 to 10)

URL: https://www.kaggle.com/datasets/neoanderson144/supermarketsales?select=supermarketsales.json

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(jsonlite)
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
# read, flatten json file, and convert to data frame

salesdt <- flatten(fromJSON(txt = "C:/Users/Anhuynh/Desktop/ITU/Classes/Principles of Business Analytics/Assignments/Dataset/supermarketsales.json"))

tab_list <- lapply(1:nrow(salesdt),
                  function(i) data.frame(salesdt[i, -17], salesdt[i, 17],
                              stringsAsFactors = FALSE))
library(dplyr)
flat_table <- bind_rows(tab_list)
# calculate Revenue 
salesdf <- cbind(flat_table, Revenue = flat_table$cogs/(1-flat_table$gross.margin.percentage/100))

# rename a column
colnames(salesdf)[17] <- "Rating"

# convert from "char" to "date"
salesdf$Date <- mdy(salesdf$Date)
##Find missing values
colSums(is.na(salesdf))
##              Invoice.ID                  Branch                    City 
##                       0                       0                       0 
##           Customer.type                  Gender            Product.line 
##                       0                       0                       0 
##              Unit.price                Quantity                  Tax.5. 
##                       0                       0                       0 
##                   Total                    Date                    Time 
##                       0                       0                       0 
##                 Payment                    cogs gross.margin.percentage 
##                       0                       0                       0 
##            gross.income                  Rating                 Revenue 
##                       0                       0                       0
# Changes in revenue over Payment type and Customer type
ggplot(salesdf, aes(fill=Customer.type, y=Revenue, x=Payment)) + 
    geom_bar(position="stack", stat="identity") +
    scale_fill_viridis(discrete = T) +
    ggtitle("Changes in Revenue over Payment types") +
   theme(text = element_text(size = 12),
          panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank()) +
    xlab("") +
    labs(x="Payment type",y="Revenue ($)") 

ggplot(salesdf, aes(fill=Customer.type, y=Revenue, x=City)) + 
    geom_bar(position="stack", stat="identity") +
    scale_fill_viridis(discrete = T) +
    ggtitle("Changes in Revenue across City") +
   theme(text = element_text(size = 12),
          panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank()) +
    xlab("") +
    labs(x="City",y="Revenue ($)")

ggplot(salesdf, aes(fill=Customer.type, y=Revenue, x=Product.line)) + 
    geom_bar(position="stack", stat="identity") +
    scale_fill_viridis(discrete = T) +
    ggtitle("Changes in Revenue across Product lines") +
   theme(text = element_text(size = 12)) +
    xlab("") +
    labs(x="Product line",y="Revenue ($)") +
   theme(legend.title = element_blank(),     axis.text.x=element_text(angle=45,hjust=1,vjust=1),
          panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank())

# Distribution of the variables counts, or how many patients fall into each bucket of measurements.

hist(salesdf$Rating[which(salesdf$Customer.type == "Member")], col = "maroon",
     xlab='Rating scale', main=paste0(
     "Histogram of Member Customers across Rating"))
rug(salesdf$Rating)
abline(v = median(salesdf$Rating), col = "magenta", lwd = 4)

hist(salesdf$Rating[which(salesdf$Customer.type == "Normal")], col = "gold",
     xlab='Rating scale', main=paste0(
     "Histogram of Normal Customers across Rating"))
rug(salesdf$Rating)
abline(v = median(salesdf$Rating), col = "magenta", lwd = 4)

library(reshape2)
# create new dataset, convert column values into rows
salesdf$Date <- as.factor(salesdf$Date)
sales <- melt(salesdf[c("cogs", "Revenue", "Date")])
## Using Date as id variables
ggplot(sales, aes(fill=variable, y=value, x=Date)) + 
    geom_bar(position="dodge", stat="identity") +
    scale_fill_viridis(discrete = T) +
    ggtitle("Revenue Vs. Cost of Goods Sold") +
   theme(text = element_text(size = 12)) +
    xlab("") +
    labs(x="Date",y="Value ($)") +
   theme(legend.title = element_blank(),     axis.text.x=element_text(angle=45,hjust=1,vjust=1),
          panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank()) +
   theme(legend.position="bottom")

# Identify important variables and build models

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.2.0 ──
## ✔ broom        0.8.0     ✔ rsample      0.1.1
## ✔ dials        1.0.0     ✔ tibble       3.1.7
## ✔ infer        1.0.2     ✔ tidyr        1.2.0
## ✔ modeldata    0.1.1     ✔ tune         0.2.0
## ✔ parsnip      1.0.0     ✔ workflows    0.2.6
## ✔ purrr        0.3.4     ✔ workflowsets 0.2.1
## ✔ recipes      0.2.0     ✔ yardstick    1.0.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
## Step 1: Split the data into training and test sets
set.seed(2022)
salesdf_split <- initial_split(salesdf,  prop = 3/4)
sales_train <- training(salesdf_split) # training set
sales_test <- testing(salesdf_split) # test set

Feature Engineering.

sales_train_imp <- sales_train %>%
  transmute(
        Branch,
        City,
        Customer.type = ifelse(Customer.type == "Member", 1, 0),
    Gender = ifelse(Gender == "Female", 1, 0),
    Product.line,
    Unit.price,
    Quantity,
    Tax.5.,
    Total,
    Payment = ifelse(Payment == "Credit card", 1, 0) ,
    cogs,
    gross.income,
    Rating
    )
  
sales_test_imp <- sales_test %>%
  transmute(
        Branch,
        City,
        Customer.type = ifelse(Customer.type == "Member", 1, 0),
    Gender = ifelse(Gender == "Female", 1, 0),
    Product.line,
    Unit.price,
    Quantity,
    Tax.5.,
    Total,
    Payment = ifelse(Payment == "Credit card", 1, 0) ,
    cogs,
    gross.income,
    Rating
    )
library(corrplot)
## corrplot 0.92 loaded
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:parsnip':
## 
##     translate
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Mark the insignificant coefficients according to the specified p-value significance level
cor_9 <- rcorr(as.matrix(sales_train_imp[-c(1,2,5)]))
sales_train_imp_cor <- cor_9$r
p_mat <- cor_9$P

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

corrplot(sales_train_imp_cor, method = "color", col = col(200),  
         type = "upper", order = "hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col = "darkblue", tl.srt = 45, #Text label color and rotation
         # Combine with significance level
         p.mat = p_mat, sig.level = 0.01,  
         # hide correlation coefficient on the principal diagonal
         diag = FALSE 
         )

library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute
## The following object is masked from 'package:tune':
## 
##     tune
## The following object is masked from 'package:rsample':
## 
##     permutations
## The following object is masked from 'package:parsnip':
## 
##     tune
library(gmodels)
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
## 
##     lift
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ROCR)

## Naive Bayes
# accuracy = TP + TN /(TP + TN + FP + FN)  
# error rate = 1 - accuracy 


sales_classifier <- naiveBayes(sales_train_imp, sales_train_imp$Customer.type)

sales_class_pred <- predict(sales_classifier, sales_test_imp )

CrossTable(sales_class_pred, sales_test_imp$Customer.type,
 prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
 dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  250 
## 
##  
##              | actual 
##    predicted |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       140 |         0 |       140 | 
##              |     0.560 |     0.000 |           | 
## -------------|-----------|-----------|-----------|
##            1 |         0 |       110 |       110 | 
##              |     0.000 |     0.440 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       140 |       110 |       250 | 
## -------------|-----------|-----------|-----------|
## 
## 

Random Forest Model

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
library(pROC)
library(ROCR)

sales_train_imp$Customer.type <- factor(sales_train_imp$Customer.type)

sales_test_imp$Customer.type <- factor(sales_test_imp$Customer.type)

## Check number of trees (ntree = 100)
sales_rf <- randomForest(Customer.type ~ .,
                data = sales_train_imp)

plot(sales_rf)

## Build model 
control_rf <- trainControl(method = "cv", 5)

sales_rf <- train(Customer.type ~ .,
                data = sales_train_imp, method="rf", 
                  trControl=control_rf, ntree=100
                  )
sales_rf
## Random Forest 
## 
## 750 samples
##  12 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 600, 600, 600, 599, 601 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa       
##    2    0.4694578  -0.061161991
##   10    0.4735025  -0.058528987
##   18    0.5055472   0.007279281
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 18.
predict_rf <- predict(sales_rf, sales_test_imp)

confusionMatrix(sales_test_imp$Customer.type, predict_rf)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 62 78
##          1 53 57
##                                           
##                Accuracy : 0.476           
##                  95% CI : (0.4127, 0.5399)
##     No Information Rate : 0.54            
##     P-Value [Acc > NIR] : 0.9817          
##                                           
##                   Kappa : -0.038          
##                                           
##  Mcnemar's Test P-Value : 0.0360          
##                                           
##             Sensitivity : 0.5391          
##             Specificity : 0.4222          
##          Pos Pred Value : 0.4429          
##          Neg Pred Value : 0.5182          
##              Prevalence : 0.4600          
##          Detection Rate : 0.2480          
##    Detection Prevalence : 0.5600          
##       Balanced Accuracy : 0.4807          
##                                           
##        'Positive' Class : 0               
## 
# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(sales_test_imp$Customer.type, predict_rf)$overall[1])
OOSE
## [1] 0.524
plot(sales_rf)

# RandomForest Model
sales_randomF <- randomForest(Customer.type ~ .,
                data = sales_train_imp, method="rf", 
                  trControl=control_rf, ntree=80, mtry = 10, 
                importance = TRUE,
                proximity = TRUE 
                  )
varImpPlot(sales_randomF, sort=T, n.var = 10, main = 'Top Feature Importance')

Tree Model

library(caret)
library(rlang)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, splice
## The following objects are masked from 'package:jsonlite':
## 
##     flatten, unbox
library(rpart)
## 
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
## 
##     prune
library(rpart.plot)

sales_train_imp$Customer.type <- factor(sales_train$Customer.type, levels = c("Member", "Normal"))

sales_test_imp$Customer.type <- factor(sales_test$Customer.type, levels = c("Member", "Normal"))


# build model with Classification Tree 
modFit.rpart <- train(Customer.type ~ .,
                data = sales_train_imp, 
                      method = "rpart" )  

pred.rpart <- predict(modFit.rpart, sales_test_imp)

confusionMatrix(pred.rpart, sales_test_imp$Customer.type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Member Normal
##     Member     71     85
##     Normal     39     55
##                                           
##                Accuracy : 0.504           
##                  95% CI : (0.4403, 0.5676)
##     No Information Rate : 0.56            
##     P-Value [Acc > NIR] : 0.9673          
##                                           
##                   Kappa : 0.0367          
##                                           
##  Mcnemar's Test P-Value : 5.32e-05        
##                                           
##             Sensitivity : 0.6455          
##             Specificity : 0.3929          
##          Pos Pred Value : 0.4551          
##          Neg Pred Value : 0.5851          
##              Prevalence : 0.4400          
##          Detection Rate : 0.2840          
##    Detection Prevalence : 0.6240          
##       Balanced Accuracy : 0.5192          
##                                           
##        'Positive' Class : Member          
## 
sales_tree = rpart(Customer.type ~ .,
                data = sales_train_imp )

rpart.plot(sales_tree, type = 2, extra = 101, leaf.round = 3, fallen.leaves = TRUE,
    varlen = 0, tweak = 1.1
    )

Market Basket Analysis

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'arules'
## The following object is masked from 'package:recipes':
## 
##     discretize
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Find association rules between product lines and customers' Ratings
#Convert to transaction class:
trans_product <- as(split(salesdf[,"Product.line"], salesdf[,"Rating"], salesdf[,"Quantity"], salesdf[,"Customer.type"], salesdf[,"Payment"]), "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
summary(trans_product)
## transactions as itemMatrix in sparse format with
##  61 rows (elements/itemsets/transactions) and
##  6 columns (items) and a density of 0.931694 
## 
## most frequent items:
##    Fashion accessories Electronic accessories      Sports and travel 
##                     59                     58                     57 
##     Food and beverages     Home and lifestyle                (Other) 
##                     56                     56                     55 
## 
## element (itemset/transaction) length distribution:
## sizes
##  3  4  5  6 
##  1  2 18 40 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00    5.00    6.00    5.59    6.00    6.00 
## 
## includes extended item information - examples:
##                   labels
## 1 Electronic accessories
## 2    Fashion accessories
## 3     Food and beverages
## 
## includes extended transaction information - examples:
##   transactionID
## 1             4
## 2           4.1
## 3           4.2
inspect(trans_product[1:5])
##     items                     transactionID
## [1] {Electronic accessories,               
##      Fashion accessories,                  
##      Food and beverages,                   
##      Health and beauty,                    
##      Sports and travel}                 4  
## [2] {Electronic accessories,               
##      Fashion accessories,                  
##      Food and beverages,                   
##      Health and beauty,                    
##      Home and lifestyle,                   
##      Sports and travel}                 4.1
## [3] {Electronic accessories,               
##      Fashion accessories,                  
##      Food and beverages,                   
##      Health and beauty,                    
##      Home and lifestyle,                   
##      Sports and travel}                 4.2
## [4] {Electronic accessories,               
##      Fashion accessories,                  
##      Food and beverages,                   
##      Health and beauty,                    
##      Home and lifestyle,                   
##      Sports and travel}                 4.3
## [5] {Electronic accessories,               
##      Fashion accessories,                  
##      Food and beverages,                   
##      Health and beauty,                    
##      Home and lifestyle,                   
##      Sports and travel}                 4.4
itemFrequency(trans_product[, 1:3])
## Electronic accessories    Fashion accessories     Food and beverages 
##              0.9508197              0.9672131              0.9180328
#Visualizing item support – item frequency plots
itemFrequencyPlot(trans_product, support = 0.1)

itemFrequencyPlot(trans_product, topN = 3)

## Training a model on the data
# if an item is purchased twice a day (about 60 times in a month of data) then 60/1000 = 0.06
product_rules <- apriori(trans_product, parameter = list(support = 0.06, confidence = 0.25, minlen = 2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.25    0.1    1 none FALSE            TRUE       5    0.06      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 3 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[6 item(s), 61 transaction(s)] done [0.00s].
## sorting and recoding items ... [6 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.00s].
## writing ... [186 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(product_rules[1:3])
##     lhs                     rhs                  support   confidence coverage 
## [1] {Health and beauty}  => {Home and lifestyle} 0.8196721 0.9090909  0.9016393
## [2] {Home and lifestyle} => {Health and beauty}  0.8196721 0.8928571  0.9180328
## [3] {Health and beauty}  => {Food and beverages} 0.8196721 0.9090909  0.9016393
##     lift      count
## [1] 0.9902597 50   
## [2] 0.9902597 50   
## [3] 0.9902597 50
## Evaluating model performance
 summary(product_rules)
## set of 186 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4  5  6 
## 30 60 60 30  6 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   4.000   3.581   4.000   6.000 
## 
## summary of quality measures:
##     support         confidence        coverage           lift       
##  Min.   :0.6557   Min.   :0.8696   Min.   :0.6721   Min.   :0.9644  
##  1st Qu.:0.7254   1st Qu.:0.9188   1st Qu.:0.7869   1st Qu.:0.9881  
##  Median :0.7705   Median :0.9362   Median :0.8443   Median :0.9974  
##  Mean   :0.7805   Mean   :0.9330   Mean   :0.8368   Mean   :1.0014  
##  3rd Qu.:0.8197   3rd Qu.:0.9455   3rd Qu.:0.8852   3rd Qu.:1.0132  
##  Max.   :0.9180   Max.   :0.9821   Max.   :0.9672   Max.   :1.0474  
##      count      
##  Min.   :40.00  
##  1st Qu.:44.25  
##  Median :47.00  
##  Mean   :47.61  
##  3rd Qu.:50.00  
##  Max.   :56.00  
## 
## mining info:
##           data ntransactions support confidence
##  trans_product            61    0.06       0.25
##                                                                                            call
##  apriori(data = trans_product, parameter = list(support = 0.06, confidence = 0.25, minlen = 2))
## Improving model performance
inspect(sort(product_rules, by = "lift")[1:5])
##     lhs                          rhs                    support confidence  coverage     lift count
## [1] {Home and lifestyle,                                                                           
##      Sports and travel}       => {Food and beverages} 0.8196721  0.9615385 0.8524590 1.047390    50
## [2] {Fashion accessories,                                                                          
##      Home and lifestyle,                                                                           
##      Sports and travel}       => {Food and beverages} 0.8032787  0.9607843 0.8360656 1.046569    49
## [3] {Electronic accessories,                                                                       
##      Home and lifestyle,                                                                           
##      Sports and travel}       => {Food and beverages} 0.7704918  0.9591837 0.8032787 1.044825    47
## [4] {Electronic accessories,                                                                       
##      Fashion accessories,                                                                          
##      Home and lifestyle,                                                                           
##      Sports and travel}       => {Food and beverages} 0.7540984  0.9583333 0.7868852 1.043899    46
## [5] {Health and beauty,                                                                            
##      Home and lifestyle,                                                                           
##      Sports and travel}       => {Food and beverages} 0.7213115  0.9565217 0.7540984 1.041925    44

Comments: inspect(product_rules[1:3]) - The lift of a rule measures how much more likely one item or item set is to be purchased relative to its typical rate of purchase, given that you know another item or item set has been purchased. - if a customer buys Health & Beauty, they will also buy Home & Lifestyle.” With a support of about 82% and confidence of 91%, we can determine that this rule covers about 90 percent of transactions and is correct in 99 percent of purchases involving Health & Beauty. The lift value tells us how much more likely a customer is to buy Home and lifestyle relative to the average customer, given that he or she bought a Health and beauty. Since we know that about 75 percent (0.82 * 0.90) of customers bought whole milk (support), while 91 percent of customers buying a Health and beauty bought Home and lifestyle (confidence), we can compute the lift as 0.91 / 0.75 = 1.2, which mostly matches the value shown.

Part II: Credit Card Fraud

URL: https://www.kaggle.com/datasets/dhanushnarayananr/credit-card-fraud

Feature Explanation:

distancefromhome - the distance from home where the transaction happened.

distancefromlast_transaction - the distance from last transaction happened.

ratiotomedianpurchaseprice - Ratio of purchased price transaction to median purchase price.

repeat_retailer - Is the transaction happened from same retailer. (1-Yes / 0-No)

used_chip - Is the transaction through chip (credit card). (1-Yes / 0-No)

usedpinnumber - Is the transaction happened by using PIN number. (1-Yes / 0-No)

online_order - Is the transaction an online order. (1-Yes / 0-No)

fraud - Is the transaction fraudulent. (1-Yes / 0-No)

credit_dt <- read.csv(file = "C:/Users/Anhuynh/Desktop/ITU/Classes/Principles of Business Analytics/Assignments/Dataset/card_transdata.csv")
##Find missing values
colSums(is.na(credit_dt))
##             distance_from_home distance_from_last_transaction 
##                              0                              0 
## ratio_to_median_purchase_price                repeat_retailer 
##                              0                              0 
##                      used_chip                used_pin_number 
##                              0                              0 
##                   online_order                          fraud 
##                              0                              0
library(corrplot)
library(Hmisc)

## Mark the insignificant coefficients according to the specified p-value significance level
cor_9 <- rcorr(as.matrix(credit_dt))
credit_cor <- cor_9$r
p_mat <- cor_9$P

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

corrplot(credit_cor, method = "color", col = col(200),  
         type = "upper", order = "hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col = "darkblue", tl.srt = 45, #Text label color and rotation
         # Combine with significance level
         p.mat = p_mat, sig.level = 0.01,  
         # hide correlation coefficient on the principal diagonal
         diag = FALSE 
         )

library(ggplot2)
library(scales)
library(stringr)
## 
## Attaching package: 'stringr'
## The following object is masked from 'package:recipes':
## 
##     fixed
##Exploratory Data Analysis
# fraud
count_fraud <- table(credit_dt$fraud)

df_fraud <- data.frame(transactions = c("No Fraud", "Fraud", "Total" ), 
                      count = c(912597, 87403, 1000000))

ggplot(data=df_fraud, aes(x = transactions, y = count, fill = transactions)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_fraud$count, label= paste(df_fraud$count,"\n", "(", round(df_fraud$count/10^6*100,2), "%" , ")")), vjust=1, color="black", size=3) +
labs(title="Fraud Count") + 
theme(legend.position="none") +
scale_y_continuous(labels  = 
                       label_number(scale = 1e-3, prefix = "", suffix = "K", big.mark = ',',  accuracy = 1))
## Warning: Use of `df_fraud$count` is discouraged. Use `count` instead.
## Use of `df_fraud$count` is discouraged. Use `count` instead.
## Use of `df_fraud$count` is discouraged. Use `count` instead.

# repeat_retailer
count_retailer <- table(credit_dt$repeat_retailer)

df_retailer <- data.frame(transactions = c("Different Retailer", "Same Retailer", "Total" ), 
                      count = c(118464, 881536, 1000000))

ggplot(data=df_retailer, aes(x = transactions, y = count, fill = transactions)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_retailer$count, label= paste(df_retailer$count,"\n", "(", round(df_retailer$count/10^6*100,2), "%" , ")")), vjust=1, color="black", size=3) +
labs(title="Repeated Retailer Count.") + 
theme(legend.position="none") +
scale_y_continuous(labels  = 
                       label_number(scale = 1e-3, prefix = "", suffix = "K", big.mark = ',',  accuracy = 1))
## Warning: Use of `df_retailer$count` is discouraged. Use `count` instead.
## Warning: Use of `df_retailer$count` is discouraged. Use `count` instead.
## Use of `df_retailer$count` is discouraged. Use `count` instead.

# used_chip (credit card)
count_chip <- table(credit_dt$used_chip)

df_chip <- data.frame(transactions = c("Without Credit Card", "With Credit Card", "Total" ), 
                      count = c(649601, 350399, 1000000))

ggplot(data=df_chip, aes(x = transactions, y = count, fill = transactions)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_chip$count, label= paste(df_chip$count,"\n", "(", round(df_chip$count/10^6*100,2), "%" , ")")), vjust=1, color="black", size=3) +
labs(title="Credit Card User Count") + 
theme(legend.position="none") +
scale_y_continuous(labels  = 
                       label_number(scale = 1e-3, prefix = "", suffix = "K", big.mark = ',',  accuracy = 1))
## Warning: Use of `df_chip$count` is discouraged. Use `count` instead.
## Warning: Use of `df_chip$count` is discouraged. Use `count` instead.
## Use of `df_chip$count` is discouraged. Use `count` instead.

# used_pin_number (PIN number)
count_pin <- table(credit_dt$used_pin_number)

df_pin <- data.frame(transactions = c("Without PIN number", "With PIN number", "Total" ), 
                      count = c(899392, 100608, 1000000))

ggplot(data=df_pin, aes(x = transactions, y = count, fill = transactions)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_pin$count, label= paste(df_pin$count,"\n", "(", round(df_pin$count/10^6*100,2), "%" , ")")), vjust=1, color="black", size=3) +
labs(title="PIN Number User Count") + 
theme(legend.position="none") +
scale_y_continuous(labels  = 
                       label_number(scale = 1e-3, prefix = "", suffix = "K", big.mark = ',',  accuracy = 1))
## Warning: Use of `df_pin$count` is discouraged. Use `count` instead.
## Warning: Use of `df_pin$count` is discouraged. Use `count` instead.
## Use of `df_pin$count` is discouraged. Use `count` instead.

# online_order
count_order <- table(credit_dt$online_order)

df_order <- data.frame(transactions = c("Without Online Order", "With Online Order", "Total" ), 
                      count = c(349448, 650552, 1000000))

ggplot(data=df_order, aes(x = transactions, y = count, fill = transactions)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_order$count, label= paste(df_order$count,"\n", "(", round(df_order$count/10^6*100,2), "%" , ")")), vjust=1, color="black", size=3) +
labs(title="Online Order Count") + 
theme(legend.position="none") +
scale_y_continuous(labels  = 
                       label_number(scale = 1e-3, prefix = "", suffix = "K", big.mark = ',',  accuracy = 1))
## Warning: Use of `df_order$count` is discouraged. Use `count` instead.
## Warning: Use of `df_order$count` is discouraged. Use `count` instead.
## Use of `df_order$count` is discouraged. Use `count` instead.

## Distribution of the variables counts, or how many transactions fall into each bucket of measurements.
# distance_from_home
## with Fraud
hist(credit_dt$distance_from_home[which(credit_dt$fraud == 1)], col = "green")
rug(credit_dt$distance_from_home[which(credit_dt$fraud == 1)])
abline(v = median(credit_dt$distance_from_home[which(credit_dt$fraud == 1)]), col = "magenta", lwd = 4)

## without Fraud
hist(credit_dt$distance_from_home[which(credit_dt$fraud == 0)], col = "light green")
rug(credit_dt$distance_from_home[which(credit_dt$fraud == 0)])
abline(v = median(credit_dt$distance_from_home[which(credit_dt$fraud == 0)]), col = "magenta", lwd = 4)

# distance_from_last_transaction
hist(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 1)], col = "blue")
rug(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 1)])
abline(v = median(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 1)]), col = "magenta", lwd = 4)

hist(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 0)], col = "light blue")
rug(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 0)])
abline(v = median(credit_dt$distance_from_last_transaction[which(credit_dt$fraud == 0)]), col = "magenta", lwd = 4)

# ratio_to_median_purchase_price
hist(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 1)], col = "maroon")
rug(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 1)])
abline(v = median(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 1)]), col = "magenta", lwd = 4)

hist(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 0)], col = "purple")
rug(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 0)])
abline(v = median(credit_dt$ratio_to_median_purchase_price[which(credit_dt$fraud == 0)]), col = "magenta", lwd = 4)

Data Modeling:

library(tidymodels)
## Split the data into training and test sets
set.seed(2023)
credit_split <- initial_split(credit_dt,  prop = 3/4)
credit_train <- training(credit_split) # training set
credit_test <- testing(credit_split) # test set

Logistic Regression Model

library(caret)
library(pROC)
library(ROCR)

credit_train$fraud <- factor(credit_train$fraud)
credit_test$fraud <- factor(credit_test$fraud)

# Receiver operating characteristic (ROC) curve is commonly used to examine the tradeoff between the detection of true positives while avoiding the false positives.
credit_gml <- glm(fraud ~ distance_from_home + distance_from_last_transaction + ratio_to_median_purchase_price + repeat_retailer + used_chip + used_pin_number + online_order, family = "binomial", data = credit_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
credit.probs <- predict(credit_gml, credit_test, type="response")

pr <- prediction(credit.probs, credit_test$fraud)

# plotting ROC curve
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9668457
# Calculate a test misclassification rate
table(credit_test$fraud) / nrow(credit_test)
## 
##        0        1 
## 0.912112 0.087888

What does AUC mean? The Area Under the Curve (AUC) is the measure of the ability of a classifier to distinguish between classes and is used as a summary of the ROC curve. The higher the AUC, the better the performance of the model at distinguishing between the positive and negative classes.

Test misclassification rate, which is a proportion that algorithm incorrectly predicts transactions as “Fraud” instead of “non-Fraud” in test data. (8.78%)

Calculate cost-saving per Transaction. If we assume that fraud loss/cost is approximately $4 per transaction, which is the cost to handle a fraud transaction, which is predicted as “non-Fraud” (False Negative). Otherwise, the cost will be four time less expensive to handle an existing fraud or $1 per transaction, which is predicted as “Fraud” (True Positive)

FN (predict that a transaction wouldn’t be fraud, but it actually would, then loss will be): $4 TP (predict that a transaction would be fraud , when it actually would): $1 FP (predict that a transaction would be fraud , when it actually wouldn’t): $1 TN (predict that a transaction wouldn’t be fraud, when it actually wouldn’t): $0 Cost = FN($4) + TP($1) + FP($1) + TN($0)

Apply this cost evaluation to our model. We start by fitting the model, and making predictions in the form of probabilities.

## let’s create a threshold vector and a cost vector.
# threshold vector
thresh <- seq(0.1,1.0, length = 10)

#cost vector
cost = rep(0,length(thresh))

dim(credit_test) ## 250,000
## [1] 250000      8
# cost as a function of threshold
for (i in 1:length(thresh)){
  
  glm.pred = rep("No", length(credit.probs))
  glm.pred <- ifelse(credit.probs > thresh[[i]], 1, 0)
  glm.pred <- as.factor(glm.pred)
  credit_test$fraud <- as.factor(credit_test$fraud)
  
  x <- confusionMatrix(glm.pred, credit_test$fraud)
  TN <- x$table[1]/250000
  FP <- x$table[2]/250000
  FN <- x$table[3]/250000
  TP <- x$table[4]/250000
  
  cost[i] = FN*4 + TP*1 + FP*1 + TN*0
}
## Warning in confusionMatrix.default(glm.pred, credit_test$fraud): Levels are not
## in the same order for reference and data. Refactoring data to match.
##Confusion Matrix
x
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 228028  21972
##          1      0      0
##                                          
##                Accuracy : 0.9121         
##                  95% CI : (0.911, 0.9132)
##     No Information Rate : 0.9121         
##     P-Value [Acc > NIR] : 0.5018         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.9121         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.9121         
##          Detection Rate : 0.9121         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : 0              
## 

Let’s assume that the company is using “simple model” which just defaults to a threshold of 0.5. We need fit that model, make predictions, and calculate the cost.

# simple model: assume threshold is 0.5
glm.pred = rep("No", length(credit.probs))
glm.pred <- ifelse(credit.probs > 0.5, 1, 0)
glm.pred <- as.factor(glm.pred)
credit_test$fraud <- as.factor(credit_test$fraud)

x <- confusionMatrix(glm.pred, credit_test$fraud)
TN <- x$table[1]/250000
FP <- x$table[2]/250000
FN <- x$table[3]/250000
TP <- x$table[4]/250000

cost_simple = FN*4 + TP*1 + FP*1 + TN*0
# putting results in a dataframe for plotting
dat <- data.frame(
  model = c(rep("optimized",10),"simple"),
  cost_per_transaction = c(cost, cost_simple),
  threshold = c(thresh, 0.5)
)

# plotting
ggplot(dat, aes(x = threshold, y = cost_per_transaction, group = model, colour = model)) +
  geom_line() +
  geom_point()

Looking at the results, we can see that the minimum cost per transaction is about $0.05 at a threshold of 0.2. The “simple” model that our company is currently implementing costs about $0.2 per transaction, at a threshold of 0.5 .

As if we have 1,000,000 transactions to be switched from existing model to new model, the cost-saving will be:

# cost savings of optimized model (threshold = 0.2) compared to baseline model (threshold = 0.5)
savings_per_transaction = cost_simple - min(cost)

total_savings = 1000000*savings_per_transaction

total_savings
## [1] 37052

Regression Tree Model.

library(rlang)
library(rpart.plot)

credit_tree = rpart(fraud ~ distance_from_home + distance_from_last_transaction + ratio_to_median_purchase_price + repeat_retailer + used_chip + used_pin_number + online_order, 
              data = credit_train )

rpart.plot(credit_tree, type = 2, extra = 101, leaf.round = 3, fallen.leaves = TRUE,
    varlen = 0, tweak = 1.2)

credit_tree_tst_pred = predict(credit_tree, credit_test, type = "class")
table(predicted = credit_tree_tst_pred, actual = credit_test$fraud)
##          actual
## predicted      0      1
##         0 227774    358
##         1    254  21614
# function to predict accuracy of Classification tree model
calc_acc = function(actual, predicted) {
  mean(actual == predicted)
}

# Accuracy
(tree_tst_acc = calc_acc(predicted = credit_tree_tst_pred, actual = credit_test$fraud))
## [1] 0.997552
# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(credit_test$fraud, credit_tree_tst_pred)$overall[1])
OOSE
## [1] 0.002448