Reading in the data and skimming it

credit <- 
  read_excel("credit fraud.xlsx") |> 
  mutate(Fraud = factor(Fraud,
                        levels = c("No", "Yes")))

#credit <- 
#  read_csv("Credit Fraud Full.csv") |> 
#  mutate(Fraud = factor(Fraud,
#                        levels = c("No", "Yes"))) |> 
#  dplyr::select(Fraud, Amount, PC1:PC10) |> 
#  filter(Amount < 1000)

N <- nrow(credit)

skim(credit)
Data summary
Name credit
Number of rows 2000
Number of columns 12
_______________________
Column type frequency:
factor 1
numeric 11
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Fraud 0 1 FALSE 2 No: 1508, Yes: 492

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Amount 0 1 87.28 187.03 0.00 2.71 18.55 86.65 2125.87 ▇▁▁▁▁
PC1 0 1 1.12 2.70 -3.79 -0.61 0.40 2.03 12.11 ▃▇▂▁▁
PC2 0 1 0.17 3.53 -41.04 -0.19 0.08 0.55 20.01 ▁▁▁▇▁
PC3 0 1 -1.40 3.55 -24.59 -1.31 -0.28 0.24 11.15 ▁▁▁▇▁
PC4 0 1 0.01 1.03 -3.55 -0.69 0.03 0.70 3.31 ▁▃▇▅▁
PC5 0 1 -1.72 3.75 -19.21 -1.55 -0.25 0.35 4.00 ▁▁▁▃▇
PC6 0 1 0.09 0.90 -12.59 -0.20 -0.03 0.26 11.06 ▁▁▇▁▁
PC7 0 1 0.19 2.12 -22.80 -0.21 0.02 0.31 27.20 ▁▁▇▁▁
PC8 0 1 -0.02 0.99 -8.89 -0.56 -0.03 0.53 8.36 ▁▁▇▁▁
PC9 0 1 0.05 0.75 -7.26 -0.07 0.02 0.23 6.03 ▁▁▇▁▁
PC10 0 1 0.02 0.38 -8.23 -0.06 0.02 0.12 2.54 ▁▁▁▇▁

Exploratory analysis: Creating graphs to compare fraud vs no fraud

Box plots per variable by Fraud

First just check Amount and PC1 - PC5:

credit |> 
  select(Fraud, Amount:PC5) |> 
  pivot_longer(
    cols = Amount:PC5,
    values_to = "value",
    names_to = "variable"
  ) |>
  
  ggplot(
    mapping = aes(
      x = value, 
      fill = Fraud
    )
  ) + 
  geom_boxplot() + 
  facet_wrap(
    facets = ~ variable, 
    scales = 'free',
    ncol = 2
  ) + 
  scale_y_continuous(breaks = NULL) + 
  
  theme_test() + 
  
  scale_fill_manual(
    values = c(No = "steelblue", Yes = "tomato")
  )

Now check Amount and PC6 - PC10

credit |> 
  select(Fraud, PC6:PC10) |> 
  pivot_longer(
    cols = PC6:PC10,
    values_to = "value",
    names_to = "variable"
  ) |>
  
  ggplot(
    mapping = aes(
      x = value, 
      fill = Fraud
    )
  ) + 
  geom_boxplot() + 
  facet_wrap(
    facets = ~ variable, 
    scales = 'free',
    ncol = 2
  ) + 
  
  labs(x = NULL) +
  
  theme_test() + 
  
  theme(legend.position = c(0.8, 0.15)) + 
  
  scale_y_continuous(breaks = NULL) + 
  
  scale_fill_manual(
    values = c(No = "steelblue", Yes = "tomato")
  )

It looks like there is a difference in fraud cases for PC1, PC2, PC3, PC5, PC7, PC9, and PC10. Let’s use a Partial F-test to test for a difference in the variables, accounting for the other variables:

# Reading in the Partial F-test Function
source('Partial F Test function.R')

good_vars <- 
  Partial_F(
    Y = credit |> select(where(is.numeric)), 
    x = credit$Fraud
  ) |> 
  rownames_to_column(var = "Variables") |>
  filter(P_value < 0.01) |> 
  arrange(Variables)

good_vars
##   Variables Partial_Test F_stat P_value
## 1    Amount        0.321  49.93   0e+00
## 2       PC1        0.334 134.15   0e+00
## 3      PC10        0.315  13.56   2e-04
## 4       PC2        0.315  14.29   2e-04
## 5       PC3        0.314   9.62   2e-03
## 6       PC4        0.315  12.85   3e-04
## 7       PC5        0.380 427.99   0e+00

Now we’ll just keep the variables with a p-value less than 0.01

credit2 <- 
  credit |> 
  dplyr::select(Fraud, 
                good_vars$Variables)

LDA with Fisher’s Method “by hand”

Using Fisher’s method of Linear discriminant analysis for 2 groups, we start by converting all of the values into the singular linear discriminant:

\[z = \textbf{a}'\textbf{y} \\ \textbf{a} = (\bar{\textbf{y}}_1 - \bar{\textbf{y}}_2)' \textbf{S}_{\textrm{pl}}^{-1}\]

We need to find the mean vectors for both Fraud and non-fraud groups, \(\bar{\textbf{y}}_1, \bar{\textbf{y}}_2\), and the pooled covariance matrix,

\[\textbf{S}_{\textrm{pl}}^{-1} = \textbf{E}/(N-k)\]

where \(\textbf{E}\) is our error matrix from a MANOVA

# First the sample mean vectors: y_bar1, y_bar2
credit_means <- 
  credit2 |> 
  
  summarize(
    .by = Fraud,
    across(.cols = where(is.numeric),
                   .fns = mean)
    ) |> 
  
  column_to_rownames(var="Fraud") |> 
  
  # Transposing the results
  t() |> 
  
  data.frame() |> 
  
  mutate(diff = No - Yes) |> 
  
  round(digits = 3)

credit_means
##            Yes     No    diff
## Amount 122.211 75.879 -46.332
## PC1      4.542  0.009  -4.533
## PC10     0.076  0.000  -0.076
## PC2      0.571  0.034  -0.537
## PC3     -5.677  0.000   5.677
## PC4     -0.109  0.047   0.156
## PC5     -6.972 -0.013   6.959

Pooled covariance matrix:

Spl <- 
  manova(
    cbind(Amount,PC1,PC10,PC2,PC3,PC4,PC5) ~ Fraud, 
    data = credit2
  ) |> 
  summary() |> 
  pluck("SS") |> 
  pluck("Residuals") |> 
  divide_by(N-2)

signif(Spl, digits = 2)
##         Amount    PC1   PC10    PC2   PC3    PC4    PC5
## Amount 35000.0 -18.00 -4.500 10.000 23.00 -0.400 49.000
## PC1      -18.0   3.50 -0.110  0.470 -2.40  0.180 -1.900
## PC10      -4.5  -0.11  0.140 -0.025  0.19 -0.033 -0.051
## PC2       10.0   0.47 -0.025 12.000 -0.21  0.640 -1.700
## PC3       23.0  -2.40  0.190 -0.210  6.60 -0.150  3.000
## PC4       -0.4   0.18 -0.033  0.640 -0.15  1.100 -0.140
## PC5       49.0  -1.90 -0.051 -1.700  3.00 -0.140  5.100

Calculating the linear discriminant function:

\[\textbf{a} = (\bar{\textbf{y}}_1 - \bar{\textbf{y}}_2)' \textbf{S}_{\textrm{pl}}^{-1}\]

ldf <- t(credit_means$Yes - credit_means$No) %*% solve(Spl)
ldf
##       Amount   PC1  PC10    PC2    PC3    PC4   PC5
## [1,] 0.00349 0.679 0.901 -0.114 -0.163 -0.331 -1.09

Calculating the linear discriminant for credit fraud:

\[z = \textbf{Y}\textbf{a}\]

ld_score <- 
  # Getting our Y matrix:
  (credit2 |> 
  dplyr::select(-Fraud) |> 
  unlist() |> 
  matrix(nrow = nrow(credit2))) %*% t(ldf) # Now multiplying by a

ld_data <- 
  data.frame(
    score = ld_score,
             Fraud = credit2$Fraud
    )

gg_fraud_ld <- 
  
  ggplot(
    data = ld_data,
    mapping = aes(
      x = ld_score,
      fill = Fraud
    )
  ) + 
  
  geom_density(alpha = 0.5) + 
  
  coord_cartesian(expand = F) + 
  
  theme(legend.position = c(0.7, 0.4)) + 
  
  scale_fill_manual(
    values = c(No = "steelblue", Yes = "tomato")
  ) + 
  
  labs(x = "Linear Discriminant") 
  
  
gg_fraud_ld

Finding the cutoff assuming equal priors and misclassification costs:

\[\frac{1}{2}\textbf{a}'(\bar{\textbf{y}}_1 + \bar{\textbf{y}}_2)\]

ld_cutoff <- 0.5 * ldf %*% as.numeric(credit_means[ ,"No"] + credit_means[ ,"Yes"]) |> 
             as.numeric()

ld_cutoff
## [1] 6.16
# Adding the cutoff line to the graph
gg_fraud_ld <- 
gg_fraud_ld + 
  
  geom_vline(
    xintercept = ld_cutoff,
    linewidth = 1,
    linetype = "dashed",
    color = "orange"
  ) + 
  
  annotate(
    geom = "text",
    label = "p[yes]==p[no]", 
    x = 20, y = .30, 
    color = "orange",
    size = 5,
    parse = T
  ) 
  

gg_fraud_ld

Making a prediction assuming P(Yes) = P(No)

ld_data <- 
  ld_data |> 
  mutate(pred_equal = if_else(score > ld_cutoff, "Yes", "No"))

tibble(ld_data)
## # A tibble: 2,000 × 3
##    score Fraud pred_equal
##    <dbl> <fct> <chr>     
##  1  7.74 Yes   Yes       
##  2  5.19 Yes   No        
##  3  4.17 Yes   No        
##  4 10.9  Yes   Yes       
##  5  9.93 Yes   Yes       
##  6 17.7  Yes   Yes       
##  7 17.2  Yes   Yes       
##  8 15.6  Yes   Yes       
##  9 15.2  Yes   Yes       
## 10  9.65 Yes   Yes       
## # ℹ 1,990 more rows

The full data had only 0.2% of purchases as fraud, so our priors are p_yes = 0.002 and p_no = 0.998. If we want to adjust for priors, we need to add ln(p2/p1) to the cutoff

ld_prior_cutoff <- ld_cutoff + log(0.998/0.002)
ld_prior_cutoff
## [1] 12.4
# Adding the cutoff line to the graph
gg_fraud_ld <- 
  gg_fraud_ld + 
  
  geom_vline(
    xintercept = ld_prior_cutoff,
    linewidth = 1,
    linetype = "dashed",
    color = "orchid"
  ) + 
  
  annotate(
    geom = "text",
    label = "p[yes] == 0.002", 
    x = 20, y = .25, 
    color = "orchid",
    size = 5,
    parse = T
  ) 



gg_fraud_ld

Making a prediction assuming P(Yes) = 500*P(No)

ld_data <- 
  ld_data |> 
  mutate(pred_priors = if_else(score > ld_prior_cutoff, "Yes", "No"))

tibble(ld_data)
## # A tibble: 2,000 × 4
##    score Fraud pred_equal pred_priors
##    <dbl> <fct> <chr>      <chr>      
##  1  7.74 Yes   Yes        No         
##  2  5.19 Yes   No         No         
##  3  4.17 Yes   No         No         
##  4 10.9  Yes   Yes        No         
##  5  9.93 Yes   Yes        No         
##  6 17.7  Yes   Yes        Yes        
##  7 17.2  Yes   Yes        Yes        
##  8 15.6  Yes   Yes        Yes        
##  9 15.2  Yes   Yes        Yes        
## 10  9.65 Yes   Yes        No         
## # ℹ 1,990 more rows

Lastly, if the cost of misclassifying an actual fraudulent purchase as legitimate is 100x the cost as the reverse:

\[c_{12} = 100*c_{21}\]

we can also adjust for the misclassification cost by using \[\frac{1}{2}\textbf{a}'(\bar{\textbf{y}}_1 + \bar{\textbf{y}}_2) +\ln \left( \frac{p_1}{p_2} \right) + \ln \left( \frac{c_{12}}{c_{21}} \right)\]

ld_cost_cutoff <- ld_prior_cutoff + log(1/100)

ld_cost_cutoff
## [1] 7.77
gg_fraud_ld + 
  
  geom_vline(
    xintercept = ld_cost_cutoff,
    linewidth = 1,
    linetype = "dashed",
    color = "black"
  ) + 
  
  annotate(
    geom = "text",
    label = "c[12] == 100*c[21]", 
    x = 20, y = .20, 
    color = "black",
    size = 5,
    parse = T
  ) 

ld_data <- 
  ld_data |> 
  mutate(pred_cost = if_else(score > ld_cost_cutoff, "Yes", "No"))

tibble(ld_data)
## # A tibble: 2,000 × 5
##    score Fraud pred_equal pred_priors pred_cost
##    <dbl> <fct> <chr>      <chr>       <chr>    
##  1  7.74 Yes   Yes        No          No       
##  2  5.19 Yes   No         No          No       
##  3  4.17 Yes   No         No          No       
##  4 10.9  Yes   Yes        No          Yes      
##  5  9.93 Yes   Yes        No          Yes      
##  6 17.7  Yes   Yes        Yes         Yes      
##  7 17.2  Yes   Yes        Yes         Yes      
##  8 15.6  Yes   Yes        Yes         Yes      
##  9 15.2  Yes   Yes        Yes         Yes      
## 10  9.65 Yes   Yes        No          Yes      
## # ℹ 1,990 more rows
lda_boxplot_methods <- 
  ld_data |> 
  pivot_longer(
    cols = pred_equal:pred_cost,
    names_to = "method",
    values_to = "pred_fraud"
  ) |> 
  
  mutate(method = as_factor(method)) |> 
  
  ggplot(
    mapping = aes(
      x = score,
      fill = pred_fraud
    )
  ) + 
  
  geom_boxplot() + 
  
  scale_fill_manual(
    values = c(No = "steelblue", Yes = "tomato")
  ) + 
  
  facet_wrap(
    facets = ~ method,
    ncol = 1
  ) + 
  
  theme_bw() +
  
  theme(legend.position = "top") +
  
  labs(fill = "Predicted Fraud") +
  
  geom_vline(
    data = data.frame(cutoffs = c(ld_cutoff, ld_prior_cutoff, ld_cost_cutoff),
                      method = c("pred_equal", "pred_priors", "pred_cost") |> 
                        as_factor()), 
    mapping = aes(xintercept = cutoffs),
    linetype = "dashed") + 
  
  scale_y_continuous(breaks = NULL) 



lda_boxplot_methods

LDA: Linear Discriminant Classification Using the lda() Function

The lda() function in the MASS package creates the linear classification rules. The arguments are:

  1. formula = Grouping Variable ~ Numeric Variables
  2. data = the data set
  3. prior = a vector of prior probabilities
fraud_lda <- 
  MASS::lda(
    Fraud ~ .,
    data = credit2, 
    prior = c(0.5,0.5)
  )

fraud_lda
## Call:
## lda(Fraud ~ ., data = credit2, prior = c(0.5, 0.5))
## 
## Prior probabilities of groups:
##  No Yes 
## 0.5 0.5 
## 
## Group means:
##     Amount     PC1     PC10    PC2       PC3    PC4     PC5
## No    75.9 0.00921 7.61e-05 0.0338 -0.000261  0.047 -0.0127
## Yes  122.2 4.54203 7.57e-02 0.5706 -5.676883 -0.109 -6.9717
## 
## Coefficients of linear discriminants:
##             LD1
## Amount  0.00102
## PC1     0.19766
## PC10    0.26143
## PC2    -0.03329
## PC3    -0.04749
## PC4    -0.09652
## PC5    -0.31678

Next, creating a dataset to store the lda results with the true status of the transaction.

credit_ld <- 
  data.frame(
    Fraud = credit$Fraud,
    ld_fraud = predict(fraud_lda)$class,
    LDA = as.numeric(predict(fraud_lda)$x)
  )

head(credit_ld)
##   Fraud ld_fraud    LDA
## 1   Yes      Yes  0.460
## 2   Yes       No -0.283
## 3   Yes       No -0.581
## 4   Yes      Yes  1.381
## 5   Yes      Yes  1.098
## 6   Yes      Yes  3.366

Plotting the results and the cutoff: \[\frac{\bar{z}_1 + \bar{z}_2}{2}\]

credit_ld |> 
  ggplot(mapping = aes(x = LDA)) + 
  
  geom_density(
    mapping = aes(fill = Fraud), 
    color = "black",
    alpha = 0.5
  ) + 
  
  geom_vline(
    xintercept = 0, 
    linetype = "dashed"
  ) + 
  
  coord_cartesian(expand = F) + 
  
  theme(
    legend.position = c(0.7, 0.4)
  ) + 
  
  scale_fill_manual(
    values = c(No = "blue", Yes = "tomato")
  )

Creating the confusion matrix for apparent error rate

The confusionMatrix() function in the caret package will create a confusion matrix along with a few helpful probabilities. The arguments are:

  1. data = a vector of the predicted class
  2. reference = the true values of the class being predicted
  3. positive = the “success” group (if our response variable is binary)
# confusionMatrix function in caret package
cm_lda <- 
  confusionMatrix(
    data = predict(fraud_lda) |> pluck("class"),
    reference = credit2$Fraud,
    positive = "Yes",
    prevalence = 0.5
    )


cm_lda
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1501   91
##        Yes    7  401
##                                        
##                Accuracy : 0.951        
##                  95% CI : (0.941, 0.96)
##     No Information Rate : 0.754        
##     P-Value [Acc > NIR] : <2e-16       
##                                        
##                   Kappa : 0.86         
##                                        
##  Mcnemar's Test P-Value : <2e-16       
##                                        
##             Sensitivity : 0.815        
##             Specificity : 0.995        
##          Pos Pred Value : 0.994        
##          Neg Pred Value : 0.843        
##              Prevalence : 0.500        
##          Detection Rate : 0.201        
##    Detection Prevalence : 0.204        
##       Balanced Accuracy : 0.905        
##                                        
##        'Positive' Class : Yes          
## 

LDA: Linear Discriminant Classification: With Priors

With MASS::lda(), we can set the priors using prior = c(p1, p2). For the full data, the probability a transaction was fraud was 0.002 (0.2%)

fraud_prior <- c(0.998, 0.002)

fraud_ldp <- 
  MASS::lda(
    Fraud ~ .,
    data = credit2, 
    prior = fraud_prior
  )


### Creating the confusion matrix for apparent error rate after adjusting
### for priors. Need to add prevalence = P(Yes)=0.002
cm_ldp <- 
  confusionMatrix(
    data = predict(fraud_ldp) |> pluck("class"), 
    reference = credit2$Fraud, 
    prevalence = 0.002,
    positive = "Yes"
  )

cm_ldp
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1508  266
##        Yes    0  226
##                                         
##                Accuracy : 0.867         
##                  95% CI : (0.851, 0.882)
##     No Information Rate : 0.754         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.562         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.459         
##             Specificity : 1.000         
##          Pos Pred Value : 1.000         
##          Neg Pred Value : 0.999         
##              Prevalence : 0.002         
##          Detection Rate : 0.113         
##    Detection Prevalence : 0.113         
##       Balanced Accuracy : 0.730         
##                                         
##        'Positive' Class : Yes           
## 

LDA: Linear Discriminant Classification: Misclass costs

So far we’ve treated the cost of misclassifying a purchase as equal. A bank is likely going to consider a fraudulent purchase as much more costly and denying a customer a legit purchase. Let’s assume that the ratio is 100x higher for misclassifying a fraudulent purchase as legit than vice versa.

We can adjust the priors by increasing fraud prior by the misclassification cost by adjusting the priors:

cost_prior <- c(0.998, 0.002*100)/sum(0.998, 0.002*100)


fraud_ldc <- 
  MASS::lda(
    Fraud ~ ., 
    data = credit2, 
    prior = cost_prior
  )

### Creating the confusion matrix for apparent error rate after adjusting
### for priors and misclass costs:
cm_ldc <- 
  confusionMatrix(
    data = predict(fraud_ldc) |> pluck("class"), 
    reference = credit2$Fraud, 
    prevalence = 0.002,
    positive = "Yes"
  )

cm_ldc
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1508  139
##        Yes    0  353
##                                         
##                Accuracy : 0.93          
##                  95% CI : (0.918, 0.941)
##     No Information Rate : 0.754         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.793         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.717         
##             Specificity : 1.000         
##          Pos Pred Value : 1.000         
##          Neg Pred Value : 0.999         
##              Prevalence : 0.002         
##          Detection Rate : 0.176         
##    Detection Prevalence : 0.176         
##       Balanced Accuracy : 0.859         
##                                         
##        'Positive' Class : Yes           
## 
bind_cols(
  #.id = "method",
  "measure" = names(cm_lda$byClass),
  "lda" = cm_lda$byClass,
  "priors" = cm_ldp$byClass,
  "cost" = cm_ldc$byClass
) |> 
  mutate(
    across(
      .cols = where(is.numeric),
      .fns = round,
      digits = 3
    )
  )
## # A tibble: 11 × 4
##    measure                lda priors  cost
##    <chr>                <dbl>  <dbl> <dbl>
##  1 Sensitivity          0.815  0.459 0.717
##  2 Specificity          0.995  1     1    
##  3 Pos Pred Value       0.994  1     1    
##  4 Neg Pred Value       0.843  0.999 0.999
##  5 Precision            0.983  1     1    
##  6 Recall               0.815  0.459 0.717
##  7 F1                   0.891  0.63  0.836
##  8 Prevalence           0.5    0.002 0.002
##  9 Detection Rate       0.2    0.113 0.176
## 10 Detection Prevalence 0.204  0.113 0.176
## 11 Balanced Accuracy    0.905  0.73  0.859

Let’s see how the predictions differ using equal priors, priors, and misclassification costs:

ld_predictions <- 
  left_join(
    x = cm_lda$table |> 
        data.frame() |> 
        rename(LDA = Freq),
  
    y = cm_ldp$table |> 
        data.frame() |> 
        rename(priors = Freq)) |> 
  
    left_join(
      y = cm_ldc$table |> 
          data.frame() |> 
          rename(cost = Freq)) |> 
  
  rename(Actual = Reference)

ld_predictions
##   Prediction Actual  LDA priors cost
## 1         No     No 1501   1508 1508
## 2        Yes     No    7      0    0
## 3         No    Yes   91    266  139
## 4        Yes    Yes  401    226  353

Now let’s compare the misclassification costs of predictions using priors vs using priors and misclass costs:

Correct -> Cost = 0

Predicting Fraudulent purchase as Legitimate -> Cost = 100

Predicting a Legitimate purchase as Fraudulent -> Cost = 1

ld_predictions |> 
  mutate(
    misclass_cost = case_when(
      Prediction == Actual ~ 0,   # No cost when correct
      Prediction == "No"  & Actual == "Yes" ~ 100,  # Misclassifying an actual fraud purchase as legitimate
      Prediction == "Yes" & Actual == "No"  ~ 1,    # Misclassifying a legitimate purchase as fraudulent
    )
  ) |> 
  
  summarize(
    priors_cost = mean(priors*misclass_cost)/sum(LDA),
    cost_cost = mean(cost*misclass_cost)/sum(LDA)
  )
##   priors_cost cost_cost
## 1        3.33      1.74