credit <-
read_excel("credit fraud.xlsx") |>
mutate(Fraud = factor(Fraud,
levels = c("No", "Yes")))
#credit <-
# read_csv("Credit Fraud Full.csv") |>
# mutate(Fraud = factor(Fraud,
# levels = c("No", "Yes"))) |>
# dplyr::select(Fraud, Amount, PC1:PC10) |>
# filter(Amount < 1000)
N <- nrow(credit)
skim(credit)
Name | credit |
Number of rows | 2000 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Fraud | 0 | 1 | FALSE | 2 | No: 1508, Yes: 492 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Amount | 0 | 1 | 87.28 | 187.03 | 0.00 | 2.71 | 18.55 | 86.65 | 2125.87 | ▇▁▁▁▁ |
PC1 | 0 | 1 | 1.12 | 2.70 | -3.79 | -0.61 | 0.40 | 2.03 | 12.11 | ▃▇▂▁▁ |
PC2 | 0 | 1 | 0.17 | 3.53 | -41.04 | -0.19 | 0.08 | 0.55 | 20.01 | ▁▁▁▇▁ |
PC3 | 0 | 1 | -1.40 | 3.55 | -24.59 | -1.31 | -0.28 | 0.24 | 11.15 | ▁▁▁▇▁ |
PC4 | 0 | 1 | 0.01 | 1.03 | -3.55 | -0.69 | 0.03 | 0.70 | 3.31 | ▁▃▇▅▁ |
PC5 | 0 | 1 | -1.72 | 3.75 | -19.21 | -1.55 | -0.25 | 0.35 | 4.00 | ▁▁▁▃▇ |
PC6 | 0 | 1 | 0.09 | 0.90 | -12.59 | -0.20 | -0.03 | 0.26 | 11.06 | ▁▁▇▁▁ |
PC7 | 0 | 1 | 0.19 | 2.12 | -22.80 | -0.21 | 0.02 | 0.31 | 27.20 | ▁▁▇▁▁ |
PC8 | 0 | 1 | -0.02 | 0.99 | -8.89 | -0.56 | -0.03 | 0.53 | 8.36 | ▁▁▇▁▁ |
PC9 | 0 | 1 | 0.05 | 0.75 | -7.26 | -0.07 | 0.02 | 0.23 | 6.03 | ▁▁▇▁▁ |
PC10 | 0 | 1 | 0.02 | 0.38 | -8.23 | -0.06 | 0.02 | 0.12 | 2.54 | ▁▁▁▇▁ |
First just check Amount and PC1 - PC5:
credit |>
select(Fraud, Amount:PC5) |>
pivot_longer(
cols = Amount:PC5,
values_to = "value",
names_to = "variable"
) |>
ggplot(
mapping = aes(
x = value,
fill = Fraud
)
) +
geom_boxplot() +
facet_wrap(
facets = ~ variable,
scales = 'free',
ncol = 2
) +
scale_y_continuous(breaks = NULL) +
theme_test() +
scale_fill_manual(
values = c(No = "steelblue", Yes = "tomato")
)
Now check Amount and PC6 - PC10
credit |>
select(Fraud, PC6:PC10) |>
pivot_longer(
cols = PC6:PC10,
values_to = "value",
names_to = "variable"
) |>
ggplot(
mapping = aes(
x = value,
fill = Fraud
)
) +
geom_boxplot() +
facet_wrap(
facets = ~ variable,
scales = 'free',
ncol = 2
) +
labs(x = NULL) +
theme_test() +
theme(legend.position = c(0.8, 0.15)) +
scale_y_continuous(breaks = NULL) +
scale_fill_manual(
values = c(No = "steelblue", Yes = "tomato")
)
It looks like there is a difference in fraud cases for PC1, PC2, PC3, PC5, PC7, PC9, and PC10. Let’s use a Partial F-test to test for a difference in the variables, accounting for the other variables:
# Reading in the Partial F-test Function
source('Partial F Test function.R')
good_vars <-
Partial_F(
Y = credit |> select(where(is.numeric)),
x = credit$Fraud
) |>
rownames_to_column(var = "Variables") |>
filter(P_value < 0.01) |>
arrange(Variables)
good_vars
## Variables Partial_Test F_stat P_value
## 1 Amount 0.321 49.93 0e+00
## 2 PC1 0.334 134.15 0e+00
## 3 PC10 0.315 13.56 2e-04
## 4 PC2 0.315 14.29 2e-04
## 5 PC3 0.314 9.62 2e-03
## 6 PC4 0.315 12.85 3e-04
## 7 PC5 0.380 427.99 0e+00
Now we’ll just keep the variables with a p-value less than 0.01
credit2 <-
credit |>
dplyr::select(Fraud,
good_vars$Variables)
Using Fisher’s method of Linear discriminant analysis for 2 groups, we start by converting all of the values into the singular linear discriminant:
\[z = \textbf{a}'\textbf{y} \\ \textbf{a} = (\bar{\textbf{y}}_1 - \bar{\textbf{y}}_2)' \textbf{S}_{\textrm{pl}}^{-1}\]
We need to find the mean vectors for both Fraud and non-fraud groups, \(\bar{\textbf{y}}_1, \bar{\textbf{y}}_2\), and the pooled covariance matrix,
\[\textbf{S}_{\textrm{pl}}^{-1} = \textbf{E}/(N-k)\]
where \(\textbf{E}\) is our error matrix from a MANOVA
# First the sample mean vectors: y_bar1, y_bar2
credit_means <-
credit2 |>
summarize(
.by = Fraud,
across(.cols = where(is.numeric),
.fns = mean)
) |>
column_to_rownames(var="Fraud") |>
# Transposing the results
t() |>
data.frame() |>
mutate(diff = No - Yes) |>
round(digits = 3)
credit_means
## Yes No diff
## Amount 122.211 75.879 -46.332
## PC1 4.542 0.009 -4.533
## PC10 0.076 0.000 -0.076
## PC2 0.571 0.034 -0.537
## PC3 -5.677 0.000 5.677
## PC4 -0.109 0.047 0.156
## PC5 -6.972 -0.013 6.959
Pooled covariance matrix:
Spl <-
manova(
cbind(Amount,PC1,PC10,PC2,PC3,PC4,PC5) ~ Fraud,
data = credit2
) |>
summary() |>
pluck("SS") |>
pluck("Residuals") |>
divide_by(N-2)
signif(Spl, digits = 2)
## Amount PC1 PC10 PC2 PC3 PC4 PC5
## Amount 35000.0 -18.00 -4.500 10.000 23.00 -0.400 49.000
## PC1 -18.0 3.50 -0.110 0.470 -2.40 0.180 -1.900
## PC10 -4.5 -0.11 0.140 -0.025 0.19 -0.033 -0.051
## PC2 10.0 0.47 -0.025 12.000 -0.21 0.640 -1.700
## PC3 23.0 -2.40 0.190 -0.210 6.60 -0.150 3.000
## PC4 -0.4 0.18 -0.033 0.640 -0.15 1.100 -0.140
## PC5 49.0 -1.90 -0.051 -1.700 3.00 -0.140 5.100
Calculating the linear discriminant function:
\[\textbf{a} = (\bar{\textbf{y}}_1 - \bar{\textbf{y}}_2)' \textbf{S}_{\textrm{pl}}^{-1}\]
ldf <- t(credit_means$Yes - credit_means$No) %*% solve(Spl)
ldf
## Amount PC1 PC10 PC2 PC3 PC4 PC5
## [1,] 0.00349 0.679 0.901 -0.114 -0.163 -0.331 -1.09
Calculating the linear discriminant for credit fraud:
\[z = \textbf{Y}\textbf{a}\]
ld_score <-
# Getting our Y matrix:
(credit2 |>
dplyr::select(-Fraud) |>
unlist() |>
matrix(nrow = nrow(credit2))) %*% t(ldf) # Now multiplying by a
ld_data <-
data.frame(
score = ld_score,
Fraud = credit2$Fraud
)
gg_fraud_ld <-
ggplot(
data = ld_data,
mapping = aes(
x = ld_score,
fill = Fraud
)
) +
geom_density(alpha = 0.5) +
coord_cartesian(expand = F) +
theme(legend.position = c(0.7, 0.4)) +
scale_fill_manual(
values = c(No = "steelblue", Yes = "tomato")
) +
labs(x = "Linear Discriminant")
gg_fraud_ld
Finding the cutoff assuming equal priors and misclassification costs:
\[\frac{1}{2}\textbf{a}'(\bar{\textbf{y}}_1 + \bar{\textbf{y}}_2)\]
ld_cutoff <- 0.5 * ldf %*% as.numeric(credit_means[ ,"No"] + credit_means[ ,"Yes"]) |>
as.numeric()
ld_cutoff
## [1] 6.16
# Adding the cutoff line to the graph
gg_fraud_ld <-
gg_fraud_ld +
geom_vline(
xintercept = ld_cutoff,
linewidth = 1,
linetype = "dashed",
color = "orange"
) +
annotate(
geom = "text",
label = "p[yes]==p[no]",
x = 20, y = .30,
color = "orange",
size = 5,
parse = T
)
gg_fraud_ld
Making a prediction assuming P(Yes) = P(No)
ld_data <-
ld_data |>
mutate(pred_equal = if_else(score > ld_cutoff, "Yes", "No"))
tibble(ld_data)
## # A tibble: 2,000 × 3
## score Fraud pred_equal
## <dbl> <fct> <chr>
## 1 7.74 Yes Yes
## 2 5.19 Yes No
## 3 4.17 Yes No
## 4 10.9 Yes Yes
## 5 9.93 Yes Yes
## 6 17.7 Yes Yes
## 7 17.2 Yes Yes
## 8 15.6 Yes Yes
## 9 15.2 Yes Yes
## 10 9.65 Yes Yes
## # ℹ 1,990 more rows
The full data had only 0.2% of purchases as fraud, so our priors are p_yes = 0.002 and p_no = 0.998. If we want to adjust for priors, we need to add ln(p2/p1) to the cutoff
ld_prior_cutoff <- ld_cutoff + log(0.998/0.002)
ld_prior_cutoff
## [1] 12.4
# Adding the cutoff line to the graph
gg_fraud_ld <-
gg_fraud_ld +
geom_vline(
xintercept = ld_prior_cutoff,
linewidth = 1,
linetype = "dashed",
color = "orchid"
) +
annotate(
geom = "text",
label = "p[yes] == 0.002",
x = 20, y = .25,
color = "orchid",
size = 5,
parse = T
)
gg_fraud_ld
Making a prediction assuming P(Yes) = 500*P(No)
ld_data <-
ld_data |>
mutate(pred_priors = if_else(score > ld_prior_cutoff, "Yes", "No"))
tibble(ld_data)
## # A tibble: 2,000 × 4
## score Fraud pred_equal pred_priors
## <dbl> <fct> <chr> <chr>
## 1 7.74 Yes Yes No
## 2 5.19 Yes No No
## 3 4.17 Yes No No
## 4 10.9 Yes Yes No
## 5 9.93 Yes Yes No
## 6 17.7 Yes Yes Yes
## 7 17.2 Yes Yes Yes
## 8 15.6 Yes Yes Yes
## 9 15.2 Yes Yes Yes
## 10 9.65 Yes Yes No
## # ℹ 1,990 more rows
Lastly, if the cost of misclassifying an actual fraudulent purchase as legitimate is 100x the cost as the reverse:
\[c_{12} = 100*c_{21}\]
we can also adjust for the misclassification cost by using \[\frac{1}{2}\textbf{a}'(\bar{\textbf{y}}_1 + \bar{\textbf{y}}_2) +\ln \left( \frac{p_1}{p_2} \right) + \ln \left( \frac{c_{12}}{c_{21}} \right)\]
ld_cost_cutoff <- ld_prior_cutoff + log(1/100)
ld_cost_cutoff
## [1] 7.77
gg_fraud_ld +
geom_vline(
xintercept = ld_cost_cutoff,
linewidth = 1,
linetype = "dashed",
color = "black"
) +
annotate(
geom = "text",
label = "c[12] == 100*c[21]",
x = 20, y = .20,
color = "black",
size = 5,
parse = T
)
ld_data <-
ld_data |>
mutate(pred_cost = if_else(score > ld_cost_cutoff, "Yes", "No"))
tibble(ld_data)
## # A tibble: 2,000 × 5
## score Fraud pred_equal pred_priors pred_cost
## <dbl> <fct> <chr> <chr> <chr>
## 1 7.74 Yes Yes No No
## 2 5.19 Yes No No No
## 3 4.17 Yes No No No
## 4 10.9 Yes Yes No Yes
## 5 9.93 Yes Yes No Yes
## 6 17.7 Yes Yes Yes Yes
## 7 17.2 Yes Yes Yes Yes
## 8 15.6 Yes Yes Yes Yes
## 9 15.2 Yes Yes Yes Yes
## 10 9.65 Yes Yes No Yes
## # ℹ 1,990 more rows
lda_boxplot_methods <-
ld_data |>
pivot_longer(
cols = pred_equal:pred_cost,
names_to = "method",
values_to = "pred_fraud"
) |>
mutate(method = as_factor(method)) |>
ggplot(
mapping = aes(
x = score,
fill = pred_fraud
)
) +
geom_boxplot() +
scale_fill_manual(
values = c(No = "steelblue", Yes = "tomato")
) +
facet_wrap(
facets = ~ method,
ncol = 1
) +
theme_bw() +
theme(legend.position = "top") +
labs(fill = "Predicted Fraud") +
geom_vline(
data = data.frame(cutoffs = c(ld_cutoff, ld_prior_cutoff, ld_cost_cutoff),
method = c("pred_equal", "pred_priors", "pred_cost") |>
as_factor()),
mapping = aes(xintercept = cutoffs),
linetype = "dashed") +
scale_y_continuous(breaks = NULL)
lda_boxplot_methods
lda()
FunctionThe lda()
function in the MASS
package
creates the linear classification rules. The arguments are:
formula =
Grouping Variable ~ Numeric Variablesdata =
the data setprior =
a vector of prior probabilitiesfraud_lda <-
MASS::lda(
Fraud ~ .,
data = credit2,
prior = c(0.5,0.5)
)
fraud_lda
## Call:
## lda(Fraud ~ ., data = credit2, prior = c(0.5, 0.5))
##
## Prior probabilities of groups:
## No Yes
## 0.5 0.5
##
## Group means:
## Amount PC1 PC10 PC2 PC3 PC4 PC5
## No 75.9 0.00921 7.61e-05 0.0338 -0.000261 0.047 -0.0127
## Yes 122.2 4.54203 7.57e-02 0.5706 -5.676883 -0.109 -6.9717
##
## Coefficients of linear discriminants:
## LD1
## Amount 0.00102
## PC1 0.19766
## PC10 0.26143
## PC2 -0.03329
## PC3 -0.04749
## PC4 -0.09652
## PC5 -0.31678
Next, creating a dataset to store the lda results with the true status of the transaction.
credit_ld <-
data.frame(
Fraud = credit$Fraud,
ld_fraud = predict(fraud_lda)$class,
LDA = as.numeric(predict(fraud_lda)$x)
)
head(credit_ld)
## Fraud ld_fraud LDA
## 1 Yes Yes 0.460
## 2 Yes No -0.283
## 3 Yes No -0.581
## 4 Yes Yes 1.381
## 5 Yes Yes 1.098
## 6 Yes Yes 3.366
Plotting the results and the cutoff: \[\frac{\bar{z}_1 + \bar{z}_2}{2}\]
credit_ld |>
ggplot(mapping = aes(x = LDA)) +
geom_density(
mapping = aes(fill = Fraud),
color = "black",
alpha = 0.5
) +
geom_vline(
xintercept = 0,
linetype = "dashed"
) +
coord_cartesian(expand = F) +
theme(
legend.position = c(0.7, 0.4)
) +
scale_fill_manual(
values = c(No = "blue", Yes = "tomato")
)
The confusionMatrix()
function in the caret
package will create a confusion matrix along with a few helpful
probabilities. The arguments are:
data =
a vector of the predicted classreference =
the true values of the class being
predictedpositive =
the “success” group (if our response
variable is binary)# confusionMatrix function in caret package
cm_lda <-
confusionMatrix(
data = predict(fraud_lda) |> pluck("class"),
reference = credit2$Fraud,
positive = "Yes",
prevalence = 0.5
)
cm_lda
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1501 91
## Yes 7 401
##
## Accuracy : 0.951
## 95% CI : (0.941, 0.96)
## No Information Rate : 0.754
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.86
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.815
## Specificity : 0.995
## Pos Pred Value : 0.994
## Neg Pred Value : 0.843
## Prevalence : 0.500
## Detection Rate : 0.201
## Detection Prevalence : 0.204
## Balanced Accuracy : 0.905
##
## 'Positive' Class : Yes
##
With MASS::lda()
, we can set the priors using
prior = c(p1, p2)
. For the full data, the probability a
transaction was fraud was 0.002 (0.2%)
fraud_prior <- c(0.998, 0.002)
fraud_ldp <-
MASS::lda(
Fraud ~ .,
data = credit2,
prior = fraud_prior
)
### Creating the confusion matrix for apparent error rate after adjusting
### for priors. Need to add prevalence = P(Yes)=0.002
cm_ldp <-
confusionMatrix(
data = predict(fraud_ldp) |> pluck("class"),
reference = credit2$Fraud,
prevalence = 0.002,
positive = "Yes"
)
cm_ldp
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1508 266
## Yes 0 226
##
## Accuracy : 0.867
## 95% CI : (0.851, 0.882)
## No Information Rate : 0.754
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.562
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.459
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 0.999
## Prevalence : 0.002
## Detection Rate : 0.113
## Detection Prevalence : 0.113
## Balanced Accuracy : 0.730
##
## 'Positive' Class : Yes
##
So far we’ve treated the cost of misclassifying a purchase as equal. A bank is likely going to consider a fraudulent purchase as much more costly and denying a customer a legit purchase. Let’s assume that the ratio is 100x higher for misclassifying a fraudulent purchase as legit than vice versa.
We can adjust the priors by increasing fraud prior by the misclassification cost by adjusting the priors:
cost_prior <- c(0.998, 0.002*100)/sum(0.998, 0.002*100)
fraud_ldc <-
MASS::lda(
Fraud ~ .,
data = credit2,
prior = cost_prior
)
### Creating the confusion matrix for apparent error rate after adjusting
### for priors and misclass costs:
cm_ldc <-
confusionMatrix(
data = predict(fraud_ldc) |> pluck("class"),
reference = credit2$Fraud,
prevalence = 0.002,
positive = "Yes"
)
cm_ldc
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1508 139
## Yes 0 353
##
## Accuracy : 0.93
## 95% CI : (0.918, 0.941)
## No Information Rate : 0.754
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.793
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.717
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 0.999
## Prevalence : 0.002
## Detection Rate : 0.176
## Detection Prevalence : 0.176
## Balanced Accuracy : 0.859
##
## 'Positive' Class : Yes
##
bind_cols(
#.id = "method",
"measure" = names(cm_lda$byClass),
"lda" = cm_lda$byClass,
"priors" = cm_ldp$byClass,
"cost" = cm_ldc$byClass
) |>
mutate(
across(
.cols = where(is.numeric),
.fns = round,
digits = 3
)
)
## # A tibble: 11 × 4
## measure lda priors cost
## <chr> <dbl> <dbl> <dbl>
## 1 Sensitivity 0.815 0.459 0.717
## 2 Specificity 0.995 1 1
## 3 Pos Pred Value 0.994 1 1
## 4 Neg Pred Value 0.843 0.999 0.999
## 5 Precision 0.983 1 1
## 6 Recall 0.815 0.459 0.717
## 7 F1 0.891 0.63 0.836
## 8 Prevalence 0.5 0.002 0.002
## 9 Detection Rate 0.2 0.113 0.176
## 10 Detection Prevalence 0.204 0.113 0.176
## 11 Balanced Accuracy 0.905 0.73 0.859
Let’s see how the predictions differ using equal priors, priors, and misclassification costs:
ld_predictions <-
left_join(
x = cm_lda$table |>
data.frame() |>
rename(LDA = Freq),
y = cm_ldp$table |>
data.frame() |>
rename(priors = Freq)) |>
left_join(
y = cm_ldc$table |>
data.frame() |>
rename(cost = Freq)) |>
rename(Actual = Reference)
ld_predictions
## Prediction Actual LDA priors cost
## 1 No No 1501 1508 1508
## 2 Yes No 7 0 0
## 3 No Yes 91 266 139
## 4 Yes Yes 401 226 353
Now let’s compare the misclassification costs of predictions using priors vs using priors and misclass costs:
Correct -> Cost = 0
Predicting Fraudulent purchase as Legitimate -> Cost = 100
Predicting a Legitimate purchase as Fraudulent -> Cost = 1
ld_predictions |>
mutate(
misclass_cost = case_when(
Prediction == Actual ~ 0, # No cost when correct
Prediction == "No" & Actual == "Yes" ~ 100, # Misclassifying an actual fraud purchase as legitimate
Prediction == "Yes" & Actual == "No" ~ 1, # Misclassifying a legitimate purchase as fraudulent
)
) |>
summarize(
priors_cost = mean(priors*misclass_cost)/sum(LDA),
cost_cost = mean(cost*misclass_cost)/sum(LDA)
)
## priors_cost cost_cost
## 1 3.33 1.74