library(knitr)
===============================================
Displaying Top 5 rows of the dataset.
head(df, n = 5)
Description and Basic Summary of the Dataset, which explains the type of columns present, their datatypes, total number of entries in that column
summary(df)
## accountNumber customerId creditLimit
## Length:786363 Length:786363 Min. : 250
## Class :character Class :character 1st Qu.: 5000
## Mode :character Mode :character Median : 7500
## Mean :10759
## 3rd Qu.:15000
## Max. :50000
## availableMoney transactionDateTime transactionAmount
## Min. :-1006 Length:786363 Min. : 0.00
## 1st Qu.: 1077 Class :character 1st Qu.: 33.65
## Median : 3185 Mode :character Median : 87.90
## Mean : 6251 Mean : 136.99
## 3rd Qu.: 7500 3rd Qu.: 191.48
## Max. :50000 Max. :2011.54
## merchantName acqCountry merchantCountryCode
## Length:786363 Length:786363 Length:786363
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## posEntryMode posConditionCode merchantCategoryCode
## Length:786363 Length:786363 Length:786363
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## currentExpDate accountOpenDate
## Length:786363 Length:786363
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## dateOfLastAddressChange cardCVV
## Length:786363 Length:786363
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## enteredCVV cardLast4Digits transactionType
## Length:786363 Length:786363 Length:786363
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## echoBuffer currentBalance merchantCity
## Length:786363 Min. : 0.0 Length:786363
## Class :character 1st Qu.: 689.9 Class :character
## Mode :character Median : 2451.8 Mode :character
## Mean : 4508.7
## 3rd Qu.: 5291.1
## Max. :47498.8
## merchantState merchantZip cardPresent
## Length:786363 Length:786363 Mode :logical
## Class :character Class :character FALSE:433495
## Mode :character Mode :character TRUE :352868
##
##
##
## posOnPremises recurringAuthInd
## Length:786363 Length:786363
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## expirationDateKeyInMatch isFraud
## Mode :logical Mode :logical
## FALSE:785320 FALSE:773946
## TRUE :1043 TRUE :12417
##
##
##
Displaying All the columns in the data at a glance. Columns Names:
colnames(df)
## [1] "accountNumber" "customerId"
## [3] "creditLimit" "availableMoney"
## [5] "transactionDateTime" "transactionAmount"
## [7] "merchantName" "acqCountry"
## [9] "merchantCountryCode" "posEntryMode"
## [11] "posConditionCode" "merchantCategoryCode"
## [13] "currentExpDate" "accountOpenDate"
## [15] "dateOfLastAddressChange" "cardCVV"
## [17] "enteredCVV" "cardLast4Digits"
## [19] "transactionType" "echoBuffer"
## [21] "currentBalance" "merchantCity"
## [23] "merchantState" "merchantZip"
## [25] "cardPresent" "posOnPremises"
## [27] "recurringAuthInd" "expirationDateKeyInMatch"
## [29] "isFraud"
Dimension of dataset:
dim(df)
## [1] 786363 29
Number of Records = 786363 Number of Attributes/Columns = 29
Null values in data:
df[df == ""] <- NA
colSums(is.na(df))
## accountNumber customerId
## 0 0
## creditLimit availableMoney
## 0 0
## transactionDateTime transactionAmount
## 0 0
## merchantName acqCountry
## 0 4562
## merchantCountryCode posEntryMode
## 724 4054
## posConditionCode merchantCategoryCode
## 409 0
## currentExpDate accountOpenDate
## 0 0
## dateOfLastAddressChange cardCVV
## 0 0
## enteredCVV cardLast4Digits
## 0 0
## transactionType echoBuffer
## 698 786363
## currentBalance merchantCity
## 0 786363
## merchantState merchantZip
## 786363 786363
## cardPresent posOnPremises
## 0 786363
## recurringAuthInd expirationDateKeyInMatch
## 786363 0
## isFraud
## 0
Unique values in data:
require(dplyr)
sapply(df, n_distinct)
## accountNumber customerId
## 5000 5000
## creditLimit availableMoney
## 10 521916
## transactionDateTime transactionAmount
## 776637 66038
## merchantName acqCountry
## 2490 5
## merchantCountryCode posEntryMode
## 5 6
## posConditionCode merchantCategoryCode
## 4 19
## currentExpDate accountOpenDate
## 165 1820
## dateOfLastAddressChange cardCVV
## 2184 899
## enteredCVV cardLast4Digits
## 976 5246
## transactionType echoBuffer
## 4 1
## currentBalance merchantCity
## 487318 1
## merchantState merchantZip
## 1 1
## cardPresent posOnPremises
## 2 1
## recurringAuthInd expirationDateKeyInMatch
## 1 2
## isFraud
## 2
Response attribute “isFraud” :
require(dplyr)
dplyr::count(df, isFraud, sort = TRUE)
isFraud attribute has 1.6% True Values.
df$isFraud <- as.integer(df$isFraud)
df$cardPresent <- as.integer(df$cardPresent)
df$expirationDateKeyInMatch <- as.integer(df$expirationDateKeyInMatch)
library(RColorBrewer)
e = count(df, creditLimit)
e <- e[complete.cases(e), ]
coul <- brewer.pal(8, "Set2")
par(mar = c(4.1, 4 ,2.1 ,0))
barplot(height= e$n, names= e$creditLimit, xlab = 'creditLimit' , ylab = 'Frequency', col=coul)
library(RColorBrewer)
coul <- brewer.pal(8, "Set2")
e = count(df, merchantCountryCode)
e <- e[complete.cases(e), ]
par(mar = c(4.1, 4 ,2.1 ,0))
barplot(height= e$n, names= e$merchantCountryCode, xlab = 'merchantCountryCode' , ylab = 'Frequency', col=coul, horiz = TRUE)
library(plotrix)
e = count(df, posEntryMode)
e <- e[complete.cases(e), ]
pie3D(e$n,labels = e$posEntryMode,explode = 0.1, main = "Pie Chart of posEntryMode ", mar = rep(1, 4),col = hcl.colors(length(e$n), "Spectral"))
library(RColorBrewer)
coul <- brewer.pal(8, "Set2")
e = count(df, posConditionCode)
e <- e[complete.cases(e), ]
par(mar = c(4.1, 4 ,1.1 ,0))
barplot(height= e$n, names= e$posConditionCode, xlab = 'posConditionCode' , ylab = 'Frequency', col=coul)
library(lessR)
merchant_with_fraud1 = subset(df, transactionType != is.na(transactionType), select = transactionType)
cols <- hcl.colors(length(unique(merchant_with_fraud1$transactionType)), "Zissou 1")
PieChart(transactionType,data = merchant_with_fraud1,values="%",
fill = "viridis",
main = "Transaction types",
color = "black",
lwd = 1.5,
values_color = c(rep("white", 2), 1),
values_size = 0.8)
## >>> Suggestions
## PieChart(transactionType, hole=0) # traditional pie chart
## PieChart(transactionType, values="%") # display %'s on the chart
## PieChart(transactionType) # bar chart
## Plot(transactionType) # bubble plot
## Plot(transactionType, values="count") # lollipop plot
##
## --- transactionType ---
##
## transactinTyp Count Prop
## -------------------------------------
## ADDRESS_VERIFICATION 20169 0.026
## PURCHASE 745193 0.948
## REVERSAL 20303 0.026
## -------------------------------------
## Total 785665 1.000
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 1337879.797, df = 2, p-value = 0.000
library(plotrix)
e = count(df, cardPresent)
e <- e[complete.cases(e), ]
pie3D(e$n,labels = e$cardPresent,explode = 0.1, main = "Card Present during transaction ", mar = rep(1.75, 4),col = hcl.colors(length(e$n), "Spectral"))
library(RColorBrewer)
coul <- brewer.pal(8, "Set2")
e = count(df, expirationDateKeyInMatch)
e <- e[complete.cases(e), ]
par(mar = c(4.1, 4 ,1.1 ,0))
barplot(height= e$n, names= e$expirationDateKeyInMatch, xlab = 'expirationDateKeyInMatch' , ylab = 'Frequency', col=coul)
count(df, expirationDateKeyInMatch)
Plot showing frequency of Fraud
As described earlier, it is clearly an example of an IMBALANCED DATSET with respect to isFraud attribute. But It is expected to be like that in real world scenario as Frauds are always less than 5% with respect to whole dataset.
Here it is 1.6%
Finding Correlation and Causation aspect with respect to fraud transactions
Here we can clearly see which are the Top Merchants:
To know more about the transaction data set it is important to know the Top Merchants where most of the Transactions happen. It helps in Optimizing the metrics and to prioritize the business and data strategies accordingly.
important_merchants = count(df, merchantName)
important_merchants <- filter(important_merchants, n > 10000)
arrange(important_merchants, -n)
Here we can clearly see that most transactions happen at:
library("ggplot2")
e = count(df, merchantCategoryCode)
e <- e[complete.cases(e), ]
ggplot(e, aes(reorder(merchantCategoryCode, -n), n)) +
geom_bar(stat = "identity",color='skyblue',fill='steelblue')+
theme(axis.text.x = element_text(angle = 90, size = 10))
df_num = df[ , c("creditLimit", "availableMoney","transactionAmount", "currentBalance")]
head(df_num, n = 5)
summary(df_num)
## creditLimit availableMoney transactionAmount
## Min. : 250 Min. :-1006 Min. : 0.00
## 1st Qu.: 5000 1st Qu.: 1077 1st Qu.: 33.65
## Median : 7500 Median : 3185 Median : 87.90
## Mean :10759 Mean : 6251 Mean : 136.99
## 3rd Qu.:15000 3rd Qu.: 7500 3rd Qu.: 191.48
## Max. :50000 Max. :50000 Max. :2011.54
## currentBalance
## Min. : 0.0
## 1st Qu.: 689.9
## Median : 2451.8
## Mean : 4508.7
## 3rd Qu.: 5291.1
## Max. :47498.8
Through analysis it can be seen that most of the Credit limits are under 10,000.
But some goes till 50,000.
While data Modelling this information can be really useful.
par(mfrow=c(2,2))
hist(df[ , c("creditLimit")], main="creditLimit", xlab="Frequency", ylab="credit Limit", col = "blue")
hist(df[ , c("availableMoney")], main="availableMoney", xlab="Frequency", ylab="availableMoney", col = "blue")
hist(df[ , c("transactionAmount")], main="transactionAmount", xlab="Frequency", ylab="transaction Amount", col = "blue")
hist(df[ , c("currentBalance")], main="currentBalance", xlab="Frequency", ylab="currentBalance", col = "blue")
par(mfrow=c(2,2))
boxplot(df_num[ , c("creditLimit")], col = "green", main="credit Limit")
boxplot(df_num[ , c("availableMoney")], col = "red", main="available Money")
boxplot(df_num[ , c("transactionAmount")], col = "blue", main="transaction Amount")
boxplot(df_num[ , c("currentBalance")], col = "purple", main="current Balance")
merchant_with_fraud = subset(df, isFraud == '1', select = merchantCategoryCode)
head(merchant_with_fraud, n = 10)
library("ggplot2")
e = count(merchant_with_fraud, merchantCategoryCode)
e <- e[complete.cases(e), ]
ggplot(e, aes(reorder(merchantCategoryCode, -n), n)) +
geom_bar(stat = "identity",color='skyblue',fill='brown')+
theme(axis.text.x = element_text(angle = 90, size = 10))
merchant_name__with_fraud = subset(df, isFraud == '1', select = merchantName)
important_merchants = count(merchant_name__with_fraud, merchantName)
important_merchants = filter(important_merchants, n > 300)
arrange(important_merchants, -n)
accountNumber_with_fraud = subset(df, isFraud == '1', select = accountNumber)
important_merchants = count(accountNumber_with_fraud, accountNumber)
important_merchants = filter(important_merchants, n > 200)
arrange(important_merchants, -n)
customers_with_fraud = subset(df, isFraud == '1', select = customerId)
important_merchants = count(customers_with_fraud, customerId)
important_merchants = filter(important_merchants, n > 200)
arrange(important_merchants, -n)
library(psych)
corPlot(df_num, cex = 1.2)
drops <- c('echoBuffer','merchantCity','merchantState','merchantZip','posOnPremises','recurringAuthInd')
df <- df[ , !(names(df) %in% drops)]
head(df, n = 5)
df_2 <- df
df_2 <- subset(df_2, transactionType != "ADDRESS_VERIFICATION" | is.na(transactionType))
df_2$transactionDate <- as.Date(df_2$transactionDateTime)
head(df_2, n = 5)
duplicate_trans = df_2[duplicated(df_2[,c('accountNumber', 'transactionAmount','merchantName', 'acqCountry' , 'accountOpenDate' ,'merchantCategoryCode','cardLast4Digits')]) | duplicated(df_2[,c('accountNumber', 'transactionAmount','merchantName', 'acqCountry' , 'accountOpenDate' ,'merchantCategoryCode','cardLast4Digits')], fromLast=TRUE), ]
head(duplicate_trans, n = 5)
dim(duplicate_trans)
## [1] 100716 24
duplicate_trans_fraud = subset(duplicate_trans, isFraud == '1', select = merchantName)
dup_fraud = count(duplicate_trans_fraud, merchantName)
dup_fraud = filter(dup_fraud, n >= 30)
arrange(dup_fraud, -n)
###Question: ### Reversal Transactions can be found directly by using the attribute Transaction Type.
nrow(subset(df_2, transactionType == 'REVERSAL'))
## [1] 20303
sum(subset(df_2, transactionType == 'REVERSAL')$transactionAmount)
## [1] 2821792
###Question: ### Merchnats where Frauds took place according to the given attribute transaction type = Reversal in the original dataset:
df_2_rev = subset(df_2, isFraud == '1', select = c(merchantName, transactionType))
df_2_rev = subset(df_2_rev, transactionType == 'REVERSAL', select = merchantName)
df_2_rev_fraud = count(df_2_rev, merchantName)
df_2_rev_fraud = filter(df_2_rev_fraud, n >= 10)
arrange(df_2_rev_fraud, -n)
###Question: ### 2nd method to find Reversal Transaction is to look directly into the duplicated transactions:
nrow(subset(duplicate_trans, transactionType == 'REVERSAL'))
## [1] 17826
sum(subset(duplicate_trans, transactionType == 'REVERSAL')$transactionAmount)
## [1] 2669860
duplicate_trans_fraud = subset(duplicate_trans, isFraud == '1', select = c(merchantName, transactionType))
reverse_trans_fraud = subset(duplicate_trans_fraud, transactionType == 'REVERSAL', select = merchantName)
rev_fraud = count(reverse_trans_fraud, merchantName)
rev_fraud = filter(rev_fraud, n >= 10)
arrange(rev_fraud, -n)
library(ggparliament)
library(tidyverse)
## ── Attaching packages ─────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.7 ✔ purrr 0.3.4
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lessR::recode() masks dplyr::recode()
## ✖ lessR::rename() masks dplyr::rename()
ru_semicircle <- parliament_data(election_data = rev_fraud,
type = "semicircle", # Parliament type
parl_rows = 5, # Number of rows of the parliament
party_seats = rev_fraud$n) # Seats per party
ggplot(ru_semicircle, aes(x = x, y = y, colour = merchantName)) +
geom_parliament_seats() +
theme_ggparliament() +
labs(title = "merchantName")
{
df <- df[df$creditLimit > quantile(df$creditLimit, .25) - 1.5*IQR(df$creditLimit) &
df$creditLimit < quantile(df$creditLimit, .75) + 1.5*IQR(df$creditLimit), ]
head(df, n = 5)
}
library(RColorBrewer)
coul <- brewer.pal(8, "Set2")
e = count(df, creditLimit)
e <- e[complete.cases(e), ]
par(mar = c(4, 4 ,2 ,0))
barplot(height= e$n, names= e$creditLimit, col=coul, xlab = 'creditLimit' , ylab = 'Frequency' )
df <- df[df$availableMoney > quantile(df$availableMoney, .25) - 1.5*IQR(df$availableMoney) &
df$availableMoney < quantile(df$availableMoney, .75) + 1.5*IQR(df$availableMoney), ]
df <- df[df$transactionAmount > quantile(df$transactionAmount, .25) - 1.5*IQR(df$transactionAmount) &
df$transactionAmount < quantile(df$transactionAmount, .75) + 1.5*IQR(df$transactionAmount), ]
df <- df[df$currentBalance > quantile(df$currentBalance, .25) - 1.5*IQR(df$currentBalance) &
df$currentBalance < quantile(df$currentBalance, .75) + 1.5*IQR(df$currentBalance), ]
par(mfrow=c(2,2))
hist(df[ , c("creditLimit")], main="creditLimit", xlab="Frequency", ylab="credit Limit", col = "blue")
hist(df[ , c("availableMoney")], main="availableMoney", xlab="Frequency", ylab="availableMoney", col = "blue")
hist(df[ , c("transactionAmount")], main="transactionAmount", xlab="Frequency", ylab="transactionAmount", col = "blue")
hist(df[ , c("currentBalance")], main="currentBalance", xlab="Frequency", ylab="currentBalance", col = "blue")
par(mfrow=c(2,2))
boxplot(df[ , c("creditLimit")], col = "green", main="credit Limit")
boxplot(df[ , c("availableMoney")], col = "red", main="availableMoney")
boxplot(df[ , c("transactionAmount")], col = "blue", main="transaction Amounty")
boxplot(df[ , c("currentBalance")], col = "purple", main="current Balance")
df$CVVMatched <- df$cardCVV == df$enteredCVV
df$CVVMatched <- as.integer(df$CVVMatched)
dim(df)
## [1] 617105 24
df_time <- data.frame(df)
head(df_time,n=5 )
library("lubridate")
df_time <- df_time %>%
mutate(hour_admit = hour(strptime(transactionDateTime, format = "%Y-%m-%dT%H:%M:%S"))) %>%
mutate(time_period = case_when(
hour_admit > 05 & hour_admit < 11 ~ 0,
hour_admit >= 11 & hour_admit < 17 ~ 1,
hour_admit >= 17 & hour_admit < 23 ~ 2,
hour_admit >=23 | hour_admit <= 5 ~ 3))
df$transactionTimeCat = df_time$time_period
head(df, n=5)
count(df, transactionTimeCat)
count(df, CVVMatched)
drops <- c('accountNumber','customerId', 'transactionDateTime', 'cardLast4Digits', 'cardCVV', 'enteredCVV', 'accountOpenDate', 'dateOfLastAddressChange', 'currentExpDate')
df <- df[ , !(names(df) %in% drops)]
print('Final total columns')
## [1] "Final total columns"
ncol(df)
## [1] 16
library(superml)
le <- LabelEncoder$new()
columns <- c('merchantName', 'acqCountry', 'merchantCountryCode', 'merchantCategoryCode', 'transactionType', 'posEntryMode', 'posConditionCode')
label <- LabelEncoder$new()
df$merchantName <- label$fit_transform(df$merchantName)
df$acqCountry <- label$fit_transform(df$acqCountry)
df$merchantCountryCode <- label$fit_transform(df$merchantCountryCode)
df$merchantCategoryCode <- label$fit_transform(df$merchantCategoryCode)
df$transactionType <- label$fit_transform(df$transactionType)
df$posEntryMode <- label$fit_transform(df$posEntryMode)
df$posConditionCode <- label$fit_transform(df$posConditionCode)
df = df %>%
mutate_all(~ifelse(is.na(.), mode(., na.rm = TRUE), .))
#install.packages(c("zoo","xts","quantmod"))
#install.packages("smotefamily")
library(DMwR2) # for smote implementation
library(ROSE)# for ROSE sampling
# smote
set.seed(9560)
rose_train <- ROSE(isFraud ~ ., data = df)$data
table(rose_train$isFraud)
##
## 0 1
## 309174 307931
under_v1 <- ovun.sample(isFraud ~ .,
data = df,
method = "under",
N = 2 * sum(df$isFraud == 1))
df_under = under_v1$data
table(df_under$isFraud)
##
## 0 1
## 8847 8847
# Libraries
library(pROC, quietly=TRUE)
library(microbenchmark, quietly=TRUE)
# Set seed so the train/test split is reproducible
# Loading package
library(caTools)
library(ROCR)
ds_noads = df_under
#split <- sample.split(ds_noads, SplitRatio = 0.7)
#train <- subset(ds_noads, split == "TRUE")
#test <- subset(ds_noads, split == "FALSE")
#create a list of random number ranging from 1 to number of rows from actual data
#and 70% of the data into training data
data2 = sort(sample(nrow(ds_noads), nrow(ds_noads)*.8))
#creating training data set by selecting the output row values
train <- ds_noads[data2,]
#creating test data set by not selecting the output row values
test <- ds_noads[-data2,]
# Training model
logistic_model <- glm(isFraud ~ .,
data = train,
family = "binomial")
#logistic_model
# Summary
summary(logistic_model)
##
## Call:
## glm(formula = isFraud ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0817 -1.0694 -0.5585 1.0963 2.0292
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) 0.203188709 0.168595143 1.205
## creditLimit 0.000011916 0.000006803 1.752
## availableMoney -0.000012060 0.000008381 -1.439
## transactionAmount 0.004758081 0.000169930 28.000
## merchantName -0.000465575 0.000041221 -11.295
## acqCountry 0.043301686 0.228171315 0.190
## merchantCountryCode 0.084819309 0.233430307 0.363
## posEntryMode -0.113553464 0.017284093 -6.570
## posConditionCode -0.075115314 0.039741254 -1.890
## merchantCategoryCode 0.006528148 0.004391111 1.487
## transactionType -0.027811227 0.050030075 -0.556
## currentBalance NA NA NA
## cardPresent -0.420620035 0.041709599 -10.084
## expirationDateKeyInMatch 0.795534257 0.514831351 1.545
## CVVMatched -0.433319209 0.159294140 -2.720
## transactionTimeCat -0.014773995 0.015988819 -0.924
## Pr(>|z|)
## (Intercept) 0.22813
## creditLimit 0.07984 .
## availableMoney 0.15017
## transactionAmount < 0.0000000000000002 ***
## merchantName < 0.0000000000000002 ***
## acqCountry 0.84948
## merchantCountryCode 0.71634
## posEntryMode 0.0000000000504 ***
## posConditionCode 0.05874 .
## merchantCategoryCode 0.13710
## transactionType 0.57829
## currentBalance NA
## cardPresent < 0.0000000000000002 ***
## expirationDateKeyInMatch 0.12229
## CVVMatched 0.00652 **
## transactionTimeCat 0.35548
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19623 on 14154 degrees of freedom
## Residual deviance: 18139 on 14140 degrees of freedom
## AIC: 18169
##
## Number of Fisher Scoring iterations: 4
dim(train)
## [1] 14155 16
dim(test)
## [1] 3539 16
predict_reg <- predict(logistic_model,
test, type = "response")
#predict_reg
# Changing probabilities
predict_reg <- ifelse(predict_reg >0.5, 1, 0)
# Evaluating model accuracy
# using confusion matrix
table(test$isFraud, predict_reg)
## predict_reg
## 0 1
## 0 1209 547
## 1 680 1103
missing_classerr <- mean(predict_reg != test$isFraud)
print(paste('Accuracy =', 1 - missing_classerr))
## [1] "Accuracy = 0.65329189036451"
as.data.frame(table(ds_noads$isFraud))
ROCPred <- prediction(predict_reg, test$isFraud)
ROCPer <- performance(ROCPred, measure = "tpr",
x.measure = "fpr")
auc <- performance(ROCPred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.6535584
# Plotting curve
plot(ROCPer)
plot(ROCPer, colorize = TRUE,
print.cutoffs.at = seq(0.1, by = 0.1),
main = "ROC CURVE LOGISTIC REGRESSION")
abline(a = 0, b = 1)
auc <- round(auc, 4)
legend(.6, .4, auc, title = "AUC", cex = 1)
library(e1071)
library(caret)
confusionMatrix(factor(predict_reg), factor(test$isFraud))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1209 680
## 1 547 1103
##
## Accuracy : 0.6533
## 95% CI : (0.6373, 0.669)
## No Information Rate : 0.5038
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.3069
##
## Mcnemar's Test P-Value : 0.0001643
##
## Sensitivity : 0.6885
## Specificity : 0.6186
## Pos Pred Value : 0.6400
## Neg Pred Value : 0.6685
## Prevalence : 0.4962
## Detection Rate : 0.3416
## Detection Prevalence : 0.5338
## Balanced Accuracy : 0.6536
##
## 'Positive' Class : 0
##
confusionMatrix(factor(predict_reg), factor(test$isFraud), mode = "everything", positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1209 680
## 1 547 1103
##
## Accuracy : 0.6533
## 95% CI : (0.6373, 0.669)
## No Information Rate : 0.5038
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.3069
##
## Mcnemar's Test P-Value : 0.0001643
##
## Sensitivity : 0.6186
## Specificity : 0.6885
## Pos Pred Value : 0.6685
## Neg Pred Value : 0.6400
## Precision : 0.6685
## Recall : 0.6186
## F1 : 0.6426
## Prevalence : 0.5038
## Detection Rate : 0.3117
## Detection Prevalence : 0.4662
## Balanced Accuracy : 0.6536
##
## 'Positive' Class : 1
##
fitControl <- trainControl(method="cv",
number = 5,
preProcOptions = list(thresh = 0.99), # threshold for pca preprocess
classProbs = TRUE,
summaryFunction = twoClassSummary)
train$isFraud <- factor(train$isFraud)
model_knn <- train(make.names(isFraud)~.,
train,
method="knn",
metric="ROC",
preProcess = c('center', 'scale'),
tuneLength=10,
trControl=fitControl)
pred_knn <- predict(model_knn, test)
ROCPred <- prediction(as.numeric(pred_knn), test$isFraud)
ROCPer <- performance(ROCPred, measure = "tpr",
x.measure = "fpr")
auc <- performance(ROCPred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.6590938
# Plotting curve
plot(ROCPer)
plot(ROCPer, colorize = TRUE,
print.cutoffs.at = seq(0.1, by = 0.1),
main = "ROC CURVE KNN")
abline(a = 0, b = 1)
auc <- round(auc, 4)
legend(.6, .4, auc, title = "AUC", cex = 1)
raw.data = df_under
nrows <- nrow(raw.data)
set.seed(314)
indexT <- sample(1:nrow(raw.data), 0.8 * nrows)
#separate train and validation set
trainset = raw.data[indexT,]
verset = raw.data[-indexT,]
train = trainset
test = verset
library(rpart) # for regression trees
library(randomForest) # for random forests
# train a decision tree based on our dataset
tree.model <- rpart(isFraud ~ ., data = train)
# plot our regression tree
plot(tree.model, uniform=TRUE)
# add text labels & make them 60% as big as they are by default
text(tree.model, cex=.6)
raw.data = df_under
nrows <- nrow(raw.data)
set.seed(314)
indexT <- sample(1:nrow(raw.data), 0.8 * nrows)
#separate train and validation set
trainset = raw.data[indexT,]
verset = raw.data[-indexT,]
n <- names(trainset)
rf.form <- as.formula(paste("isFraud ~", paste(n[!n %in% "isFraud"], collapse = " + ")))
trainset.rf <- randomForest(rf.form,trainset,ntree=100,importance=T)
varimp <- data.frame(trainset.rf$importance)
vi1 <- ggplot(varimp, aes(x=reorder(rownames(varimp),IncNodePurity), y=IncNodePurity)) +
geom_bar(stat="identity", fill="tomato", colour="black") +
coord_flip() + theme_bw(base_size = 8) +
labs(title="Prediction using RandomForest with 100 trees", subtitle="Variable importance (IncNodePurity)", x="Variable", y="Variable importance (IncNodePurity)")
vi2 <- ggplot(varimp, aes(x=reorder(rownames(varimp),X.IncMSE), y=X.IncMSE)) +
geom_bar(stat="identity", fill="lightblue", colour="black") +
coord_flip() + theme_bw(base_size = 8) +
labs(title="Prediction using RandomForest with 100 trees", subtitle="Variable importance (%IncMSE)", x="Variable", y="Variable importance (%IncMSE)")
library(gridExtra)
library(grid)
library(ggplot2)
library(lattice)
grid.arrange(vi1, vi2, ncol=2)
library(xgboost, quietly=TRUE)
raw.data = df_under
nrows <- nrow(raw.data)
set.seed(314)
indexT <- sample(1:nrow(raw.data), 0.8 * nrows)
#separate train and validation set
trainset = raw.data[indexT,]
verset = raw.data[-indexT,]
train = trainset
test = verset
xgb.data.train <- xgb.DMatrix(as.matrix(train[, colnames(train) != "isFraud"]), label = train$isFraud)
xgb.data.test <- xgb.DMatrix(as.matrix(test[, colnames(test) != "isFraud"]), label = test$isFraud)
# Get the time to train the xgboost model
xgb.bench.speed = microbenchmark(
xgb.model.speed <- xgb.train(data = xgb.data.train
, params = list(objective = "binary:logistic"
, eta = 0.1
, max.depth = 3
, min_child_weight = 100
, subsample = 1
, colsample_bytree = 1
, nthread = 3
, eval_metric = "auc"
)
, watchlist = list(test = xgb.data.test)
, nrounds = 500
, early_stopping_rounds = 40
, print_every_n = 20
)
, times = 5L
)
## [1] test-auc:0.693403
## Will train until test_auc hasn't improved in 40 rounds.
##
## [21] test-auc:0.717561
## [41] test-auc:0.725942
## [61] test-auc:0.729602
## [81] test-auc:0.733082
## [101] test-auc:0.735312
## [121] test-auc:0.736935
## [141] test-auc:0.738546
## [161] test-auc:0.739718
## [181] test-auc:0.740832
## [201] test-auc:0.741978
## [221] test-auc:0.742289
## [241] test-auc:0.743184
## [261] test-auc:0.743543
## [281] test-auc:0.744220
## [301] test-auc:0.744372
## [321] test-auc:0.745138
## [341] test-auc:0.745831
## [361] test-auc:0.745897
## [381] test-auc:0.746428
## [401] test-auc:0.747213
## [421] test-auc:0.747397
## [441] test-auc:0.747453
## [461] test-auc:0.747624
## [481] test-auc:0.747817
## [500] test-auc:0.747818
## [1] test-auc:0.693403
## Will train until test_auc hasn't improved in 40 rounds.
##
## [21] test-auc:0.717561
## [41] test-auc:0.725942
## [61] test-auc:0.729602
## [81] test-auc:0.733082
## [101] test-auc:0.735312
## [121] test-auc:0.736935
## [141] test-auc:0.738546
## [161] test-auc:0.739718
## [181] test-auc:0.740832
## [201] test-auc:0.741978
## [221] test-auc:0.742289
## [241] test-auc:0.743184
## [261] test-auc:0.743543
## [281] test-auc:0.744220
## [301] test-auc:0.744372
## [321] test-auc:0.745138
## [341] test-auc:0.745831
## [361] test-auc:0.745897
## [381] test-auc:0.746428
## [401] test-auc:0.747213
## [421] test-auc:0.747397
## [441] test-auc:0.747453
## [461] test-auc:0.747624
## [481] test-auc:0.747817
## [500] test-auc:0.747818
## [1] test-auc:0.693403
## Will train until test_auc hasn't improved in 40 rounds.
##
## [21] test-auc:0.717561
## [41] test-auc:0.725942
## [61] test-auc:0.729602
## [81] test-auc:0.733082
## [101] test-auc:0.735312
## [121] test-auc:0.736935
## [141] test-auc:0.738546
## [161] test-auc:0.739718
## [181] test-auc:0.740832
## [201] test-auc:0.741978
## [221] test-auc:0.742289
## [241] test-auc:0.743184
## [261] test-auc:0.743543
## [281] test-auc:0.744220
## [301] test-auc:0.744372
## [321] test-auc:0.745138
## [341] test-auc:0.745831
## [361] test-auc:0.745897
## [381] test-auc:0.746428
## [401] test-auc:0.747213
## [421] test-auc:0.747397
## [441] test-auc:0.747453
## [461] test-auc:0.747624
## [481] test-auc:0.747817
## [500] test-auc:0.747818
## [1] test-auc:0.693403
## Will train until test_auc hasn't improved in 40 rounds.
##
## [21] test-auc:0.717561
## [41] test-auc:0.725942
## [61] test-auc:0.729602
## [81] test-auc:0.733082
## [101] test-auc:0.735312
## [121] test-auc:0.736935
## [141] test-auc:0.738546
## [161] test-auc:0.739718
## [181] test-auc:0.740832
## [201] test-auc:0.741978
## [221] test-auc:0.742289
## [241] test-auc:0.743184
## [261] test-auc:0.743543
## [281] test-auc:0.744220
## [301] test-auc:0.744372
## [321] test-auc:0.745138
## [341] test-auc:0.745831
## [361] test-auc:0.745897
## [381] test-auc:0.746428
## [401] test-auc:0.747213
## [421] test-auc:0.747397
## [441] test-auc:0.747453
## [461] test-auc:0.747624
## [481] test-auc:0.747817
## [500] test-auc:0.747818
## [1] test-auc:0.693403
## Will train until test_auc hasn't improved in 40 rounds.
##
## [21] test-auc:0.717561
## [41] test-auc:0.725942
## [61] test-auc:0.729602
## [81] test-auc:0.733082
## [101] test-auc:0.735312
## [121] test-auc:0.736935
## [141] test-auc:0.738546
## [161] test-auc:0.739718
## [181] test-auc:0.740832
## [201] test-auc:0.741978
## [221] test-auc:0.742289
## [241] test-auc:0.743184
## [261] test-auc:0.743543
## [281] test-auc:0.744220
## [301] test-auc:0.744372
## [321] test-auc:0.745138
## [341] test-auc:0.745831
## [361] test-auc:0.745897
## [381] test-auc:0.746428
## [401] test-auc:0.747213
## [421] test-auc:0.747397
## [441] test-auc:0.747453
## [461] test-auc:0.747624
## [481] test-auc:0.747817
## [500] test-auc:0.747818
print(xgb.bench.speed)
## Unit: seconds
## expr
## xgb.model.speed <- xgb.train(data = xgb.data.train, params = list(objective = "binary:logistic", eta = 0.1, max.depth = 3, min_child_weight = 100, subsample = 1, colsample_bytree = 1, nthread = 3, eval_metric = "auc"), watchlist = list(test = xgb.data.test), nrounds = 500, early_stopping_rounds = 40, print_every_n = 20)
## min lq mean median uq max neval
## 2.623388 2.633211 2.704979 2.735801 2.765213 2.76728 5
print(xgb.model.speed$bestScore)
## NULL
# Make predictions on test set for ROC curve
xgb.test.speed = predict(xgb.model.speed
, newdata = as.matrix(test[, colnames(test) != "isFraud"])
, ntreelimit = xgb.model.speed$bestInd)
#auc.xgb.speed = roc(test$Class, xgb.test.speed, plot = TRUE, col = "blue")
#print(auc.xgb.speed)
ROCPred <- prediction(xgb.test.speed, test$isFraud)
ROCPer <- performance(ROCPred, measure = "tpr",
x.measure = "fpr")
auc <- performance(ROCPred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.7478539
# Plotting curve
plot(ROCPer)
plot(ROCPer, colorize = TRUE,
print.cutoffs.at = seq(0.1, by = 0.1),
main = "ROC CURVE XGBOOST")
abline(a = 0, b = 1)
auc <- round(auc, 4)
legend(.6, .4, auc, title = "AUC", cex = 1)
df_under.pca <- prcomp(df_under[,c(1:15)], center = TRUE,scale. = TRUE)
summary(df_under.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.4445 1.4019 1.2840 1.09533 1.05290
## Proportion of Variance 0.1391 0.1310 0.1099 0.07998 0.07391
## Cumulative Proportion 0.1391 0.2701 0.3800 0.46001 0.53392
## PC6 PC7 PC8 PC9
## Standard deviation 1.00436 1.00003 0.99841 0.98256
## Proportion of Variance 0.06725 0.06667 0.06646 0.06436
## Cumulative Proportion 0.60116 0.66783 0.73429 0.79865
## PC10 PC11 PC12 PC13
## Standard deviation 0.97034 0.95195 0.84932 0.64800
## Proportion of Variance 0.06277 0.06041 0.04809 0.02799
## Cumulative Proportion 0.86142 0.92184 0.96993 0.99792
## PC14 PC15
## Standard deviation 0.17661 0.0000000000001122
## Proportion of Variance 0.00208 0.0000000000000000
## Cumulative Proportion 1.00000 1.0000000000000000
library(devtools)
install_github("vqv/ggbiplot")
library(ggbiplot)
ggbiplot(df_under.pca, labels=rownames(df_under))
Logisitc_Regression <- c("65.3%")
KNN <- c("65.9%")
XGBoost <- c("75%")
results <- data.frame(XGBoost, KNN, Logisitc_Regression)
results
======================================================================================================================================