#Name: Tanjil Azad
#StudentID: 2012193630
#Final Project
Data<-read.csv('Fraud Directory.csv')
head(Data,10)
## step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest
## 1 1 PAYMENT 9839.64 C1231006815 170136.0 160296.36 M1979787155
## 2 1 PAYMENT 1864.28 C1666544295 21249.0 19384.72 M2044282225
## 3 1 TRANSFER 181.00 C1305486145 181.0 0.00 C553264065
## 4 1 CASH_OUT 181.00 C840083671 181.0 0.00 C38997010
## 5 1 PAYMENT 11668.14 C2048537720 41554.0 29885.86 M1230701703
## 6 1 PAYMENT 7817.71 C90045638 53860.0 46042.29 M573487274
## 7 1 PAYMENT 7107.77 C154988899 183195.0 176087.23 M408069119
## 8 1 PAYMENT 7861.64 C1912850431 176087.2 168225.59 M633326333
## 9 1 PAYMENT 4024.36 C1265012928 2671.0 0.00 M1176932104
## 10 1 DEBIT 5337.77 C712410124 41720.0 36382.23 C195600860
## oldbalanceDest newbalanceDest isFraud isFlaggedFraud
## 1 0 0.00 0 0
## 2 0 0.00 0 0
## 3 0 0.00 1 0
## 4 21182 0.00 1 0
## 5 0 0.00 0 0
## 6 0 0.00 0 0
## 7 0 0.00 0 0
## 8 0 0.00 0 0
## 9 0 0.00 0 0
## 10 41898 40348.79 0 0
tail(Data,10)
## step type amount nameOrig oldbalanceOrg newbalanceOrig
## 8841 7 CASH_OUT 10606.95 C1985832925 0 0
## 8842 7 CASH_OUT 178055.24 C778616125 0 0
## 8843 7 CASH_OUT 55593.37 C802853600 0 0
## 8844 7 CASH_OUT 287860.49 C1398464310 0 0
## 8845 7 CASH_OUT 112423.36 C1319381263 0 0
## 8846 7 CASH_OUT 41059.08 C634320051 0 0
## 8847 7 CASH_OUT 357427.06 C1584938872 0 0
## 8848 7 CASH_OUT 26205.91 C1541339374 0 0
## 8849 7 CASH_OUT 94641.18 C878530251 0 0
## 8850 7 CASH_OUT 176673.12 C1726238623 0 0
## nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
## 8841 C1440282299 100424.0 111030.9 0 0
## 8842 C1778300121 182825.7 360880.9 0 0
## 8843 C1588476205 61609.0 117202.4 0 0
## 8844 C289039926 1322467.9 1610328.4 0 0
## 8845 C97730845 14500000.0 14700000.0 0 0
## 8846 C1329589315 148954.2 225461.0 0 0
## 8847 C1313534832 376357.3 1005781.5 1 0
## 8848 C1297685781 1094402.8 1120608.7 0 0
## 8849 C187019413 136886.3 72085.7 1 0
## 8850 C1085479446 422281.8 346139.6 0 0
#summary
summary(Data)
## step type amount nameOrig
## Min. :1.000 Length:8850 Min. : 2 Length:8850
## 1st Qu.:1.000 Class :character 1st Qu.: 4350 Class :character
## Median :4.000 Mode :character Median : 13066 Mode :character
## Mean :3.812 Mean : 102253
## 3rd Qu.:6.000 3rd Qu.: 114218
## Max. :7.000 Max. :10000000
## oldbalanceOrg newbalanceOrig nameDest oldbalanceDest
## Min. : 0 Min. : 0 Length:8850 Min. : 0
## 1st Qu.: 196 1st Qu.: 0 Class :character 1st Qu.: 0
## Median : 22142 Median : 11937 Mode :character Median : 0
## Mean : 942337 Mean : 964951 Mean : 951834
## 3rd Qu.: 200496 3rd Qu.: 199837 3rd Qu.: 294917
## Max. :12900000 Max. :13000000 Max. :19500000
## newbalanceDest isFraud isFlaggedFraud
## Min. : 0 Min. :0.00000 Min. :0
## 1st Qu.: 0 1st Qu.:0.00000 1st Qu.:0
## Median : 0 Median :0.00000 Median :0
## Mean : 1120269 Mean :0.02949 Mean :0
## 3rd Qu.: 247063 3rd Qu.:0.00000 3rd Qu.:0
## Max. :19200000 Max. :1.00000 Max. :0
#read.csv
View(Data)
str(Data)
## 'data.frame': 8850 obs. of 11 variables:
## $ step : int 1 1 1 1 1 1 1 1 1 1 ...
## $ type : chr "PAYMENT" "PAYMENT" "TRANSFER" "CASH_OUT" ...
## $ amount : num 9840 1864 181 181 11668 ...
## $ nameOrig : chr "C1231006815" "C1666544295" "C1305486145" "C840083671" ...
## $ oldbalanceOrg : num 170136 21249 181 181 41554 ...
## $ newbalanceOrig: num 160296 19385 0 0 29886 ...
## $ nameDest : chr "M1979787155" "M2044282225" "C553264065" "C38997010" ...
## $ oldbalanceDest: num 0 0 0 21182 0 ...
## $ newbalanceDest: num 0 0 0 0 0 ...
## $ isFraud : int 0 0 1 1 0 0 0 0 0 0 ...
## $ isFlaggedFraud: int 0 0 0 0 0 0 0 0 0 0 ...
#summaries and Tables
summary(Data$nameOrig)
## Length Class Mode
## 8850 character character
mean(Data$oldbalanceOrg)
## [1] 942336.5
sd(Data$newbalanceOrig)
## [1] 2248578
median(Data$newbalanceOrig)
## [1] 11937.4
#Box Plot
boxplot(Data$newbalanceOrig,
col="blue")
#histograms
hist(Data$newbalanceOrig, breaks=10,col="red")
#Data visualiziation with ggplot2
library("ggplot2")
p<-ggplot(Data, aes(amount, type))
p+geom_point()
p + geom_point(colour = "red", size = 3)
p + geom_point(mapping = aes(x=newbalanceDest,y=amount, alpha=type),col="blue" )
## Warning: Using alpha for a discrete variable is not advised.
p +geom_point(mapping = aes(x=newbalanceDest,y=amount, shape=type),col="green" )
#Scatterplots for categorical
p + geom_point(mapping = aes(x=newbalanceDest,y=amount, alpha=type),col="red" )
## Warning: Using alpha for a discrete variable is not advised.
#facets
p + geom_point(mapping = aes(x=newbalanceDest,y=amount, alpha=type),col="cyan" )+
facet_wrap(~type,nrow=2)
## Warning: Using alpha for a discrete variable is not advised.
#barchart
f<-ggplot(Data, aes(type))
f + geom_bar(mapping = aes(x=isFraud,fill=type))
f + geom_bar(mapping = aes(x=isFlaggedFraud,fill=type))
f+
geom_bar(mapping = aes(x=isFraud),fill="blue")
p+geom_boxplot()+coord_flip()
p+geom_boxplot(notch=TRUE,col="red")
p+geom_boxplot(varwidth = TRUE)
p+geom_boxplot(fill="blue", col="magenta")
p+geom_boxplot(outlier.shape = NA)+
geom_jitter(width = 0.2)
#histograms
q<-ggplot(Data, aes(newbalanceDest))
q+geom_histogram()+
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.1
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
#Scatterplot
ggplot(Data, aes(x=newbalanceDest, y=amount)) +
geom_point()
d<-ggplot(Data, aes(x=newbalanceDest, y=amount))
d + geom_point()
#Adding a regression line
d +
geom_point() +
geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'
#X and Y axis
r <- ggplot(Data, aes(x=newbalanceDest, y=amount)) +
geom_point() +
geom_smooth(method="lm")
r
## `geom_smooth()` using formula = 'y ~ x'
# Delete the points outside the limits
r +
xlim(c(0, 0.1)) +
ylim(c(0, 1000000))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3736 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 3736 rows containing missing values (`geom_point()`).
r
## `geom_smooth()` using formula = 'y ~ x'
#Change color and size of points
j<-ggplot(Data, aes(x=newbalanceDest, y=amount)) +
geom_point(col="steelblue", size=2) +
geom_smooth(method="lm", col="red") +
coord_cartesian(xlim=c(0, 1000000), ylim=c(0, 1000000)) +
labs(title="New balance Dest Vs Amount", subtitle="From Data dataset", y="Amount", x="New balance Dest", caption="Dataset of Payment")
j
## `geom_smooth()` using formula = 'y ~ x'
ggplotly(j)
## `geom_smooth()` using formula = 'y ~ x'
#Color to reflect categories
jj<-ggplot(Data, aes(x=newbalanceDest, y=amount)) +
geom_point(aes(col=type), size=2) + # Set color to vary based on state categories.
geom_smooth(method="lm", col="red", size=0.5) +
coord_cartesian(xlim=c(0, 1000000), ylim=c(0, 1000000)) +
labs(title="New balance Dest Vs Amount", subtitle="From Data dataset", y="Amount", x="New balance Dest", caption="Dataset of Payment")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
jj
## `geom_smooth()` using formula = 'y ~ x'
ggplotly(jj)
## `geom_smooth()` using formula = 'y ~ x'
#color, but size, shape, stroke (thickness of boundary) and fill (fill color) can be used to discriminate groupings.
jj + theme(legend.position="None")
## `geom_smooth()` using formula = 'y ~ x'
jj + scale_colour_brewer(palette = "Set1") + theme(legend.position="None")
## `geom_smooth()` using formula = 'y ~ x'
# Base plot
jj<-ggplot(Data, aes(x=newbalanceDest, y=amount)) +
geom_point(aes(col=type), size=1) + # Set color to vary based on state categories.
geom_smooth(method="lm", col="red", size=0.5) +
coord_cartesian(xlim=c(0, 1000000), ylim=c(0, 1000000)) +
labs(title="New balance Dest Vs Amount", subtitle="From Data dataset", y="Amount", x="New balance Dest", caption="Dataset of Payment")
jj
## `geom_smooth()` using formula = 'y ~ x'
ggplotly(jj)
## `geom_smooth()` using formula = 'y ~ x'
# Change breaks
jj +
scale_x_continuous(breaks=seq(0, 0.1, 0.01))
## `geom_smooth()` using formula = 'y ~ x'
#Change labels
jj +
scale_x_continuous(breaks=seq(0, 0.1, 0.01), labels = letters[1:11])
## `geom_smooth()` using formula = 'y ~ x'
# Change Axis Texts
jj +
scale_x_continuous(breaks=seq(0, 0.1, 0.01), labels = sprintf("%1.2f%%", seq(0, 0.1, 0.01))) +
scale_y_continuous(breaks=seq(0, 1000000, 200000), labels = function(x){paste0(x/1000, 'K')})
## `geom_smooth()` using formula = 'y ~ x'
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.