Final Project Publish

#Name: Tanjil Azad
#StudentID: 2012193630

#Final Project
Data<-read.csv('Fraud Directory.csv')

head(Data,10)
##    step     type   amount    nameOrig oldbalanceOrg newbalanceOrig    nameDest
## 1     1  PAYMENT  9839.64 C1231006815      170136.0      160296.36 M1979787155
## 2     1  PAYMENT  1864.28 C1666544295       21249.0       19384.72 M2044282225
## 3     1 TRANSFER   181.00 C1305486145         181.0           0.00  C553264065
## 4     1 CASH_OUT   181.00  C840083671         181.0           0.00   C38997010
## 5     1  PAYMENT 11668.14 C2048537720       41554.0       29885.86 M1230701703
## 6     1  PAYMENT  7817.71   C90045638       53860.0       46042.29  M573487274
## 7     1  PAYMENT  7107.77  C154988899      183195.0      176087.23  M408069119
## 8     1  PAYMENT  7861.64 C1912850431      176087.2      168225.59  M633326333
## 9     1  PAYMENT  4024.36 C1265012928        2671.0           0.00 M1176932104
## 10    1    DEBIT  5337.77  C712410124       41720.0       36382.23  C195600860
##    oldbalanceDest newbalanceDest isFraud isFlaggedFraud
## 1               0           0.00       0              0
## 2               0           0.00       0              0
## 3               0           0.00       1              0
## 4           21182           0.00       1              0
## 5               0           0.00       0              0
## 6               0           0.00       0              0
## 7               0           0.00       0              0
## 8               0           0.00       0              0
## 9               0           0.00       0              0
## 10          41898       40348.79       0              0
tail(Data,10)
##      step     type    amount    nameOrig oldbalanceOrg newbalanceOrig
## 8841    7 CASH_OUT  10606.95 C1985832925             0              0
## 8842    7 CASH_OUT 178055.24  C778616125             0              0
## 8843    7 CASH_OUT  55593.37  C802853600             0              0
## 8844    7 CASH_OUT 287860.49 C1398464310             0              0
## 8845    7 CASH_OUT 112423.36 C1319381263             0              0
## 8846    7 CASH_OUT  41059.08  C634320051             0              0
## 8847    7 CASH_OUT 357427.06 C1584938872             0              0
## 8848    7 CASH_OUT  26205.91 C1541339374             0              0
## 8849    7 CASH_OUT  94641.18  C878530251             0              0
## 8850    7 CASH_OUT 176673.12 C1726238623             0              0
##         nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
## 8841 C1440282299       100424.0       111030.9       0              0
## 8842 C1778300121       182825.7       360880.9       0              0
## 8843 C1588476205        61609.0       117202.4       0              0
## 8844  C289039926      1322467.9      1610328.4       0              0
## 8845   C97730845     14500000.0     14700000.0       0              0
## 8846 C1329589315       148954.2       225461.0       0              0
## 8847 C1313534832       376357.3      1005781.5       1              0
## 8848 C1297685781      1094402.8      1120608.7       0              0
## 8849  C187019413       136886.3        72085.7       1              0
## 8850 C1085479446       422281.8       346139.6       0              0
#summary
summary(Data)
##       step           type               amount           nameOrig        
##  Min.   :1.000   Length:8850        Min.   :       2   Length:8850       
##  1st Qu.:1.000   Class :character   1st Qu.:    4350   Class :character  
##  Median :4.000   Mode  :character   Median :   13066   Mode  :character  
##  Mean   :3.812                      Mean   :  102253                     
##  3rd Qu.:6.000                      3rd Qu.:  114218                     
##  Max.   :7.000                      Max.   :10000000                     
##  oldbalanceOrg      newbalanceOrig       nameDest         oldbalanceDest    
##  Min.   :       0   Min.   :       0   Length:8850        Min.   :       0  
##  1st Qu.:     196   1st Qu.:       0   Class :character   1st Qu.:       0  
##  Median :   22142   Median :   11937   Mode  :character   Median :       0  
##  Mean   :  942337   Mean   :  964951                      Mean   :  951834  
##  3rd Qu.:  200496   3rd Qu.:  199837                      3rd Qu.:  294917  
##  Max.   :12900000   Max.   :13000000                      Max.   :19500000  
##  newbalanceDest        isFraud        isFlaggedFraud
##  Min.   :       0   Min.   :0.00000   Min.   :0     
##  1st Qu.:       0   1st Qu.:0.00000   1st Qu.:0     
##  Median :       0   Median :0.00000   Median :0     
##  Mean   : 1120269   Mean   :0.02949   Mean   :0     
##  3rd Qu.:  247063   3rd Qu.:0.00000   3rd Qu.:0     
##  Max.   :19200000   Max.   :1.00000   Max.   :0
#read.csv
View(Data)
str(Data)
## 'data.frame':    8850 obs. of  11 variables:
##  $ step          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ type          : chr  "PAYMENT" "PAYMENT" "TRANSFER" "CASH_OUT" ...
##  $ amount        : num  9840 1864 181 181 11668 ...
##  $ nameOrig      : chr  "C1231006815" "C1666544295" "C1305486145" "C840083671" ...
##  $ oldbalanceOrg : num  170136 21249 181 181 41554 ...
##  $ newbalanceOrig: num  160296 19385 0 0 29886 ...
##  $ nameDest      : chr  "M1979787155" "M2044282225" "C553264065" "C38997010" ...
##  $ oldbalanceDest: num  0 0 0 21182 0 ...
##  $ newbalanceDest: num  0 0 0 0 0 ...
##  $ isFraud       : int  0 0 1 1 0 0 0 0 0 0 ...
##  $ isFlaggedFraud: int  0 0 0 0 0 0 0 0 0 0 ...
#summaries and Tables
summary(Data$nameOrig)
##    Length     Class      Mode 
##      8850 character character
mean(Data$oldbalanceOrg)
## [1] 942336.5
sd(Data$newbalanceOrig)
## [1] 2248578
median(Data$newbalanceOrig)
## [1] 11937.4
#Box Plot


boxplot(Data$newbalanceOrig,
        col="blue")

#histograms
hist(Data$newbalanceOrig, breaks=10,col="red")


#Data visualiziation with ggplot2

library("ggplot2")

p<-ggplot(Data, aes(amount, type))

p+geom_point()

p + geom_point(colour = "red", size = 3)

p + geom_point(mapping = aes(x=newbalanceDest,y=amount, alpha=type),col="blue" )
## Warning: Using alpha for a discrete variable is not advised.

p +geom_point(mapping = aes(x=newbalanceDest,y=amount, shape=type),col="green" )

#Scatterplots for categorical

p + geom_point(mapping = aes(x=newbalanceDest,y=amount, alpha=type),col="red" )
## Warning: Using alpha for a discrete variable is not advised.

#facets

p + geom_point(mapping = aes(x=newbalanceDest,y=amount, alpha=type),col="cyan" )+
  facet_wrap(~type,nrow=2)
## Warning: Using alpha for a discrete variable is not advised.

#barchart

f<-ggplot(Data, aes(type))


f + geom_bar(mapping = aes(x=isFraud,fill=type))

f + geom_bar(mapping = aes(x=isFlaggedFraud,fill=type))

f+
  geom_bar(mapping = aes(x=isFraud),fill="blue")

p+geom_boxplot()+coord_flip()

p+geom_boxplot(notch=TRUE,col="red")

p+geom_boxplot(varwidth = TRUE)

p+geom_boxplot(fill="blue", col="magenta")

p+geom_boxplot(outlier.shape = NA)+
  geom_jitter(width = 0.2)

#histograms
q<-ggplot(Data, aes(newbalanceDest))

q+geom_histogram()+
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.1
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
#Scatterplot

ggplot(Data, aes(x=newbalanceDest, y=amount)) + 
  geom_point()

d<-ggplot(Data, aes(x=newbalanceDest, y=amount))

d + geom_point()

#Adding a regression line

d +  
  geom_point() + 
  geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

#X and Y axis

r <- ggplot(Data, aes(x=newbalanceDest, y=amount)) + 
  geom_point() + 
  geom_smooth(method="lm") 
r
## `geom_smooth()` using formula = 'y ~ x'

# Delete the points outside the limits
r + 
  xlim(c(0, 0.1)) + 
  ylim(c(0, 1000000))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3736 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 3736 rows containing missing values (`geom_point()`).

r
## `geom_smooth()` using formula = 'y ~ x'

#Change color and size of points

j<-ggplot(Data, aes(x=newbalanceDest, y=amount)) + 
  geom_point(col="steelblue", size=2) +   
  geom_smooth(method="lm", col="red") +  
  coord_cartesian(xlim=c(0, 1000000), ylim=c(0, 1000000)) + 
  labs(title="New balance Dest Vs Amount", subtitle="From Data dataset", y="Amount", x="New balance Dest", caption="Dataset of Payment")
j
## `geom_smooth()` using formula = 'y ~ x'

ggplotly(j)
## `geom_smooth()` using formula = 'y ~ x'
#Color to reflect categories

jj<-ggplot(Data, aes(x=newbalanceDest, y=amount)) + 
  geom_point(aes(col=type), size=2) +  # Set color to vary based on state categories.
  geom_smooth(method="lm", col="red", size=0.5) + 
  coord_cartesian(xlim=c(0, 1000000), ylim=c(0, 1000000)) + 
  labs(title="New balance Dest Vs Amount", subtitle="From Data dataset", y="Amount", x="New balance Dest", caption="Dataset of Payment")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
jj
## `geom_smooth()` using formula = 'y ~ x'

ggplotly(jj)
## `geom_smooth()` using formula = 'y ~ x'
#color, but size, shape, stroke (thickness of boundary) and fill (fill color) can be used to discriminate groupings.

jj + theme(legend.position="None")  
## `geom_smooth()` using formula = 'y ~ x'

jj + scale_colour_brewer(palette = "Set1") + theme(legend.position="None") 
## `geom_smooth()` using formula = 'y ~ x'

# Base plot
jj<-ggplot(Data, aes(x=newbalanceDest, y=amount)) + 
  geom_point(aes(col=type), size=1) +  # Set color to vary based on state categories.
  geom_smooth(method="lm", col="red", size=0.5) + 
  coord_cartesian(xlim=c(0, 1000000), ylim=c(0, 1000000)) + 
  labs(title="New balance Dest Vs Amount", subtitle="From Data dataset", y="Amount", x="New balance Dest", caption="Dataset of Payment")
jj
## `geom_smooth()` using formula = 'y ~ x'

ggplotly(jj)
## `geom_smooth()` using formula = 'y ~ x'
# Change breaks
jj + 
  scale_x_continuous(breaks=seq(0, 0.1, 0.01))
## `geom_smooth()` using formula = 'y ~ x'

#Change labels

jj + 
  scale_x_continuous(breaks=seq(0, 0.1, 0.01), labels = letters[1:11])
## `geom_smooth()` using formula = 'y ~ x'

# Change Axis Texts
jj + 
  scale_x_continuous(breaks=seq(0, 0.1, 0.01), labels = sprintf("%1.2f%%", seq(0, 0.1, 0.01))) + 
  scale_y_continuous(breaks=seq(0, 1000000, 200000), labels = function(x){paste0(x/1000, 'K')})
## `geom_smooth()` using formula = 'y ~ x'

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.