#1. Use data visualization to show the ccFraud dataset
#with insights ( 7 different kinds of visualizations)
#
ls()
## character(0)
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 359246 19.2 592000 31.7 460000 24.6
## Vcells 549350 4.2 1023718 7.9 841966 6.5
getwd()
## [1] "/home/ajayohri/Desktop/test/homework3"
setwd("/home/ajayohri/Desktop/test/homework3")
dir()
## [1] "ccFraud.csv" "code.R" "homework3.Rproj"
## [4] "homework4.R" "homework4.spin.R" "homework4.spin.Rmd"
library(data.table)
fraud=fread("ccFraud.csv")
##
Read 29.2% of 10000000 rows
Read 53.0% of 10000000 rows
Read 78.6% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:05
class(fraud)
## [1] "data.table" "data.frame"
names(fraud)
## [1] "custID" "gender" "state" "cardholder"
## [5] "balance" "numTrans" "numIntlTrans" "creditLine"
## [9] "fraudRisk"
str(fraud)
## Classes 'data.table' and 'data.frame': 10000000 obs. of 9 variables:
## $ custID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : int 1 2 2 1 1 2 1 1 2 1 ...
## $ state : int 35 2 2 15 46 44 3 10 32 23 ...
## $ cardholder : int 1 1 1 1 1 2 1 1 1 1 ...
## $ balance : int 3000 0 0 0 0 5546 2000 6016 2428 0 ...
## $ numTrans : int 4 9 27 12 11 21 41 20 4 18 ...
## $ numIntlTrans: int 14 0 9 0 16 0 0 3 10 56 ...
## $ creditLine : int 2 18 16 5 7 13 1 6 22 5 ...
## $ fraudRisk : int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(fraud)
## custID gender state cardholder
## Min. :1.0e+00 Min. :1.000 Min. : 1.00 Min. :1.00
## 1st Qu.:2.5e+06 1st Qu.:1.000 1st Qu.:10.00 1st Qu.:1.00
## Median :5.0e+06 Median :1.000 Median :24.00 Median :1.00
## Mean :5.0e+06 Mean :1.382 Mean :24.66 Mean :1.03
## 3rd Qu.:7.5e+06 3rd Qu.:2.000 3rd Qu.:38.00 3rd Qu.:1.00
## Max. :1.0e+07 Max. :2.000 Max. :51.00 Max. :2.00
## balance numTrans numIntlTrans creditLine
## Min. : 0 Min. : 0.00 Min. : 0.000 Min. : 1.000
## 1st Qu.: 0 1st Qu.: 10.00 1st Qu.: 0.000 1st Qu.: 4.000
## Median : 3706 Median : 19.00 Median : 0.000 Median : 6.000
## Mean : 4110 Mean : 28.94 Mean : 4.047 Mean : 9.134
## 3rd Qu.: 6000 3rd Qu.: 39.00 3rd Qu.: 4.000 3rd Qu.:11.000
## Max. :41485 Max. :100.00 Max. :60.000 Max. :75.000
## fraudRisk
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.0596
## 3rd Qu.:0.0000
## Max. :1.0000
class(fraud)
## [1] "data.table" "data.frame"
table(fraud$cardholder)
##
## 1 2
## 9699957 300043
table(fraud$gender)
##
## 1 2
## 6178231 3821769
table(fraud$state)
##
## 1 2 3 4 5 6 7 8 9
## 20137 162574 101740 202776 1216069 171774 121802 20603 30333
## 10 11 12 13 14 15 16 17 18
## 608630 303984 50438 111775 60992 404720 203143 91127 142170
## 19 20 21 22 23 24 25 26 27
## 151715 201918 202444 40819 304553 182201 203045 101829 30131
## 28 29 30 31 32 33 34 35 36
## 60617 303833 20215 40563 284428 81332 91326 608575 364531
## 37 38 39 40 41 42 43 44 45
## 122191 121846 405892 30233 152253 20449 203827 812638 91375
## 46 47 48 49 50 51
## 252812 20017 202972 182557 61385 20691
table(fraud$fraudRisk)
##
## 0 1
## 9403986 596014
fraud$gender=as.factor(fraud$gender)
fraud$state=as.factor(fraud$state)
fraud$fraudRisk=as.factor(fraud$fraudRisk)
fraud$cardholder=as.factor(fraud$cardholder)
attach(fraud)
boxplot(balance~state)

boxplot(balance~gender)

boxplot(balance~cardholder)

boxplot(balance~fraudRisk)

boxplot(creditLine~state)

boxplot(creditLine~gender)

boxplot(creditLine~cardholder)

boxplot(creditLine~fraudRisk)

#
#2. Use data visualization to show the Boston dataset with insights ( 4 different kinds visualizations)
library(MASS)
data("Boston")
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
attach(Boston)
plot(medv~chas)

plot(medv~rm)

boxplot(medv~chas)

boxplot(medv~rm)

boxplot(nox~chas)
