homework4.R

#1. Use data visualization to show the ccFraud dataset 
#with insights ( 7 different kinds of visualizations)
# 
ls()

## character(0)

rm(list=ls())
gc()

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 359246 19.2     592000 31.7   460000 24.6
## Vcells 549350  4.2    1023718  7.9   841966  6.5

getwd()

## [1] "/home/ajayohri/Desktop/test/homework3"

setwd("/home/ajayohri/Desktop/test/homework3")

dir()

## [1] "ccFraud.csv"        "code.R"             "homework3.Rproj"   
## [4] "homework4.R"        "homework4.spin.R"   "homework4.spin.Rmd"

library(data.table)
fraud=fread("ccFraud.csv")

## 
Read 29.2% of 10000000 rows
Read 53.0% of 10000000 rows
Read 78.6% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:05

class(fraud)

## [1] "data.table" "data.frame"

names(fraud)

## [1] "custID"       "gender"       "state"        "cardholder"  
## [5] "balance"      "numTrans"     "numIntlTrans" "creditLine"  
## [9] "fraudRisk"

str(fraud)

## Classes 'data.table' and 'data.frame':   10000000 obs. of  9 variables:
##  $ custID      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender      : int  1 2 2 1 1 2 1 1 2 1 ...
##  $ state       : int  35 2 2 15 46 44 3 10 32 23 ...
##  $ cardholder  : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ balance     : int  3000 0 0 0 0 5546 2000 6016 2428 0 ...
##  $ numTrans    : int  4 9 27 12 11 21 41 20 4 18 ...
##  $ numIntlTrans: int  14 0 9 0 16 0 0 3 10 56 ...
##  $ creditLine  : int  2 18 16 5 7 13 1 6 22 5 ...
##  $ fraudRisk   : int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

summary(fraud)

##      custID            gender          state         cardholder  
##  Min.   :1.0e+00   Min.   :1.000   Min.   : 1.00   Min.   :1.00  
##  1st Qu.:2.5e+06   1st Qu.:1.000   1st Qu.:10.00   1st Qu.:1.00  
##  Median :5.0e+06   Median :1.000   Median :24.00   Median :1.00  
##  Mean   :5.0e+06   Mean   :1.382   Mean   :24.66   Mean   :1.03  
##  3rd Qu.:7.5e+06   3rd Qu.:2.000   3rd Qu.:38.00   3rd Qu.:1.00  
##  Max.   :1.0e+07   Max.   :2.000   Max.   :51.00   Max.   :2.00  
##     balance         numTrans       numIntlTrans      creditLine    
##  Min.   :    0   Min.   :  0.00   Min.   : 0.000   Min.   : 1.000  
##  1st Qu.:    0   1st Qu.: 10.00   1st Qu.: 0.000   1st Qu.: 4.000  
##  Median : 3706   Median : 19.00   Median : 0.000   Median : 6.000  
##  Mean   : 4110   Mean   : 28.94   Mean   : 4.047   Mean   : 9.134  
##  3rd Qu.: 6000   3rd Qu.: 39.00   3rd Qu.: 4.000   3rd Qu.:11.000  
##  Max.   :41485   Max.   :100.00   Max.   :60.000   Max.   :75.000  
##    fraudRisk     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.0596  
##  3rd Qu.:0.0000  
##  Max.   :1.0000

class(fraud)

## [1] "data.table" "data.frame"

table(fraud$cardholder)

## 
##       1       2 
## 9699957  300043

table(fraud$gender)

## 
##       1       2 
## 6178231 3821769

table(fraud$state)

## 
##       1       2       3       4       5       6       7       8       9 
##   20137  162574  101740  202776 1216069  171774  121802   20603   30333 
##      10      11      12      13      14      15      16      17      18 
##  608630  303984   50438  111775   60992  404720  203143   91127  142170 
##      19      20      21      22      23      24      25      26      27 
##  151715  201918  202444   40819  304553  182201  203045  101829   30131 
##      28      29      30      31      32      33      34      35      36 
##   60617  303833   20215   40563  284428   81332   91326  608575  364531 
##      37      38      39      40      41      42      43      44      45 
##  122191  121846  405892   30233  152253   20449  203827  812638   91375 
##      46      47      48      49      50      51 
##  252812   20017  202972  182557   61385   20691

table(fraud$fraudRisk)

## 
##       0       1 
## 9403986  596014

fraud$gender=as.factor(fraud$gender)
fraud$state=as.factor(fraud$state)
fraud$fraudRisk=as.factor(fraud$fraudRisk)
fraud$cardholder=as.factor(fraud$cardholder)

attach(fraud)
boxplot(balance~state)

boxplot(balance~gender)

boxplot(balance~cardholder)

boxplot(balance~fraudRisk)

boxplot(creditLine~state)

boxplot(creditLine~gender)

boxplot(creditLine~cardholder)

boxplot(creditLine~fraudRisk)

#
#2. Use data visualization to show the Boston dataset with insights ( 4 different kinds visualizations)
library(MASS)
data("Boston")
names(Boston)

##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

attach(Boston)
plot(medv~chas)

plot(medv~rm)

boxplot(medv~chas)

boxplot(medv~rm)

boxplot(nox~chas)

homework4.R

ajayohri

Sat Dec 17 08:13:15 2016