# reading external data and storing into a dataframe called "cc.df"
cc.df <- read.csv("DefaultData.csv")
# make a data table variable
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
cc.dt <- data.table(cc.df)
# Display the column names
colnames(cc.df)
## [1] "default" "student" "balance" "income"
# Display the Data Dimensions
dim(cc.df)
## [1] 10000 4
# Display the colnames
colnames(cc.df)
## [1] "default" "student" "balance" "income"
# Structure the file
str(cc.df)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
# Structure the file
str(cc.dt)
## Classes 'data.table' and 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Count Default
summary(cc.df$default)
## No Yes
## 9667 333
# Count Default & Student
defaultvsStudent <- table(cc.df$default,cc.df$student)
defaultvsStudent
##
## No Yes
## No 6850 2817
## Yes 206 127
# Count Default and Studnet contingency
addmargins(defaultvsStudent)
##
## No Yes Sum
## No 6850 2817 9667
## Yes 206 127 333
## Sum 7056 2944 10000
# ## Defaulters
prop.table(table(cc.df$default))
##
## No Yes
## 0.9667 0.0333
# STATS
library(psych)
## Warning: package 'psych' was built under R version 3.5.3
describe(cc.df$income)[,c(1,3,5)]
## vars mean median
## X1 1 33516.98 34552.64
# Min and Max
describe(cc.df$income)[,c(8,9)]
## min max
## X1 771.97 73554.23
## Descriptive statistics for each variable
describe(cc.df)[,c(1:5,8:9)]
## vars n mean sd median min max
## default* 1 10000 1.03 0.18 1.00 1.00 2.00
## student* 2 10000 1.29 0.46 1.00 1.00 2.00
## balance 3 10000 835.37 483.71 823.64 0.00 2654.32
## income 4 10000 33516.98 13336.64 34552.64 771.97 73554.23
## descriptive Statitics
aggregate(cc.df[,c(3)], by = list(cc.df$default),mean)
## Histogram
hist(cc.df$balance)
# Part 14: Mean, count and SD
## groupby anaylsis
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
cc.dt[,.(N=.N,
meanBalance = mean(balance),
sdBalance = sd(balance)),
,by = .(default,student)]
## default student N meanBalance sdBalance
## 1: No No 6850 744.5044 445.5151
## 2: No Yes 2817 948.4802 450.5537
## 3: Yes Yes 127 1860.3791 328.7356
## 4: Yes No 206 1678.4295 330.9141
## creditcard balance boxplot
boxplot(cc.df$balance, horizontal = TRUE)
# Part 15b: Box plot for credit card balance vs default
## creditcard balance vs default boxplot
boxplot(cc.df$balance ~ cc.df$default,
main = "hsitogram of balance",
col = c("grey","light blue"))