## Load Libraries ##
library(readr)
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## Load data ##
df <- read_csv("./DefaultData.csv")
## Parsed with column specification:
## cols(
## default = col_character(),
## student = col_character(),
## balance = col_double(),
## income = col_double()
## )
## Looking into structure of the data ##
str(df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10000 obs. of 4 variables:
## $ default: chr "No" "No" "No" "No" ...
## $ student: chr "No" "Yes" "No" "No" ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
## - attr(*, "spec")=
## .. cols(
## .. default = col_character(),
## .. student = col_character(),
## .. balance = col_double(),
## .. income = col_double()
## .. )
## Load into data table ##
dt <- fread("./DefaultData.csv")
## Looking into structure of the data ##
str(dt)
## Classes 'data.table' and 'data.frame': 10000 obs. of 4 variables:
## $ default: chr "No" "No" "No" "No" ...
## $ student: chr "No" "Yes" "No" "No" ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
## - attr(*, ".internal.selfref")=<externalptr>
dim(df)
## [1] 10000 4
colnames(df)
## [1] "default" "student" "balance" "income"
attach(df)
sapply(df, class)
## default student balance income
## "character" "character" "numeric" "numeric"
sapply(dt, class)
## default student balance income
## "character" "character" "numeric" "numeric"
default <- as.factor(default)
table(default)
## default
## No Yes
## 9667 333
df %>% group_by(student, default) %>% summarise(number=n())
## # A tibble: 4 x 3
## # Groups: student [2]
## student default number
## <chr> <chr> <int>
## 1 No No 6850
## 2 No Yes 206
## 3 Yes No 2817
## 4 Yes Yes 127
addmargins(table(student, default), c(1,2))
## default
## student No Yes Sum
## No 6850 206 7056
## Yes 2817 127 2944
## Sum 9667 333 10000
round(table(default)/length(default),3)*100
## default
## No Yes
## 96.7 3.3
## Mean of income ##
mean(income)
## [1] 33516.98
## Standard deviation of income ##
sd(income)
## [1] 13336.64
## Variance of income ##
var(income)
## [1] 177865955
## minimum income
round(min(income),2)
## [1] 771.97
## maximum income
round(max(income),2)
## [1] 73554.23
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'df':
##
## income
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
df$default <- as.factor(df$default)
df$student <- as.factor(df$student)
describe(df, na.rm=TRUE)[,c(1:5,8,9)]
## vars n mean sd median min max
## default* 1 10000 1.03 0.18 1.00 1.00 2.00
## student* 2 10000 1.29 0.46 1.00 1.00 2.00
## balance 3 10000 835.37 483.71 823.64 0.00 2654.32
## income 4 10000 33516.98 13336.64 34552.64 771.97 73554.23
The values No & Yes are coded as: - No <- 1 - Yes <- 2 and, then the mean is calculated coming down to 1.29
df %>% group_by(default) %>% summarise(avg_balance = mean(balance))
## # A tibble: 2 x 2
## default avg_balance
## <fct> <dbl>
## 1 No 804.
## 2 Yes 1748.
plot <- df %>% ggplot(aes(x=balance)) + geom_histogram(binwidth = 100)
print(plot)
df %>% group_by(default, student) %>% summarise(count=n(), mean_balance = mean(balance), sd_balance=sd(balance))
## # A tibble: 4 x 5
## # Groups: default [2]
## default student count mean_balance sd_balance
## <fct> <fct> <int> <dbl> <dbl>
## 1 No No 6850 745. 446.
## 2 No Yes 2817 948. 451.
## 3 Yes No 206 1678. 331.
## 4 Yes Yes 127 1860. 329.
plot <- df %>% ggplot(aes(y=balance)) + geom_boxplot()
print(plot)
plot <- df %>% ggplot(aes(y=balance, fill=student)) + geom_boxplot()
print(plot)