Sameer Mathur
Reading, Processing and Describing ISLR Data
Using data.table()
---
library(ISLR)
library(data.table)
# reading inbuilt data as data table
default.dt <- data.table(Default)
# reading inbuilt data as data frame
default.df <- as.data.frame(default.dt)
# mean of income
mean(default.dt$income)
[1] 33516.98
# sd of income
sd(default.dt$income)
[1] 13336.64
# minimum value of income
min(default.dt$income)
[1] 771.9677
# maximum value of income
max(default.dt$income)
[1] 73554.23
summary() functionlibrary(psych)
# descriptive statistics using summary() function
summary(default.dt)
default student balance income
No :9667 No :7056 Min. : 0.0 Min. : 772
Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
Median : 823.6 Median :34553
Mean : 835.4 Mean :33517
3rd Qu.:1166.3 3rd Qu.:43808
Max. :2654.3 Max. :73554
describe() function in psych Package# descriptive statistics all columns
library(psych)
describe(default.dt)
vars n mean sd median trimmed mad min
default* 1 10000 1.03 0.18 1.00 1.00 0.00 1.00
student* 2 10000 1.29 0.46 1.00 1.24 0.00 1.00
balance 3 10000 835.37 483.71 823.64 823.73 507.52 0.00
income 4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
max range skew kurtosis se
default* 2.00 1.00 5.20 25.06 0.00
student* 2.00 1.00 0.90 -1.19 0.00
balance 2654.32 2654.32 0.25 -0.36 4.84
income 73554.23 72782.27 0.07 -0.90 133.37
library(psych)
# descriptive statistics selected columns
describe(default.dt)[, c(1:5, 8:9)]
vars n mean sd median min max
default* 1 10000 1.03 0.18 1.00 1.00 2.00
student* 2 10000 1.29 0.46 1.00 1.00 2.00
balance 3 10000 835.37 483.71 823.64 0.00 2654.32
income 4 10000 33516.98 13336.64 34552.64 771.97 73554.23
attach(default.dt)
# number of defaulters by student
table(default, student)
student
default No Yes
No 6850 2817
Yes 206 127
# number of defaulters by student and adding sum
addmargins(table(default, student))
student
default No Yes Sum
No 6850 2817 9667
Yes 206 127 333
Sum 7056 2944 10000
# proportion of loan defaulters by student
round(prop.table(addmargins(table(default, student))), 3)
student
default No Yes Sum
No 0.171 0.070 0.242
Yes 0.005 0.003 0.008
Sum 0.176 0.074 0.250
# percentage of loan defaulters by student
round(prop.table(addmargins(table(default, student)))*100, 2)
student
default No Yes Sum
No 17.12 7.04 24.17
Yes 0.52 0.32 0.83
Sum 17.64 7.36 25.00
xtabs() function# number of defaulters by student
z = xtabs(~ default + student, data = default.dt)
z
student
default No Yes
No 6850 2817
Yes 206 127
# Sum
addmargins(z)
student
default No Yes Sum
No 6850 2817 9667
Yes 206 127 333
Sum 7056 2944 10000
# number of defaulters by student
addmargins(xtabs(~ default + student, data = default.dt), 1)
student
default No Yes
No 6850 2817
Yes 206 127
Sum 7056 2944
# number of defaulters by student
addmargins(xtabs(~ default + student, data = default.dt), 2)
student
default No Yes Sum
No 6850 2817 9667
Yes 206 127 333
aggregate()# average income by defaulters
agg1 <- aggregate(default.df$income, list(default.df$default), mean)
colnames(agg1) <- c("Defaulters" , "AverageIncome")
agg1
Defaulters AverageIncome
1 No 33566.17
2 Yes 32089.15
data.table functionlibrary(data.table)
# average income and balance by default
default.dt[, .(N = .N,
MeanIncome = round(mean(income), 2),
MeanBalance = round(mean(balance), 2)),
by = .(default)]
default N MeanIncome MeanBalance
1: No 9667 33566.17 803.94
2: Yes 333 32089.15 1747.82
aggregate()# average income by defaulters and student
agg3 <- aggregate(default.df$income, by = list(default.df$student, default.df$default), mean)
colnames(agg3) <- c("Defaulters" , "Student", "AverageIncome")
agg3
Defaulters Student AverageIncome
1 No No 39993.52
2 Yes No 17937.01
3 No Yes 40625.05
4 Yes Yes 18243.51
# average income and balance by default and student
default.dt[, .(N = .N,
MeanIncome = round(mean(income), 2),
MeanBalance = round(mean(balance), 2)),
by = .(student,default)][order(student,default)]
student default N MeanIncome MeanBalance
1: No No 6850 39993.52 744.50
2: No Yes 206 40625.05 1678.43
3: Yes No 2817 17937.01 948.48
4: Yes Yes 127 18243.51 1860.38