Sameer Mathur
Reading, Processing and Describing ISLR Data
Using data.table()
---
library(ISLR)
library(data.table)
# reading inbuilt data as data table
default.dt <- data.table(Default)
# dimentions of the data table
dim(default.dt)
[1] 10000 4
# name of the data columns
colnames(default.dt)
[1] "default" "student" "balance" "income"
# data types of the data coumns
str(default.dt)
Classes 'data.table' and 'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
- attr(*, ".internal.selfref")=<externalptr>
library(psych)
# attaching data columns
attach(default.dt)
# descriptive statistics all columns
describe(default.dt)
vars n mean sd median trimmed mad min
default* 1 10000 1.03 0.18 1.00 1.00 0.00 1.00
student* 2 10000 1.29 0.46 1.00 1.24 0.00 1.00
balance 3 10000 835.37 483.71 823.64 823.73 507.52 0.00
income 4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
max range skew kurtosis se
default* 2.00 1.00 5.20 25.06 0.00
student* 2.00 1.00 0.90 -1.19 0.00
balance 2654.32 2654.32 0.25 -0.36 4.84
income 73554.23 72782.27 0.07 -0.90 133.37
library(psych)
# descriptive statistics selected columns
describe(default.dt)[, c(1:5, 8:9)]
vars n mean sd median min max
default* 1 10000 1.03 0.18 1.00 1.00 2.00
student* 2 10000 1.29 0.46 1.00 1.00 2.00
balance 3 10000 835.37 483.71 823.64 0.00 2654.32
income 4 10000 33516.98 13336.64 34552.64 771.97 73554.23
# number of defaulters by student
table(default, student)
student
default No Yes
No 6850 2817
Yes 206 127
# number of loan defaults by home ownership
round(prop.table(table(default, student))*100, 2)
student
default No Yes
No 68.50 28.17
Yes 2.06 1.27
# average income and balance by default
default.dt[, .(N = .N,
MeanIncome = round(mean(income), 2),
MeanBalance = round(mean(balance), 2)),
by = .(default)]
default N MeanIncome MeanBalance
1: No 9667 33566.17 803.94
2: Yes 333 32089.15 1747.82
# average income and balance by default and student
default.dt[, .(N = .N,
MeanIncome = round(mean(income), 2),
MeanBalance = round(mean(balance), 2)),
by = .(default, student)][order(default, student)]
default student N MeanIncome MeanBalance
1: No No 6850 39993.52 744.50
2: No Yes 2817 17937.01 948.48
3: Yes No 206 40625.05 1678.43
4: Yes Yes 127 18243.51 1860.38
# average loan amount by loan default
agg1 <- aggregate(income, list(default), mean)
colnames(agg1) <- c("Defaulters" , "Average Income")
agg1
Defaulters Average Income
1 No 33566.17
2 Yes 32089.15
# boxplot of income by defaulters
boxplot(income ~ default, data = default.dt,
main = "Boxplot of Income by Defaulters",
xlab = "Defaulters", ylab = "Income (USD)",
col = c("grey", "light blue"))
# boxplot of income by defaulters
boxplot(income ~ default * student, data = default.dt,
main = "Boxplot of Income by Defaulters",
xlab = "Defaulters", ylab = "Income (USD)",
col = c("grey", "light blue"))
library(lattice)# boxplot of income by defaulters
bwplot(income ~ default | student, data = default.dt,
horizontal = FALSE, xlab = "Defaulters")
# mean plot by defaulters
library(gplots)
plotmeans(income ~ default, data = default.dt,
xlab = "Defaulters", ylab = "Income (USD)",
frame = TRUE, mean.labels = TRUE)
# scatter plot of income and balance
plot(income, balance, data = default.dt,
ylab = "Balance (USD)", xlab = "Income (USD)",
main = "Scatter Plot of Income and Balance")
# scatter plot of income and balance by defaulters
xyplot(balance ~ income, group = default, data = default.dt,
ylab = "Balance (USD)", xlab = "Income (USD)",
main = "Scatter Plot of Income and Balance by Defaulters")
# scatter plot of income and balance by defaulters
xyplot(balance ~ income | default, data = default.dt,
ylab = "Balance (USD)", xlab = "Income (USD)",
main = "Scatter Plot of Income and Balance by Defaulters",
type = c("p", "smooth"), scales = "free")