Sameer Mathur
Reading, Processing and Describing ISLR Data
Using data.table()
---
library(ISLR)
library(data.table)
# reading inbuilt data as data table
default.dt <- data.table(Default)
# reading inbuilt data as data frame
default.df <- as.data.frame(default.dt)
# dimentions of the data table
dim(default.dt)
[1] 10000 4
# name of data columns
colnames(default.dt)
[1] "default" "student" "balance" "income"
# first few rows
head(default.dt)
default student balance income
1: No No 729.5265 44361.625
2: No Yes 817.1804 12106.135
3: No No 1073.5492 31767.139
4: No No 529.2506 35704.494
5: No No 785.6559 38463.496
6: No Yes 919.5885 7491.559
# last few rows
tail(default.dt)
default student balance income
1: No Yes 172.4130 14955.94
2: No No 711.5550 52992.38
3: No No 757.9629 19660.72
4: No No 845.4120 58636.16
5: No No 1569.0091 36669.11
6: No Yes 200.9222 16862.95
# random few rows
library(car)
some(default.dt)
default student balance income
1: No No 590.3998 42769.20
2: No No 1755.3889 35031.53
3: No No 1372.9358 35467.76
4: No No 1345.7070 40802.89
5: No Yes 1286.8398 17464.99
6: No No 902.7137 23847.39
7: No No 904.4241 56062.02
8: No No 1558.5970 38880.44
9: No No 784.4701 52617.21
10: No No 1391.0339 53255.02
# mean of income
mean(default.dt$income)
[1] 33516.98
# sd of income
mean(default.dt$income)
[1] 33516.98
# minimum value of income
min(default.dt$income)
[1] 771.9677
# maximum value of income
max(default.dt$income)
[1] 73554.23
# attaching data columns
attach(default.dt)
# data types of the data coumns
str(default.dt)
Classes 'data.table' and 'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
- attr(*, ".internal.selfref")=<externalptr>
summary() functionlibrary(psych)
# descriptive statistics using summary() function
summary(default.dt)
default student balance income
No :9667 No :7056 Min. : 0.0 Min. : 772
Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
Median : 823.6 Median :34553
Mean : 835.4 Mean :33517
3rd Qu.:1166.3 3rd Qu.:43808
Max. :2654.3 Max. :73554
describeBy() function in psych Packagelibrary(psych)
# descriptive statistics using describeBy() function
describeBy(default.dt)
vars n mean sd median trimmed mad min
default* 1 10000 1.03 0.18 1.00 1.00 0.00 1.00
student* 2 10000 1.29 0.46 1.00 1.24 0.00 1.00
balance 3 10000 835.37 483.71 823.64 823.73 507.52 0.00
income 4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
max range skew kurtosis se
default* 2.00 1.00 5.20 25.06 0.00
student* 2.00 1.00 0.90 -1.19 0.00
balance 2654.32 2654.32 0.25 -0.36 4.84
income 73554.23 72782.27 0.07 -0.90 133.37
describe() function in psych Package# descriptive statistics all columns
library(psych)
describe(default.dt)
vars n mean sd median trimmed mad min
default* 1 10000 1.03 0.18 1.00 1.00 0.00 1.00
student* 2 10000 1.29 0.46 1.00 1.24 0.00 1.00
balance 3 10000 835.37 483.71 823.64 823.73 507.52 0.00
income 4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
max range skew kurtosis se
default* 2.00 1.00 5.20 25.06 0.00
student* 2.00 1.00 0.90 -1.19 0.00
balance 2654.32 2654.32 0.25 -0.36 4.84
income 73554.23 72782.27 0.07 -0.90 133.37
library(psych)
# descriptive statistics selected columns
describe(default.dt)[, c(1:5, 8:9)]
vars n mean sd median min max
default* 1 10000 1.03 0.18 1.00 1.00 2.00
student* 2 10000 1.29 0.46 1.00 1.00 2.00
balance 3 10000 835.37 483.71 823.64 0.00 2654.32
income 4 10000 33516.98 13336.64 34552.64 771.97 73554.23
# number of defaulters by student
table(default, student)
student
default No Yes
No 6850 2817
Yes 206 127
# number of defaulters by student and adding sum
addmargins(table(default, student))
student
default No Yes Sum
No 6850 2817 9667
Yes 206 127 333
Sum 7056 2944 10000
# proportion of loan defaulters by student
round(prop.table(addmargins(table(default, student))), 2)
student
default No Yes Sum
No 0.17 0.07 0.24
Yes 0.01 0.00 0.01
Sum 0.18 0.07 0.25
# percentage of loan defaulters by student
round(prop.table(addmargins(table(default, student)))*100, 2)
student
default No Yes Sum
No 17.12 7.04 24.17
Yes 0.52 0.32 0.83
Sum 17.64 7.36 25.00
xtabs() function# number of defaulters by student
addmargins(xtabs(~ default + student, data = default.dt))
student
default No Yes Sum
No 6850 2817 9667
Yes 206 127 333
Sum 7056 2944 10000
# number of defaulters by student
addmargins(xtabs(~ default + student, data = default.dt), 1)
student
default No Yes
No 6850 2817
Yes 206 127
Sum 7056 2944
# number of defaulters by student
addmargins(xtabs(~ default + student, data = default.dt), 2)
student
default No Yes Sum
No 6850 2817 9667
Yes 206 127 333
library(data.table)
# average income and balance by default
default.dt[, .(N = .N,
MeanIncome = round(mean(income), 2),
MeanBalance = round(mean(balance), 2)),
by = .(default)]
default N MeanIncome MeanBalance
1: No 9667 33566.17 803.94
2: Yes 333 32089.15 1747.82
# average income and balance by default and student
default.dt[, .(N = .N,
MeanIncome = round(mean(income), 2),
MeanBalance = round(mean(balance), 2)),
by = .(default, student)][order(default, student)]
default student N MeanIncome MeanBalance
1: No No 6850 39993.52 744.50
2: No Yes 2817 17937.01 948.48
3: Yes No 206 40625.05 1678.43
4: Yes Yes 127 18243.51 1860.38
aggregate() function# average income by defaulters
agg1 <- aggregate(default.df$income, list(default.df$default), mean)
colnames(agg1) <- c("Defaulters" , "AverageIncome")
agg1
Defaulters AverageIncome
1 No 33566.17
2 Yes 32089.15
aggregate() function# standard deviation of income by defaulters
agg2 <- aggregate(default.df$income, list(default.df$default), sd)
colnames(agg2) <- c("Defaulters" , "AverageIncome")
agg2
Defaulters AverageIncome
1 No 13318.25
2 Yes 13804.22
aggregate() function# average income by defaulters and student
agg3 <- aggregate(default.df$income, by = list(default.df$default, default.df$student), mean)
colnames(agg3) <- c("Defaulters" , "Student", "AverageIncome")
agg3
Defaulters Student AverageIncome
1 No No 39993.52
2 Yes No 40625.05
3 No Yes 17937.01
4 Yes Yes 18243.51
# boxplot of income by defaulters
boxplot(income ~ default, data = default.dt,
main = "Boxplot of Income by Defaulters",
xlab = "Defaulters", ylab = "Income (USD)",
col = c("grey", "light blue"))
# boxplot of income by defaulters
boxplot(income ~ default * student, data = default.dt,
main = "Boxplot of Income by Defaulters",
xlab = "Defaulters | Student", ylab = "Income (USD)",
col = c("grey", "light blue"))
library(lattice)# boxplot of income by defaulters
bwplot(income ~ default | student, data = default.dt,
horizontal = FALSE, xlab = "Defaulters")
# mean plot by defaulters
library(gplots)
plotmeans(income ~ default, data = default.dt,
xlab = "Defaulters", ylab = "Income (USD)",
frame = TRUE, mean.labels = TRUE)
# scatter plot of income and balance by defaulters
xyplot(balance ~ income, group = default, data = default.dt,
ylab = "Balance (USD)", xlab = "Income (USD)",
main = "Scatter Plot of Income and Balance by Defaulters")
# scatter plot of income and balance by defaulters
xyplot(balance ~ income | default, data = default.dt,
ylab = "Balance (USD)", xlab = "Income (USD)",
main = "Scatter Plot of Income and Balance by Defaulters",
type = c("p", "smooth"), scales = "free")