Importing Data

Sameer Mathur

Reading, Processing and Describing ISLR Data

Using data.table()

---

READING DATA

Importing Data

library(ISLR)
library(data.table)
# reading inbuilt data as data table
default.dt <- data.table(Default)
# dimentions of the data table
dim(default.dt)
[1] 10000     4

Data Column Names

# name of the data columns
colnames(default.dt)
[1] "default" "student" "balance" "income" 

Data Structure

# data types of the data coumns
str(default.dt)
Classes 'data.table' and 'data.frame':  10000 obs. of  4 variables:
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num  730 817 1074 529 786 ...
 $ income : num  44362 12106 31767 35704 38463 ...
 - attr(*, ".internal.selfref")=<externalptr> 

Descriptive Statistics

library(psych)
# attaching data columns
attach(default.dt)
# descriptive statistics all columns
describe(default.dt)
         vars     n     mean       sd   median  trimmed      mad    min
default*    1 10000     1.03     0.18     1.00     1.00     0.00   1.00
student*    2 10000     1.29     0.46     1.00     1.24     0.00   1.00
balance     3 10000   835.37   483.71   823.64   823.73   507.52   0.00
income      4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
              max    range skew kurtosis     se
default*     2.00     1.00 5.20    25.06   0.00
student*     2.00     1.00 0.90    -1.19   0.00
balance   2654.32  2654.32 0.25    -0.36   4.84
income   73554.23 72782.27 0.07    -0.90 133.37

Descriptive Statistics

library(psych)
# descriptive statistics selected columns
describe(default.dt)[, c(1:5, 8:9)]
         vars     n     mean       sd   median    min      max
default*    1 10000     1.03     0.18     1.00   1.00     2.00
student*    2 10000     1.29     0.46     1.00   1.00     2.00
balance     3 10000   835.37   483.71   823.64   0.00  2654.32
income      4 10000 33516.98 13336.64 34552.64 771.97 73554.23

DATA SUMMARY: CATEGORICAL VARIABLES

Number of Defaulters by Student

# number of defaulters by student
table(default, student)
       student
default   No  Yes
    No  6850 2817
    Yes  206  127

Percentage of Defaulters by Student

# number of loan defaults by home ownership
round(prop.table(table(default, student))*100, 2)
       student
default    No   Yes
    No  68.50 28.17
    Yes  2.06  1.27

DATA SUMMARY: CONTINUOUS VARIABLES

Average Income and Balance by Defaulters

# average income and balance by default
default.dt[, .(N = .N,
                   MeanIncome = round(mean(income), 2),
                   MeanBalance = round(mean(balance), 2)),
               by = .(default)]
   default    N MeanIncome MeanBalance
1:      No 9667   33566.17      803.94
2:     Yes  333   32089.15     1747.82

Average Income and Balance by Defaulters and Student

# average income and balance by default and student
default.dt[, .(N = .N,
                   MeanIncome = round(mean(income), 2),
                   MeanBalance = round(mean(balance), 2)),
               by = .(default, student)][order(default, student)]
   default student    N MeanIncome MeanBalance
1:      No      No 6850   39993.52      744.50
2:      No     Yes 2817   17937.01      948.48
3:     Yes      No  206   40625.05     1678.43
4:     Yes     Yes  127   18243.51     1860.38

ALTERNATE: Average Loan Amount by Loan Default

# average loan amount by loan default
agg1 <- aggregate(income, list(default), mean)
colnames(agg1) <- c("Defaulters" , "Average Income")
agg1
  Defaulters Average Income
1         No       33566.17
2        Yes       32089.15

VISUALIZATION: CATEGORICAL VARIABLES

Boxplot of Income by Deafulters

# boxplot of income by defaulters
boxplot(income ~ default, data = default.dt,
        main = "Boxplot of Income by Defaulters",
        xlab = "Defaulters", ylab = "Income (USD)",
        col = c("grey", "light blue"))

plot of chunk unnamed-chunk-13

Boxplot of Income by Deafulters and Student

# boxplot of income by defaulters
boxplot(income ~ default * student, data = default.dt,
        main = "Boxplot of Income by Defaulters",
        xlab = "Defaulters", ylab = "Income (USD)",
        col = c("grey", "light blue"))

plot of chunk unnamed-chunk-15

ALTERNATE

library(lattice)# boxplot of income by defaulters
bwplot(income ~ default | student, data = default.dt, 
       horizontal = FALSE, xlab = "Defaulters")

plot of chunk unnamed-chunk-17

Mean Plot of Income by Deafulters

# mean plot by defaulters
library(gplots)
plotmeans(income ~ default, data = default.dt,
          xlab = "Defaulters", ylab = "Income (USD)",
          frame = TRUE, mean.labels = TRUE)

plot of chunk unnamed-chunk-19

VISUALIZATION: CONTINUOUS VARIABLES

Scatter Plot of Income and Balance

# scatter plot of income and balance
plot(income, balance, data = default.dt,
     ylab = "Balance (USD)", xlab = "Income (USD)",
     main = "Scatter Plot of Income and Balance")

plot of chunk unnamed-chunk-21

Sactter Plot of Income and Balance by Defaulters

# scatter plot of income and balance by defaulters
xyplot(balance ~ income, group = default, data = default.dt,
       ylab = "Balance (USD)", xlab = "Income (USD)",
       main = "Scatter Plot of Income and Balance by Defaulters")

plot of chunk unnamed-chunk-23

ALTERNATE: Sactter Plot of Income and Balance by Defaulters

# scatter plot of income and balance by defaulters
xyplot(balance ~ income | default, data = default.dt,
       ylab = "Balance (USD)", xlab = "Income (USD)",
       main = "Scatter Plot of Income and Balance by Defaulters",
       type = c("p", "smooth"), scales = "free")

plot of chunk unnamed-chunk-25