Importing Data

Sameer Mathur

Reading, Processing and Describing ISLR Data

Using data.table()

---

READING AND SUMMARIZING DATA

Importing Data

library(ISLR)
library(data.table)
# reading inbuilt data as data table
default.dt <- data.table(Default)
# reading inbuilt data as data frame
default.df <- as.data.frame(default.dt)

Number of Rows and Columns

# dimentions of the data table
dim(default.dt)
[1] 10000     4

Name of Data Columns

# name of data columns
colnames(default.dt)
[1] "default" "student" "balance" "income" 

First Few Rows of Data Table

# first few rows
head(default.dt)
   default student   balance    income
1:      No      No  729.5265 44361.625
2:      No     Yes  817.1804 12106.135
3:      No      No 1073.5492 31767.139
4:      No      No  529.2506 35704.494
5:      No      No  785.6559 38463.496
6:      No     Yes  919.5885  7491.559

Last Few Rows of Data Table

# last few rows
tail(default.dt)
   default student   balance   income
1:      No     Yes  172.4130 14955.94
2:      No      No  711.5550 52992.38
3:      No      No  757.9629 19660.72
4:      No      No  845.4120 58636.16
5:      No      No 1569.0091 36669.11
6:      No     Yes  200.9222 16862.95

Random Few Rows of Data Table

# random few rows
library(car)
some(default.dt)
    default student   balance   income
 1:      No      No  590.3998 42769.20
 2:      No      No 1755.3889 35031.53
 3:      No      No 1372.9358 35467.76
 4:      No      No 1345.7070 40802.89
 5:      No     Yes 1286.8398 17464.99
 6:      No      No  902.7137 23847.39
 7:      No      No  904.4241 56062.02
 8:      No      No 1558.5970 38880.44
 9:      No      No  784.4701 52617.21
10:      No      No 1391.0339 53255.02

Mean of Income

# mean of income
mean(default.dt$income)
[1] 33516.98

Standard Deviation (sd) of Income

# sd of income
mean(default.dt$income)
[1] 33516.98

Minimum Value of Income

# minimum value of income
min(default.dt$income)
[1] 771.9677

Maximum Value of Income

# maximum value of income
max(default.dt$income)
[1] 73554.23

Data Structure

# attaching data columns
attach(default.dt)
# data types of the data coumns
str(default.dt)
Classes 'data.table' and 'data.frame':  10000 obs. of  4 variables:
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num  730 817 1074 529 786 ...
 $ income : num  44362 12106 31767 35704 38463 ...
 - attr(*, ".internal.selfref")=<externalptr> 

Descriptive Statistics using summary() function

library(psych)
# descriptive statistics using summary() function
summary(default.dt)
 default    student       balance           income     
 No :9667   No :7056   Min.   :   0.0   Min.   :  772  
 Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
                       Median : 823.6   Median :34553  
                       Mean   : 835.4   Mean   :33517  
                       3rd Qu.:1166.3   3rd Qu.:43808  
                       Max.   :2654.3   Max.   :73554  

Descriptive Statistics using describeBy() function in psych Package

library(psych)
# descriptive statistics using describeBy() function
describeBy(default.dt)
         vars     n     mean       sd   median  trimmed      mad    min
default*    1 10000     1.03     0.18     1.00     1.00     0.00   1.00
student*    2 10000     1.29     0.46     1.00     1.24     0.00   1.00
balance     3 10000   835.37   483.71   823.64   823.73   507.52   0.00
income      4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
              max    range skew kurtosis     se
default*     2.00     1.00 5.20    25.06   0.00
student*     2.00     1.00 0.90    -1.19   0.00
balance   2654.32  2654.32 0.25    -0.36   4.84
income   73554.23 72782.27 0.07    -0.90 133.37

Descriptive Statistics using describe() function in psych Package

# descriptive statistics all columns
library(psych)
describe(default.dt)
         vars     n     mean       sd   median  trimmed      mad    min
default*    1 10000     1.03     0.18     1.00     1.00     0.00   1.00
student*    2 10000     1.29     0.46     1.00     1.24     0.00   1.00
balance     3 10000   835.37   483.71   823.64   823.73   507.52   0.00
income      4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
              max    range skew kurtosis     se
default*     2.00     1.00 5.20    25.06   0.00
student*     2.00     1.00 0.90    -1.19   0.00
balance   2654.32  2654.32 0.25    -0.36   4.84
income   73554.23 72782.27 0.07    -0.90 133.37

Descriptive Statistics

library(psych)
# descriptive statistics selected columns
describe(default.dt)[, c(1:5, 8:9)]
         vars     n     mean       sd   median    min      max
default*    1 10000     1.03     0.18     1.00   1.00     2.00
student*    2 10000     1.29     0.46     1.00   1.00     2.00
balance     3 10000   835.37   483.71   823.64   0.00  2654.32
income      4 10000 33516.98 13336.64 34552.64 771.97 73554.23

DATA SUMMARY: CATEGORICAL VARIABLES

Number of Defaulters by Student

# number of defaulters by student
table(default, student)
       student
default   No  Yes
    No  6850 2817
    Yes  206  127

Number of Defaulters by Student and Adding Sum

# number of defaulters by student and adding sum
addmargins(table(default, student))
       student
default    No   Yes   Sum
    No   6850  2817  9667
    Yes   206   127   333
    Sum  7056  2944 10000

Proportion of Defaulters by Student

# proportion of loan defaulters by student
round(prop.table(addmargins(table(default, student))), 2)
       student
default   No  Yes  Sum
    No  0.17 0.07 0.24
    Yes 0.01 0.00 0.01
    Sum 0.18 0.07 0.25

Percentage of Defaulters by Student

# percentage of loan defaulters by student
round(prop.table(addmargins(table(default, student)))*100, 2)
       student
default    No   Yes   Sum
    No  17.12  7.04 24.17
    Yes  0.52  0.32  0.83
    Sum 17.64  7.36 25.00

Number of Defaulters by Student using xtabs() function

# number of defaulters by student
addmargins(xtabs(~ default + student, data = default.dt))
       student
default    No   Yes   Sum
    No   6850  2817  9667
    Yes   206   127   333
    Sum  7056  2944 10000

Number of Defaulters by Student (Columns Sum)

# number of defaulters by student
addmargins(xtabs(~ default + student, data = default.dt), 1)
       student
default   No  Yes
    No  6850 2817
    Yes  206  127
    Sum 7056 2944

Number of Defaulters by Student (Rows Sum)

# number of defaulters by student
addmargins(xtabs(~ default + student, data = default.dt), 2)
       student
default   No  Yes  Sum
    No  6850 2817 9667
    Yes  206  127  333

DATA SUMMARY: CONTINUOUS VARIABLES

Average Income and Balance by Defaulters

library(data.table)
# average income and balance by default
default.dt[, .(N = .N,
                   MeanIncome = round(mean(income), 2),
                   MeanBalance = round(mean(balance), 2)),
               by = .(default)]
   default    N MeanIncome MeanBalance
1:      No 9667   33566.17      803.94
2:     Yes  333   32089.15     1747.82

Average Income and Balance by Defaulters and Student

# average income and balance by default and student
default.dt[, .(N = .N,
                   MeanIncome = round(mean(income), 2),
                   MeanBalance = round(mean(balance), 2)),
               by = .(default, student)][order(default, student)]
   default student    N MeanIncome MeanBalance
1:      No      No 6850   39993.52      744.50
2:      No     Yes 2817   17937.01      948.48
3:     Yes      No  206   40625.05     1678.43
4:     Yes     Yes  127   18243.51     1860.38

ALTERNATE: Averages using data frame

Average Income by Defaulters using aggregate() function

# average income by defaulters
agg1 <- aggregate(default.df$income, list(default.df$default), mean)
colnames(agg1) <- c("Defaulters" , "AverageIncome")
agg1
  Defaulters AverageIncome
1         No      33566.17
2        Yes      32089.15

Standard Deviation of Income by Defaulters aggregate() function

# standard deviation of income by defaulters
agg2 <- aggregate(default.df$income, list(default.df$default), sd)
colnames(agg2) <- c("Defaulters" , "AverageIncome")
agg2
  Defaulters AverageIncome
1         No      13318.25
2        Yes      13804.22

Average Income by Defaulters and Student aggregate() function

# average income by defaulters and student
agg3 <- aggregate(default.df$income, by = list(default.df$default, default.df$student), mean)
colnames(agg3) <- c("Defaulters" , "Student", "AverageIncome")
agg3
  Defaulters Student AverageIncome
1         No      No      39993.52
2        Yes      No      40625.05
3         No     Yes      17937.01
4        Yes     Yes      18243.51

VISUALIZATION: CATEGORICAL VARIABLES

Boxplot of Income by Deafulters

# boxplot of income by defaulters
boxplot(income ~ default, data = default.dt,
        main = "Boxplot of Income by Defaulters",
        xlab = "Defaulters", ylab = "Income (USD)",
        col = c("grey", "light blue"))

plot of chunk unnamed-chunk-30

Boxplot of Income by Deafulters and Student

# boxplot of income by defaulters
boxplot(income ~ default * student, data = default.dt,
        main = "Boxplot of Income by Defaulters",
        xlab = "Defaulters | Student", ylab = "Income (USD)",
        col = c("grey", "light blue"))

plot of chunk unnamed-chunk-32

ALTERNATE

library(lattice)# boxplot of income by defaulters
bwplot(income ~ default | student, data = default.dt, 
       horizontal = FALSE, xlab = "Defaulters")

plot of chunk unnamed-chunk-34

Mean Plot of Income by Deafulters

# mean plot by defaulters
library(gplots)
plotmeans(income ~ default, data = default.dt,
          xlab = "Defaulters", ylab = "Income (USD)",
          frame = TRUE, mean.labels = TRUE)

plot of chunk unnamed-chunk-36

VISUALIZATION: CONTINUOUS VARIABLES

Sactter Plot of Income and Balance by Defaulters

# scatter plot of income and balance by defaulters
xyplot(balance ~ income, group = default, data = default.dt,
       ylab = "Balance (USD)", xlab = "Income (USD)",
       main = "Scatter Plot of Income and Balance by Defaulters")

plot of chunk unnamed-chunk-38

ALTERNATE: Sactter Plot of Income and Balance by Defaulters

# scatter plot of income and balance by defaulters
xyplot(balance ~ income | default, data = default.dt,
       ylab = "Balance (USD)", xlab = "Income (USD)",
       main = "Scatter Plot of Income and Balance by Defaulters",
       type = c("p", "smooth"), scales = "free")

plot of chunk unnamed-chunk-40