Summarizing Data

Sameer Mathur

Reading, Processing and Describing ISLR Data

Using data.table()

---

Importing Data

library(ISLR)
library(data.table)
# reading inbuilt data as data table
default.dt <- data.table(Default)
# reading inbuilt data as data frame
default.df <- as.data.frame(default.dt)

Mean of Income

# mean of income
mean(default.dt$income)
[1] 33516.98

Standard Deviation (sd) of Income

# sd of income
sd(default.dt$income)
[1] 13336.64

Minimum , Maximum Value of Income

# minimum value of income
min(default.dt$income)
[1] 771.9677
# maximum value of income
max(default.dt$income)
[1] 73554.23

Descriptive Statistics using summary() function

library(psych)
# descriptive statistics using summary() function
summary(default.dt)
 default    student       balance           income     
 No :9667   No :7056   Min.   :   0.0   Min.   :  772  
 Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
                       Median : 823.6   Median :34553  
                       Mean   : 835.4   Mean   :33517  
                       3rd Qu.:1166.3   3rd Qu.:43808  
                       Max.   :2654.3   Max.   :73554  

Descriptive Statistics using describe() function in psych Package

# descriptive statistics all columns
library(psych)
describe(default.dt)
         vars     n     mean       sd   median  trimmed      mad    min
default*    1 10000     1.03     0.18     1.00     1.00     0.00   1.00
student*    2 10000     1.29     0.46     1.00     1.24     0.00   1.00
balance     3 10000   835.37   483.71   823.64   823.73   507.52   0.00
income      4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
              max    range skew kurtosis     se
default*     2.00     1.00 5.20    25.06   0.00
student*     2.00     1.00 0.90    -1.19   0.00
balance   2654.32  2654.32 0.25    -0.36   4.84
income   73554.23 72782.27 0.07    -0.90 133.37

Descriptive Statistics

library(psych)
# descriptive statistics selected columns
describe(default.dt)[, c(1:5, 8:9)]
         vars     n     mean       sd   median    min      max
default*    1 10000     1.03     0.18     1.00   1.00     2.00
student*    2 10000     1.29     0.46     1.00   1.00     2.00
balance     3 10000   835.37   483.71   823.64   0.00  2654.32
income      4 10000 33516.98 13336.64 34552.64 771.97 73554.23

DATA SUMMARY: CATEGORICAL VARIABLES

Attach a data table

attach(default.dt)

Number of Defaulters by Student

# number of defaulters by student
table(default, student)
       student
default   No  Yes
    No  6850 2817
    Yes  206  127

Number of Defaulters by Student and Adding Sum

# number of defaulters by student and adding sum
addmargins(table(default, student))
       student
default    No   Yes   Sum
    No   6850  2817  9667
    Yes   206   127   333
    Sum  7056  2944 10000

Proportion of Defaulters by Student

# proportion of loan defaulters by student
round(prop.table(addmargins(table(default, student))), 3)
       student
default    No   Yes   Sum
    No  0.171 0.070 0.242
    Yes 0.005 0.003 0.008
    Sum 0.176 0.074 0.250

Percentage of Defaulters by Student

# percentage of loan defaulters by student
round(prop.table(addmargins(table(default, student)))*100, 2)
       student
default    No   Yes   Sum
    No  17.12  7.04 24.17
    Yes  0.52  0.32  0.83
    Sum 17.64  7.36 25.00

Number of Defaulters by Student using xtabs() function

# number of defaulters by student
z = xtabs(~ default + student, data = default.dt)
z
       student
default   No  Yes
    No  6850 2817
    Yes  206  127
# Sum
addmargins(z)
       student
default    No   Yes   Sum
    No   6850  2817  9667
    Yes   206   127   333
    Sum  7056  2944 10000

Number of Defaulters by Student (Columns Sum)

# number of defaulters by student
addmargins(xtabs(~ default + student, data = default.dt), 1)
       student
default   No  Yes
    No  6850 2817
    Yes  206  127
    Sum 7056 2944

Number of Defaulters by Student (Rows Sum)

# number of defaulters by student
addmargins(xtabs(~ default + student, data = default.dt), 2)
       student
default   No  Yes  Sum
    No  6850 2817 9667
    Yes  206  127  333

DATA SUMMARY: CONTINUOUS VARIABLES

Average Income by Defaulters using aggregate()

# average income by defaulters
agg1 <- aggregate(default.df$income, list(default.df$default), mean)
colnames(agg1) <- c("Defaulters" , "AverageIncome")
agg1
  Defaulters AverageIncome
1         No      33566.17
2        Yes      32089.15

Average Income by Defaulters using data.table function

library(data.table)
# average income and balance by default
default.dt[, .(N = .N,
                   MeanIncome = round(mean(income), 2),
                   MeanBalance = round(mean(balance), 2)),
               by = .(default)]
   default    N MeanIncome MeanBalance
1:      No 9667   33566.17      803.94
2:     Yes  333   32089.15     1747.82

Average Income by Defaulters and Student using aggregate()

# average income by defaulters and student
agg3 <- aggregate(default.df$income, by = list(default.df$student, default.df$default), mean)
colnames(agg3) <- c("Defaulters" , "Student", "AverageIncome")
agg3
  Defaulters Student AverageIncome
1         No      No      39993.52
2        Yes      No      17937.01
3         No     Yes      40625.05
4        Yes     Yes      18243.51

Average Income and Balance by Defaulters and Student

# average income and balance by default and student
default.dt[, .(N = .N,
                   MeanIncome = round(mean(income), 2),
                   MeanBalance = round(mean(balance), 2)),
               by = .(student,default)][order(student,default)]
   student default    N MeanIncome MeanBalance
1:      No      No 6850   39993.52      744.50
2:      No     Yes  206   40625.05     1678.43
3:     Yes      No 2817   17937.01      948.48
4:     Yes     Yes  127   18243.51     1860.38