Part 1a

# reading external data and storing into a dataframe called "cc.df"
setwd("D:/IIML/Term 4/MLM/Session 4")
df <- read.csv("DefaultData.csv")

Part 1b

library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
dt<-data.table(df)

Part 2

# Display the Data Dimensions
dim(df)
## [1] 10000     4

Part 3

# Display column names
colnames(df)
## [1] "default" "student" "balance" "income"

Part 4

# attach dataframe
attach(df)

Part 5

# Structure of the dataframe
str(df)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
str(dt)
## Classes 'data.table' and 'data.frame':   10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Part 6

# how many consumers default on their loan
table(default)
## default
##   No  Yes 
## 9667  333

Part 7

# how many consumers default on their loan, further broken down by whether or not they are students
df_v_student<-table(default,student)
df_v_student
##        student
## default   No  Yes
##     No  6850 2817
##     Yes  206  127

Part 8

# complete contingency table of defaulters broken down by students
addmargins(df_v_student)
##        student
## default    No   Yes   Sum
##     No   6850  2817  9667
##     Yes   206   127   333
##     Sum  7056  2944 10000

Part 9

# calculate the percentage of Defaulters and non-Defaulters, rounded to 1 decimal place
round(prop.table(table(default))*100,1)
## default
##   No  Yes 
## 96.7  3.3

Part 10

# Mean, Standard Deviation and Variance Of The Income
paste("Mean Income=",mean(df$income))
## [1] "Mean Income= 33516.9818759574"
paste("Standard Deviation of Income=",sd(df$income))
## [1] "Standard Deviation of Income= 13336.6395627319"
paste("Variance of Income=",var(df$income))
## [1] "Variance of Income= 177865954.826226"

Part 11

# Minimum And Maximum Income, rounding it to 2 decimal places
paste("Maximum Income=",round(max(df$income),2))
## [1] "Maximum Income= 73554.23"
paste("Minmum Income=",round(min(df$income),2))
## [1] "Minmum Income= 771.97"

Part 12

# Descriptive statistics for default data'
library(psych)
## Warning: package 'psych' was built under R version 3.5.3
## 
## Attaching package: 'psych'
## The following object is masked from 'df':
## 
##     income
describe(df)[,c(1:5,8:9)]
##          vars     n     mean       sd   median    min      max
## default*    1 10000     1.03     0.18     1.00   1.00     2.00
## student*    2 10000     1.29     0.46     1.00   1.00     2.00
## balance     3 10000   835.37   483.71   823.64   0.00  2654.32
## income      4 10000 33516.98 13336.64 34552.64 771.97 73554.23

Part 13

# average of balance, broken down by whether consumers default on their loan
aggregate(df[,c(3)], by = list(df$default),mean)
##   Group.1         x
## 1      No  803.9438
## 2     Yes 1747.8217

Part 13b

# Histogram of balance
hist(df$balance,col="blue")

Part 14

# mean and standard deviation of the balance, with respect to whether someone is a student and whether he or she has defaulted in payment
dt[,.(N=.N,
         meanBalance = mean(balance),
         sdBalance = sd(balance)),
      ,by = .(default,student)]
##    default student    N meanBalance sdBalance
## 1:      No      No 6850    744.5044  445.5151
## 2:      No     Yes 2817    948.4802  450.5537
## 3:     Yes     Yes  127   1860.3791  328.7356
## 4:     Yes      No  206   1678.4295  330.9141

Part 15

# Box-Plot for credit card balance
boxplot(df$balance, horizontal = TRUE,col= "blue")

Part 16

# boxplots for credit card balance, broken down by whether a consumer is a student or not a student
boxplot(df$balance ~ df$default,
        main = "histogram of balance",
        col = c("grey","light blue"))