Part 1a
# reading external data and storing into a dataframe called "cc.df"
setwd("D:/IIML/Term 4/MLM/Session 4")
df <- read.csv("DefaultData.csv")
Part 1b
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
dt<-data.table(df)
Part 2
# Display the Data Dimensions
dim(df)
## [1] 10000 4
Part 3
# Display column names
colnames(df)
## [1] "default" "student" "balance" "income"
Part 4
# attach dataframe
attach(df)
Part 5
# Structure of the dataframe
str(df)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
str(dt)
## Classes 'data.table' and 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
## - attr(*, ".internal.selfref")=<externalptr>
Part 6
# how many consumers default on their loan
table(default)
## default
## No Yes
## 9667 333
Part 7
# how many consumers default on their loan, further broken down by whether or not they are students
df_v_student<-table(default,student)
df_v_student
## student
## default No Yes
## No 6850 2817
## Yes 206 127
Part 8
# complete contingency table of defaulters broken down by students
addmargins(df_v_student)
## student
## default No Yes Sum
## No 6850 2817 9667
## Yes 206 127 333
## Sum 7056 2944 10000
Part 9
# calculate the percentage of Defaulters and non-Defaulters, rounded to 1 decimal place
round(prop.table(table(default))*100,1)
## default
## No Yes
## 96.7 3.3
Part 10
# Mean, Standard Deviation and Variance Of The Income
paste("Mean Income=",mean(df$income))
## [1] "Mean Income= 33516.9818759574"
paste("Standard Deviation of Income=",sd(df$income))
## [1] "Standard Deviation of Income= 13336.6395627319"
paste("Variance of Income=",var(df$income))
## [1] "Variance of Income= 177865954.826226"
Part 11
# Minimum And Maximum Income, rounding it to 2 decimal places
paste("Maximum Income=",round(max(df$income),2))
## [1] "Maximum Income= 73554.23"
paste("Minmum Income=",round(min(df$income),2))
## [1] "Minmum Income= 771.97"
Part 12
# Descriptive statistics for default data'
library(psych)
## Warning: package 'psych' was built under R version 3.5.3
##
## Attaching package: 'psych'
## The following object is masked from 'df':
##
## income
describe(df)[,c(1:5,8:9)]
## vars n mean sd median min max
## default* 1 10000 1.03 0.18 1.00 1.00 2.00
## student* 2 10000 1.29 0.46 1.00 1.00 2.00
## balance 3 10000 835.37 483.71 823.64 0.00 2654.32
## income 4 10000 33516.98 13336.64 34552.64 771.97 73554.23
Part 13
# average of balance, broken down by whether consumers default on their loan
aggregate(df[,c(3)], by = list(df$default),mean)
## Group.1 x
## 1 No 803.9438
## 2 Yes 1747.8217
Part 13b
# Histogram of balance
hist(df$balance,col="blue")

Part 14
# mean and standard deviation of the balance, with respect to whether someone is a student and whether he or she has defaulted in payment
dt[,.(N=.N,
meanBalance = mean(balance),
sdBalance = sd(balance)),
,by = .(default,student)]
## default student N meanBalance sdBalance
## 1: No No 6850 744.5044 445.5151
## 2: No Yes 2817 948.4802 450.5537
## 3: Yes Yes 127 1860.3791 328.7356
## 4: Yes No 206 1678.4295 330.9141
Part 15
# Box-Plot for credit card balance
boxplot(df$balance, horizontal = TRUE,col= "blue")

Part 16
# boxplots for credit card balance, broken down by whether a consumer is a student or not a student
boxplot(df$balance ~ df$default,
main = "histogram of balance",
col = c("grey","light blue"))
