# reading external data and storing into a dataframe called "df"
df <- read.csv("DefaultData.csv")
# reading external data and storing into a datatable called "dt"
library(data.table)
dt <- data.table(df)
# Display the Data Dimensions
dim(df)
## [1] 10000 4
# Display the column names
colnames(df)
## [1] "default" "student" "balance" "income"
# Attach the Data
attach(df)
# List Data Structures of all the Columns in Dataframe
str(df)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
# List Data Structures of all the Columns in Datatable
str(dt)
## Classes 'data.table' and 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Count of Consumer Defaults on their loans
table(default)
## default
## No Yes
## 9667 333
# Count of Consumer Defaults on their loans broken down by students
table(default, student)
## student
## default No Yes
## No 6850 2817
## Yes 206 127
# creating contingency table
tab1 <- table(default,student)
# Margin of rows
addmargins(tab1, c(1,2))
## student
## default No Yes Sum
## No 6850 2817 9667
## Yes 206 127 333
## Sum 7056 2944 10000
# proportion table for the variable default
protable <- prop.table(tab1)
# printing the proportion table in the form of percentage
PercentPropotion <- round(protable*100,1)
PercentPropotion
## student
## default No Yes
## No 68.5 28.2
## Yes 2.1 1.3
# Display the Mean, Standard Deviation & Variance of Income
mean(income)
## [1] 33516.98
sd(income)
## [1] 13336.64
var(income)
## [1] 177865955
# Max & Min upto 2 decimal places
round(min(income),2)
## [1] 771.97
round(max(income),2)
## [1] 73554.23
# descriptive statistics using package psych
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'df':
##
## income
describe(df)[,c(1:5,8:9)]
## vars n mean sd median min max
## default* 1 10000 1.03 0.18 1.00 1.00 2.00
## student* 2 10000 1.29 0.46 1.00 1.00 2.00
## balance 3 10000 835.37 483.71 823.64 0.00 2654.32
## income 4 10000 33516.98 13336.64 34552.64 771.97 73554.23
#Part 12b: Inference about 1.29 mean value for student
# the 1.29 mean value in student column has no real signifance as student is a factor variable
# descriptive Statistics using aggregate() fuction
aggregate(df[,c(3)],
by = list(default),mean)
## Group.1 x
## 1 No 803.9438
## 2 Yes 1747.8217
# plotting histogram
hist(balance,main = "Histogram of Balance",
xlab = "balance",col = c("green"))
# summary statistics by groups
library(data.table)
tab2 <- dt[, .(N = .N,
mean = mean(balance),
sd = sd(balance)),
by = .(default,student)][order(default)]
tab2
## default student N mean sd
## 1: No No 6850 744.5044 445.5151
## 2: No Yes 2817 948.4802 450.5537
## 3: Yes Yes 127 1860.3791 328.7356
## 4: Yes No 206 1678.4295 330.9141
# plotting the boxplot for inbuilt data
boxplot(balance,width = 0.5,
horizontal = TRUE,main = "Boxplot for Balance",
xlab = "Balance",col = c("lightblue"))
# plotting box plots by student
boxplot(balance ~ default,
main = "Boxplot for Balance grouped by Default (Yes/No)",
col=(c("gray","lightblue")))