# reading external data and storing into a dataframe called "cc.df"
df.df <- read.csv("DefaultData.csv")
library(data.table)
dt.df <- data.table (df.df)
# Display the Data Dimensions
dim(df.df)
## [1] 10000 4
# Display the column names
colnames(df.df)
## [1] "default" "student" "balance" "income"
#Part 4:attaching the dataframe airline.df
attach(df.df)
#Part 5a:data structure
str(df.df)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
#Part 5b:list data structures of columns in data.table “dt”
str(dt.df)
## Classes 'data.table' and 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
## - attr(*, ".internal.selfref")=<externalptr>
table(default)
## default
## No Yes
## 9667 333
#Part 7:
table(default,student)
## student
## default No Yes
## No 6850 2817
## Yes 206 127
#Part 8:
# creating contingency table
tab1 <- table(default,student)
# Margin of rows
addmargins(tab1, 2)
## student
## default No Yes Sum
## No 6850 2817 9667
## Yes 206 127 333
#Part 9:
# creating contingency table
tab2 <- table(default)
# proportion table for the variable defaulters
protable <- prop.table(tab2)
# printing the proportion table in the form of percentage
PercentPropotion <- round(protable*100,1)
PercentPropotion
## default
## No Yes
## 96.7 3.3
#Part 10:
#attaching dataframe
attach(df.df)
## The following objects are masked from df.df (pos = 3):
##
## balance, default, income, student
## The following objects are masked from df.df (pos = 3):
# mean of income
mean(income)
## [1] 33516.98
# standard deviation of income
sd(income)
## [1] 13336.64
# variance of income
var(income)
## [1] 177865955
#Part 11:
# minimum income
minimum <- round(income,2)
min(minimum)
## [1] 771.97
# maximum income
maximum <- round(income,2)
max(maximum)
## [1] 73554.23
#Part 12:
# descriptive statistics using package psych
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'df.df':
##
## income
describe(df.df)[,c(1:5,8:9)]
## vars n mean sd median min max
## default* 1 10000 1.03 0.18 1.00 1.00 2.00
## student* 2 10000 1.29 0.46 1.00 1.00 2.00
## balance 3 10000 835.37 483.71 823.64 0.00 2654.32
## income 4 10000 33516.98 13336.64 34552.64 771.97 73554.23
#Part 12b:
#More number of non students than students in the sample set.
#Part 13:
library(data.table)
dt.df <- data.table(df.df)
tab1 <- dt.df[, .(N = .N,
mean = mean(balance)),
by = (default)][order(default)]
tab1
## default N mean
## 1: No 9667 803.9438
## 2: Yes 333 1747.8217
#Part 13b:
hist(df.df$balance,main = "Histogram of Balance",
xlab = "balance",col = c("light blue"))
#Part 14:
# summary statistics by groups
library(data.table)
dt.df <- data.table(df.df)
tab1 <- dt.df[, .(N = .N,
mean = mean(balance),
sd = sd(balance)),
by = .(default,student)][order(default)]
tab1
## default student N mean sd
## 1: No No 6850 744.5044 445.5151
## 2: No Yes 2817 948.4802 450.5537
## 3: Yes Yes 127 1860.3791 328.7356
## 4: Yes No 206 1678.4295 330.9141
#Part 15:
# plotting the boxplot for inbuilt data
boxplot(df.df$balance,width = 0.5,
horizontal = TRUE,main = "boxplot for balance",
xlab = "balance",col = c("lightblue"))
#Part 16:
# loading the package
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
# plotting box plots by Airline
p <- ggplot(df.df, aes(x=default,y=balance)) +
geom_boxplot()
p + labs(title="Boxplot for balance, grouped by default(Yes/No)")