Part 1a: Read the data

# reading external data and storing into a dataframe called "cc.df"
df.df <- read.csv("DefaultData.csv")

Part 1b: read the data into data table

library(data.table)
dt.df <- data.table (df.df)

Part 2: Data Dimensions

# Display the Data Dimensions
dim(df.df)
## [1] 10000     4

Part 3: Column names

# Display the column names
colnames(df.df)
## [1] "default" "student" "balance" "income"

#Part 4:attaching the dataframe airline.df

attach(df.df)

#Part 5a:data structure

str(df.df)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...

#Part 5b:list data structures of columns in data.table “dt”

str(dt.df)
## Classes 'data.table' and 'data.frame':   10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Part 6: Display the count of consumers who default on their loan

table(default)
## default
##   No  Yes 
## 9667  333

#Part 7:

table(default,student)
##        student
## default   No  Yes
##     No  6850 2817
##     Yes  206  127

#Part 8:

# creating contingency table
tab1 <- table(default,student)
# Margin of rows
addmargins(tab1, 2)
##        student
## default   No  Yes  Sum
##     No  6850 2817 9667
##     Yes  206  127  333

#Part 9:

# creating contingency table
tab2 <- table(default)
# proportion table for the variable defaulters
protable <- prop.table(tab2)
# printing the proportion table in the form of percentage
PercentPropotion <- round(protable*100,1) 
PercentPropotion
## default
##   No  Yes 
## 96.7  3.3

#Part 10:

#attaching dataframe
attach(df.df)
## The following objects are masked from df.df (pos = 3):
## 
##     balance, default, income, student
## The following objects are masked from df.df (pos = 3):
# mean of income
mean(income)
## [1] 33516.98
# standard deviation of income
sd(income)
## [1] 13336.64
# variance of income
var(income)
## [1] 177865955

#Part 11:

# minimum income
minimum <- round(income,2)
min(minimum)
## [1] 771.97
# maximum income
maximum <- round(income,2)
max(maximum)
## [1] 73554.23

#Part 12:

# descriptive statistics using package psych
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'df.df':
## 
##     income
describe(df.df)[,c(1:5,8:9)]
##          vars     n     mean       sd   median    min      max
## default*    1 10000     1.03     0.18     1.00   1.00     2.00
## student*    2 10000     1.29     0.46     1.00   1.00     2.00
## balance     3 10000   835.37   483.71   823.64   0.00  2654.32
## income      4 10000 33516.98 13336.64 34552.64 771.97 73554.23

#Part 12b:

#More number of non students than students in the sample set.

#Part 13:

library(data.table)
dt.df <- data.table(df.df)
tab1 <- dt.df[, .(N = .N,
                    mean = mean(balance)),
                      by = (default)][order(default)]
tab1
##    default    N      mean
## 1:      No 9667  803.9438
## 2:     Yes  333 1747.8217

#Part 13b:

hist(df.df$balance,main = "Histogram of Balance",
xlab = "balance",col = c("light blue"))

#Part 14:

# summary statistics by groups
library(data.table)
dt.df <- data.table(df.df)
tab1 <- dt.df[, .(N = .N,
                    mean = mean(balance),
                      sd = sd(balance)),
                      by = .(default,student)][order(default)]
tab1
##    default student    N      mean       sd
## 1:      No      No 6850  744.5044 445.5151
## 2:      No     Yes 2817  948.4802 450.5537
## 3:     Yes     Yes  127 1860.3791 328.7356
## 4:     Yes      No  206 1678.4295 330.9141

#Part 15:

# plotting the boxplot for inbuilt data
boxplot(df.df$balance,width = 0.5,
horizontal = TRUE,main = "boxplot for balance",
xlab = "balance",col = c("lightblue"))

#Part 16:

# loading the package
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
# plotting box plots by Airline
p <- ggplot(df.df, aes(x=default,y=balance)) + 
  geom_boxplot()
p + labs(title="Boxplot for balance, grouped by default(Yes/No)")