Part 1: Read the data..

# reading external data and storing into a dataframe called "airline.df"
library(data.table)
setwd("C:/4. Academics/4. Term 4/1. MLM/S1")
df <- read.csv("DefaultData.csv")
dt <- fread(input="DefaultData.csv",stringsAsFactors=TRUE)

Part 2: Data Dimensions

# Display the Data Dimensions
dim(df)

## [1] 10000     4

Part 3: Column names

# Display the column names
colnames(df)

## [1] "default" "student" "balance" "income"

Part 4: Attach dataframe “df”

attach(df)

Part 5: Data structures of the columns in the dataframe “df”

str(df)

## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...

str(dt)

## Classes 'data.table' and 'data.frame':   10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Part 6: Count how many consumers default on their loan

table(default)

## default
##   No  Yes 
## 9667  333

Part 7: Count how many consumers default on their loan, further broken down by whether or not they are students

table(default,student)

##        student
## default   No  Yes
##     No  6850 2817
##     Yes  206  127

Part 8: Complete contingency table of defaulters broken down by students

# creating contingency table
tab1 <- table(default,student)
# Margin of rows
addmargins(tab1, c(1,2))

##        student
## default    No   Yes   Sum
##     No   6850  2817  9667
##     Yes   206   127   333
##     Sum  7056  2944 10000

Part 9: Calculate the percentage of Defaulters and non-Defaulters, rounded to 1 decimal place

protable <- prop.table(table(default))
round(protable*100,1)

## default
##   No  Yes 
## 96.7  3.3

Part 10: Mean, Standard Deviation and Variance Of The Income

mean(income)

## [1] 33516.98

sd(income)

## [1] 13336.64

var(income)

## [1] 177865955

Part 11: Minimum And Maximum Income, rounding it to 2 decimal places

round(min(income),2)

## [1] 771.97

round(max(income),2)

## [1] 73554.23

Part 12: Descriptive Statistics

library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'df':
## 
##     income

describe(df)

##          vars     n     mean       sd   median  trimmed      mad    min
## default*    1 10000     1.03     0.18     1.00     1.00     0.00   1.00
## student*    2 10000     1.29     0.46     1.00     1.24     0.00   1.00
## balance     3 10000   835.37   483.71   823.64   823.73   507.52   0.00
## income      4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
##               max    range skew kurtosis     se
## default*     2.00     1.00 5.20    25.06   0.00
## student*     2.00     1.00 0.90    -1.19   0.00
## balance   2654.32  2654.32 0.25    -0.36   4.84
## income   73554.23 72782.27 0.07    -0.90 133.37

# 12B : Statistics for student is not to be taken into consideration as "student" is a categorical variable and hence its mean does not imply any significant meaning

Part 13: Average of balance, broken down by whether consumers default on their loan and Histogram of balance

aggregate(balance, list(default), mean)

##   Group.1         x
## 1      No  803.9438
## 2     Yes 1747.8217

hist(balance)

Part 14: breakdown of the mean and standard deviation of the balance, with respect to whether someone is a student and whether he or she has defaulted in payment, as shown in the following output

#aggregate(balance ~ default+student, data = df, FUN = function(x) c(N=length(x),MeanBalance = mean(x), SDBalance = sd(x) ) )
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

group <- group_by(df,student,default)
summarise(group, N = n(), MeanBalance = mean(balance, na.rm = TRUE),SDBalance   = sd(balance, na.rm = TRUE))

## # A tibble: 4 x 5
## # Groups:   student [?]
##   student default     N MeanBalance SDBalance
##   <fct>   <fct>   <int>       <dbl>     <dbl>
## 1 No      No       6850        745.      446.
## 2 No      Yes       206       1678.      331.
## 3 Yes     No       2817        948.      451.
## 4 Yes     Yes       127       1860.      329.

Part 15: Box-Plot for credit card balance

boxplot(df$balance,horizontal = TRUE,main = "boxplot for variable Price")

# Part 16: Boxplots for credit card balance, broken down by whether a consumer is a student or not a student

boxplot(balance ~ student, main = "Boxplot for Variable Price grouped by student",col=(c("white","red")))

Exploratory Data Analysis on the Credit Card default dataset

Mudit Sinha

today

Part 1: Read the data..

Part 2: Data Dimensions

Part 3: Column names

Part 4: Attach dataframe “df”

Part 5: Data structures of the columns in the dataframe “df”

Part 6: Count how many consumers default on their loan

Part 7: Count how many consumers default on their loan, further broken down by whether or not they are students

Part 8: Complete contingency table of defaulters broken down by students

Part 9: Calculate the percentage of Defaulters and non-Defaulters, rounded to 1 decimal place

Part 10: Mean, Standard Deviation and Variance Of The Income

Part 11: Minimum And Maximum Income, rounding it to 2 decimal places

Part 12: Descriptive Statistics

Part 13: Average of balance, broken down by whether consumers default on their loan and Histogram of balance

Part 14: breakdown of the mean and standard deviation of the balance, with respect to whether someone is a student and whether he or she has defaulted in payment, as shown in the following output

Part 15: Box-Plot for credit card balance