Part 1a: Read the data as a Dataframe

# reading external data and storing into a dataframe called "df"
df <- read.csv("DefaultData.csv")

Part 1b: Read the data as a Datatable

# reading external data and storing into a datatable called "dt"
library(data.table)
dt <- data.table(df)

Part 2: Data Dimensions

# Display the Data Dimensions
dim(df)

## [1] 10000     4

Part 3: Column names

# Display the column names
colnames(df)

## [1] "default" "student" "balance" "income"

Part 4: Attach Data

# Attach the Data
attach(df)

Part 5a: List Data Structures of all the Columns in Dataframe

# List Data Structures of all the Columns in Dataframe
str(df)

## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...

Part 5b: List Data Structures of all the Columns in Datatable

# List Data Structures of all the Columns in Datatable
str(dt)

## Classes 'data.table' and 'data.frame':   10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Part 6: Count of Consumer Defaults

# Count of Consumer Defaults on their loans
table(default)

## default
##   No  Yes 
## 9667  333

Part 7: Count of Consumer Defaults broken down by students

# Count of Consumer Defaults on their loans broken down by students
table(default, student)

##        student
## default   No  Yes
##     No  6850 2817
##     Yes  206  127

Part 8: Count of Consumer Defaults broken down by students along with sum

# creating contingency table
 tab1 <- table(default,student)
# Margin of rows
addmargins(tab1, c(1,2))

##        student
## default    No   Yes   Sum
##     No   6850  2817  9667
##     Yes   206   127   333
##     Sum  7056  2944 10000

Part 9: Count of Consumer Defaults broken down by students

# proportion table for the variable default
protable <- prop.table(tab1)
# printing the proportion table in the form of percentage
PercentPropotion <- round(protable*100,1) 
PercentPropotion

##        student
## default   No  Yes
##     No  68.5 28.2
##     Yes  2.1  1.3

Part 10: Descriptive Statistics

# Display the Mean, Standard Deviation & Variance of Income
mean(income)

## [1] 33516.98

sd(income)

## [1] 13336.64

var(income)

## [1] 177865955

Part 11: Max & Min

# Max & Min upto 2 decimal places
round(min(income),2)

## [1] 771.97

round(max(income),2)

## [1] 73554.23

Part 12a: Descriptive Statistics

# descriptive statistics using package psych
library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'df':
## 
##     income

describe(df)[,c(1:5,8:9)]

##          vars     n     mean       sd   median    min      max
## default*    1 10000     1.03     0.18     1.00   1.00     2.00
## student*    2 10000     1.29     0.46     1.00   1.00     2.00
## balance     3 10000   835.37   483.71   823.64   0.00  2654.32
## income      4 10000 33516.98 13336.64 34552.64 771.97 73554.23

#Part 12b: Inference about 1.29 mean value for student

# the 1.29 mean value in student column has no real signifance as student is a factor variable

Part 13: Display means of balance, broken down by consumer defaults

# descriptive Statistics using aggregate() fuction
aggregate(df[,c(3)],
          by = list(default),mean)

##   Group.1         x
## 1      No  803.9438
## 2     Yes 1747.8217

Part 13b: Display Histogram of balance

# plotting histogram
hist(balance,main = "Histogram of Balance",
xlab = "balance",col = c("green"))

Part 14: Display Breakdown of Mean & SD of balance wrt to student and default

# summary statistics by groups
library(data.table)
tab2 <- dt[, .(N = .N,
                    mean = mean(balance),
                      sd = sd(balance)),
                      by = .(default,student)][order(default)]
tab2

##    default student    N      mean       sd
## 1:      No      No 6850  744.5044 445.5151
## 2:      No     Yes 2817  948.4802 450.5537
## 3:     Yes     Yes  127 1860.3791 328.7356
## 4:     Yes      No  206 1678.4295 330.9141

Part 15: Display Box Plot of balance

# plotting the boxplot for inbuilt data
boxplot(balance,width = 0.5,
horizontal = TRUE,main = "Boxplot for Balance",
xlab = "Balance",col = c("lightblue"))

Part 16: Display Box Plot of balance broken down by Default

# plotting box plots by student
boxplot(balance ~ default,
                main = "Boxplot for Balance grouped by Default (Yes/No)",
                 col=(c("gray","lightblue")))

Getting Started

Sameer Mathur

today

Part 1a: Read the data as a Dataframe

Part 1b: Read the data as a Datatable

Part 2: Data Dimensions

Part 3: Column names

Part 4: Attach Data

Part 5a: List Data Structures of all the Columns in Dataframe

Part 5b: List Data Structures of all the Columns in Datatable

Part 6: Count of Consumer Defaults

Part 7: Count of Consumer Defaults broken down by students

Part 8: Count of Consumer Defaults broken down by students along with sum

Part 9: Count of Consumer Defaults broken down by students

Part 10: Descriptive Statistics

Part 11: Max & Min

Part 12a: Descriptive Statistics

Part 13: Display means of balance, broken down by consumer defaults

Part 13b: Display Histogram of balance

Part 14: Display Breakdown of Mean & SD of balance wrt to student and default

Part 15: Display Box Plot of balance

Part 16: Display Box Plot of balance broken down by Default