Part 1: Read the data asa data frame

# reading external data and storing into a dataframe called "cc.df"
cc.df <- read.csv("DefaultData.csv")

Part 2: Read the data as a data table

# make a data table variable
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
cc.dt <- data.table(cc.df)

Part 2: Column names

# Display the column names
colnames(cc.df)
## [1] "default" "student" "balance" "income"

Part 3: Data Dimensions

# Display the Data Dimensions
dim(cc.df)
## [1] 10000     4

Part 4: Getting the col names of df

# Display the colnames
colnames(cc.df)
## [1] "default" "student" "balance" "income"

Part 5a: Listing the data structure of columes

# Structure the file 
str(cc.df)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...

Part 5b: Listing the data structure of columes

# Structure the file 
str(cc.dt)
## Classes 'data.table' and 'data.frame':   10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Part 6: counting the defaulters in the data

# Count Default 
summary(cc.df$default)
##   No  Yes 
## 9667  333

Part 7: Defaulters based on students

# Count Default & Student
defaultvsStudent <- table(cc.df$default,cc.df$student)
defaultvsStudent
##      
##         No  Yes
##   No  6850 2817
##   Yes  206  127

Part 8: Defaulters based on students

# Count Default and Studnet contingency
addmargins(defaultvsStudent)
##      
##          No   Yes   Sum
##   No   6850  2817  9667
##   Yes   206   127   333
##   Sum  7056  2944 10000

Part 9: Defaulters based on students

# ## Defaulters
prop.table(table(cc.df$default))
## 
##     No    Yes 
## 0.9667 0.0333

Part 10: mean variance SD of income

# STATS
library(psych)
## Warning: package 'psych' was built under R version 3.5.3
describe(cc.df$income)[,c(1,3,5)]
##    vars     mean   median
## X1    1 33516.98 34552.64

Part 11: Min and Max income

# Min and Max
describe(cc.df$income)[,c(8,9)]
##       min      max
## X1 771.97 73554.23

Part 12: Descriptive STAT

## Descriptive statistics for each variable
describe(cc.df)[,c(1:5,8:9)]
##          vars     n     mean       sd   median    min      max
## default*    1 10000     1.03     0.18     1.00   1.00     2.00
## student*    2 10000     1.29     0.46     1.00   1.00     2.00
## balance     3 10000   835.37   483.71   823.64   0.00  2654.32
## income      4 10000 33516.98 13336.64 34552.64 771.97 73554.23

Part 13: Descriptive statistics for default data

## descriptive Statitics
aggregate(cc.df[,c(3)], by = list(cc.df$default),mean)

Part 13b: histagram of balance

## Histogram 
hist(cc.df$balance)

# Part 14: Mean, count and SD

## groupby anaylsis
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
cc.dt[,.(N=.N,
         meanBalance = mean(balance),
         sdBalance = sd(balance)),
      ,by = .(default,student)]
##    default student    N meanBalance sdBalance
## 1:      No      No 6850    744.5044  445.5151
## 2:      No     Yes 2817    948.4802  450.5537
## 3:     Yes     Yes  127   1860.3791  328.7356
## 4:     Yes      No  206   1678.4295  330.9141

Part 15: Box plot for credit card balance

## creditcard balance boxplot
boxplot(cc.df$balance, horizontal = TRUE)

# Part 15b: Box plot for credit card balance vs default

## creditcard balance vs default boxplot
boxplot(cc.df$balance ~ cc.df$default,
        main = "hsitogram of balance",
        col = c("grey","light blue"))