reading external data and storing into a dataframe called “airline.df”

library(data.table)
setwd("E:/YASH/Books/MBA/TERM 4/MLM")
df <- read.csv("DefaultData.csv")
dt <- fread(input="DefaultData.csv",stringsAsFactors=TRUE)

Display the Data Dimensions

dim(df)

## [1] 10000     4

Display the column names

colnames(df)

## [1] "default" "student" "balance" "income"

attach(df)

str(df)

## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...

str(dt)

## Classes 'data.table' and 'data.frame':   10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
##  - attr(*, ".internal.selfref")=<externalptr>

table(default)

## default
##   No  Yes 
## 9667  333

table(default,student)

##        student
## default   No  Yes
##     No  6850 2817
##     Yes  206  127

creating contingency table

tab1 <- table(default,student)

Margin of rows

addmargins(tab1, c(1,2))

##        student
## default    No   Yes   Sum
##     No   6850  2817  9667
##     Yes   206   127   333
##     Sum  7056  2944 10000

protable <- prop.table(table(default))
round(protable*100,1)

## default
##   No  Yes 
## 96.7  3.3

mean(income)

## [1] 33516.98

sd(income)

## [1] 13336.64

var(income)

## [1] 177865955

round(min(income),2)

## [1] 771.97

round(max(income),2)

## [1] 73554.23

library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'df':
## 
##     income

describe(df)

##          vars     n     mean       sd   median  trimmed      mad    min
## default*    1 10000     1.03     0.18     1.00     1.00     0.00   1.00
## student*    2 10000     1.29     0.46     1.00     1.24     0.00   1.00
## balance     3 10000   835.37   483.71   823.64   823.73   507.52   0.00
## income      4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
##               max    range skew kurtosis     se
## default*     2.00     1.00 5.20    25.06   0.00
## student*     2.00     1.00 0.90    -1.19   0.00
## balance   2654.32  2654.32 0.25    -0.36   4.84
## income   73554.23 72782.27 0.07    -0.90 133.37

aggregate(balance, list(default), mean)

##   Group.1         x
## 1      No  803.9438
## 2     Yes 1747.8217

hist(balance)

#aggregate(balance ~ default+student, data = df, FUN = function(x) c(N=length(x),MeanBalance = mean(x), SDBalance = sd(x) ) )

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

group <- group_by(df,student,default)
summarise(group, N = n(), MeanBalance = mean(balance, na.rm = TRUE),SDBalance   = sd(balance, na.rm = TRUE))

## # A tibble: 4 x 5
## # Groups:   student [2]
##   student default     N MeanBalance SDBalance
##   <fct>   <fct>   <int>       <dbl>     <dbl>
## 1 No      No       6850        745.      446.
## 2 No      Yes       206       1678.      331.
## 3 Yes     No       2817        948.      451.
## 4 Yes     Yes       127       1860.      329.

boxplot(df$balance,horizontal = TRUE,main = "boxplot for variable Price")

boxplot(balance ~ student, main = "Boxplot for Variable Price grouped by student",col=(c("white","red")))

EDA

Yashodeep Dhonde

today

reading external data and storing into a dataframe called “airline.df”

Display the Data Dimensions

Display the column names

creating contingency table

Margin of rows