library(data.table)
setwd("E:/YASH/Books/MBA/TERM 4/MLM")
df <- read.csv("DefaultData.csv")
dt <- fread(input="DefaultData.csv",stringsAsFactors=TRUE)
dim(df)
## [1] 10000 4
colnames(df)
## [1] "default" "student" "balance" "income"
attach(df)
str(df)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
str(dt)
## Classes 'data.table' and 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
## - attr(*, ".internal.selfref")=<externalptr>
table(default)
## default
## No Yes
## 9667 333
table(default,student)
## student
## default No Yes
## No 6850 2817
## Yes 206 127
tab1 <- table(default,student)
addmargins(tab1, c(1,2))
## student
## default No Yes Sum
## No 6850 2817 9667
## Yes 206 127 333
## Sum 7056 2944 10000
protable <- prop.table(table(default))
round(protable*100,1)
## default
## No Yes
## 96.7 3.3
mean(income)
## [1] 33516.98
sd(income)
## [1] 13336.64
var(income)
## [1] 177865955
round(min(income),2)
## [1] 771.97
round(max(income),2)
## [1] 73554.23
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'df':
##
## income
describe(df)
## vars n mean sd median trimmed mad min
## default* 1 10000 1.03 0.18 1.00 1.00 0.00 1.00
## student* 2 10000 1.29 0.46 1.00 1.24 0.00 1.00
## balance 3 10000 835.37 483.71 823.64 823.73 507.52 0.00
## income 4 10000 33516.98 13336.64 34552.64 33305.57 16350.86 771.97
## max range skew kurtosis se
## default* 2.00 1.00 5.20 25.06 0.00
## student* 2.00 1.00 0.90 -1.19 0.00
## balance 2654.32 2654.32 0.25 -0.36 4.84
## income 73554.23 72782.27 0.07 -0.90 133.37
aggregate(balance, list(default), mean)
## Group.1 x
## 1 No 803.9438
## 2 Yes 1747.8217
hist(balance)
#aggregate(balance ~ default+student, data = df, FUN = function(x) c(N=length(x),MeanBalance = mean(x), SDBalance = sd(x) ) )
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
group <- group_by(df,student,default)
summarise(group, N = n(), MeanBalance = mean(balance, na.rm = TRUE),SDBalance = sd(balance, na.rm = TRUE))
## # A tibble: 4 x 5
## # Groups: student [2]
## student default N MeanBalance SDBalance
## <fct> <fct> <int> <dbl> <dbl>
## 1 No No 6850 745. 446.
## 2 No Yes 206 1678. 331.
## 3 Yes No 2817 948. 451.
## 4 Yes Yes 127 1860. 329.
boxplot(df$balance,horizontal = TRUE,main = "boxplot for variable Price")
boxplot(balance ~ student, main = "Boxplot for Variable Price grouped by student",col=(c("white","red")))