mba <- read.csv(paste("MBA Starting Salaries Data.csv", sep=" "))
library(psych)
str(mba)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
summary(mba)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
describe(mba)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
barplot(table(mba$sex),col="orange")
#par(mfrow=c(1,4))
boxplot(mba$gmat_tot, main="GMAT score distribution",horizontal=TRUE,col="bisque")
boxplot(mba$gmat_qpc, main="QPC distribution",horizontal=TRUE,col="grey")
boxplot(mba$gmat_vpc, main="VPC distribution",horizontal=TRUE,col="peachpuff")
boxplot(mba$gmat_tpc, main="TPC distribution",horizontal=TRUE,col="violet")
par(mfrow=c(1,2))
boxplot(mba$s_avg, main="spring average",col="gold")
boxplot(mba$f_avg, main="fall average",col="darkgreen")
barplot(table(mba$age),col="darkblue")
barplot(table(mba$work_yrs),main="Number of years of experience",col="lightblue")
barplot(table(mba$frstlang),main="First Language",xlab="1->English 2-> Others",col="brown")
hist(mba$salary, main="Salary distribution",xlim=c(50000,220000),ylim=c(0,50),xlab="Salary",ylab="count",col="beige")
#par(mfrow=c(1,2))
hist(mba$satis, main="Degree of satisfaction with MBA program", xlab=" (1= low, 7 = high satisfaction)", xlim = c(1,7),breaks = (1:ceiling(max(mba$satis)/1)*1),col="maroon")
barplot(table(mba$satis),col="magenta")
#par(mfrow=c(1,3))
with(mba, plot(gmat_tot,salary ,cex=1))
with(mba, plot(quarter, salary,cex=1))
with(mba, plot(work_yrs, salary,cex=1))
plot(mba$salary,mba$satis,cex=1,ylim=c(1,7),ylab="satisfaction")
with(mba,plot(salary,sex))
#barplot(table(mba$salary,mba$sex))
par(mfrow=c(1,3))
with(mba, plot(gmat_tot,sex,cex=1))
with(mba, plot(gmat_tot, s_avg,cex=1))
with(mba, plot(gmat_tot, f_avg,cex=1))
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(formula=~gmat_tot+gmat_qpc + gmat_vpc+gmat_tpc,data=mba,diagonal="histogram")
scatterplotMatrix(formula=~s_avg + f_avg+work_yrs+salary,data=mba)
library(corrgram)
library(corrplot)
## corrplot 0.84 loaded
x<-cov(mba[, c(1:10)])
corrplot(corr=cor(mba[ , c(1:10)],use="complete.obs"),
method =c("color"),main="corrplot")
corrgram(mba,order = NULL,panel=panel.cor,lower.panel = panel.shade,text.panel=panel.txt,main="Corrgram")
cov2cor(x)
## age sex gmat_tot gmat_qpc gmat_vpc
## age 1.00000000 -0.028106442 -0.14593840 -0.21616985 -0.04417547
## sex -0.02810644 1.000000000 -0.05336820 -0.16377435 0.07488782
## gmat_tot -0.14593840 -0.053368202 1.00000000 0.72473781 0.74839187
## gmat_qpc -0.21616985 -0.163774346 0.72473781 1.00000000 0.15218014
## gmat_vpc -0.04417547 0.074887816 0.74839187 0.15218014 1.00000000
## gmat_tpc -0.16990307 -0.008090213 0.84779965 0.65137754 0.66621604
## s_avg 0.14970402 0.127115144 0.11311702 -0.02984873 0.20445365
## f_avg -0.01744806 0.091663891 0.10442409 0.07370455 0.07592225
## quarter -0.04967221 -0.133533171 -0.09223903 0.03636638 -0.17460736
## work_yrs 0.85829810 -0.011296374 -0.18235434 -0.23660827 -0.06639049
## gmat_tpc s_avg f_avg quarter work_yrs
## age -0.169903066 0.14970402 -0.01744806 -0.04967221 0.85829810
## sex -0.008090213 0.12711514 0.09166389 -0.13353317 -0.01129637
## gmat_tot 0.847799647 0.11311702 0.10442409 -0.09223903 -0.18235434
## gmat_qpc 0.651377538 -0.02984873 0.07370455 0.03636638 -0.23660827
## gmat_vpc 0.666216035 0.20445365 0.07592225 -0.17460736 -0.06639049
## gmat_tpc 1.000000000 0.11736245 0.07973210 -0.08303535 -0.17336186
## s_avg 0.117362449 1.00000000 0.55062139 -0.76211664 0.12929271
## f_avg 0.079732099 0.55062139 1.00000000 -0.44750637 -0.03905692
## quarter -0.083035351 -0.76211664 -0.44750637 1.00000000 -0.08602641
## work_yrs -0.173361859 0.12929271 -0.03905692 -0.08602641 1.00000000
notplaced <- mba[which(mba$salary==0), ]
placed<- mba[which (mba$salary > 1000) , ]
aggregate(gmat_tot ~ sex, data = mba, mean)
## sex gmat_tot
## 1 1 621.2136
## 2 2 614.1176
aggregate(salary ~ sex, data = mba, mean)
## sex salary
## 1 1 37013.62
## 2 2 45121.07
x<-mean(mba$gmat_tot)
above_avg_gmat<- mba[which (placed$gmat_tot > x) , ]
below_avg_gmat<- mba[which (placed$gmat_tot < x) , ]
mean(above_avg_gmat$salary) #above average candidates
## [1] 34119.94
mean(below_avg_gmat$salary) #below average candidates
## [1] 38999.37
aggregate(salary ~ quarter, data = mba, mean)
## quarter salary
## 1 1 54166.28
## 2 2 37261.01
## 3 3 34037.40
## 4 4 30225.80
aggregate(salary~quarter,data=above_avg_gmat,mean)
## quarter salary
## 1 1 50930.3333
## 2 2 499.1667
aggregate(salary~quarter,data=below_avg_gmat,mean)
## quarter salary
## 1 1 57696.39
## 2 2 436.75
freshers <- placed[which (placed$salary >0) , ]
mean(freshers$salary)
## [1] 103030.7
aggregate(gmat_tot~quarter,data=notplaced,mean)
## quarter gmat_tot
## 1 1 631.1111
## 2 2 605.5556
## 3 3 611.7391
## 4 4 614.0909
mytable <-xtabs(~sex+salary,data=placed)
mytable
## salary
## sex 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000 93000
## 1 0 1 0 0 1 0 0 1 3 2 2
## 2 1 0 1 1 3 2 1 0 0 1 1
## salary
## sex 95000 96000 96500 97000 98000 99000 100000 100400 101000 101100 101600
## 1 4 3 1 2 6 0 4 1 0 1 1
## 2 3 1 0 0 4 1 5 0 2 0 0
## salary
## sex 102500 103000 104000 105000 106000 107000 107300 107500 108000 110000
## 1 1 1 2 11 2 1 1 1 2 0
## 2 0 0 0 0 1 0 0 0 0 1
## salary
## sex 112000 115000 118000 120000 126710 130000 145800 146000 162000 220000
## 1 3 5 1 3 1 1 1 1 1 0
## 2 0 0 0 1 0 0 0 0 0 1
mytable1 <-xtabs(~salary+work_yrs,data=placed)
mytable1
## work_yrs
## salary 0 1 2 3 4 5 6 7 8 10 15 16
## 64000 0 0 1 0 0 0 0 0 0 0 0 0
## 77000 0 0 1 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0
## 82000 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 0 1 2 1 0 0 0 0 0 0 0 0
## 86000 0 0 1 1 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0
## 88500 0 0 0 1 0 0 0 0 0 0 0 0
## 90000 0 0 2 0 0 1 0 0 0 0 0 0
## 92000 0 0 3 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 0 1 1 0 0 1 0 0 0
## 95000 1 1 2 2 0 1 0 0 0 0 0 0
## 96000 0 1 2 0 1 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 1 1 0 0 0 0 0 0 0
## 98000 0 0 7 1 1 0 0 1 0 0 0 0
## 99000 0 0 0 0 0 1 0 0 0 0 0 0
## 100000 0 0 6 1 1 0 1 0 0 0 0 0
## 100400 0 0 0 1 0 0 0 0 0 0 0 0
## 101000 0 0 2 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 1 0 0 0
## 101600 0 0 0 1 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 1 0 0 0 0 0
## 103000 0 0 0 1 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 2 0 0 0 0 0 0 0
## 105000 0 0 4 4 0 1 1 0 0 0 0 1
## 106000 0 0 0 0 0 0 2 0 1 0 0 0
## 107000 0 0 1 0 0 0 0 0 0 0 0 0
## 107300 0 0 1 0 0 0 0 0 0 0 0 0
## 107500 0 0 0 1 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 1 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0
## 112000 0 0 1 0 0 0 1 0 0 0 0 1
## 115000 0 2 0 1 2 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 1 0 0
## 120000 0 0 0 1 0 2 0 0 1 0 0 0
## 126710 0 0 0 1 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 1 0
## 162000 0 1 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 1 0
mytable2<-xtabs(~salary+frstlang,data=placed)
mytable2
## frstlang
## salary 1 2
## 64000 1 0
## 77000 1 0
## 78256 1 0
## 82000 1 0
## 85000 4 0
## 86000 2 0
## 88000 1 0
## 88500 1 0
## 90000 3 0
## 92000 3 0
## 93000 3 0
## 95000 7 0
## 96000 4 0
## 96500 1 0
## 97000 2 0
## 98000 8 2
## 99000 0 1
## 100000 9 0
## 100400 1 0
## 101000 2 0
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 1 1
## 105000 11 0
## 106000 3 0
## 107000 1 0
## 107300 0 1
## 107500 1 0
## 108000 2 0
## 110000 1 0
## 112000 3 0
## 115000 5 0
## 118000 0 1
## 120000 4 0
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
mytable3<-xtabs(~salary+gmat_tot,data=placed)
mytable3
## gmat_tot
## salary 500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660
## 64000 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 77000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 78256 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 85000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
## 86000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 88000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 88500 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 90000 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0
## 92000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
## 93000 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0
## 95000 0 0 1 0 0 2 0 0 0 0 2 0 0 0 0 0
## 96000 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0
## 96500 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## 98000 0 0 0 0 0 1 3 1 1 0 1 0 0 0 0 0
## 99000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 100000 0 0 0 0 0 2 0 1 0 1 1 0 1 0 2 0
## 100400 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 101000 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 101600 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 102500 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 103000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 104000 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## 105000 0 0 0 0 2 0 2 3 0 1 0 1 0 0 1 0
## 106000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 107300 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 107500 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 108000 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 112000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 115000 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 120000 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0
## 126710 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 145800 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 162000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 220000 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gmat_tot
## salary 670 680 700 710 720
## 64000 0 0 0 0 0
## 77000 0 0 0 0 0
## 78256 0 0 0 0 0
## 82000 1 0 0 0 0
## 85000 0 0 1 0 1
## 86000 0 1 0 0 0
## 88000 0 0 0 0 0
## 88500 0 0 0 0 0
## 90000 0 0 0 0 0
## 92000 0 0 0 1 0
## 93000 0 0 0 0 0
## 95000 2 0 0 0 0
## 96000 0 0 0 0 0
## 96500 0 0 0 0 0
## 97000 0 0 0 0 0
## 98000 1 1 0 1 0
## 99000 0 0 0 0 0
## 100000 0 0 0 1 0
## 100400 0 0 0 0 0
## 101000 0 0 0 0 0
## 101100 0 0 0 0 0
## 101600 0 0 0 0 0
## 102500 1 0 0 0 0
## 103000 0 0 0 0 0
## 104000 0 0 0 0 0
## 105000 0 1 0 0 0
## 106000 0 2 0 0 0
## 107000 0 0 0 0 0
## 107300 0 0 0 0 0
## 107500 0 0 0 0 0
## 108000 0 0 0 0 0
## 110000 0 0 0 0 0
## 112000 1 1 0 0 0
## 115000 0 0 0 1 0
## 118000 0 0 0 0 0
## 120000 1 0 1 0 0
## 126710 0 0 0 0 0
## 130000 0 0 0 0 0
## 145800 0 0 0 0 0
## 146000 0 0 0 0 0
## 162000 0 0 1 0 0
## 220000 0 0 0 0 0
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable1
## X-squared = 535.23, df = 451, p-value = 0.003809
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable2
## X-squared = 69.847, df = 41, p-value = 0.003296
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable3
## X-squared = 927.24, df = 820, p-value = 0.005279
t.test(placed$salary,placed$work_yrs,var.equal=TRUE, paired=FALSE)
##
## Two Sample t-test
##
## data: placed$salary and placed$work_yrs
## t = 58.516, df = 204, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 99555.62 106498.49
## sample estimates:
## mean of x mean of y
## 1.030307e+05 3.679612e+00
model1 <- lm(salary ~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc, data = placed)
summary(model1)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc,
## data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40370 -8250 -2164 5253 100097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109539.54 48054.24 2.279 0.0248 *
## gmat_tot 55.01 181.71 0.303 0.7627
## gmat_qpc 718.40 541.90 1.326 0.1880
## gmat_vpc 546.10 543.85 1.004 0.3178
## gmat_tpc -1663.16 801.57 -2.075 0.0406 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17670 on 98 degrees of freedom
## Multiple R-squared: 0.06089, Adjusted R-squared: 0.02256
## F-statistic: 1.589 on 4 and 98 DF, p-value: 0.1834
model2<- lm(salary ~satis+work_yrs+frstlang, data = placed)
summary(model2)
##
## Call:
## lm(formula = salary ~ satis + work_yrs + frstlang, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31764 -9640 -604 4816 76193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90600.7 13050.3 6.942 4.07e-10 ***
## satis -1913.1 2000.0 -0.957 0.3411
## work_yrs 2506.8 528.6 4.742 7.11e-06 ***
## frstlang 13541.5 6305.7 2.147 0.0342 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15740 on 99 degrees of freedom
## Multiple R-squared: 0.2466, Adjusted R-squared: 0.2237
## F-statistic: 10.8 on 3 and 99 DF, p-value: 3.354e-06
model3 <- lm(salary ~age+sex, data = placed)
summary(model3)
##
## Call:
## lm(formula = salary ~ age + sex, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29047 -9444 -1750 5428 84503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36859.8 14123.5 2.610 0.0105 *
## age 2653.1 475.1 5.584 2.03e-07 ***
## sex -3743.6 3372.6 -1.110 0.2697
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15540 on 100 degrees of freedom
## Multiple R-squared: 0.2588, Adjusted R-squared: 0.244
## F-statistic: 17.46 on 2 and 100 DF, p-value: 3.144e-07
mba$Placed = (mba$salary >1000)
mytable_sex <- xtabs(~ Placed+sex, data=mba)
round(ftable(addmargins(prop.table(mytable_sex))),2)
## sex 1 2 Sum
## Placed
## FALSE 0.49 0.14 0.62
## TRUE 0.26 0.11 0.38
## Sum 0.75 0.25 1.00
mytable_frstlang <- xtabs(~ Placed+frstlang, data=mba)
round(ftable(addmargins(prop.table(mytable_frstlang))),2)
## frstlang 1 2 Sum
## Placed
## FALSE 0.53 0.09 0.62
## TRUE 0.35 0.03 0.38
## Sum 0.88 0.12 1.00
Hypothesis H1: Satisfaction of MBA course does not depend on salary
chisq.test(mytable_sex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable_sex
## X-squared = 2.033, df = 1, p-value = 0.1539
chisq.test(mytable_frstlang)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable_frstlang
## X-squared = 3.0938, df = 1, p-value = 0.07859
my_data<- read.csv('MBA Starting Salaries Data.csv',header=T,na.strings=c("999"))
sapply(my_data,function(x) sum(is.na(x)))
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## 0 0 0 0 0 0 0 0
## quarter work_yrs frstlang salary satis
## 0 0 0 35 0
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(my_data, main = "Missing values vs observed")
my_data$salary[is.na(my_data$salary)] <- mean(my_data$salary,na.rm=T)
my_data1<- mba
my_data1$GotPlaced<-(my_data1$salary>1000)
View(my_data1)
my_data1$Placed<-factor(my_data1$Placed)
str(my_data1)
## 'data.frame': 274 obs. of 15 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot : int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc : int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc : int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc : int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs : int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang : int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
## $ Placed : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 1 1 1 ...
## $ GotPlaced: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
my_data$GotPlaced = (my_data$salary >1000)
View(my_data)
my_data1$Placed <- factor(my_data1$Placed)
my_data$GotPlaced[my_data$GotPlaced == TRUE] <- '1' # placed
my_data$GotPlaced[my_data$GotPlaced == FALSE] <- '0' # not placed
my_data$GotPlaced <- factor(my_data$GotPlaced)
train <- my_data[1:150,]
test <- my_data[151:274,]
model <- glm(GotPlaced ~.,family=binomial(link='logit'),data=my_data)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = GotPlaced ~ ., family = binomial(link = "logit"),
## data = my_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -9.246e-06 -3.906e-06 2.110e-08 2.110e-08 1.549e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.512e+01 2.813e+05 0.000 1.000
## age 1.617e-02 5.819e+03 0.000 1.000
## sex -3.923e-01 2.977e+04 0.000 1.000
## gmat_tot -1.791e-03 6.870e+02 0.000 1.000
## gmat_qpc 1.476e-02 1.983e+03 0.000 1.000
## gmat_vpc 2.512e-02 1.994e+03 0.000 1.000
## gmat_tpc -1.122e-02 9.011e+02 0.000 1.000
## s_avg -9.101e-01 5.947e+04 0.000 1.000
## f_avg 1.556e-01 3.906e+04 0.000 1.000
## quarter 4.410e-02 1.672e+04 0.000 1.000
## work_yrs -1.223e-02 6.730e+03 0.000 1.000
## frstlang 1.006e+00 4.402e+04 0.000 1.000
## salary 1.092e-03 5.254e-01 0.002 0.998
## satis -2.740e-03 5.574e+01 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3.7983e+02 on 273 degrees of freedom
## Residual deviance: 5.9230e-09 on 260 degrees of freedom
## AIC: 28
##
## Number of Fisher Scoring iterations: 25
anova(model, test="Chisq")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: GotPlaced
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 273 379.83
## age 1 3.32 272 376.51 0.06845 .
## sex 1 0.19 271 376.32 0.65984
## gmat_tot 1 0.09 270 376.23 0.76724
## gmat_qpc 1 0.36 269 375.87 0.54850
## gmat_vpc 1 0.12 268 375.75 0.73432
## gmat_tpc 1 0.14 267 375.61 0.70631
## s_avg 1 0.30 266 375.32 0.58611
## f_avg 1 0.37 265 374.94 0.54236
## quarter 1 0.36 264 374.59 0.54962
## work_yrs 1 0.64 263 373.94 0.42310
## frstlang 1 0.00 262 373.94 0.98508
## salary 1 373.94 261 0.00 < 2e-16 ***
## satis 1 0.00 260 0.00 0.99999
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(model)
## llh llhNull G2 McFadden r2ML
## -2.961492e-09 -1.899150e+02 3.798301e+02 1.000000e+00 7.499867e-01
## r2CU
## 1.000000e+00
fitted.results <- predict(model,newdata=subset(test,select=c(1:14)),type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$GotPlaced)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 1"
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
p <- predict(model, newdata=subset(test,select=c(1:14)), type="response")
pr <- prediction(p, test$GotPlaced)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 1