MBA Starting Salaries

Reading the data

mba <- read.csv(paste("MBA Starting Salaries Data.csv", sep=" "))

Creating summary statistics

library(psych)
str(mba)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...
summary(mba)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
describe(mba)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Vizualizing the distribution of each variables

Vizualizing the sex ratio of admits
barplot(table(mba$sex),col="orange")

We can observe that nearly 80% of admits are male and slightly over 20% are female
Vizualizing GMAT total distribution, Quant percentile distribution, Verbal percentile distribution, Total percentile distribution
#par(mfrow=c(1,4))
boxplot(mba$gmat_tot, main="GMAT score distribution",horizontal=TRUE,col="bisque")

boxplot(mba$gmat_qpc, main="QPC distribution",horizontal=TRUE,col="grey")

boxplot(mba$gmat_vpc, main="VPC distribution",horizontal=TRUE,col="peachpuff")

boxplot(mba$gmat_tpc, main="TPC distribution",horizontal=TRUE,col="violet")

Vizualizing Spring MBA average and Fall MBA average
par(mfrow=c(1,2))
boxplot(mba$s_avg, main="spring average",col="gold")
boxplot(mba$f_avg, main="fall average",col="darkgreen")

Vizualizing age distribution
barplot(table(mba$age),col="darkblue")

Vizualizing experience distribution
barplot(table(mba$work_yrs),main="Number of years of experience",col="lightblue")

Vizualizing first language distribution
barplot(table(mba$frstlang),main="First Language",xlab="1->English 2-> Others",col="brown")

Vizualizing salary distribution
hist(mba$salary, main="Salary distribution",xlim=c(50000,220000),ylim=c(0,50),xlab="Salary",ylab="count",col="beige")

Vizualizing Degree of Satisfaction distribution
#par(mfrow=c(1,2))
hist(mba$satis, main="Degree of satisfaction with MBA program", xlab=" (1= low, 7 = high satisfaction)", xlim = c(1,7),breaks = (1:ceiling(max(mba$satis)/1)*1),col="maroon")

barplot(table(mba$satis),col="magenta")

Scatterplots to understand correlation between variables

Vizualizing GMAT total vs Salary, Quartile ranking vs Salary, Work experience vs Salary
#par(mfrow=c(1,3))
with(mba, plot(gmat_tot,salary ,cex=1))

with(mba, plot(quarter, salary,cex=1))

with(mba, plot(work_yrs, salary,cex=1))

We can observe that the person with highest salary is not the one with highest GMAT score instead with below average GMAT score.
Vizualizing Salary vs Satisfaction
plot(mba$salary,mba$satis,cex=1,ylim=c(1,7),ylab="satisfaction")

Vizualizing Salary vs Sex
with(mba,plot(salary,sex))

#barplot(table(mba$salary,mba$sex))
Vizualizing GMAT total vs Sex, GMAT total vs Spring average, GMAT total vs Fall average
par(mfrow=c(1,3))
with(mba, plot(gmat_tot,sex,cex=1))
with(mba, plot(gmat_tot, s_avg,cex=1))
with(mba, plot(gmat_tot, f_avg,cex=1))

Vizualizing through ScatterPlotMatrix
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(formula=~gmat_tot+gmat_qpc + gmat_vpc+gmat_tpc,data=mba,diagonal="histogram")

scatterplotMatrix(formula=~s_avg + f_avg+work_yrs+salary,data=mba)

Vizualizing correlation between variables
library(corrgram)
library(corrplot)
## corrplot 0.84 loaded
x<-cov(mba[, c(1:10)]) 
corrplot(corr=cor(mba[ , c(1:10)],use="complete.obs"), 
         method =c("color"),main="corrplot")

corrgram(mba,order = NULL,panel=panel.cor,lower.panel = panel.shade,text.panel=panel.txt,main="Corrgram")

cov2cor(x)
##                  age          sex    gmat_tot    gmat_qpc    gmat_vpc
## age       1.00000000 -0.028106442 -0.14593840 -0.21616985 -0.04417547
## sex      -0.02810644  1.000000000 -0.05336820 -0.16377435  0.07488782
## gmat_tot -0.14593840 -0.053368202  1.00000000  0.72473781  0.74839187
## gmat_qpc -0.21616985 -0.163774346  0.72473781  1.00000000  0.15218014
## gmat_vpc -0.04417547  0.074887816  0.74839187  0.15218014  1.00000000
## gmat_tpc -0.16990307 -0.008090213  0.84779965  0.65137754  0.66621604
## s_avg     0.14970402  0.127115144  0.11311702 -0.02984873  0.20445365
## f_avg    -0.01744806  0.091663891  0.10442409  0.07370455  0.07592225
## quarter  -0.04967221 -0.133533171 -0.09223903  0.03636638 -0.17460736
## work_yrs  0.85829810 -0.011296374 -0.18235434 -0.23660827 -0.06639049
##              gmat_tpc       s_avg       f_avg     quarter    work_yrs
## age      -0.169903066  0.14970402 -0.01744806 -0.04967221  0.85829810
## sex      -0.008090213  0.12711514  0.09166389 -0.13353317 -0.01129637
## gmat_tot  0.847799647  0.11311702  0.10442409 -0.09223903 -0.18235434
## gmat_qpc  0.651377538 -0.02984873  0.07370455  0.03636638 -0.23660827
## gmat_vpc  0.666216035  0.20445365  0.07592225 -0.17460736 -0.06639049
## gmat_tpc  1.000000000  0.11736245  0.07973210 -0.08303535 -0.17336186
## s_avg     0.117362449  1.00000000  0.55062139 -0.76211664  0.12929271
## f_avg     0.079732099  0.55062139  1.00000000 -0.44750637 -0.03905692
## quarter  -0.083035351 -0.76211664 -0.44750637  1.00000000 -0.08602641
## work_yrs -0.173361859  0.12929271 -0.03905692 -0.08602641  1.00000000
2a. Identify the crucial managerially relevant question(s) raised in the case
1. Is average gmat score of women > average gmat score of men?
  1. Is average salary of men > average salary of women?
  2. Do candidates with above average gmat score are placed better than below average gmat score candidates?
  3. Which quartile is highly populated?
  4. What is the average salary of people with 0-2 years of work experience?
  5. What is the relation candidates who are not yet placed with gmat score
Figure out how to answer these questions using the given dataset, using R
notplaced <- mba[which(mba$salary==0), ]
placed<- mba[which (mba$salary > 1000)  , ]
1. Comparing gmat scores of men and women
aggregate(gmat_tot ~ sex, data = mba, mean)
##   sex gmat_tot
## 1   1 621.2136
## 2   2 614.1176
2. Comparing salaries of men and women
aggregate(salary ~ sex, data = mba, mean)
##   sex   salary
## 1   1 37013.62
## 2   2 45121.07
3. Comparing below average and above average gmat scorers
x<-mean(mba$gmat_tot)
above_avg_gmat<- mba[which (placed$gmat_tot > x)  , ]
below_avg_gmat<- mba[which (placed$gmat_tot < x)  , ]

mean(above_avg_gmat$salary) #above average candidates
## [1] 34119.94
mean(below_avg_gmat$salary) #below average candidates
## [1] 38999.37
We can observe that below average candidates have performed better at placements
4. Analyzing quartile
aggregate(salary ~ quarter, data = mba, mean)
##   quarter   salary
## 1       1 54166.28
## 2       2 37261.01
## 3       3 34037.40
## 4       4 30225.80
aggregate(salary~quarter,data=above_avg_gmat,mean)
##   quarter     salary
## 1       1 50930.3333
## 2       2   499.1667
aggregate(salary~quarter,data=below_avg_gmat,mean)
##   quarter   salary
## 1       1 57696.39
## 2       2   436.75
5. Average salary of candidates with 0-2 years of experience
freshers <- placed[which (placed$salary >0)  , ]
mean(freshers$salary)
## [1] 103030.7
6. Analyzing not placed students
aggregate(gmat_tot~quarter,data=notplaced,mean)
##   quarter gmat_tot
## 1       1 631.1111
## 2       2 605.5556
## 3       3 611.7391
## 4       4 614.0909
2b. Who got how much salary?
Contingency table showing the affect of variousfactors on the starting salary
    mytable <-xtabs(~sex+salary,data=placed)
    mytable
##    salary
## sex 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000 93000
##   1     0     1     0     0     1     0     0     1     3     2     2
##   2     1     0     1     1     3     2     1     0     0     1     1
##    salary
## sex 95000 96000 96500 97000 98000 99000 100000 100400 101000 101100 101600
##   1     4     3     1     2     6     0      4      1      0      1      1
##   2     3     1     0     0     4     1      5      0      2      0      0
##    salary
## sex 102500 103000 104000 105000 106000 107000 107300 107500 108000 110000
##   1      1      1      2     11      2      1      1      1      2      0
##   2      0      0      0      0      1      0      0      0      0      1
##    salary
## sex 112000 115000 118000 120000 126710 130000 145800 146000 162000 220000
##   1      3      5      1      3      1      1      1      1      1      0
##   2      0      0      0      1      0      0      0      0      0      1
 mytable1 <-xtabs(~salary+work_yrs,data=placed)
    mytable1
##         work_yrs
## salary   0 1 2 3 4 5 6 7 8 10 15 16
##   64000  0 0 1 0 0 0 0 0 0  0  0  0
##   77000  0 0 1 0 0 0 0 0 0  0  0  0
##   78256  0 1 0 0 0 0 0 0 0  0  0  0
##   82000  0 1 0 0 0 0 0 0 0  0  0  0
##   85000  0 1 2 1 0 0 0 0 0  0  0  0
##   86000  0 0 1 1 0 0 0 0 0  0  0  0
##   88000  0 0 0 1 0 0 0 0 0  0  0  0
##   88500  0 0 0 1 0 0 0 0 0  0  0  0
##   90000  0 0 2 0 0 1 0 0 0  0  0  0
##   92000  0 0 3 0 0 0 0 0 0  0  0  0
##   93000  0 0 0 0 1 1 0 0 1  0  0  0
##   95000  1 1 2 2 0 1 0 0 0  0  0  0
##   96000  0 1 2 0 1 0 0 0 0  0  0  0
##   96500  0 0 1 0 0 0 0 0 0  0  0  0
##   97000  0 0 0 1 1 0 0 0 0  0  0  0
##   98000  0 0 7 1 1 0 0 1 0  0  0  0
##   99000  0 0 0 0 0 1 0 0 0  0  0  0
##   100000 0 0 6 1 1 0 1 0 0  0  0  0
##   100400 0 0 0 1 0 0 0 0 0  0  0  0
##   101000 0 0 2 0 0 0 0 0 0  0  0  0
##   101100 0 0 0 0 0 0 0 0 1  0  0  0
##   101600 0 0 0 1 0 0 0 0 0  0  0  0
##   102500 0 0 0 0 0 0 1 0 0  0  0  0
##   103000 0 0 0 1 0 0 0 0 0  0  0  0
##   104000 0 0 0 0 2 0 0 0 0  0  0  0
##   105000 0 0 4 4 0 1 1 0 0  0  0  1
##   106000 0 0 0 0 0 0 2 0 1  0  0  0
##   107000 0 0 1 0 0 0 0 0 0  0  0  0
##   107300 0 0 1 0 0 0 0 0 0  0  0  0
##   107500 0 0 0 1 0 0 0 0 0  0  0  0
##   108000 0 0 0 1 1 0 0 0 0  0  0  0
##   110000 0 0 0 0 0 0 1 0 0  0  0  0
##   112000 0 0 1 0 0 0 1 0 0  0  0  1
##   115000 0 2 0 1 2 0 0 0 0  0  0  0
##   118000 0 0 0 0 0 0 0 0 0  1  0  0
##   120000 0 0 0 1 0 2 0 0 1  0  0  0
##   126710 0 0 0 1 0 0 0 0 0  0  0  0
##   130000 0 0 0 0 1 0 0 0 0  0  0  0
##   145800 0 0 1 0 0 0 0 0 0  0  0  0
##   146000 0 0 0 0 0 0 0 0 0  0  1  0
##   162000 0 1 0 0 0 0 0 0 0  0  0  0
##   220000 0 0 0 0 0 0 0 0 0  0  1  0
Work experience is an added advantage while getting placed
 mytable2<-xtabs(~salary+frstlang,data=placed)
    mytable2
##         frstlang
## salary    1  2
##   64000   1  0
##   77000   1  0
##   78256   1  0
##   82000   1  0
##   85000   4  0
##   86000   2  0
##   88000   1  0
##   88500   1  0
##   90000   3  0
##   92000   3  0
##   93000   3  0
##   95000   7  0
##   96000   4  0
##   96500   1  0
##   97000   2  0
##   98000   8  2
##   99000   0  1
##   100000  9  0
##   100400  1  0
##   101000  2  0
##   101100  1  0
##   101600  1  0
##   102500  1  0
##   103000  1  0
##   104000  1  1
##   105000 11  0
##   106000  3  0
##   107000  1  0
##   107300  0  1
##   107500  1  0
##   108000  2  0
##   110000  1  0
##   112000  3  0
##   115000  5  0
##   118000  0  1
##   120000  4  0
##   126710  1  0
##   130000  1  0
##   145800  1  0
##   146000  1  0
##   162000  1  0
##   220000  0  1
English communication is important
mytable3<-xtabs(~salary+gmat_tot,data=placed)
    mytable3
##         gmat_tot
## salary   500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660
##   64000    0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0
##   77000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   78256    0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   82000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   85000    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   1
##   86000    0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   88000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   88500    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   90000    0   0   0   0   0   0   0   1   0   0   0   0   1   0   1   0
##   92000    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   1
##   93000    0   0   0   1   0   0   0   0   0   0   1   1   0   0   0   0
##   95000    0   0   1   0   0   2   0   0   0   0   2   0   0   0   0   0
##   96000    0   0   0   0   0   1   0   0   1   1   0   0   0   0   1   0
##   96500    1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   97000    0   0   0   0   0   0   0   1   0   0   0   1   0   0   0   0
##   98000    0   0   0   0   0   1   3   1   1   0   1   0   0   0   0   0
##   99000    0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
##   100000   0   0   0   0   0   2   0   1   0   1   1   0   1   0   2   0
##   100400   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   101000   0   0   0   0   0   0   0   0   0   1   0   1   0   0   0   0
##   101100   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   101600   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   102500   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   103000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   104000   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0   0
##   105000   0   0   0   0   2   0   2   3   0   1   0   1   0   0   1   0
##   106000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   107000   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##   107300   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   107500   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   108000   0   0   0   0   0   0   1   0   0   1   0   0   0   0   0   0
##   110000   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0
##   112000   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##   115000   0   0   0   1   0   0   1   0   0   0   0   1   1   0   0   0
##   118000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   120000   0   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0
##   126710   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0
##   130000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   145800   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   146000   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   162000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   220000   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##         gmat_tot
## salary   670 680 700 710 720
##   64000    0   0   0   0   0
##   77000    0   0   0   0   0
##   78256    0   0   0   0   0
##   82000    1   0   0   0   0
##   85000    0   0   1   0   1
##   86000    0   1   0   0   0
##   88000    0   0   0   0   0
##   88500    0   0   0   0   0
##   90000    0   0   0   0   0
##   92000    0   0   0   1   0
##   93000    0   0   0   0   0
##   95000    2   0   0   0   0
##   96000    0   0   0   0   0
##   96500    0   0   0   0   0
##   97000    0   0   0   0   0
##   98000    1   1   0   1   0
##   99000    0   0   0   0   0
##   100000   0   0   0   1   0
##   100400   0   0   0   0   0
##   101000   0   0   0   0   0
##   101100   0   0   0   0   0
##   101600   0   0   0   0   0
##   102500   1   0   0   0   0
##   103000   0   0   0   0   0
##   104000   0   0   0   0   0
##   105000   0   1   0   0   0
##   106000   0   2   0   0   0
##   107000   0   0   0   0   0
##   107300   0   0   0   0   0
##   107500   0   0   0   0   0
##   108000   0   0   0   0   0
##   110000   0   0   0   0   0
##   112000   1   1   0   0   0
##   115000   0   0   0   1   0
##   118000   0   0   0   0   0
##   120000   1   0   1   0   0
##   126710   0   0   0   0   0
##   130000   0   0   0   0   0
##   145800   0   0   0   0   0
##   146000   0   0   0   0   0
##   162000   0   0   1   0   0
##   220000   0   0   0   0   0
Chisquare test
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable1
## X-squared = 535.23, df = 451, p-value = 0.003809
Since p<0.01 there is a relationship betweeen work experience and salary
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable2
## X-squared = 69.847, df = 41, p-value = 0.003296
Since p<0.01 we can say that there is a relationship between first language and salary
 chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable3
## X-squared = 927.24, df = 820, p-value = 0.005279
Since p<0.01 we see there exists a relationship between Total GMAT score and starting salary.
T-Test
t.test(placed$salary,placed$work_yrs,var.equal=TRUE, paired=FALSE)
## 
##  Two Sample t-test
## 
## data:  placed$salary and placed$work_yrs
## t = 58.516, df = 204, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   99555.62 106498.49
## sample estimates:
##    mean of x    mean of y 
## 1.030307e+05 3.679612e+00
Regression Model
 model1 <- lm(salary ~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc, data = placed)
summary(model1)
## 
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc, 
##     data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40370  -8250  -2164   5253 100097 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 109539.54   48054.24   2.279   0.0248 *
## gmat_tot        55.01     181.71   0.303   0.7627  
## gmat_qpc       718.40     541.90   1.326   0.1880  
## gmat_vpc       546.10     543.85   1.004   0.3178  
## gmat_tpc     -1663.16     801.57  -2.075   0.0406 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17670 on 98 degrees of freedom
## Multiple R-squared:  0.06089,    Adjusted R-squared:  0.02256 
## F-statistic: 1.589 on 4 and 98 DF,  p-value: 0.1834
Gmat_tpc is a significant variable in model 1 The multiple R squared value indicates that the model accounts for 6% of the variance in the variables
Model 2
model2<- lm(salary ~satis+work_yrs+frstlang, data = placed)
summary(model2)
## 
## Call:
## lm(formula = salary ~ satis + work_yrs + frstlang, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31764  -9640   -604   4816  76193 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  90600.7    13050.3   6.942 4.07e-10 ***
## satis        -1913.1     2000.0  -0.957   0.3411    
## work_yrs      2506.8      528.6   4.742 7.11e-06 ***
## frstlang     13541.5     6305.7   2.147   0.0342 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15740 on 99 degrees of freedom
## Multiple R-squared:  0.2466, Adjusted R-squared:  0.2237 
## F-statistic:  10.8 on 3 and 99 DF,  p-value: 3.354e-06
Work_yrs and frstlang are significant variables in model 2 The multiple R squared value indicates that the model accounts for 24.66% of the variance in the variables The residual error(15740) can be thought of as the average error in predicting salary using work experience, job satisfaction and first language.
Model 3
model3 <- lm(salary ~age+sex, data = placed)
summary(model3)
## 
## Call:
## lm(formula = salary ~ age + sex, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29047  -9444  -1750   5428  84503 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36859.8    14123.5   2.610   0.0105 *  
## age           2653.1      475.1   5.584 2.03e-07 ***
## sex          -3743.6     3372.6  -1.110   0.2697    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15540 on 100 degrees of freedom
## Multiple R-squared:  0.2588, Adjusted R-squared:  0.244 
## F-statistic: 17.46 on 2 and 100 DF,  p-value: 3.144e-07
Age is a significant factor in model 3
We see that model 2 is better than model 1 and model 3, with a higher R-squared value.
2c. Comparing students with job and without job
mba$Placed = (mba$salary >1000)
mytable_sex <- xtabs(~ Placed+sex, data=mba)
round(ftable(addmargins(prop.table(mytable_sex))),2)
##        sex    1    2  Sum
## Placed                   
## FALSE      0.49 0.14 0.62
## TRUE       0.26 0.11 0.38
## Sum        0.75 0.25 1.00
26% of men were placed and 11% of women were placed and on the contrary 49% of men were not placed and only 14% of women were not placed.
mytable_frstlang <- xtabs(~ Placed+frstlang, data=mba)
round(ftable(addmargins(prop.table(mytable_frstlang))),2)
##        frstlang    1    2  Sum
## Placed                        
## FALSE           0.53 0.09 0.62
## TRUE            0.35 0.03 0.38
## Sum             0.88 0.12 1.00
Chi square test

Hypothesis H1: Satisfaction of MBA course does not depend on salary

chisq.test(mytable_sex)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytable_sex
## X-squared = 2.033, df = 1, p-value = 0.1539
chisq.test(mytable_frstlang)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytable_frstlang
## X-squared = 3.0938, df = 1, p-value = 0.07859
2d. Challenge
Data cleaning process
my_data<- read.csv('MBA Starting Salaries Data.csv',header=T,na.strings=c("999"))
sapply(my_data,function(x) sum(is.na(x)))
##      age      sex gmat_tot gmat_qpc gmat_vpc gmat_tpc    s_avg    f_avg 
##        0        0        0        0        0        0        0        0 
##  quarter work_yrs frstlang   salary    satis 
##        0        0        0       35        0
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(my_data, main = "Missing values vs observed")

Taking care of missing values
my_data$salary[is.na(my_data$salary)] <- mean(my_data$salary,na.rm=T)
Model fitting
my_data1<- mba
my_data1$GotPlaced<-(my_data1$salary>1000)
View(my_data1)
my_data1$Placed<-factor(my_data1$Placed)
str(my_data1)
## 'data.frame':    274 obs. of  15 variables:
##  $ age      : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex      : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot : int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc : int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc : int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc : int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg    : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg    : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs : int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang : int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary   : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis    : int  7 6 6 7 5 6 5 6 4 998 ...
##  $ Placed   : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 1 1 1 ...
##  $ GotPlaced: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
my_data$GotPlaced = (my_data$salary >1000)
View(my_data)
my_data1$Placed <- factor(my_data1$Placed)

my_data$GotPlaced[my_data$GotPlaced == TRUE] <- '1' # placed
my_data$GotPlaced[my_data$GotPlaced == FALSE] <- '0' # not placed
my_data$GotPlaced <- factor(my_data$GotPlaced)
train <- my_data[1:150,]
test <- my_data[151:274,]
model <- glm(GotPlaced ~.,family=binomial(link='logit'),data=my_data)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
## 
## Call:
## glm(formula = GotPlaced ~ ., family = binomial(link = "logit"), 
##     data = my_data)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -9.246e-06  -3.906e-06   2.110e-08   2.110e-08   1.549e-05  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.512e+01  2.813e+05   0.000    1.000
## age          1.617e-02  5.819e+03   0.000    1.000
## sex         -3.923e-01  2.977e+04   0.000    1.000
## gmat_tot    -1.791e-03  6.870e+02   0.000    1.000
## gmat_qpc     1.476e-02  1.983e+03   0.000    1.000
## gmat_vpc     2.512e-02  1.994e+03   0.000    1.000
## gmat_tpc    -1.122e-02  9.011e+02   0.000    1.000
## s_avg       -9.101e-01  5.947e+04   0.000    1.000
## f_avg        1.556e-01  3.906e+04   0.000    1.000
## quarter      4.410e-02  1.672e+04   0.000    1.000
## work_yrs    -1.223e-02  6.730e+03   0.000    1.000
## frstlang     1.006e+00  4.402e+04   0.000    1.000
## salary       1.092e-03  5.254e-01   0.002    0.998
## satis       -2.740e-03  5.574e+01   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3.7983e+02  on 273  degrees of freedom
## Residual deviance: 5.9230e-09  on 260  degrees of freedom
## AIC: 28
## 
## Number of Fisher Scoring iterations: 25
Interpreting the results of our logistic regression model
anova(model, test="Chisq")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: GotPlaced
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                       273     379.83             
## age       1     3.32       272     376.51  0.06845 .  
## sex       1     0.19       271     376.32  0.65984    
## gmat_tot  1     0.09       270     376.23  0.76724    
## gmat_qpc  1     0.36       269     375.87  0.54850    
## gmat_vpc  1     0.12       268     375.75  0.73432    
## gmat_tpc  1     0.14       267     375.61  0.70631    
## s_avg     1     0.30       266     375.32  0.58611    
## f_avg     1     0.37       265     374.94  0.54236    
## quarter   1     0.36       264     374.59  0.54962    
## work_yrs  1     0.64       263     373.94  0.42310    
## frstlang  1     0.00       262     373.94  0.98508    
## salary    1   373.94       261       0.00  < 2e-16 ***
## satis     1     0.00       260       0.00  0.99999    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(model)
##           llh       llhNull            G2      McFadden          r2ML 
## -2.961492e-09 -1.899150e+02  3.798301e+02  1.000000e+00  7.499867e-01 
##          r2CU 
##  1.000000e+00
Assessing the predictive ability of the model
fitted.results <- predict(model,newdata=subset(test,select=c(1:14)),type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)

misClasificError <- mean(fitted.results != test$GotPlaced)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 1"
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
p <- predict(model, newdata=subset(test,select=c(1:14)), type="response")
pr <- prediction(p, test$GotPlaced)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 1