MBA SALARY and ADMISSION Case Study

mbasalary <- read.csv("MBA Starting Salaries Data.csv")
summary(mbasalary)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0

Adding two columns for better representation

mbasalary$sex1 <- mbasalary$sex
mbasalary$frstlang1 <- mbasalary$frstlang

mbasalary$sex1[mbasalary$sex1==1] <- "Male"
mbasalary$sex1[mbasalary$sex1==2] <- "Female"

mbasalary$frstlang1[mbasalary$frstlang1==1] <- "English"
mbasalary$frstlang1[mbasalary$frstlang1==2] <- "Non English"

Boxplot showing gender distribution in the dataset

a <- table(mbasalary$sex1)
barplot(a, main = "Gender distribution in the dataset")

Boxplots showing various variables and their distribution

boxplot(mbasalary$gmat_tot, horizontal = TRUE, xlab= "GMAT Total Scores", main = "GMAT Total Distribution",las= 1)

boxplot(mbasalary$gmat_qpc, horizontal = TRUE, xlab= "Quantitative GMAT Scores", main = "Quantitative GMAT Score Distribution",las= 1)

boxplot(mbasalary$gmat_vpc, horizontal = TRUE, xlab= "Verbal GMAT Scores", main = "Verbal GMAT Score Distribution",las= 1)

boxplot(mbasalary$gmat_tpc, horizontal = TRUE, xlab= "Overall GMAT Percentile", main = "Overall GMAT Percentile Distribution",las= 1)

boxplot(mbasalary$s_avg, horizontal = TRUE, xlab= "Spring MBA Avg", main = "Spring MBA Avg Distribution",las= 1)

boxplot(mbasalary$f_avg, horizontal = TRUE, xlab= "Fall MBA Avg", main = "Fall MBA Avg Distribution",las= 1)

boxplot(mbasalary$work_yrs, horizontal = TRUE, xlab= "Work Ex", main = "Work Ex Distribution",las= 1)

Boxplot showing distribution of quarters in the dataset

b <- table(mbasalary$quarter)
barplot(b)

Creating a dataframe for students who got a job and declared their salary

yesjob <- mbasalary[which(mbasalary$salary >= 1000),]
boxplot(yesjob$salary, horizontal = TRUE, xlab= "Salary", main = "Salary Distribution",las= 1)

Boxplot depicting satisfaction of students who completed the survey

nosatis <- mbasalary[which(mbasalary$satis < 10),]
boxplot(nosatis$satis, horizontal = TRUE, xlab= "Satisfaction on a Scale of 1-7", main = "Satisfaction Distribution",las= 1)

Scatterplots depicting various relationship between variables for the people who got a job offer

library(car)
## Warning: package 'car' was built under R version 3.4.3
scatterplot(yesjob$sex,yesjob$salary)

scatterplot(yesjob$age,yesjob$salary)

scatterplot(yesjob$gmat_tot,yesjob$salary)

scatterplot(yesjob$work_yrs,yesjob$salary)

scatterplot(yesjob$s_avg,yesjob$salary)

scatterplot(yesjob$f_avg,yesjob$salary)

scatterplot(yesjob$satis,yesjob$salary)

scatterplot(yesjob$salary, yesjob$frstlang)

Scatterplots depicting various relationship between variables for people who completed the satisfaction survey

scatterplot(nosatis$sex,nosatis$satis)

scatterplot(nosatis$satis, nosatis$age)

scatterplot(nosatis$satis, nosatis$gmat_tot)

scatterplot(nosatis$satis, nosatis$work_yrs)

scatterplot(nosatis$satis, nosatis$s_avg)

scatterplot(nosatis$satis, nosatis$f_avg)

Barplot showing distribution of satisfaction based on student’s first langauge

abc <- table(nosatis$frstlang1,nosatis$satis)
barplot(abc,ylab="Frequency",xlab = "Satisfaction on a scale of 1-7",col=c("darkblue","red"),
        legend = rownames(abc), beside=TRUE)

Scatterplot depicting the relation of Student’s score in GMAT and MBA

scatterplot(mbasalary$gmat_tot, mbasalary$s_avg)

scatterplot(mbasalary$gmat_tot, mbasalary$f_avg)

CORRGRAM of all the variables in the original data set

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(mbasalary[c(1:13)], order=TRUE,
         main="MBA SALARY CORRGRAM",
         lower.panel=panel.shade, upper.panel=panel.pie,
         diag.panel=panel.minmax, text.panel=panel.txt)

Correlation between all the variables in the original data set

cor(mbasalary[c(1:13)])
##                  age          sex    gmat_tot    gmat_qpc    gmat_vpc
## age       1.00000000 -0.028106442 -0.14593840 -0.21616985 -0.04417547
## sex      -0.02810644  1.000000000 -0.05336820 -0.16377435  0.07488782
## gmat_tot -0.14593840 -0.053368202  1.00000000  0.72473781  0.74839187
## gmat_qpc -0.21616985 -0.163774346  0.72473781  1.00000000  0.15218014
## gmat_vpc -0.04417547  0.074887816  0.74839187  0.15218014  1.00000000
## gmat_tpc -0.16990307 -0.008090213  0.84779965  0.65137754  0.66621604
## s_avg     0.14970402  0.127115144  0.11311702 -0.02984873  0.20445365
## f_avg    -0.01744806  0.091663891  0.10442409  0.07370455  0.07592225
## quarter  -0.04967221 -0.133533171 -0.09223903  0.03636638 -0.17460736
## work_yrs  0.85829810 -0.011296374 -0.18235434 -0.23660827 -0.06639049
## frstlang  0.05692649  0.001536205 -0.13503402  0.13892774 -0.38980465
## salary   -0.06257355  0.068858628 -0.05497188 -0.04403293 -0.00613934
## satis    -0.12788825 -0.054602220  0.08255770  0.06060004  0.06262375
##              gmat_tpc       s_avg       f_avg       quarter     work_yrs
## age      -0.169903066  0.14970402 -0.01744806 -4.967221e-02  0.858298096
## sex      -0.008090213  0.12711514  0.09166389 -1.335332e-01 -0.011296374
## gmat_tot  0.847799647  0.11311702  0.10442409 -9.223903e-02 -0.182354339
## gmat_qpc  0.651377538 -0.02984873  0.07370455  3.636638e-02 -0.236608270
## gmat_vpc  0.666216035  0.20445365  0.07592225 -1.746074e-01 -0.066390490
## gmat_tpc  1.000000000  0.11736245  0.07973210 -8.303535e-02 -0.173361859
## s_avg     0.117362449  1.00000000  0.55062139 -7.621166e-01  0.129292714
## f_avg     0.079732099  0.55062139  1.00000000 -4.475064e-01 -0.039056921
## quarter  -0.083035351 -0.76211664 -0.44750637  1.000000e+00 -0.086026406
## work_yrs -0.173361859  0.12929271 -0.03905692 -8.602641e-02  1.000000000
## frstlang -0.103362747 -0.13631308 -0.03705695  9.949226e-02 -0.027866747
## salary    0.004930901  0.14583606  0.02944303 -1.643699e-01  0.009023407
## satis     0.092934266 -0.03268664  0.01089273 -1.267198e-05 -0.109255286
##              frstlang       salary         satis
## age       0.056926486 -0.062573547 -1.278882e-01
## sex       0.001536205  0.068858628 -5.460222e-02
## gmat_tot -0.135034017 -0.054971880  8.255770e-02
## gmat_qpc  0.138927742 -0.044032933  6.060004e-02
## gmat_vpc -0.389804653 -0.006139340  6.262375e-02
## gmat_tpc -0.103362747  0.004930901  9.293427e-02
## s_avg    -0.136313080  0.145836062 -3.268664e-02
## f_avg    -0.037056954  0.029443027  1.089273e-02
## quarter   0.099492259 -0.164369865 -1.267198e-05
## work_yrs -0.027866747  0.009023407 -1.092553e-01
## frstlang  1.000000000 -0.086592096  7.932264e-02
## salary   -0.086592096  1.000000000 -3.352171e-01
## satis     0.079322637 -0.335217114  1.000000e+00

Creating a new data set of students that were not placed and then comparing thefirst language and gender for all 3 groups: All students, Not placed students and Placed Students in order

nojob <- mbasalary
nojob <- mbasalary[which(mbasalary$salary == 0),]

print("All Students")
## [1] "All Students"
table(mbasalary$frstlang1,mbasalary$sex1)
##              
##               Female Male
##   English         60  182
##   Non English      8   24
print("Not placed Students")
## [1] "Not placed Students"
table(nojob$frstlang1,nojob$sex1)
##              
##               Female Male
##   English         22   60
##   Non English      1    7
print("Placed Students")
## [1] "Placed Students"
table(yesjob$frstlang1,yesjob$sex1)
##              
##               Female Male
##   English         28   68
##   Non English      3    4
nojob <- mbasalary
nojob <- mbasalary[which(mbasalary$salary == 0),]

print("All Students")
## [1] "All Students"
table(mbasalary$frstlang1,mbasalary$sex1)
##              
##               Female Male
##   English         60  182
##   Non English      8   24
print("Not placed Students")
## [1] "Not placed Students"
table(nojob$frstlang1,nojob$sex1)
##              
##               Female Male
##   English         22   60
##   Non English      1    7
print("Placed Students")
## [1] "Placed Students"
table(yesjob$frstlang1,yesjob$sex1)
##              
##               Female Male
##   English         28   68
##   Non English      3    4

Adding one column to the mbasalary dataframe for better visualiztion

mbasalary$salary1 <- mbasalary$salary

mbasalary$salary1[mbasalary$salary1>1000] <- "Got an offer"
mbasalary$salary1[mbasalary$salary1==0] <- "Job less"
mbasalary$salary1[mbasalary$salary1==999] <- "Not Applicable"
mbasalary$salary1[mbasalary$salary1==998] <- "Not Applicable"

Creating tables to visualze various factors that may impact job placement

table(mbasalary$salary1,mbasalary$sex1)
##                 
##                  Female Male
##   Got an offer       31   72
##   Job less           23   67
##   Not Applicable     14   67
table(mbasalary$salary1,mbasalary$frstlang1)
##                 
##                  English Non English
##   Got an offer        96           7
##   Job less            82           8
##   Not Applicable      64          17
print("now comparing job placement vs satisfaction level")
## [1] "now comparing job placement vs satisfaction level"
table(mbasalary$salary1,mbasalary$satis)
##                 
##                   1  2  3  4  5  6  7 998
##   Got an offer    0  0  1  1 29 50 22   0
##   Job less        0  0  0  4 36 40 10   0
##   Not Applicable  1  1  4 12  9  7  1  46

Chi-sq test to check for association between Job placement and gender

chisq.test(mbasalary$salary1,mbasalary$sex1)
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$salary1 and mbasalary$sex1
## X-squared = 4.0288, df = 2, p-value = 0.1334

Chi-sq test to check for association between Job placement and first language

chisq.test(mbasalary$salary1,mbasalary$frstlang1)
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$salary1 and mbasalary$frstlang1
## X-squared = 9.8645, df = 2, p-value = 0.00721

Using linear regression to build a model for predicting Salary using multiple factors

predi <- lm(yesjob$salary~yesjob$sex+yesjob$work_yrs+yesjob$gmat_tot+yesjob$s_avg+yesjob$f_avg+yesjob$frstlang)
summary(predi)
## 
## Call:
## lm(formula = yesjob$salary ~ yesjob$sex + yesjob$work_yrs + yesjob$gmat_tot + 
##     yesjob$s_avg + yesjob$f_avg + yesjob$frstlang)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32652  -8940  -1709   5186  83182 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     82356.30   24206.88   3.402 0.000976 ***
## yesjob$sex      -5886.39    3462.79  -1.700 0.092388 .  
## yesjob$work_yrs  2201.83     579.92   3.797 0.000257 ***
## yesjob$gmat_tot   -11.90      31.77  -0.375 0.708712    
## yesjob$s_avg     4851.02    4986.79   0.973 0.333110    
## yesjob$f_avg    -1153.74    3822.28  -0.302 0.763422    
## yesjob$frstlang 15101.77    6473.46   2.333 0.021743 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15760 on 96 degrees of freedom
## Multiple R-squared:  0.2678, Adjusted R-squared:  0.2221 
## F-statistic: 5.853 on 6 and 96 DF,  p-value: 3.114e-05

Using linear regression to build a model for predicting Salary using Work-ex and First language

predi1 <- lm(yesjob$salary~yesjob$work_yrs+yesjob$frstlang)
summary(predi1)
## 
## Call:
## lm(formula = yesjob$salary ~ yesjob$work_yrs + yesjob$frstlang)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33972  -8955   -455   4545  76681 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      79941.4     6788.8  11.775  < 2e-16 ***
## yesjob$work_yrs   2483.3      527.9   4.704 8.18e-06 ***
## yesjob$frstlang  13064.0     6283.2   2.079   0.0402 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15740 on 100 degrees of freedom
## Multiple R-squared:  0.2396, Adjusted R-squared:  0.2244 
## F-statistic: 15.75 on 2 and 100 DF,  p-value: 1.128e-06

Using linear regression to build a model for predicting Salary using Gender, Gmat Total, and MBA performance

predi2  <- lm(yesjob$salary~yesjob$sex+yesjob$gmat_tot+yesjob$s_avg+yesjob$f_avg)
summary(predi2)
## 
## Call:
## lm(formula = yesjob$salary ~ yesjob$sex + yesjob$gmat_tot + yesjob$s_avg + 
##     yesjob$f_avg)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39889  -7926  -2357   4047 120521 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     122912.34   24422.85   5.033  2.2e-06 ***
## yesjob$sex       -6158.11    3825.41  -1.610   0.1107    
## yesjob$gmat_tot    -38.61      34.87  -1.107   0.2708    
## yesjob$s_avg      9603.44    5173.84   1.856   0.0664 .  
## yesjob$f_avg     -5752.29    4028.95  -1.428   0.1565    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17550 on 98 degrees of freedom
## Multiple R-squared:  0.07369,    Adjusted R-squared:  0.03588 
## F-statistic: 1.949 on 4 and 98 DF,  p-value: 0.1084

From the above analysis we see that the best predictor for salary are Gender and First language.

Using tables to visualize Job Placement and Gender or First language

table(mbasalary$salary1,mbasalary$sex1)
##                 
##                  Female Male
##   Got an offer       31   72
##   Job less           23   67
##   Not Applicable     14   67
table(mbasalary$salary1,mbasalary$frstlang1)
##                 
##                  English Non English
##   Got an offer        96           7
##   Job less            82           8
##   Not Applicable      64          17

Performing Chi-sq test to check association between various variables

chisq.test(mbasalary$salary1,mbasalary$sex1)
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$salary1 and mbasalary$sex1
## X-squared = 4.0288, df = 2, p-value = 0.1334
chisq.test(mbasalary$salary1,mbasalary$frstlang1)
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$salary1 and mbasalary$frstlang1
## X-squared = 9.8645, df = 2, p-value = 0.00721
chisq.test(mbasalary$frstlang1,mbasalary$sex1)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mbasalary$frstlang1 and mbasalary$sex1
## X-squared = 2.4871e-29, df = 1, p-value = 1

Creating a new dataframe to perform logistic regression

mbasalary12 <- mbasalary
mbasalary12$salary2 <- mbasalary12$salary
mbasalary12<-mbasalary12[mbasalary12$salary2!=998,]
mbasalary12$salary2[mbasalary12$salary2>1] <- 1
mbasalary12$salary2[mbasalary12$salary2==0] <- 0

Removing actual salary data from the dataframe and diving it into two parts for training and testing.

We are using 90:10 split for the dataframe

mbasalary12 <- mbasalary12[-c(12)]

train <- mbasalary12[1:205,]
test <- mbasalary12[206:228,]

Creating the logistics regression model

model <- glm(salary2 ~ age + sex+ gmat_tot+gmat_qpc+gmat_vpc+ gmat_tpc + s_avg +f_avg + quarter + work_yrs+frstlang + satis ,family=binomial(link='logit'),data=mbasalary12)
summary(model)
## 
## Call:
## glm(formula = salary2 ~ age + sex + gmat_tot + gmat_qpc + gmat_vpc + 
##     gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang + 
##     satis, family = binomial(link = "logit"), data = mbasalary12)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9069  -1.2617   0.8233   0.9796   1.5543  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  7.229484   3.907020   1.850   0.0643 .
## age         -0.182398   0.075593  -2.413   0.0158 *
## sex         -0.071435   0.332632  -0.215   0.8300  
## gmat_tot    -0.006845   0.010361  -0.661   0.5088  
## gmat_qpc     0.012925   0.028238   0.458   0.6472  
## gmat_vpc     0.016111   0.027631   0.583   0.5598  
## gmat_tpc     0.011001   0.019087   0.576   0.5644  
## s_avg        0.032437   0.621421   0.052   0.9584  
## f_avg       -0.190403   0.340350  -0.559   0.5759  
## quarter     -0.141159   0.192870  -0.732   0.4642  
## work_yrs     0.097279   0.083493   1.165   0.2440  
## frstlang     0.608889   0.578531   1.052   0.2926  
## satis       -0.158727   0.157510  -1.008   0.3136  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 305.89  on 227  degrees of freedom
## Residual deviance: 291.96  on 215  degrees of freedom
## AIC: 317.96
## 
## Number of Fisher Scoring iterations: 4

Now predicting job placement using the model and checking it’s efficiency.

fitted.results <- predict(model,newdata=subset(test,select=c(1:12)),type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)

misClasificError <- mean(fitted.results != test$salary2)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.869565217391304"