MBA Starting Salaries Case

  1. Reading Data from a CSV format file into R
setwd("C:/Users/Dell/Desktop/Project/Week 4")
mba_salary=read.csv("MBA Starting Salaries Data.csv")
View(mba_salary)
  1. Checking the data types of all the variables in the data set
str(mba_salary)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

We see that a few categorical variables have been integer data type instead of factor data types. Hence, we need to change that.

  1. Conversion of data types of certain variables
mba_salary$sex=factor(mba_salary$sex)
mba_salary$frstlang=factor(mba_salary$frstlang)
  1. Checking the data types of all the variables in the data set AGAIN
str(mba_salary)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...
  1. Statistical summary of all the variables in the data set
summary(mba_salary)
##       age        sex        gmat_tot        gmat_qpc        gmat_vpc    
##  Min.   :22.00   1:206   Min.   :450.0   Min.   :28.00   Min.   :16.00  
##  1st Qu.:25.00   2: 68   1st Qu.:580.0   1st Qu.:72.00   1st Qu.:71.00  
##  Median :27.00           Median :620.0   Median :83.00   Median :81.00  
##  Mean   :27.36           Mean   :619.5   Mean   :80.64   Mean   :78.32  
##  3rd Qu.:29.00           3rd Qu.:660.0   3rd Qu.:93.00   3rd Qu.:91.00  
##  Max.   :48.00           Max.   :790.0   Max.   :99.00   Max.   :99.00  
##     gmat_tpc        s_avg           f_avg          quarter     
##  Min.   : 0.0   Min.   :2.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750   1st Qu.:1.250  
##  Median :87.0   Median :3.000   Median :3.000   Median :2.000  
##  Mean   :84.2   Mean   :3.025   Mean   :3.062   Mean   :2.478  
##  3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250   3rd Qu.:3.000  
##  Max.   :99.0   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##     work_yrs      frstlang     salary           satis      
##  Min.   : 0.000   1:242    Min.   :     0   Min.   :  1.0  
##  1st Qu.: 2.000   2: 32    1st Qu.:     0   1st Qu.:  5.0  
##  Median : 3.000            Median :   999   Median :  6.0  
##  Mean   : 3.872            Mean   : 39026   Mean   :172.2  
##  3rd Qu.: 4.000            3rd Qu.: 97000   3rd Qu.:  7.0  
##  Max.   :22.000            Max.   :220000   Max.   :998.0
library(psych)
describe(mba_salary)
##           vars   n     mean       sd median  trimmed     mad min    max
## age          1 274    27.36     3.71     27    26.76    2.97  22     48
## sex*         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot     3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc     4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc     5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc     6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg        7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg        8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter      9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs    10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang*   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary      12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis       13 274   172.18   371.61      6    91.50    1.48   1    998
##            range  skew kurtosis      se
## age           26  2.16     6.45    0.22
## sex*           1  1.16    -0.66    0.03
## gmat_tot     340 -0.01     0.06    3.48
## gmat_qpc      71 -0.92     0.30    0.90
## gmat_vpc      83 -1.04     0.74    1.02
## gmat_tpc      99 -2.28     9.02    0.85
## s_avg          2 -0.06    -0.38    0.02
## f_avg          4 -2.08    10.85    0.03
## quarter        3  0.02    -1.35    0.07
## work_yrs      22  2.78     9.80    0.20
## frstlang*      1  2.37     3.65    0.02
## salary    220000  0.70    -1.05 3078.10
## satis        997  1.77     1.13   22.45

6A. Visualizing the data given about the students

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(gtable)
library(grid)
f1=ggplot(mba_salary,aes(age))+stat_bin(bins=25,color="gray")
f2=ggplot(mba_salary,aes(sex))+stat_count(color="gray")
f3=ggplot(mba_salary,aes(frstlang))+stat_count(color="gray")
f4=ggplot(mba_salary,aes(work_yrs))+stat_bin(bins=25,color="gray")
ff1=ggplot_gtable(ggplot_build(f1))
ff2=ggplot_gtable(ggplot_build(f2))
ff3=ggplot_gtable(ggplot_build(f3))
ff4=ggplot_gtable(ggplot_build(f4))
ff=gtable(widths=unit(c(1,2),"null"),heights=unit(c(2,1),"null"))
ff=gtable_add_grob(ff,ff1,1,1)
ff=gtable_add_grob(ff,ff2,1,2)
ff=gtable_add_grob(ff,ff3,2,1)
ff=gtable_add_grob(ff,ff4,2,2)
grid.draw(ff)

6B. Visualizing the data given about the performance in GMAT examination

a1=ggplot(mba_salary,aes(gmat_tot))+stat_bin(bins=20,color="gray")
a2=ggplot(mba_salary,aes(gmat_qpc))+stat_bin(bins=20,color="gray")
a3=ggplot(mba_salary,aes(gmat_vpc))+stat_bin(bins=20,color="gray")
a4=ggplot(mba_salary,aes(gmat_tpc))+stat_bin(bins=20,color="gray")
aa1=ggplot_gtable(ggplot_build(a1))
aa2=ggplot_gtable(ggplot_build(a2))
aa3=ggplot_gtable(ggplot_build(a3))
aa4=ggplot_gtable(ggplot_build(a4))
aa=gtable(widths=unit(c(1,2),"null"),heights=unit(c(2,1),"null"))
aa=gtable_add_grob(aa,aa1,1,1)
aa=gtable_add_grob(aa,aa2,1,2)
aa=gtable_add_grob(aa,aa3,2,1)
aa=gtable_add_grob(aa,aa4,2,2)
grid.draw(aa)

6C. Visualizing the data given about the performance in the MBA program

b1=ggplot(mba_salary,aes(s_avg))+stat_bin(bins=20,color="gray")
b2=ggplot(mba_salary,aes(f_avg))+stat_bin(bins=20,color="gray")
b3=ggplot(mba_salary,aes(quarter))+stat_count(color="gray")
b4=ggplot(mba_salary,aes(satis))+stat_count(color="gray")
bb1=ggplot_gtable(ggplot_build(b1))
bb2=ggplot_gtable(ggplot_build(b2))
bb3=ggplot_gtable(ggplot_build(b3))
bb4=ggplot_gtable(ggplot_build(b4))
bb=gtable(widths=unit(c(1,2),"null"),heights=unit(c(2,1),"null"))
bb=gtable_add_grob(bb,bb1,1,1)
bb=gtable_add_grob(bb,bb2,1,2)
bb=gtable_add_grob(bb,bb3,2,1)
bb=gtable_add_grob(bb,bb4,2,2)
grid.draw(bb)

We have been told that there are some people who did not answer the survey and some who did not disclose their salary. Hence, we need to treat the ‘missing value’ of unrevealed salaries by giving them the value of the average salary.

  1. Treating the missing value
Salaried=mba_salary[which(mba_salary$salary>=1000),]
Unrevealed=mba_salary[which(mba_salary$salary==999),]
Unrevealed$salary=mean(Salaried$salary)
Placed=rbind(Unrevealed,Salaried)
  1. Visualizing salaries of all placed students
library(lattice)
histogram(~salary,data=Placed,col="gray",xlab="Salary",main="Histogram of Salary")

Salary of the placed students can be pictorially represented in relation with various variables as shown below:

9A. Scatterplot of Salary versus Age

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary~age,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Age",xlab="Age",ylab="Salary")

9B. Scatterplot of Salary vs Total GMAT Score

scatterplot(salary~gmat_tot,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Total GMAT Score",xlab="Total GMAT Score",ylab="Salary")

9C. Scatterplot of Salary vs Spring MBA Score

scatterplot(salary~s_avg,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Spring MBA Average",xlab="Spring MBA Average",ylab="Salary")

9D. Scatterplot of Salary vs Fall MBA Score

scatterplot(salary~f_avg,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Fall MBA Average",xlab="Fall MBA Average",ylab="Salary")

9E. Scatterplot of Salary vs Quartile Ranking

scatterplot(salary~quarter,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Quartile Ranking",xlab="Quartile Ranking",ylab="Salary")

9F. Scatterplot of Salary vs Years of Work Experience

scatterplot(salary~work_yrs,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Years of Work Experience",xlab="Years of Work Experience",ylab="Salary")

A few points to be noted are:

a.) The Fall MBA Average as a variable is slightly negatively correlated with Salary of the student.

b.) Spring MBA Average is slightly positively correlated with Salary.

c.) On the other hand, Age and Years of Work Experience are strongly positively correlated with Salary.

d.) At the end, the salary is hardly correlated with the Total GMAT Score and Quartile Ranking.

The correlations can be further explored with the help of corrgram.

  1. Constructing a Corrgram to see the intercorrelations pictorially
library(corrgram)
corrgram(Placed,order=FALSE,lower.panel=panel.shade,upper.panel=panel.pie,diag.panel=panel.minmax,text.panel=panel.txt,main="Corrgram of Salaries intercorrelations")

On studying the corrgram closely, we can infer the following:

a.) Age and Years of Work Experience is highly correlated. ‘Work Experience’ is a more relevant metric as far as the case is concerned.

b.) Total GMAT Score, Quantitative GMAT percentile, Verbal GMAT Percentile and Overall GMAT Percentile are highly correlated as well. Hence, we choose to include ‘Total GMAT Score’ and rejecting the rest.

c.) Spring MBA Average, Fall MBA Average and Quartile Ranking are highly correlated too. Among them, we choose ‘Quartile Ranking’ to include in our regression further.

d.) There seem to exist some correlations between Salary with variables like Age, Total GMAT Score, Quartile Ranking. It can be seen clearly in the following scatter plots:

This doesnt tell the correct picture since categorical variables like Sex and First Language Spoken have not been considered. Here are some figures which will enable to understand it better:

  1. Salaries aggregated by Sex of the student and First Language Spoken by the student
aggregate(Placed$salary,by=list(Sex=Placed$sex),mean)
##   Sex         x
## 1   1 104400.32
## 2   2  99150.27
aggregate(Placed$salary,by=list(First_Language_Spoken=Placed$frstlang),mean)
##   First_Language_Spoken        x
## 1                     1 102021.8
## 2                     2 110723.5

The average salary of a male student is more than that of a female and there seems to be no great difference in the salaries with respect to the native language spoken by the student.

If not on the salaries solely, these categorical variables can have an impact when in combination with other variables:

12A. Scatterplot of Salary vs Age|sex

scatterplot(salary~age|sex,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Age",xlab="Age",ylab="Salary")

12B. Scatterplot of Salary vs Total GMAT Score|First language

scatterplot(salary~gmat_tot|frstlang,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Total GMAT Score",xlab="Total GMAT Score",ylab="Salary")

After visualizing data, we need to formulate a hypothesis which needs to be tested with the help a regression model:

“What factors influenced the starting salaries that the students got after graduating from the MBA program?”

In order to be able to answer this question, we need to develop a null hypothesis which states that

“There exist no factors that influence the starting salaries receieved by the students post MBA”

  1. Performing T-Test to test the hypothesis by taking Years of Work Experience as a factor
library(MASS)
t.test(Placed$salary,Placed$work_yrs)
## 
##  Welch Two Sample t-test
## 
## data:  Placed$salary and Placed$work_yrs
## t = 78.497, df = 137, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  100431.7 105622.4
## sample estimates:
##    mean of x    mean of y 
## 1.030307e+05 3.666667e+00

Since the value of p is less than 0.05, there is a significant difference in the salaries of people with different years of work experience. Thus, the null hypothesis is rejected.

In order to validate more, we need to develop a regression model as folows:

  1. Regression
M1=lm(salary~sex+work_yrs+frstlang+satis+quarter+gmat_tot,data=Placed)
summary(M1)
## 
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang + satis + quarter + 
##     gmat_tot, data = Placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28423  -6666  -1437   3943  91498 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 97853.222  16352.878   5.984 1.96e-08 ***
## sex2        -4630.516   2719.460  -1.703    0.091 .  
## work_yrs     2187.366    445.426   4.911 2.65e-06 ***
## frstlang2    7685.888   3970.182   1.936    0.055 .  
## satis        -124.958   1105.669  -0.113    0.910    
## quarter     -1467.974   1059.401  -1.386    0.168    
## gmat_tot        2.808     22.885   0.123    0.903    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13860 on 131 degrees of freedom
## Multiple R-squared:  0.2278, Adjusted R-squared:  0.1924 
## F-statistic:  6.44 on 6 and 131 DF,  p-value: 5.719e-06

After removing the statistically insignificant variables,

M2=lm(salary~sex+work_yrs+frstlang,data=Placed)
summary(M2)
## 
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang, data = Placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31346  -7497  -1092   4654  88706 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  95113.7     2140.2  44.442  < 2e-16 ***
## sex2         -4272.1     2689.3  -1.589   0.1145    
## work_yrs      2252.3      433.1   5.201 7.26e-07 ***
## frstlang2     6668.9     3701.3   1.802   0.0738 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13800 on 134 degrees of freedom
## Multiple R-squared:  0.2161, Adjusted R-squared:  0.1985 
## F-statistic: 12.31 on 3 and 134 DF,  p-value: 3.661e-07

Just three variables defining the salaries becomes a risky chance to take with the regression. Hence we chose one from the less statistically insignificant variables.

M3=lm(salary~sex+work_yrs+frstlang+quarter,data=Placed)
summary(M3)
## 
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang + quarter, data = Placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28751  -6515  -1500   3769  91268 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    98984       3474  28.491  < 2e-16 ***
## sex2           -4660       2694  -1.730   0.0859 .  
## work_yrs        2174        435   4.998 1.79e-06 ***
## frstlang2       7717       3762   2.051   0.0422 *  
## quarter        -1480       1049  -1.411   0.1606    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13750 on 133 degrees of freedom
## Multiple R-squared:  0.2276, Adjusted R-squared:  0.2044 
## F-statistic: 9.798 on 4 and 133 DF,  p-value: 5.608e-07
M4=lm(salary~sex+work_yrs+frstlang+gmat_tot,data=Placed)
summary(M4)
## 
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang + gmat_tot, data = Placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31041  -7532  -1297   4533  88980 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 92114.887  14637.024   6.293 4.18e-09 ***
## sex2        -4252.827   2700.518  -1.575   0.1177    
## work_yrs     2265.501    439.319   5.157 8.90e-07 ***
## frstlang2    6811.001   3777.404   1.803   0.0736 .  
## gmat_tot        4.729     22.834   0.207   0.8362    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13850 on 133 degrees of freedom
## Multiple R-squared:  0.2163, Adjusted R-squared:  0.1927 
## F-statistic: 9.177 on 4 and 133 DF,  p-value: 1.405e-06
M5=lm(salary~sex+work_yrs+frstlang+satis,data=Placed)
summary(M5)
## 
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang + satis, data = Placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31094  -7233  -1258   4672  88838 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  96134.6     6398.9  15.024  < 2e-16 ***
## sex2         -4249.6     2702.3  -1.573   0.1182    
## work_yrs      2259.3      436.7   5.174 8.24e-07 ***
## frstlang2     6508.9     3833.1   1.698   0.0918 .  
## satis         -187.0     1104.3  -0.169   0.8658    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13850 on 133 degrees of freedom
## Multiple R-squared:  0.2162, Adjusted R-squared:  0.1926 
## F-statistic: 9.173 on 4 and 133 DF,  p-value: 1.415e-06

Hence, we obtained our regression equation as the one shown by M5. It says that “SEX”, “YEARS OF WORK EXPERIENCE”, “FIRST LANGUAGE SPOKEN” and “SATISFACTION LEVEL FROM MBA PROGRAM” affects the salaries.