Reading the file MBA Starting Salaries Data.csv and generating summary

start_salary <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
library(psych)
describe(start_salary)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

**The values for Salary and Satisfaction is not correct due to lack of data.

Segregating different data frames

# Dataframe for MBA students who got placed
Placed <- start_salary[which (start_salary$salary > 999), ]
View(Placed)

# Dataframe for MBA students who didnot get placed
Not_Placed <- start_salary[which (start_salary$salary == 0), ]
View(Not_Placed)

# Dataframe for MBA students who didnot disclose their starting salary
Salary_not_disclosed <- start_salary[which (start_salary$salary == 999), ]
View(Salary_not_disclosed)

# Dataframe for MBA students who didnot answer the survey
Not_answered <- start_salary[which (start_salary$salary == 998), ]
View(Not_answered)

# Dataframe for MBA students whose starting salary is known 
Salary_known <- start_salary[which((start_salary$salary == 0) | (start_salary$salary > 999)), ]
View(Salary_known)

# Dataframe for MBA students whose satisfaction is known
Satis_known <- start_salary[which(start_salary$satis < 8), ]
View(Satis_known)

______________________________________________________________________________________________________ ** The Salary data unavailable for the students who didnot participate in the survey or didnot disclose their starting salaries are not considered for Salary_known dataframe

** The Satisfaction Level unavailable for the students who didnot participate in the survey is not considered for Satis_known dataframe. ________________________________________________________________________________________________________

Box plot for Age

boxplot(start_salary$age,
        horizontal = TRUE,
        main = "Box Plot for Age",
        xlab =  "Age",
        col  =  "tomato"
       )

## Bar plot for Sex

counts_sex <-table(start_salary$sex)
barplot(counts_sex, width=1, space=1, main="Gender Distribution",xlab="Gender",col=c("darkblue","pink"),
        names.arg=c("Male","Female"),ylim=c(0,105),xlim=c(0,10))

## Box plot for GMAT Total

boxplot(start_salary$gmat_tot,
        horizontal = TRUE,
        main = "Box Plot for GMAT Total",
        xlab =  "Score",
        col  =  "maroon"
       )

Box plot for Quantitative GMAT Percentile

boxplot(start_salary$gmat_qpc,
        horizontal = TRUE,
        main = "Box Plot for Quantitative GMAT Percentile",
        xlab =  "Percentile",
        col  =  "gold"
       )

Box plot for Verbal GMAT Percentile

boxplot(start_salary$gmat_vpc,
        horizontal = TRUE,
        main = "Box Plot for Verbal GMAT Percentile ",
        xlab =  "Percentile",
        col  =  "dodgerblue"
       )

Box plot for Overall GMAT Percentile

boxplot(start_salary$gmat_tpc,
        horizontal = TRUE,
        main = "Box Plot for Overall GMAT Percentile ",
        xlab =  "Percentile",
        col  =  "tan"
       )

Box plot for Spring MBA Average

boxplot(start_salary$s_avg,
        horizontal = TRUE,
        main = "Box Plot for Spring MBA Average ",
        xlab =  "Average",
        col  =  "navy"
       )

Box plot for Fall MBA Average

boxplot(start_salary$f_avg,
        horizontal = TRUE,
        main = "Box Plot for Fall MBA Average ",
        xlab =  "Average",
        col  =  "peachpuff"
       )

Box plot for Years of Work Experience

boxplot(start_salary$work_yrs,
        horizontal = TRUE,
        main = "Box Plot for Years of Work Experience ",
        xlab =  "Years",
        col  =  "chocolate"
       )

Bar plot for First Language as English

counts_lang <-table(start_salary$frstlang)
barplot(counts_lang, width=1, space=1, main="First Language of Students ",xlab="Language", 
        col=c("purple","darkgreen"),names.arg=c("English","Other"),ylim=c(0,105),xlim=c(0,10))

Box plot for Starting Salary

boxplot(Salary_known$salary,
        horizontal = TRUE,
        main = "Box Plot for Starting Salary ",
        xlab =  "Salary ",
        col  =  "darksalmon"
       )

Bar Plot for Degree of Satisfaction from MBA Program

hist(Satis_known$satis, breaks=5,col="orange",xlab="Satisfaction level,1=low 7=high", main="Satisfaction  distribution")

Scatterplot Matrix

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(~salary+age+gmat_tot+s_avg+f_avg+work_yrs |sex, data=start_salary, main="Variation of Salary with other variables")

Corrgram Generation

library(corrgram)
## Warning: replacing previous import by 'magrittr::%>%' when loading
## 'dendextend'
corrgram(Salary_known, order=TRUE, lower.panel=panel.shade,upper.panel=panel.pie, text.panel=panel.txt,main="MBA starting salary analysis Corrgram")

Corrplot Generation

library(corrplot)
## corrplot 0.84 loaded
corr <- Salary_known [, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]
corr_final <- cor(corr)
corrplot(corr_final, method="circle")

Pearson Chi-square Test for Salary and Sex

chi1 <- xtabs (~ salary + sex, data=Salary_known)
chisq.test(chi1)
## Warning in chisq.test(chi1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  chi1
## X-squared = 55.494, df = 42, p-value = 0.07929

-> The p-value = 0.07929 which signifies that Salary has significant dependency on Sex of the students.

Pearson Chi-square Test for Salary and Quartile Ranking

chi2 <- xtabs (~ salary + quarter, data=Salary_known)
chisq.test(chi2)
## Warning in chisq.test(chi2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  chi2
## X-squared = 132.64, df = 126, p-value = 0.3252

-> The p-value = 0.3252 which signifies that Salary has no significant dependency on the Quartile Ranking

Pearson Chi-square Test for Salary and First Language

chi3 <- xtabs (~ salary + frstlang, data=Salary_known)
chisq.test(chi3)
## Warning in chisq.test(chi3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  chi3
## X-squared = 62.016, df = 42, p-value = 0.02384

-> The p-value = 0.02384 which signifies that Salary has very significant dependency on the First Language spoken by the student.

Correlation Matrix

round(cor(Salary_known), 3)
##             age    sex gmat_tot gmat_qpc gmat_vpc gmat_tpc  s_avg  f_avg
## age       1.000 -0.032   -0.126   -0.221   -0.007   -0.132  0.164 -0.034
## sex      -0.032  1.000   -0.044   -0.168    0.099   -0.013  0.073  0.043
## gmat_tot -0.126 -0.044    1.000    0.743    0.753    0.879  0.144  0.101
## gmat_qpc -0.221 -0.168    0.743    1.000    0.175    0.691  0.019  0.130
## gmat_vpc -0.007  0.099    0.753    0.175    1.000    0.688  0.191  0.033
## gmat_tpc -0.132 -0.013    0.879    0.691    0.688    1.000  0.189  0.110
## s_avg     0.164  0.073    0.144    0.019    0.191    0.189  1.000  0.521
## f_avg    -0.034  0.043    0.101    0.130    0.033    0.110  0.521  1.000
## quarter  -0.077 -0.087   -0.084    0.009   -0.139   -0.129 -0.735 -0.382
## work_yrs  0.872 -0.024   -0.174   -0.241   -0.041   -0.166  0.159 -0.048
## frstlang  0.098 -0.008   -0.096    0.095   -0.295   -0.108 -0.126 -0.056
## salary   -0.130  0.019    0.000    0.028    0.003    0.061  0.096  0.009
## satis    -0.074 -0.062    0.080   -0.020    0.195    0.133 -0.046 -0.115
##          quarter work_yrs frstlang salary  satis
## age       -0.077    0.872    0.098 -0.130 -0.074
## sex       -0.087   -0.024   -0.008  0.019 -0.062
## gmat_tot  -0.084   -0.174   -0.096  0.000  0.080
## gmat_qpc   0.009   -0.241    0.095  0.028 -0.020
## gmat_vpc  -0.139   -0.041   -0.295  0.003  0.195
## gmat_tpc  -0.129   -0.166   -0.108  0.061  0.133
## s_avg     -0.735    0.159   -0.126  0.096 -0.046
## f_avg     -0.382   -0.048   -0.056  0.009 -0.115
## quarter    1.000   -0.126    0.090 -0.147  0.068
## work_yrs  -0.126    1.000   -0.003 -0.053 -0.008
## frstlang   0.090   -0.003    1.000  0.007 -0.136
## salary    -0.147   -0.053    0.007  1.000  0.156
## satis      0.068   -0.008   -0.136  0.156  1.000

Regression Analysis

Model 1

regress <- lm(salary ~ age + sex + gmat_qpc + gmat_vpc +gmat_tpc + s_avg + f_avg + work_yrs + frstlang
 + satis, data = Salary_known)
summary(regress)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_vpc + gmat_tpc + 
##     s_avg + f_avg + work_yrs + frstlang + satis, data = Salary_known)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -93924 -48912  20019  44376 179796 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  40696.2    70338.2   0.579   0.5636  
## age          -4376.4     1916.4  -2.284   0.0235 *
## sex           1749.0     8733.1   0.200   0.8415  
## gmat_qpc      -246.8      442.3  -0.558   0.5775  
## gmat_vpc      -381.8      406.9  -0.938   0.3494  
## gmat_tpc       571.8      652.3   0.876   0.3819  
## s_avg        22054.8    12502.0   1.764   0.0794 .
## f_avg        -5868.9     8794.0  -0.667   0.5054  
## work_yrs      3252.9     2171.5   1.498   0.1359  
## frstlang     13151.7    15529.4   0.847   0.3982  
## satis         9967.1     5137.4   1.940   0.0539 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52430 on 182 degrees of freedom
## Multiple R-squared:  0.07776,    Adjusted R-squared:  0.02709 
## F-statistic: 1.535 on 10 and 182 DF,  p-value: 0.1301

Model 2

regress <- lm(salary ~ age + sex + gmat_qpc +gmat_tpc  + f_avg + work_yrs + frstlang
 + satis, data = Salary_known)
summary(regress)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_tpc + f_avg + 
##     work_yrs + frstlang + satis, data = Salary_known)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -80262 -52304  20427  44021 169869 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  68519.7    67901.0   1.009   0.3142  
## age          -4385.4     1915.6  -2.289   0.0232 *
## sex           1776.0     8764.8   0.203   0.8397  
## gmat_qpc      -153.5      382.5  -0.401   0.6886  
## gmat_tpc       294.5      417.2   0.706   0.4811  
## f_avg         2627.6     7477.6   0.351   0.7257  
## work_yrs      3671.7     2171.7   1.691   0.0926 .
## frstlang     14674.3    15160.5   0.968   0.3343  
## satis         9335.3     5152.1   1.812   0.0716 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52720 on 184 degrees of freedom
## Multiple R-squared:  0.05736,    Adjusted R-squared:  0.01637 
## F-statistic:   1.4 on 8 and 184 DF,  p-value: 0.1992

->The best model that can be considered for estimation is Model 1. It has Multiple R-squared = 0.07776 and p-value = 0.1301 which is better than Model 2.

Comparision of placed students with non-placed students

Salary_known$placed <- ifelse(Salary_known$salary == 0, 0, 1)
counts_placed <-table(Salary_known$placed)
barplot(counts_placed, width=1, space=1, main="Placed vs Not-placed",xlab="Status", 
        col=c("red","green"),names.arg=c("Not Placed","Placed"),ylim=c(0,105),xlim=c(0,10))

Pearson Chi-square Test for Placement Status and Sex

chi4 <- xtabs(~placed+sex, data=Salary_known)
addmargins(chi4)
##       sex
## placed   1   2 Sum
##    0    67  23  90
##    1    72  31 103
##    Sum 139  54 193
chisq.test(chi4)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chi4
## X-squared = 0.29208, df = 1, p-value = 0.5889

-> The p-value = 0.5889 which signifies that Placement Status has no significant dependency on Sex of the students.

Pearson Chi-square Test for Placement Status and Age

chi5 <- xtabs(~placed+age, data=Salary_known)
addmargins(chi5)
##       age
## placed  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  39
##    0     1   3  13   9  10  14   6  11   2   2   5   0   3   3   2   1   1
##    1     1   5  16  23  14  14   8   6   6   4   1   1   1   0   0   0   1
##    Sum   2   8  29  32  24  28  14  17   8   6   6   1   4   3   2   1   2
##       age
## placed  40  42  43  48 Sum
##    0     0   1   2   1  90
##    1     2   0   0   0 103
##    Sum   2   1   2   1 193
chisq.test(chi5)
## Warning in chisq.test(chi5): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  chi5
## X-squared = 27.943, df = 20, p-value = 0.1108

-> The p-value = 0.1108 which signifies that Placement Status has no significant dependency on Age of the students.

Pearson Chi-square Test for Placement Status and Work Experience

chi6 <- xtabs(~placed+work_yrs, data=Salary_known)
addmargins(chi6)
##       work_yrs
## placed   0   1   2   3   4   5   6   7   8   9  10  11  12  13  15  16  18
##    0     1  12  22  14   9  12   2   5   2   1   1   2   2   1   0   1   1
##    1     1   8  38  21  11   7   7   1   4   0   1   0   0   0   2   2   0
##    Sum   2  20  60  35  20  19   9   6   6   1   2   2   2   1   2   3   1
##       work_yrs
## placed  22 Sum
##    0     2  90
##    1     0 103
##    Sum   2 193
chisq.test(chi6)
## Warning in chisq.test(chi6): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  chi6
## X-squared = 24.663, df = 17, p-value = 0.1025

-> The p-value = 0.1025 which signifies that Placement Status has no significant dependency on Years of Work Experience for the students.

Pearson Chi-square Test for Placement Status and First Language

chi7 <- xtabs(~placed+frstlang, data=Salary_known)
addmargins(chi7)
##       frstlang
## placed   1   2 Sum
##    0    82   8  90
##    1    96   7 103
##    Sum 178  15 193
chisq.test(chi7)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chi7
## X-squared = 0.074127, df = 1, p-value = 0.7854

-> The p-value = 0.7854 which signifies that Placement Status has no significant dependency on First Language of the students.


Summary

The analysis was carried out in order to know how salary is dependent on other variables:

  1. The variables satis, s_avg and age have good dependency on the salary earned by MBA students.

  2. As age variable has a negative intercept, it implies that older students are offered less in comparision to younger students

  3. s_avg contributes highly to salary implying that starting salary offered to students is higher if student has higher Spring MBA average in comparision to Fall MBA average.

  4. There is a difference of about 10000 in salary for each increement in the satisfaction level of the students.

  5. The status of being placed or unplaced is independent of all these variables.