#title: "MBA_Starting_Salaries_Data"
#author: "Ayush Bose"
#date: "January 23, 2018"
#Reading the data set
mydata.df <- read.csv(paste("MBA_Starting_Salaries_Data.csv", sep=""))
View(mydata.df)
#Calculating summary statistics
summary(mydata.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
str(mydata.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
#Drawing Box Plots / Bar Plots to visualize the distribution of each variable independently
hist(mydata.df$age,
main="Visualization of Age",
xlab="Age",
ylab="Count",
breaks=10,
col="peachpuff")

boxplot(mydata.df$age)

hist(mydata.df$gmat_tot,
main="Visualization of GmatTotal",
xlab="gmat_tot",
ylab="Count",
breaks=10,
col="peachpuff")

boxplot(mydata.df$gmat_tot)

hist(mydata.df$work_yrs,
main="Visualization of Work Experience",
xlab="work_yrs",
ylab="Count",
breaks=10,
col="peachpuff")

boxplot(mydata.df$work_yrs)

hist(mydata.df$salary,
main="Visualization of salary",
xlab="salary",
ylab="Count",
breaks=10,
col="peachpuff")

boxplot(mydata.df$salary)

hist(mydata.df$satis,
main="Visualization of Satisfaction",
xlab="satis",
ylab="Count",
breaks=10,
col="peachpuff")

boxplot(mydata.df$satis)

#Drawing Scatter Plots to understand how are the variables correlated pair-wise
pairs(formula = ~ age + sex + gmat_tot + s_avg + f_avg + quarter + work_yrs + frstlang + salary + satis, cex=0.6, data=mydata.df)
#Drawing a Corrgram; Creating a Variance-Covariance Matrix
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3

corrgram(mydata.df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of Data")

cov(mydata.df)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
var(mydata.df)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
#Taking a subset of the dataset consisting of only those people who actually got a job.
placed.df <- subset(mydata.df, salary>0 & salary!= 998 & salary!=999)
View(placed.df)
#Further analysis on placed.df
#Consider y = f(x). Where y = Starting Salary. Let us analyse impact of {gender; first language; prior work experience; GMAT performance; MBA performance} on y.
#Drawing contingency tables
mytable <- with(placed.df, table(sex))
mytable
## sex
## 1 2
## 72 31
mytable <- with(placed.df, table(frstlang))
mytable
## frstlang
## 1 2
## 96 7
aggregate(salary ~ sex, data=placed.df, mean)
## sex salary
## 1 1 104970.97
## 2 2 98524.39
aggregate(salary ~ frstlang, data=placed.df, mean)
## frstlang salary
## 1 1 101748.6
## 2 2 120614.3
aggregate(salary ~ s_avg + f_avg, data=placed.df, mean)
## s_avg f_avg salary
## 1 4.00 0.00 146000.00
## 2 2.20 2.00 105000.00
## 3 2.40 2.00 85000.00
## 4 2.40 2.25 90000.00
## 5 2.30 2.50 98000.00
## 6 2.50 2.50 96000.00
## 7 2.60 2.50 107700.00
## 8 2.70 2.50 90000.00
## 9 3.50 2.67 86000.00
## 10 2.40 2.75 99500.00
## 11 2.50 2.75 220000.00
## 12 2.60 2.75 114155.00
## 13 2.70 2.75 90750.00
## 14 2.80 2.75 104833.33
## 15 2.90 2.75 91085.33
## 16 3.00 2.75 97250.00
## 17 2.91 2.83 105000.00
## 18 2.50 3.00 77000.00
## 19 2.60 3.00 100000.00
## 20 2.70 3.00 98000.00
## 21 2.80 3.00 99700.00
## 22 2.90 3.00 101400.00
## 23 3.00 3.00 105000.00
## 24 3.09 3.00 100000.00
## 25 3.10 3.00 112450.00
## 26 3.20 3.00 109000.00
## 27 3.30 3.00 105000.00
## 28 3.40 3.00 100000.00
## 29 3.50 3.00 113000.00
## 30 2.80 3.25 98000.00
## 31 2.90 3.25 93000.00
## 32 3.00 3.25 107500.00
## 33 3.20 3.25 105166.67
## 34 3.27 3.25 95000.00
## 35 3.30 3.25 101416.67
## 36 3.40 3.25 90000.00
## 37 3.50 3.25 97333.33
## 38 3.10 3.33 82000.00
## 39 2.90 3.50 107300.00
## 40 3.09 3.50 107000.00
## 41 3.10 3.50 96500.00
## 42 3.20 3.50 95000.00
## 43 3.30 3.50 95750.00
## 44 3.45 3.50 105000.00
## 45 3.50 3.50 111500.00
## 46 3.60 3.50 110500.00
## 47 3.80 3.50 105000.00
## 48 3.50 3.60 85000.00
## 49 3.70 3.60 106000.00
## 50 3.40 3.67 100000.00
## 51 3.60 3.67 95000.00
## 52 3.40 3.75 93000.00
## 53 3.50 3.75 85000.00
## 54 3.60 3.75 162000.00
## 55 3.70 4.00 115000.00
## 56 3.80 4.00 120000.00
aggregate(salary ~ gmat_tot, data=placed.df, mean)
## gmat_tot salary
## 1 500 158250.0
## 2 520 78256.0
## 3 530 99500.0
## 4 540 104000.0
## 5 550 112236.7
## 6 560 94000.0
## 7 570 103857.1
## 8 580 99875.0
## 9 590 97000.0
## 10 600 107666.7
## 11 610 96200.0
## 12 620 104108.3
## 13 630 105812.5
## 14 640 110000.0
## 15 650 101285.7
## 16 660 92480.0
## 17 670 100642.9
## 18 680 102166.7
## 19 700 122333.3
## 20 710 101250.0
## 21 720 85000.0
#Running t-tests
t.test(salary ~ sex, data=placed.df)
##
## Welch Two Sample t-test
##
## data: salary by sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2
## 104970.97 98524.39
t.test(salary ~ frstlang, data=placed.df)
##
## Welch Two Sample t-test
##
## data: salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -59933.62 22202.25
## sample estimates:
## mean in group 1 mean in group 2
## 101748.6 120614.3
#Regression Model 1 :
fit <- lm(salary ~ work_yrs + gmat_tot + s_avg + f_avg, data = placed.df)
summary(fit)
##
## Call:
## lm(formula = salary ~ work_yrs + gmat_tot + s_avg + f_avg, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36351 -8173 -1170 3864 87090
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98635.48 22643.60 4.356 3.26e-05 ***
## work_yrs 2579.88 577.97 4.464 2.16e-05 ***
## gmat_tot -14.98 32.53 -0.460 0.646
## s_avg 2422.16 5033.78 0.481 0.631
## f_avg -1087.60 3889.90 -0.280 0.780
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16200 on 98 degrees of freedom
## Multiple R-squared: 0.2098, Adjusted R-squared: 0.1776
## F-statistic: 6.506 on 4 and 98 DF, p-value: 0.0001098
#Making Regression model 2 :
fit <- lm(salary ~ work_yrs + age + s_avg + frstlang , data = placed.df)
summary(fit)
##
## Call:
## lm(formula = salary ~ work_yrs + age + s_avg + frstlang, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32957 -9005 -1362 4613 76947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34470.6 26189.2 1.316 0.1912
## work_yrs 746.2 1121.1 0.666 0.5072
## age 1833.3 1085.7 1.689 0.0945 .
## s_avg 2207.1 4233.5 0.521 0.6033
## frstlang 9270.9 6894.3 1.345 0.1818
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15630 on 98 degrees of freedom
## Multiple R-squared: 0.2647, Adjusted R-squared: 0.2347
## F-statistic: 8.818 on 4 and 98 DF, p-value: 4.008e-06
#Model 2 is the better model.
#Compare the remaining subset of those people who did not get a job and compare them with those people who got a job. Here, we are not analyzing what drives a higher salary. Instead, we are analysing the two groups who got a job / did not get a job
notplaced.df <- subset(mydata.df, salary==0)
View(notplaced.df)
t.test(gmat_tot ~ frstlang, data=notplaced.df)
##
## Welch Two Sample t-test
##
## data: gmat_tot by frstlang
## t = 0.51644, df = 7.9236, p-value = 0.6197
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -49.86769 78.58720
## sample estimates:
## mean in group 1 mean in group 2
## 615.6098 601.2500